In [10]:
from openai import OpenAI
import api
from langchain_qdrant import FastEmbedSparse
from langchain_openai import OpenAIEmbeddings
import chatbot
import retrieval
import vector_store as vs
import util
import evaluation as eval
import os
from tqdm import tqdm
import argparse
import util
import api
import os

import pdf_extraction as pdf
import caption_generation as cap
import vector_store as vs
import retrieval
import chatbot

from langchain_qdrant import FastEmbedSparse
from langchain_openai import OpenAIEmbeddings
from openai import OpenAI

from tqdm import tqdm
import chunking
from PIL import Image
import torch
import clip

import numpy as np
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from langchain.embeddings.base import Embeddings
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models

import evaluation

pdf_paths = ["Dataset/File1.pdf", "Dataset/File2.pdf", "Dataset/File3.pdf", "Dataset/File4.pdf"]
extract_folder = "robust_extracted_content"
singlepage_folder = "single_pages"
evaluation_data_folder = "evaluation_dataset"
evaluation_result_folder = "evaluation_results"

text_collection_name = 'texts_CLIP'
caption_collection_name = 'captions_CLIP'
image_collection_name = 'images_CLIP'

txt_similarity_topk = 5
cos_filtering_threshold = 0.8
cos_filtering_topk = 3
image_similarity_topk = 5
chunk_size = 300
chunk_overlap = 100

query = "query.txt"
evaluation_data = "complete_evaluation_dataset.json"
eval_result = "score.json"
eval_record = "record.json"
text_pred = "text_predictions.json"
image_pred = "image_predictions.json"
context = "retrieved_contexts.json"

# Extract Content

In [5]:
util.clear_output_folder(extract_folder)
util.clear_output_folder(singlepage_folder)

for pdf_path in pdf_paths:
    file = pdf.PDF(pdf_path)
    file.extract_images_and_text(singlepage_folder, extract_folder)
    file.save_imageInfo(extract_folder)
    file.convert_to_images(singlepage_folder)

manual_extracted_content is ready for new content.
single_pages is ready for new content.


Processing pages of file Dataset/File1.pdf: 100%|██████████| 3/3 [00:00<00:00, 93.15it/s]

Text files are combined in: manual_extracted_content/File1/File1_Pages.txt
Data written to manual_extracted_content/File1/File1_imagesInfo.json
Converting pdf pages to individual images.



3it [00:00,  5.00it/s]
Processing pages of file Dataset/File2.pdf: 100%|██████████| 14/14 [00:00<00:00, 309.58it/s]


Text files are combined in: manual_extracted_content/File2/File2_Pages.txt
Data written to manual_extracted_content/File2/File2_imagesInfo.json
Converting pdf pages to individual images.


14it [00:03,  3.81it/s]
Processing pages of file Dataset/File3.pdf: 100%|██████████| 56/56 [00:00<00:00, 576.36it/s]


Text files are combined in: manual_extracted_content/File3/File3_Pages.txt
Data written to manual_extracted_content/File3/File3_imagesInfo.json
Converting pdf pages to individual images.


56it [00:14,  3.94it/s]
Processing pages of file Dataset/File4.pdf: 100%|██████████| 20/20 [00:00<00:00, 590.67it/s]


Text files are combined in: manual_extracted_content/File4/File4_Pages.txt
Data written to manual_extracted_content/File4/File4_imagesInfo.json
Converting pdf pages to individual images.


20it [00:04,  4.02it/s]


# Generate Captions

In [None]:
for pdf_path in pdf_paths:
    print(f'Generating captions for {pdf_path}.')
    file = pdf.PDF(pdf_path)
    file.images = util.create_class_from_json(pdf.Image, f"{extract_folder}/{file.file_name}/{file.file_name}_imagesInfo.json")
    file.append_images_contexts()
    for image in tqdm(file.images, desc='Generating captions for images.'):
        image.generate_caption(singlepage_folder)
    file.save_imageInfo(extract_folder)

# Generate & Store Embeddings

In [4]:
chunks = []
images = []

for pdf_path in pdf_paths:
    file = pdf.PDF(pdf_path)
    file.images = util.create_class_from_json(pdf.Image, f"{extract_folder}/{file.file_name}/{file.file_name}_imagesInfo.json")

    splits = chunking.naive_chunk(chunk_size, chunk_overlap, f"{extract_folder}/{file.file_name}/{file.file_name}_Pages.txt")
    for split in splits:
        chunks.append(chunking.Chunk(split))

    for image in file.images:
        images.append(image)

vs.create_collection(text_collection_name, 1536)
vs.create_collection(caption_collection_name, 1536)
vs.create_collection(image_collection_name, 512)

# for chunk in tqdm(chunks, desc="Adding text cunks to the vectore store."):
#     vs.add_chunk(text_collection_name, chunk)
for image in tqdm(images, desc="Adding captions and images to the vectore store."):
    vs.add_caption(caption_collection_name, image)
    vs.add_image(image_collection_name, image)

Total text chunks created: 2
Total text chunks created: 16
Total text chunks created: 54
Total text chunks created: 14
Deleted old version collection texts_CLIP
Collection texts_CLIP initialized.
Deleted old version collection captions_CLIP
Collection captions_CLIP initialized.
Deleted old version collection images_CLIP
Collection images_CLIP initialized.


Adding captions and images to the vectore store.: 100%|██████████| 98/98 [07:31<00:00,  4.61s/it]


# Respond Text

In [None]:
def respond_text_query(
        query, 
        text_collection_name, 
        caption_collection_name, 
        txt_similarity_topk, 
        image_similarity_topk, 
        cos_filtering_threshold, 
        cos_filtering_topk,
        verbose=1
        ):
    
    dense_embedding_function = OpenAIEmbeddings(api_key=api.OPENAI_KEY, model="text-embedding-3-small")
    sparse_embedding_function = FastEmbedSparse(model_name="prithivida/Splade_PP_en_v1")
    
    collection_text = vs.get_collection(
        text_collection_name, 
        dense_embedding_function,
        sparse_embedding_function
    )

    collection_caption = vs.get_collection(
        caption_collection_name, 
        dense_embedding_function,
        sparse_embedding_function
    )

    text_retriever = retrieval.Retriever(query, collection_text)
    text_retriever.similarity_search(txt_similarity_topk)
    text_retriever.rerank('content')
    text_retriever.cos_filtering(vs.dense_embed, 'content', cos_filtering_threshold, cos_filtering_topk)

    context = " "
    for filtered_context in text_retriever.filtered_contexts:
        context += filtered_context
    
    symptom_explanation = chatbot.symptom_list_response(query, context)

    image_retriever = retrieval.Retriever(symptom_explanation, collection_caption)
    image_retriever.similarity_search(image_similarity_topk)
    image_retriever.rerank('caption')
    retrieved_image = image_retriever.reranked_docs[0]
    image_payload = vs.retrieve_payload(retrieved_image, collection_caption)
    image_path = image_payload["image_path"]

    if verbose:
        print(symptom_explanation)
        util.show_image(image_path)

    return symptom_explanation, image_path, context

In [None]:
query = util.read_query(query).replace('\n', "")
respond_text_query(query, text_collection_name, caption_collection_name, txt_similarity_topk, image_similarity_topk, cos_filtering_threshold, cos_filtering_topk)

# Respond Images

In [8]:
def respond_image_query(
        query
        ):
    
    client = QdrantClient(url=api.QDRANT_URL, api_key=api.QDRANT_API)
    
    model = SentenceTransformer('clip-ViT-B-32', device='cpu')
    image_content = Image.open(query)
    query_embedding = model.encode(image_content)

    found_docs = client.search(
        collection_name=image_collection_name,
        query_vector=query_embedding,
    )

    return found_docs[0].payload['caption']


In [4]:
query = util.read_query('query_image.txt')
respond_image_query(query, 1)

Symptom: Florid cemento-osseous dysplasia (mixed)

Description: Florid cemento-osseous dysplasia is a benign condition involving multiple quadrants of the jaws, often bilaterally and symmetrically. It is associated with the apices of the teeth and is a diffuse form of periapical cemental dysplasia. Radiographically, it undergoes three stages: starting as a radiolucent lesion, progressing to radiopacities within the apical radiolucencies, and finally appearing as a densely radiopaque lesion surrounded by a thin radiolucent line. Adjacent teeth are typically unaffected, stable, and not resorbed.


  found_docs = client.search(


# Evaluation

In [9]:
evaluation_data = util.load_json('evaluation_dataset/evaluation_data.json')
results = []
for entry in evaluation_data:
    query = entry['image_path']
    retrieved_caption = respond_image_query(query)
    entry['caption_retrieved'] = retrieved_caption
    results.append(entry)
util.save_as_json(results, 'evaluation_results/predictions.json')

Loaded content from evaluation_dataset/evaluation_data.json


  found_docs = client.search(


Saved content to evaluation_results/predictions.json


In [11]:
predicts = util.load_json('evaluation_results/predictions.json')
for predict in predicts:
    if predict['caption_truth'] == predict['caption_retrieved']:
        predict['correct'] = 1
    else:
        predict['correct'] = 0
util.save_as_json(predicts, 'evaluation_results/predictions.json')

Loaded content from evaluation_results/predictions.json
Saved content to evaluation_results/predictions.json
