# Load Raw Dataset

In [None]:
import pandas as pd

# Load the dataset
csv_path = "/Users/sheetalpatnaik/Desktop/GENAI/test_2.csv"
df = pd.read_csv(csv_path)

# Preview
print(df.head())


   index             Figure_path  \
0     62  PMC8253867_Fig2_41.jpg   
1     65  PMC8253867_Fig2_42.jpg   
2     67  PMC8253873_Fig6_45.jpg   
3     68  PMC8253873_Fig6_46.jpg   
4     74  PMC8253873_Fig8_49.jpg   

                                             Caption  \
0  CT pulmonary angiogram reveals encasement and ...   
1  CT pulmonary angiogram reveals encasement and ...   
2  Axial STIR MR image of the tear of the patella...   
3  MRI axial view of the patellar tendon at 6‚Äâmon...   
4  Pre-injection axial STIR MR image showing inju...   

                                            Question  \
0   What is the name of the artery encased and di...   
1   Which artery is encased and displaced accordi...   
2  What is the structure affected by the tear sho...   
3   What is the imaging technique used in the fig...   
4   What type of MRI sequence was used for imaging?    

                        Choice A  \
0      A: Right Coronary Artery    
1   A:Left main coronary artery    

# Image Preprocessing

 Resize + Normalize

In [None]:
from PIL import Image
import torchvision.transforms as transforms
import os

# Image paths
image_dir_1 = "/Users/sheetalpatnaik/Desktop/GENAI/images"
image_dir_2 = "/Users/sheetalpatnaik/Desktop/GENAI/figures"

# Transform
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


Preprocess Image Function + Loop

In [None]:
from tqdm import tqdm

def preprocess_image(image_path):
    try:
        img = Image.open(image_path).convert("RGB")
        return image_transform(img)
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

preprocessed_images = {}

for idx, row in tqdm(df.iterrows(), total=len(df)):
    file_name = row["Figure_path"].strip()
    path1 = os.path.join(image_dir_1, file_name)
    path2 = os.path.join(image_dir_2, file_name)

    if os.path.exists(path1):
        tensor = preprocess_image(path1)
    elif os.path.exists(path2):
        tensor = preprocess_image(path2)
    else:
        continue

    if tensor is not None:
        preprocessed_images[file_name] = tensor

print("Total preprocessed images:", len(preprocessed_images))


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 33430/33430 [01:35<00:00, 350.22it/s]


‚úÖ Total preprocessed images: 29021


# Text Preprocessing (Questions + Answers)

In [None]:
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"[^a-z0-9\s\-:(),.%]", "", text)
    return text.strip()

df["Cleaned_Question"] = df["Question"].apply(clean_text)
df["Cleaned_Answer"] = df["Answer"].apply(clean_text)
df["Cleaned_Caption"] = df["Caption"].apply(clean_text)


In [None]:
print(df["Cleaned_Caption"])

0        ct pulmonary angiogram reveals encasement and ...
1        ct pulmonary angiogram reveals encasement and ...
2        axial stir mr image of the tear of the patella...
3        mri axial view of the patellar tendon at 6 mon...
4        pre-injection axial stir mr image showing inju...
                               ...                        
33425    replacement teeth of holotype right dentary (n...
33426    a large field oct image capturing the surface ...
33427    a camera image of the same section of tissues ...
33428    merged pars and oct (0.4 na) image of resected...
33429    merged pars and oct (0.4 na) image of resected...
Name: Cleaned_Caption, Length: 33430, dtype: object


#Dataset Pairing
Already done as we are using
df['Figure_path'], df['Cleaned_Question'], df['Cleaned_Answer']

#  Annotation Cleaning

In [None]:
valid_choices = {"a", "b", "c", "d", "A", "B", "C", "D"}

clean_df = df[
    df["Question"].notnull() &
    df["Answer"].notnull() &
    df["Choice A"].notnull() &
    df["Choice B"].notnull() &
    df["Choice C"].notnull() &
    df["Choice D"].notnull() &
    df["Question"].str.strip().ne("") &
    df["Answer"].str.strip().ne("") &
    df["Choice A"].str.strip().ne("") &
    df["Choice B"].str.strip().ne("") &
    df["Choice C"].str.strip().ne("") &
    df["Choice D"].str.strip().ne("") &
    df["Answer"].str.upper().isin(valid_choices)
].copy()

clean_df["Cleaned_Answer"] = clean_df["Answer"].str.upper().str.strip()


In [None]:
print(clean_df["Cleaned_Answer"])

0        B
1        D
2        C
3        C
4        C
        ..
33425    B
33426    D
33427    D
33428    C
33429    A
Name: Cleaned_Answer, Length: 33430, dtype: object


#  Data Filtering / Sampling

In [None]:
# Random 1000 for few-shot prompts
subset_df = clean_df.sample(n=1000, random_state=42).reset_index(drop=True)


In [None]:
print(subset_df)

       index                                  Figure_path  \
0      58420                    PMC8519188_FIG5_85295.jpg   
1      10664                    PMC8285465_Fig3_10775.jpg   
2     173124                   PMC8918112_Fig4_221411.jpg   
3     883387                   PMC8225413_fig2_475661.jpg   
4     186762                   PMC9015882_fig2_255557.jpg   
..       ...                                          ...   
995   132642                     PMC8692788_F1_143932.jpg   
996    28951  PMC8350899_emmm202013695-fig-0003_30439.jpg   
997  1147805                    PMC8443912_Fig1_60310.jpg   
998   126366                   PMC8786746_Fig1_175557.jpg   
999   865651      PMC8162640_pone.0252544.g002_455897.jpg   

                                               Caption  \
0    CT scan of the chest post - chemotherapy showi...   
1    Orbit magnetic resonance imaging (MRI) at the ...   
2    posterior pole (Pos) is at the right in all ph...   
3    Excisional biopsy revealing in

# Format for Model Input

### Re-encoded PMC questions

In [None]:
from transformers import CLIPTokenizer, CLIPModel
import torch
import numpy as np

# Load CLIP tokenizer and model
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

# Generate CLIP embeddings for PMC questions
def get_clip_text_embedding(text):
    inputs = clip_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        text_features = clip_model.get_text_features(**inputs)
    return text_features.squeeze().cpu().numpy()

pmc_questions = [item["question"] for item in prompt_data]
pmc_clip_embeddings = np.array([get_clip_text_embedding(q) for q in pmc_questions])
pmc_clip_embeddings = pmc_clip_embeddings / np.linalg.norm(pmc_clip_embeddings, axis=1, keepdims=True)

print(" Re-encoded PMC questions using CLIP. Shape:", pmc_clip_embeddings.shape)


‚úÖ Re-encoded PMC questions using CLIP. Shape: (1000, 512)


In [None]:
import json

# Example: few-shot format (image path + question + choices)
prompt_data = []

for _, row in subset_df.iterrows():
    prompt = {
        "image": row["Figure_path"],
        "question": row["Cleaned_Question"],
        "choices": {
            "A": row["Choice A"],
            "B": row["Choice B"],
            "C": row["Choice C"],
            "D": row["Choice D"]
        },
        "answer": row["Cleaned_Answer"]
    }
    prompt_data.append(prompt)
print(json.dumps(prompt_data,indent=4))
# Save as JSON
with open("few_shot_prompts.json", "w") as f:
    json.dump(prompt_data, f, indent=2)



[
    {
        "image": "PMC8519188_FIG5_85295.jpg",
        "question": "what does the image depict about the patients tumor",
        "choices": {
            "A": " A:The tumor has grown larger ",
            "B": " B:The tumor has shrunk ",
            "C": " C:The tumor has not changed ",
            "D": " D:The image doesn't show tumor regression "
        },
        "answer": "B"
    },
    {
        "image": "PMC8285465_Fig3_10775.jpg",
        "question": "what imaging technique was used to capture the image",
        "choices": {
            "A": " A:CT scan ",
            "B": " B:Electroencephalography ",
            "C": " C:X-ray ",
            "D": " D:Magnetic resonance imaging "
        },
        "answer": "D"
    },
    {
        "image": "PMC8918112_Fig4_221411.jpg",
        "question": "what is located to the right in all the photographs",
        "choices": {
            "A": " A:The anterior pole ",
            "B": " B:The posterior pole ",
            "C": " 

# ZERO SHOT

In [None]:
!pip install openai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
import openai


In [None]:
import openai
import json
from PIL import Image
import base64
from io import BytesIO
import os
import re




# Helper: Convert image to base64
def image_to_base64(image_path):
    with Image.open(image_path) as img:
        buffered = BytesIO()
        img.convert("RGB").save(buffered, format="JPEG")
        return base64.b64encode(buffered.getvalue()).decode("utf-8")

# Helper: Extract option letter (A‚ÄìD)
def extract_option_letter(text):
    match = re.search(r'\b([A-D])\b', text, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    return None

#  Helper: Check for low confidence
def is_low_confidence(answer):
    answer = answer.lower()
    if re.search(r"\b[a-d]\b.*\b[a-d]\b", answer, re.IGNORECASE):  # mentions multiple options
        return True
    uncertain_phrases = ["might be", "could be", "not sure", "maybe", "possibly"]
    return any(phrase in answer for phrase in uncertain_phrases)

#  Helper: Re-prompt the model for clarification
def get_clarified_answer(question, choices, image_b64):
    clarification_prompt = f"""
You are a medical expert. Analyze the image and answer the following multiple-choice question.

Question: {question}

Options:
A. {choices['A']}
B. {choices['B']}
C. {choices['C']}
D. {choices['D']}

Please respond with **only the correct option letter (A, B, C, or D)**. Do not provide explanation.
"""

    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": clarification_prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
                ]
            }
        ],
        max_tokens=50,
        temperature=0.3
    )
    return response.choices[0].message.content.strip()


# Dataset and path
image_folder = "/Users/sheetalpatnaik/Desktop/GENAI/figures/"
correct = 0
total = 0
image_count = 0

for idx, item in enumerate(prompt_data):
    if image_count >= 30:
        break

    question = item["question"]
    correct_answer = item["answer"].strip().upper()
    image_path = os.path.join(image_folder, item["image"].strip())

    if not os.path.exists(image_path):
        print(f" Image not found: {image_path}")
        continue

    try:
        image_b64 = image_to_base64(image_path)

        prompt_text = f"""
You are a medical expert. Analyze the image and answer the following multiple-choice question.

Question: {item['question']}

Options:
A. {item['choices']['A']}
B. {item['choices']['B']}
C. {item['choices']['C']}
D. {item['choices']['D']}

Please respond with only the correct option letter (A, B, C, or D).
"""

        response = openai.chat.completions.create(
            model="gpt-4-turbo",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt_text},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
                    ]
                }
            ],
            max_tokens=100,
            temperature=0.3
        )

        gpt_answer = response.choices[0].message.content.strip()
        predicted_option = extract_option_letter(gpt_answer)

        # üîç Re-prompt if prediction is invalid or confidence is low
        if predicted_option not in ["A", "B", "C", "D"] or is_low_confidence(gpt_answer):
            print(" Low confidence or invalid option detected. Re-prompting...")
            gpt_answer = get_clarified_answer(question, item["choices"], image_b64)
            predicted_option = extract_option_letter(gpt_answer)

        #  Display results
        print(f"\n[Q{idx+1}] {question}")
        print(f" Predicted Answer: {gpt_answer}")
        print(f" Predicted Option: {predicted_option if predicted_option else 'Not Found'}")
        print(f" Actual Option: {correct_answer}")

        if predicted_option == correct_answer:
            correct += 1
        total += 1
        image_count += 1

    except Exception as e:
        print(f" Error processing image {image_path}: {e}")

# Final accuracy
if total > 0:
    print(f"\n Overall Accuracy: {correct}/{total} = {correct / total:.2f}")
else:
    print(" No valid images processed.")


[Q1] what does the image depict about the patients tumor
üìå Predicted Answer: B
üî¢ Predicted Option: B
‚úÖ Actual Option: B

[Q2] what imaging technique was used to capture the image
üìå Predicted Answer: D.
üî¢ Predicted Option: D
‚úÖ Actual Option: D

[Q3] what is located to the right in all the photographs
üìå Predicted Answer: C. The vegetal pole
üî¢ Predicted Option: C
‚úÖ Actual Option: B
‚ö†Ô∏è Low confidence or invalid option detected. Re-prompting...

[Q4] what does the excisional biopsy reveal in this image
üìå Predicted Answer: C
üî¢ Predicted Option: C
‚úÖ Actual Option: C

[Q5] what approach was used for the surgery
üìå Predicted Answer: B. Posterior approach
üî¢ Predicted Option: B
‚úÖ Actual Option: B

[Q6] what is indicated in blue in the image
üìå Predicted Answer: C.
üî¢ Predicted Option: C
‚úÖ Actual Option: B
‚ö†Ô∏è Low confidence or invalid option detected. Re-prompting...

[Q7] which type of probe is used for both methods mentioned in the caption
ü

# FEW SHOT

In [None]:
!pip install clip

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
pip uninstall -y clip


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: clip 1.0
Uninstalling clip-1.0:
  Successfully uninstalled clip-1.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install git+https://github.com/openai/CLIP.git

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /private/var/folders/tc/5d149j1n63d449j9q57cyn1c0000gn/T/pip-req-build-onu5jbdc
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /private/var/folders/tc/5d149j1n63d449j9q57cyn1c0000gn/T/pip-req-build-onu5jbdc
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l- \ | done
[?25h  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369490 sha256=9d08c6ad892f00a7c6b23e04479ae53398a7e91c35773317e5a59fee9b166dab
  Stored in directory: /private/var/folders/tc/5d149j1n63d449j9q57cyn1c0000gn/T/pip-ephem-wheel-cache-m9m102r6/wheels/35/3e/df/3d24cbfb3b6a06f17a2bfd7d1138900d4365d9028aa8f6e92f
Successfully built clip
Installing collected packages: clip

In [None]:
import clip
print(clip.__file__)

/opt/anaconda3/lib/python3.12/site-packages/clip/__init__.py


STEP 1: Load Libraries

In [None]:
import os
import json
import pickle
import numpy as np
from PIL import Image
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
import openai
import torch
from transformers import CLIPProcessor, CLIPModel
device = "cuda" if torch.cuda.is_available() else "cpu"

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

Define a Function to Generate Embeddings

In [None]:
def get_embedding(image_path, question):
    # Load image
    image = Image.open(image_path).convert("RGB")

    # Process inputs for CLIP
    inputs = clip_processor(text=[question], images=image, return_tensors="pt", padding=True).to(device)

    # Get embeddings
    with torch.no_grad():
        outputs = clip_model(**inputs)
        image_emb = outputs.image_embeds[0].cpu().numpy()
        text_emb = outputs.text_embeds[0].cpu().numpy()

    # Combine them (simple average; you can also use concat if you prefer)
    combined_emb = (image_emb + text_emb) / 2
    return combined_emb

Load Your Embeddings (Pickle File)

In [None]:
with open("/Users/sheetalpatnaik/Desktop/GENAI/dataset_embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

Fit KNN on Existing Embeddings

In [None]:
embedding_matrix = [item['embedding'] for item in embeddings]

knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(embedding_matrix)

Define Helper to Get Combined Embedding for a Query

In [None]:
image_folder = "/Users/sheetalpatnaik/Desktop/GENAI/figures"  # Replace with your folder path

def get_combined_embedding_from_query(query_entry):
    image_filename = query_entry["image"]
    question = query_entry["question"]

    image_path = os.path.join(image_folder, image_filename)
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image not found: {image_path}")

    return get_embedding(image_path, question)

In [None]:


correct_answers = 0
total_queries = 20

for i in tqdm(range(total_queries)):
    query = prompt_data[i]

    # Step 1: Get query embedding
    query_embedding = get_combined_embedding_from_query(query)

    # Step 2: KNN - Find similar examples
    distances, indices = knn.kneighbors([query_embedding])
    similar_examples = [embeddings[j] for j in indices[0]]

    # Step 3: Build prompt
    prompt = "You are a helpful assistant. Based on the examples, answer the final question.\n\n"
    for idx, ex in enumerate(similar_examples):
        prompt += f"Example {idx+1}:\n"
        prompt += f"Q: {ex['question']}\n"
        prompt += f"A: {ex['answer']}\n"

    prompt += f"\nQ: {query['question']}\nChoices:\n"
    for k, v in query["choices"].items():
        prompt += f"{v}\n"
    prompt += f"A:"

    # Step 4: Call GPT-4
    response = openai.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=10,
        temperature=0.0
    )

    model_answer = response.choices[0].message.content.strip()

    print(f"Actual Answer: {query['answer']} | Predicted Answer: {model_answer}")

    # Step 5: Compare answer
    if model_answer[0].upper() == query["answer"].upper():
        correct_answers += 1

accuracy = correct_answers / total_queries * 100
print(f"\n Accuracy on first {total_queries} queries: {accuracy:.2f}%")

  5%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå                                                                                                                                                                   | 1/20 [00:03<00:58,  3.07s/it]

Actual Answer: B | Predicted Answer: B: The tumor has shrunk



 10%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè                                                                                                                                                          | 2/20 [00:05<00:44,  2.49s/it]

Actual Answer: D | Predicted Answer: D: Magnetic resonance imaging



 15%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä                                                                                                                                                  | 3/20 [00:05<00:29,  1.72s/it]

Actual Answer: B | Predicted Answer: B: The posterior pole



 20%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç                                                                                                                                         | 4/20 [00:06<00:22,  1.42s/it]

Actual Answer: C | Predicted Answer: C: Inguinal node metastasis



 25%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                                                                                                                                 | 5/20 [00:07<00:19,  1.28s/it]

Actual Answer: B | Predicted Answer: B: Posterior approach



 30%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå                                                                                                                        | 6/20 [00:08<00:16,  1.15s/it]

Actual Answer: B | Predicted Answer: B: nucleus



 35%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè                                                                                                               | 7/20 [00:09<00:12,  1.01it/s]

Actual Answer: C | Predicted Answer: C: Linear probe



 40%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä                                                                                                       | 8/20 [00:10<00:11,  1.04it/s]

Actual Answer: B | Predicted Answer: B: Left septal/subcallosal area



 45%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç                                                                                              | 9/20 [00:11<00:10,  1.05it/s]

Actual Answer: B | Predicted Answer: B: Hip Prosthesis



 50%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå                                                                                     | 10/20 [00:11<00:08,  1.17it/s]

Actual Answer: C | Predicted Answer: C: Right ICA



 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                                                                             | 11/20 [00:12<00:07,  1.16it/s]

Actual Answer: D | Predicted Answer: D: Lateral



 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå                                                                    | 12/20 [00:13<00:06,  1.22it/s]

Actual Answer: D | Predicted Answer: D: Lower right lobe



 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè                                                           | 13/20 [00:14<00:05,  1.20it/s]

Actual Answer: B | Predicted Answer: Type B dissection.



 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã                                                   | 14/20 [00:15<00:05,  1.15it/s]

Actual Answer: B | Predicted Answer: B: right upper lobe



 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé                                          | 15/20 [00:16<00:04,  1.01it/s]

Actual Answer: A | Predicted Answer: A complex viral envelope



 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä                                  | 16/20 [00:17<00:03,  1.08it/s]

Actual Answer: B | Predicted Answer: B: CT scan



 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé                         | 17/20 [00:18<00:02,  1.09it/s]

Actual Answer: D | Predicted Answer: D: Congenitally corrected transposition of



 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ                 | 18/20 [00:19<00:01,  1.10it/s]

Actual Answer: B | Predicted Answer: B: Extension motion



 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç        | 19/20 [00:20<00:00,  1.10it/s]

Actual Answer: D | Predicted Answer: First and second lines.


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:20<00:00,  1.05s/it]

Actual Answer: D | Predicted Answer: D: Four-part fracture

‚úÖ Accuracy on first 20 queries: 90.00%





# Chain of Thought

Load Your Data and Embeddings

In [None]:
import json
import pickle
import os

# Paths

EMBEDDINGS_PKL_PATH = "/Users/sheetalpatnaik/Desktop/GENAI/dataset_embeddings.pkl"  # <-- update with actual path
IMAGE_FOLDER = "/Users/sheetalpatnaik/Desktop/GENAI/figures"                      # <-- path to image folder

# Load your 971 saved embeddings
with open(EMBEDDINGS_PKL_PATH, "rb") as f:
    saved_embeddings = pickle.load(f)

print(f"Loaded {len(saved_embeddings)} saved embeddings.")


print(f"Loaded {len(prompt_data)} VQA entries.")

Loaded 971 saved embeddings.
Loaded 1000 VQA entries.


Load CLIP Model and Preprocessing

In [None]:
import clip
import torch
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

Create (Image + Question) Embeddings in Same Format

In [None]:
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from torch.nn.functional import normalize

def generate_clip_embedding(image_path, question_text):
    # Load and preprocess image
    image = preprocess(Image.open(image_path).convert("RGB")).unsqueeze(0).to(device)

    # Tokenize text
    text = clip.tokenize([question_text]).to(device)

    # Encode both
    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)

    # Combine (as you did before ‚Äì assume you averaged or concatenated; here we use average)
    combined = (image_features + text_features) / 2
    combined = normalize(combined, dim=1).squeeze().tolist()

    return combined  # returns list of floats

Compute Cosine Distance and Retrieve Top-3 Examples

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_top_k_similar(embedding, all_embeddings, k=3):
    # Extract embedding vectors from saved data
    all_vectors = [entry["embedding"] for entry in all_embeddings]
    similarities = cosine_similarity([embedding], all_vectors)[0]

    # Get top k indices (excluding the query itself if needed)
    top_k_indices = np.argsort(similarities)[-k:][::-1]

    top_k_examples = [all_embeddings[i] for i in top_k_indices]
    return top_k_examples

Build CoT Prompt for GPT-4

In [None]:
def format_example(example):
    text = f"Image: {example['image']}\nQuestion: {example['question']}\n"
    for key, val in example["choices"].items():
        text += f"{key}: {val.strip()}\n"
    text += f"Answer: {example['answer']}\n"
    return text

In [None]:
def create_prompt(top_k_examples, current_example):
    prompt = (
        "You are a medical imaging expert. Study the examples below to understand the reasoning. "
        "Then answer the final question. Think step by step and justify your answer.\n\n"
    )
    for i, ex in enumerate(top_k_examples):
        prompt += f"Example {i+1}:\n{format_example(ex)}\n\n"

    prompt += f"Now answer the following:\n{format_example(current_example)}\n"
    prompt += "Answer:"
    return prompt

Full Pipeline for 20 Random Queries

In [None]:
import random

# Take 20 random queries
selected_queries = random.sample(prompt_data, 50)

for i, entry in enumerate(selected_queries):
    image_path = os.path.join(IMAGE_FOLDER, entry["image"])

    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}, skipping.")
        continue

    # Step 1: Create embedding
    current_embedding = generate_clip_embedding(image_path, entry["question"])

    # Step 2: Find top-3 examples
    top_k = get_top_k_similar(current_embedding, saved_embeddings, k=3)

    # Step 3: Create prompt
    prompt = create_prompt(top_k, entry)

    print(f"\n----- PROMPT FOR EXAMPLE {i+1} -----\n")
    print(prompt)
    print("\n" + "="*80 + "\n")




----- PROMPT FOR EXAMPLE 1 -----

You are a medical imaging expert. Study the examples below to understand the reasoning. Then answer the final question. Think step by step and justify your answer.

Example 1:
Image: PMC8590715_FIG5_112213.jpg
Question: what imaging technique was used to produce the adc map
A: A:Magnetic resonance imaging (MRI)
B: B:X-ray
C: C:Computed tomography (CT)
D: D:Positron emission tomography (PET)
Answer: A


Example 2:
Image: PMC8274757_f1_7979.jpg
Question: what type of imaging was used to obtain the image shown
A: A: Ultrasound
B: B: Magnetic Resonance Imaging
C: C: X-ray
D: D: Computed Tomography
Answer: B


Example 3:
Image: PMC8620805_jcm-10-05375-f004_122528.jpg
Question: what imaging technique was used to capture the image
A: A: X-ray
B: B: MRI
C: C: CT scan
D: D: PET scan
Answer: B


Now answer the following:
Image: PMC8590715_FIG5_112213.jpg
Question: what imaging technique was used to produce the adc map
A: A:Magnetic resonance imaging (MRI)
B: B:

SET UP OPEN AI

In [None]:
import os
import time
import re
from openai import OpenAI

# If hardcoding the API key for testing (safe only locally)


In [None]:
def ask_gpt4(prompt, model="gpt-4", temperature=0):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature
        )
        reply = response.choices[0].message.content
        return reply.strip()
    except Exception as e:
        print("Error calling GPT-4:", e)
        return None

Call GPT-4 and Extract Answer

In [None]:
def extract_choice(text):
    if text is None:
        return None
    match = re.search(r"\b([A-D])\b", text, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    return None

Query ‚Üí GPT-4 ‚Üí Evaluation

In [None]:
correct = 0
total = 0
results = []

for i, entry in enumerate(selected_queries):
    image_path = os.path.join(IMAGE_FOLDER, entry["image"])
    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}, skipping.")
        continue

    current_embedding = generate_clip_embedding(image_path, entry["question"])
    top_k = get_top_k_similar(current_embedding, saved_embeddings, k=3)
    prompt = create_prompt(top_k, entry)

    print(f"\n----- Query {i+1} -----")
    print(f"Question: {entry['question']}")
    print(f"Ground Truth: {entry['answer']}")

    gpt_response = ask_gpt4(prompt)
    print("GPT-4 Response:", gpt_response)

    if gpt_response is None:
        print("Skipping due to failed GPT response.")
        continue

    predicted = extract_choice(gpt_response)
    print("Predicted Answer:", predicted)

    is_correct = predicted == entry["answer"]
    print("Correct:", is_correct)

    results.append({
        "question": entry["question"],
        "actual_answer": entry["answer"],
        "predicted_answer": predicted,
        "gpt_response": gpt_response,
        "correct": is_correct
    })

    if is_correct:
        correct += 1
    total += 1

    time.sleep(1.5)  # Be kind to API limits


----- Query 1 -----
Question: what imaging technique was used to produce the adc map
Ground Truth: A
GPT-4 Response: A
Predicted Answer: A
Correct: True

----- Query 2 -----
Question: 2) where are the cystic bone destructions located
Ground Truth: B
GPT-4 Response: B: In the skull
Predicted Answer: B
Correct: True

----- Query 3 -----
Question: which side of the ovaries was imaged in the given sample
Ground Truth: B
GPT-4 Response: B
Predicted Answer: B
Correct: True

----- Query 4 -----
Question: what was the result of the abdominal ct scan
Ground Truth: C
GPT-4 Response: C: The patient had a cecal cystic mass
Predicted Answer: C
Correct: True

----- Query 5 -----
Question: which knee is displayed in nonanatomic patient  6s mri
Ground Truth: B
GPT-4 Response: B
Predicted Answer: B
Correct: True

----- Query 6 -----
Question: which cranial nerves are involved due to inflammation
Ground Truth: A
GPT-4 Response: A
Predicted Answer: A
Correct: True

----- Query 7 -----
Question: what typ

PRINT ACCURACY

In [None]:
accuracy = correct / total if total > 0 else 0
print(f"\nFinal Accuracy: {accuracy * 100:.2f}%")


Final Accuracy: 100.00%


# Tree of Thought

In [None]:
import os
import json
import pickle
import torch
import clip
from PIL import Image
import numpy as np
import openai
import re
import time

# Paths
IMAGE_FOLDER = "/Users/sheetalpatnaik/Desktop/GENAI/figures"

PICKLE_PATH = "/Users/sheetalpatnaik/Desktop/GENAI/dataset_embeddings.pkl"

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Load data
#with open(JSON_PATH, 'r') as f:
full_data = prompt_data

selected_queries = full_data[:30]

# Load saved embeddings
with open(PICKLE_PATH, 'rb') as f:
    saved_embeddings = pickle.load(f)

# Function to generate CLIP embedding
def generate_clip_embedding(image_path, question):
    image = preprocess(Image.open(image_path).convert("RGB")).unsqueeze(0).to(device)
    text = clip.tokenize([question]).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)
    combined = (image_features + text_features) / 2
    return combined.cpu().numpy().flatten()

# Cosine similarity
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Top-k retrieval
def get_top_k_similar(current_embedding, saved_embeddings, k=3):
    similarities = [
        (cosine_similarity(current_embedding, np.array(e['embedding'])), e)
        for e in saved_embeddings
    ]
    return sorted(similarities, key=lambda x: x[0], reverse=True)[:k]

# Tree of Thought Prompt
def create_tot_prompt(examples, query_item):
    prompt = "You are an expert radiologist. Below are examples of questions, options, and reasoning steps.\n"
    for _, ex in examples:
        prompt += f"\nImage: {ex['image']}\nQuestion: {ex['question']}\nChoices:\n"
        for key, val in ex['choices'].items():
            prompt += f"{key}: {val}\n"
        prompt += (
            f"Let‚Äôs explore different lines of reasoning to answer this question. "
            f"What are the possible interpretations of the image and the question? What could lead to different answers? "
            f"After considering all possibilities, choose the best answer and explain why it is correct.\n"
            f"Answer: {ex['answer']}\n"
        )

    prompt += f"\nNow consider the following:\nImage: {query_item['image']}\nQuestion: {query_item['question']}\nChoices:\n"
    for key, val in query_item['choices'].items():
        prompt += f"{key}: {val}\n"
    prompt += (
        "Let‚Äôs explore different lines of reasoning to answer this question. "
        "What are the possible interpretations of the image and the question? What could lead to different answers? "
        "After considering all possibilities, choose the best answer and explain why it is correct.\nAnswer:"
    )
    return prompt

# GPT-4 call

def ask_gpt4(prompt):
    response = openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7
    )
    return response.choices[0].message.content.strip()

# Extract answer
def extract_choice(text):
    match = re.search(r"\b([A-D])\b", text, re.IGNORECASE)
    return match.group(1).upper() if match else "N/A"

# Run the full pipeline
correct = 0
total = 0
print("\n--- Tree of Thought Prompting Results ---\n")
for i, entry in enumerate(selected_queries):
    image_path = os.path.join(IMAGE_FOLDER, entry["image"])
    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}, skipping.\n")
        continue

    current_embedding = generate_clip_embedding(image_path, entry["question"])
    top_k = get_top_k_similar(current_embedding, saved_embeddings, k=3)
    prompt = create_tot_prompt(top_k, entry)

    print(f"--- Query {i+1} ---")
    print(f"Question: {entry['question']}")
    print(f"Actual Answer: {entry['answer']}")

    try:
        gpt_response = ask_gpt4(prompt)
    except Exception as e:
        print("Error with GPT-4:", e)
        gpt_response = ""

    print("GPT-4 Response:", gpt_response)

    predicted = extract_choice(gpt_response)
    print("Predicted Answer:", predicted)
    is_correct = predicted == entry["answer"]
    print("Correct:", is_correct, "\n")

    if is_correct:
        correct += 1
    total += 1

    time.sleep(1.5)  # Respect rate limits

# Final accuracy
print("===============")
print(f"Final Accuracy: {correct}/{total} = {correct/total:.2%}")


--- Tree of Thought Prompting Results ---

--- Query 1 ---
Question: what does the image depict about the patients tumor
Actual Answer: B
GPT-4 Response: B
Predicted Answer: B
Correct: True 

--- Query 2 ---
Question: what imaging technique was used to capture the image
Actual Answer: D
GPT-4 Response: D
Predicted Answer: D
Correct: True 

--- Query 3 ---
Question: what is located to the right in all the photographs
Actual Answer: B
GPT-4 Response: B
Predicted Answer: B
Correct: True 

--- Query 4 ---
Question: what does the excisional biopsy reveal in this image
Actual Answer: C
GPT-4 Response: C
Predicted Answer: C
Correct: True 

--- Query 5 ---
Question: what approach was used for the surgery
Actual Answer: B
GPT-4 Response: B
Predicted Answer: B
Correct: True 

--- Query 6 ---
Question: what is indicated in blue in the image
Actual Answer: B
GPT-4 Response: B: nucleus

Reasoning: The blue color in the image is typically used to indicate the nucleus of the cell in most cell diagra

# RAG Implementation

In [None]:
# Step 1: Install datasets library
!pip install datasets

# Step 2: Load PubMedQA dataset
from datasets import load_dataset

# Automatically download it into Colab
pubmed_dataset = load_dataset("pubmed_qa", "pqa_labeled")

print(pubmed_dataset)

#print the first question
print(pubmed_dataset['train'][0])


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 1000
    })
})
{'pubid': 21645374, 'question': 'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?', 'context': {'contexts': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.', 'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole wi

In [None]:
from datasets import load_dataset

# Load the PubMedQA dataset
pubmed_dataset = load_dataset("pubmed_qa", "pqa_labeled")

# Check a sample
print(pubmed_dataset['train'][0])

{'pubid': 21645374, 'question': 'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?', 'context': {'contexts': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.', 'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells i

## Create PMC-VQA Question Embeddings

In [None]:
pip install sentence-transformers pandas tqdm


Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m62.0/62.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting joblib>=1.2.0 (from scikit-learn->sentence-transformers)
  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

In [None]:
# from sentence_transformers import SentenceTransformer
# import pickle
# import json

# # Step 2: Extract only the questions
# pmc_questions = [item['question'] for item in prompt_data]

# # Step 3: Load sentence transformer model
# model = SentenceTransformer('all-MiniLM-L6-v2')

# # Step 4: Create embeddings
# pmc_question_embeddings = model.encode(pmc_questions, batch_size=32, show_progress_bar=True)

# # Step 5: Save
# with open('/Users/sheetalpatnaik/Desktop/GENAI/pmc_question_embeddings.pkl', 'wb') as f:
#     pickle.dump({
#         "questions": pmc_questions,
#         "embeddings": pmc_question_embeddings
#     }, f)

# print("PMC-VQA question embeddings created and saved.")



import pandas as pd
import pickle
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# Load your matched dataset (with image names)
df = pd.read_csv("train_final.csv")

# Extract unique questions
questions = df["Question"].astype(str).tolist()

# Initialize text encoder (you can use any suitable model)
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Generate embeddings
print(" Generating text embeddings...")
embeddings = model.encode(questions, show_progress_bar=True, batch_size=64)

# Save to pickle
output = {
    "questions": questions,
    "embeddings": embeddings
}

with open("pmc_question_embeddings.pkl", "wb") as f:
    pickle.dump(output, f)

print(" Saved PMC-VQA question embeddings to pmc_question_embeddings.pkl")





Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


‚öôÔ∏è Generating text embeddings...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 490/490 [00:07<00:00, 67.36it/s]


‚úÖ Saved PMC-VQA question embeddings to pmc_question_embeddings.pkl


In [None]:
pip install torch torchvision transformers

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.31.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m40.5/40.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting hf-xet<2.0.0,>=1.1.0 (from huggingface-hub<1.0,>=0.30.0->transformers)
  Downloading hf

In [None]:
pip install google

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting google
  Downloading google-3.0.0-py2.py3-none-any.whl.metadata (627 bytes)
Downloading google-3.0.0-py2.py3-none-any.whl (45 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m45.3/45.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google
Successfully installed google-3.0.0
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
import os

# Show current working directory
print(" Current working directory:", os.getcwd())

# List all subfolders and files here
print("\n Contents of current directory:")
print(os.listdir())



üìÅ Current working directory: /app

üìÇ Contents of current directory:
['.ipynb_checkpoints', '.Trash-0', 'dermamnist_128.npz', 'final_generated_samples.png', 'GenAI_Project', 'generated_samples', 'images_2', 'loss_curve.png', 'pmc_image_embeddings.pkl', 'pmc_question_embeddings.pkl', 'train_final.csv']


In [None]:
import os

figures_path = os.path.join("images_2", "figures")
print(" Looking for figures folder at:", figures_path)
print(" Exists?" if os.path.exists(figures_path) else "‚ùå Does not exist")


üîç Looking for figures folder at: images_2/figures
‚úÖ Exists?


In [None]:
# Set path to images folder (corrected for /app working directory)
image_folder = os.path.join("images_2", "figures")


In [None]:
import os
import torch
import pickle
from PIL import Image
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel
import pandas as pd

# Load CSV with matched image names
df = pd.read_csv("train_final.csv")

#  Set path to figures folder (inside /app/images_2/figures)
image_folder = os.path.join("images_2", "figures")

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Storage
image_embeddings = []
valid_image_names = []
missing_images = []

# Process each image
for img_name in tqdm(df["Image_Name"].dropna().unique(), desc="üì∏ Encoding images"):
    img_path = os.path.join(image_folder, img_name)

    if not os.path.isfile(img_path):
        missing_images.append(img_name)
        print(f" Skipping missing image: {img_name}")
        continue

    try:
        image = Image.open(img_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            emb = model.get_image_features(**inputs).squeeze().cpu().numpy()
        image_embeddings.append(emb)
        valid_image_names.append(img_name)
    except Exception as e:
        print(f" Error processing {img_name}: {e}")

# Save to pickle
output = {
    "image_names": valid_image_names,
    "embeddings": image_embeddings
}

with open("pmc_image_embeddings.pkl", "wb") as f:
    pickle.dump(output, f)

print(f"\n Saved {len(image_embeddings)} image embeddings to pmc_image_embeddings.pkl")
print(f" Skipped {len(missing_images)} missing images.")


üì∏ Encoding images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 19496/19496 [03:35<00:00, 90.31it/s]



‚úÖ Saved 19496 image embeddings to pmc_image_embeddings.pkl
üö´ Skipped 0 missing images.


## Code for PubMedQA Question Embeddings

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import pickle

# Step 1: Load PubMedQA dataset
pubmed_dataset = load_dataset("pubmed_qa", "pqa_labeled")['train']

# Step 2: Extract only the questions
pubmed_questions = [item['question'] for item in pubmed_dataset]

# Step 3: Load Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # same model for consistency

# Step 4: Create embeddings
pubmed_question_embeddings = model.encode(pubmed_questions, batch_size=32, show_progress_bar=True)

# Step 5: Save
with open('/Users/sheetalpatnaik/Desktop/GENAI/pubmed_question_embeddings.pkl', 'wb') as f:
    pickle.dump({
        "questions": pubmed_questions,
        "embeddings": pubmed_question_embeddings
    }, f)

print(" PubMedQA question embeddings created and saved.")

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

‚úÖ PubMedQA question embeddings created and saved.


**RAG with question embeddings**

In [None]:
import pickle
import faiss
import numpy as np
import json
from datasets import load_dataset

# Step 1: Load PMC-VQA question embeddings
with open('/Users/sheetalpatnaik/Desktop/GENAI/pmc_question_embeddings.pkl', 'rb') as f:
    pmc_data = pickle.load(f)

pmc_questions = pmc_data['questions']
pmc_embeddings = pmc_data['embeddings']

# Step 2: Load PubMed question embeddings
with open('/Users/sheetalpatnaik/Desktop/GENAI/pubmed_question_embeddings.pkl', 'rb') as f:
    pubmed_data = pickle.load(f)

pubmed_questions = pubmed_data['questions']
pubmed_embeddings = pubmed_data['embeddings']

# Step 3: Load full PubMedQA dataset (for context, long answer)
pubmed_dataset = load_dataset("pubmed_qa", "pqa_labeled")['train']



# Step 5: Normalize embeddings (for cosine similarity)
pubmed_embeddings = pubmed_embeddings / np.linalg.norm(pubmed_embeddings, axis=1, keepdims=True)
pmc_embeddings = pmc_embeddings / np.linalg.norm(pmc_embeddings, axis=1, keepdims=True)

# Step 6: Build FAISS index
embedding_dim = pubmed_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(pubmed_embeddings)

print(f" FAISS index built with {index.ntotal} vectors.")

# Step 7: Search and print results
results = []

for i in range(20):  # First 20 queries
    pmc_entry = prompt_data[i]  # a dict: question, options A-D, correct option, answer
    pmc_query_embedding = pmc_embeddings[i].reshape(1, -1)

    # Search top-1 match
    distances, indices = index.search(pmc_query_embedding, k=1)
    best_idx = indices[0][0]

    # Fetch matched PubMed record
    pubmed_item = pubmed_dataset[int(best_idx)]

    # Prepare result
    choices = pmc_entry.get('choices', {})

    result = {
        'Image Name': pmc_entry['image'],
        'PMC Question': pmc_entry['question'],
        'Option A': choices.get('A', ''),
        'Option B': choices.get('B', ''),
        'Option C': choices.get('C', ''),
        'Option D': choices.get('D', ''),
        'Correct Option': pmc_entry.get('answer', ''),
        'Final Answer' : pmc_entry.get('answer', ''),
        'PubMed Retrieved Question': pubmed_item['question'],
        'PubMed Context': " ".join(pubmed_item['context']['contexts']),
        'PubMed Long Answer': pubmed_item['long_answer']
    }

    results.append(result)

# Step 8: Print results
for idx, item in enumerate(results):
    print(f"\n--- Result {idx+1} ---")
    print(f": {item['Image Name']}")
    print(f"PMC Question: {item['PMC Question']}")
    print(f": {item['Option A']}")
    print(f": {item['Option B']}")
    print(f": {item['Option C']}")
    print(f": {item['Option D']}")
    print(f"Correct Option: {item['Correct Option']}")
    print("\nRetrieved PubMed Info:")
    print(f"PubMed Question: {item['PubMed Retrieved Question']}")
    print(f"PubMed Context: {item['PubMed Context']}")
    print(f"PubMed Long Answer: {item['PubMed Long Answer']}")

‚úÖ FAISS index built with 1000 vectors.

--- Result 1 ---
: PMC8519188_FIG5_85295.jpg
PMC Question: what does the image depict about the patients tumor
:  A:The tumor has grown larger 
:  B:The tumor has shrunk 
:  C:The tumor has not changed 
:  D:The image doesn't show tumor regression 
Correct Option: B

Retrieved PubMed Info:
PubMed Question: Should tumor depth be included in prognostication of soft tissue sarcoma?
PubMed Context: Most staging systems for soft tissue sarcoma are based on histologic malignancy-grade, tumor size and tumor depth. These factors are generally dichotomized, size at 5 cm. We believe it is unlikely that tumor depth per se should influence a tumor's metastatic capability. Therefore we hypothesized that the unfavourable prognostic importance of depth could be explained by the close association between size and depth, deep-seated tumors on average being larger than the superficial ones. When tumor size is dichotomized, this effect should be most pronounced i

In [None]:
print(prompt_data)

[{'image': 'PMC8519188_FIG5_85295.jpg', 'question': 'what does the image depict about the patients tumor', 'choices': {'A': ' A:The tumor has grown larger ', 'B': ' B:The tumor has shrunk ', 'C': ' C:The tumor has not changed ', 'D': " D:The image doesn't show tumor regression "}, 'answer': 'B'}, {'image': 'PMC8285465_Fig3_10775.jpg', 'question': 'what imaging technique was used to capture the image', 'choices': {'A': ' A:CT scan ', 'B': ' B:Electroencephalography ', 'C': ' C:X-ray ', 'D': ' D:Magnetic resonance imaging '}, 'answer': 'D'}, {'image': 'PMC8918112_Fig4_221411.jpg', 'question': 'what is located to the right in all the photographs', 'choices': {'A': ' A:The anterior pole ', 'B': ' B:The posterior pole ', 'C': ' C:The vegetal pole ', 'D': ' D:The lateral pole '}, 'answer': 'B'}, {'image': 'PMC8225413_fig2_475661.jpg', 'question': 'what does the excisional biopsy reveal in this image', 'choices': {'A': ' A:Primary tumor ', 'B': ' B: Epidural tumor ', 'C': ' C: Inguinal node m

In [None]:
from openai import OpenAI
import base64
import os

# Initialize GPT-4 client (Vision supported via gpt-4-turbo)


# Image folder path
image_folder_path = "/Users/sheetalpatnaik/Desktop/GENAI/figures"

gpt4v_answers = []

for idx, item in enumerate(results):
    image_filename = item['Image Name']  # Ensure you added this to your results list
    image_path = os.path.join(image_folder_path, image_filename)

    # Load and encode image
    with open(image_path, "rb") as img_file:
        image_data = base64.b64encode(img_file.read()).decode('utf-8')

    # Prepare prompt text
    prompt_text = f"""
You are a helpful medical AI assistant. Based on the following information and the provided image, answer the question correctly.
Refer the context and long answer provided.


Question:
{item['PMC Question']}

Options:
A. {item['Option A']}
B. {item['Option B']}
C. {item['Option C']}
D. {item['Option D']}

Knowledge Base Information:
{item['PubMed Context']}
{item['PubMed Long Answer']}

Please choose the most appropriate option (A, B, C, or D).
"""

    # GPT-4-Turbo Vision Request
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image_data}"
                        }
                    }
                ]
            }
        ],
        temperature=0.0,
        max_tokens=300
    )

    # Extract GPT-4-Vision answer
    answer = response.choices[0].message.content.strip()

    gpt4v_answers.append({
        'PMC Question': item['PMC Question'],
        'Options': {
            'A': item['Option A'],
            'B': item['Option B'],
            'C': item['Option C'],
            'D': item['Option D'],
        },
        'Correct Option': item['Correct Option'],
        'GPT-4V Answer': answer,
        'Image Name': image_filename,
        'PubMed Retrieved Question': item['PubMed Retrieved Question'],
        'PubMed Context': item['PubMed Context'],
        'PubMed Long Answer': item['PubMed Long Answer']
    })

    print(f" Processed Query {idx+1}")

# FINAL PRINTING (outside loop)
for idx, item in enumerate(gpt4v_answers):
    print(f"\n=================== Result {idx+1} ===================")
    print(f"Image Name: {item['Image Name']}")
    print(f"PMC Question:\n{item['PMC Question']}\n")

    print("Options:")
    print(f"A: {item['Options']['A']}")
    print(f"B: {item['Options']['B']}")
    print(f"C: {item['Options']['C']}")
    print(f"D: {item['Options']['D']}\n")

    print(f"Correct Answer: {item['Correct Option']}")
    print(f"GPT-4 Predicted Answer: {item['GPT-4V Answer']}\n")

    print("Retrieved PubMed Question:")
    print(item['PubMed Retrieved Question'])

    print("\nRetrieved PubMed Context:")
    print(item['PubMed Context'])

    print("\nRetrieved PubMed Long Answer:")
    print(item['PubMed Long Answer'])
    print("============================================================")

‚úÖ Processed Query 1
‚úÖ Processed Query 2
‚úÖ Processed Query 3
‚úÖ Processed Query 4
‚úÖ Processed Query 5
‚úÖ Processed Query 6
‚úÖ Processed Query 7
‚úÖ Processed Query 8
‚úÖ Processed Query 9
‚úÖ Processed Query 10
‚úÖ Processed Query 11
‚úÖ Processed Query 12
‚úÖ Processed Query 13
‚úÖ Processed Query 14
‚úÖ Processed Query 15
‚úÖ Processed Query 16
‚úÖ Processed Query 17
‚úÖ Processed Query 18
‚úÖ Processed Query 19
‚úÖ Processed Query 20

Image Name: PMC8519188_FIG5_85295.jpg
PMC Question:
what does the image depict about the patients tumor

Options:
A:  A:The tumor has grown larger 
B:  B:The tumor has shrunk 
C:  C:The tumor has not changed 
D:  D:The image doesn't show tumor regression 

Correct Answer: B
GPT-4 Predicted Answer: Based on the provided image, which shows a measurement of 50.2 mm across a lesion in the lung, the correct answer to the question about the tumor's status cannot be determined solely from this single image. The image shows a measurement but does not

In [None]:
import re

correct = 0
total = len(gpt4v_answers)

for item in gpt4v_answers:
    # Extract just the letter (A/B/C/D) from GPT-4's answer using regex
    match = re.search(r"\b([A-D])\b", item['GPT-4V Answer'].upper())
    predicted = match.group(1) if match else None
    actual = item['Correct Option'].strip().upper()

    if predicted == actual:
        correct += 1

accuracy = correct / total * 100

print(f"\n GPT-4 Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")


‚úÖ GPT-4 Accuracy: 50.00% (10/20 correct)


# RAG with question and Image embeddings

### Downloading SLAKE dataset

In [None]:
from datasets import load_dataset

# Load the English-only subset of SLAKE
dataset = load_dataset("mdwiratathya/SLAKE-vqa-english")


README.md:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/31.1M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/8.34M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/9.59M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4919 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1053 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1061 [00:00<?, ? examples/s]

In [None]:
# Check dataset structure
print(dataset)

# Print a sample item
print(dataset['train'][0])


DatasetDict({
    train: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 4919
    })
    validation: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 1053
    })
    test: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 1061
    })
})
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=256x256 at 0x668DAA6F0>, 'question': 'What modality is used to take this image?', 'answer': 'MRI'}


### Generate SLAKE Image Embeddings

In [None]:
from PIL import Image
from tqdm import tqdm
import torch
import pickle
from transformers import CLIPProcessor, CLIPModel

# Load model + processor
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

# Get all images from SLAKE training set
images = [item['image'] for item in dataset['train']]
image_ids = list(range(len(images)))  # or use custom IDs if needed

# Generate embeddings
image_embeddings = []
for img in tqdm(images, desc="Embedding SLAKE Images"):
    inputs = processor(images=img, return_tensors="pt").to(model.device)
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
        image_embeddings.append(image_features.cpu().numpy().squeeze())

# Stack and normalize
image_embeddings = np.vstack(image_embeddings)
image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, axis=1, keepdims=True)

# Save
with open("slake_image_embeddings.pkl", "wb") as f:
    pickle.dump({
        "image_ids": image_ids,
        "embeddings": image_embeddings
    }, f)

print("‚úÖ SLAKE image embeddings generated and saved.")


NameError: name 'dataset' is not defined

**Load all necessary pickle files of embeddings**

In [None]:
!pip install faiss-cpu


[0m

In [None]:
!pip install openai

[0m

In [None]:
!pip install datasets

[0m

In [None]:
# Step 1: Clean uninstall
!pip uninstall -y pydantic openai typing_extensions

# Step 2: Install exact compatible versions
!pip install pydantic==2.5.3 openai==1.3.9 typing_extensions==4.7.1


Found existing installation: pydantic 2.5.3
Uninstalling pydantic-2.5.3:
  Successfully uninstalled pydantic-2.5.3
Found existing installation: openai 1.3.9
Uninstalling openai-1.3.9:
  Successfully uninstalled openai-1.3.9
Found existing installation: typing_extensions 4.7.1
Uninstalling typing_extensions-4.7.1:
  Successfully uninstalled typing_extensions-4.7.1
[0mCollecting pydantic==2.5.3
  Using cached pydantic-2.5.3-py3-none-any.whl.metadata (65 kB)
Collecting openai==1.3.9
  Using cached openai-1.3.9-py3-none-any.whl.metadata (17 kB)
Collecting typing_extensions==4.7.1
  Using cached typing_extensions-4.7.1-py3-none-any.whl.metadata (3.1 kB)
Using cached pydantic-2.5.3-py3-none-any.whl (381 kB)
Using cached openai-1.3.9-py3-none-any.whl (221 kB)
Using cached typing_extensions-4.7.1-py3-none-any.whl (33 kB)
Installing collected packages: typing_extensions, pydantic, openai
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are inst

In [None]:
from openai import OpenAI

print("‚úÖ OpenAI import successful!")


ImportError: cannot import name 'OpenAI' from 'openai' (/opt/conda/lib/python3.10/site-packages/openai/__init__.py)

In [None]:
import json
import torch
import pickle
import numpy as np
from PIL import Image
from tqdm import tqdm
import faiss
import os

from datasets import load_dataset
import pandas as pd

# Load question-image CSV
df = pd.read_csv("train_final.csv").dropna(subset=["Image_Name"]).reset_index(drop=True)

# Load PMC question embeddings
with open("pmc_question_embeddings.pkl", "rb") as f:
    pmc_q_data = pickle.load(f)
pmc_questions = pmc_q_data["questions"]
pmc_q_embeddings = np.array(pmc_q_data["embeddings"])
pmc_q_embeddings /= np.linalg.norm(pmc_q_embeddings, axis=1, keepdims=True)

# Load PMC image embeddings
with open("pmc_image_embeddings.pkl", "rb") as f:
    pmc_img_data = pickle.load(f)
pmc_img_embeddings = np.array(pmc_img_data["embeddings"])
pmc_img_embeddings /= np.linalg.norm(pmc_img_embeddings, axis=1, keepdims=True)

# Load PubMedQA
pubmed_dataset = load_dataset("pubmed_qa", "pqa_labeled")["train"]
with open("pubmed_question_embeddings.pkl", "rb") as f:
    pubmed_data = pickle.load(f)
pubmed_q_embeddings = np.array(pubmed_data["embeddings"])
pubmed_q_embeddings /= np.linalg.norm(pubmed_q_embeddings, axis=1, keepdims=True)
pubmed_index = faiss.IndexFlatL2(pubmed_q_embeddings.shape[1])
pubmed_index.add(pubmed_q_embeddings)

# Load SLAKE
slake_dataset = load_dataset("mdwiratathya/SLAKE-vqa-english")["train"]
with open("slake_image_embeddings.pkl", "rb") as f:
    slake_data = pickle.load(f)
slake_embeddings = np.array(slake_data["embeddings"])
slake_embeddings /= np.linalg.norm(slake_embeddings, axis=1, keepdims=True)
slake_index = faiss.IndexFlatL2(slake_embeddings.shape[1])
slake_index.add(slake_embeddings)




In [None]:
!pip uninstall openai -y



Found existing installation: openai 0.27.2
Uninstalling openai-0.27.2:
  Successfully uninstalled openai-0.27.2
[0m

In [None]:
!pip install openai==0.27.2



Collecting openai==0.27.2
  Using cached openai-0.27.2-py3-none-any.whl.metadata (13 kB)
Using cached openai-0.27.2-py3-none-any.whl (70 kB)
Installing collected packages: openai
Successfully installed openai-0.27.2
[0m

In [None]:
import openai
import os
from openai import OpenAI
import base64
from tqdm import tqdm




final_outputs = []
correct = 0
total = 20

for i in tqdm(range(total), desc="üîÑ Running RAG on 20 samples"):
    q = df.iloc[i]["Question"]
    gt = df.iloc[i]["Answer"].strip().upper()
    img_name = df.iloc[i]["Image_Name"]
    options = {
        "A": df.iloc[i]["Choice A"],
        "B": df.iloc[i]["Choice B"],
        "C": df.iloc[i]["Choice C"],
        "D": df.iloc[i]["Choice D"]
    }

    # Embeddings
    q_emb = pmc_q_embeddings[i].reshape(1, -1)
    img_emb = pmc_img_embeddings[i].reshape(1, -1)

    # Retrieve context from PubMed
    _, text_idx = pubmed_index.search(q_emb, 1)
    pubmed_item = pubmed_dataset[int(text_idx[0][0])]

    # Retrieve image match from SLAKE
    _, vis_idx = slake_index.search(img_emb, 1)
    slake_item = slake_dataset[int(vis_idx[0][0])]

    # Prompt for GPT-4V
    prompt = f"""
You are a medical AI assistant. Use the following data to answer the question accurately.

üß† Question:
{q}

üî¢ Options:
A. {options['A']}
B. {options['B']}
C. {options['C']}
D. {options['D']}

üìñ PubMed Knowledge:
- Retrieved Q: {pubmed_item['question']}
- Context: {" ".join(pubmed_item['context']['contexts'])}
- Long Answer: {pubmed_item['long_answer']}

üñº SLAKE Visual Info:
- Retrieved Q: {slake_item['question']}
- Answer: {slake_item['answer']}

üí° Use the image and text above to decide the best answer (A, B, C, or D). Reply with one of:
Answer: A / B / C / D
"""

    # Encode image as base64
    img_path = os.path.join(image_folder, img_name)
    with open(img_path, "rb") as f:
        encoded_image = base64.b64encode(f.read()).decode("utf-8")

    # Make GPT-4V API call
    response =  openai.ChatCompletion.create(
        model="gpt-4-vision-preview",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}
            ]
        }],
        temperature=0,
        max_tokens=400
    )

    # Extract prediction
    pred_text = response["choices"][0]["message"]["content"].strip()
    pred = pred_text.split("Answer:")[-1].strip().upper() if "Answer:" in pred_text else "Z"

    final_outputs.append({
        "Question": q,
        "Image Name": img_name,
        "Correct Answer": gt,
        "Options": options,
        "GPT-4V Answer": pred,
        "PubMedQA Context": pubmed_item["context"],
        "SLAKE Answer": slake_item["answer"]
    })

    print(f"\n--- Sample {i + 1} ---")
    print(f"üñº Image: {img_name}")
    print(f"‚ùì Question: {q}")
    print(f"‚úÖ GT: {gt}")
    print(f"ü§ñ GPT-4V Answer: {pred}")
    print("‚úîÔ∏è Correct" if pred == gt else "‚ùå Incorrect")
    print("--------------------------------------------------")

    if pred == gt:
        correct += 1

# Accuracy Report
accuracy = correct / total
print(f"\nüéØ Final Accuracy over {total} samples: {accuracy:.2%}")


ImportError: cannot import name 'OpenAI' from 'openai' (/opt/conda/lib/python3.10/site-packages/openai/__init__.py)

# Llava

In [None]:
# Install required libraries
!pip install torch torchvision transformers peft accelerate bitsandbytes datasets evaluate
!pip install sentencepiece
!pip install -U huggingface_hub

[0m

# 1. Data Preparation

In [None]:
train_df = pd.read_csv('train_final.csv')
test_df = pd.read_csv('./GenAI_Project/test_2.csv')

In [None]:
!pip install pandas

[0m

In [None]:
import pandas as pd
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image

# Make sure you have the correct path to your image directory
image_dir = './images_2/figures'  # Replace with your actual path

# Examine the data structure
print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print(f"Column names: {train_df.columns.tolist()}")

# Check that image files exist
sample_image_path = os.path.join(image_dir, train_df.iloc[0]['Image_Name'])
print(f"Sample image path: {sample_image_path}")
print(f"Image exists: {os.path.exists(sample_image_path)}")

Train set size: 31329
Test set size: 33430
Column names: ['Figure_path', 'Question', 'Answer', 'Choice A', 'Choice B', 'Choice C', 'Choice D', 'Answer_label', 'Image_Name']
Sample image path: ./images_2/figures/PMC1064097_F4_1520.jpg
Image exists: True


## 2. Load the LLaVA Model and Processor

In [None]:
from transformers import LlavaProcessor, LlavaForConditionalGeneration
import torch

# Define model ID
model_id = "llava-hf/llava-1.5-7b-hf"

# First, load the processor
processor = LlavaProcessor.from_pretrained(model_id)

# Check what image token the model uses
special_tokens = processor.tokenizer.special_tokens_map
print(f"Special tokens: {special_tokens}")

# Load model with 8-bit quantization for memory efficiency
model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    load_in_8bit=True,
    device_map="auto"
)

# Check if model has an image token index
if hasattr(model.config, "image_token_index"):
    print(f"Model image token index: {model.config.image_token_index}")
    # Convert token ID to actual token string
    token = processor.tokenizer.convert_ids_to_tokens(model.config.image_token_index)
    print(f"Image token: {token}")
else:
    print("Model does not have a defined image_token_index")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Special tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'image_token': '<image>'}


Fetching 3 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:55<00:00, 18.35s/it]
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:04<00:00,  1.56s/it]

Model image token index: 32000
Image token: <image>





In [None]:
!pip install accelerate


[0m

## 3. Create Custom Dataset Class

In [None]:
class MedVQADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor
        # Use the image token we confirmed from the model
        self.image_token = "<image>"

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]

        # Load image
        image_path = os.path.join(self.image_dir, row['Image_Name'])
        image = Image.open(image_path).convert('RGB')

        # Format the prompt with multiple choice options
        question = row['Question']
        choices = f"A: {row['Choice A'].split(':', 1)[1] if ':' in row['Choice A'] else row['Choice A']}\n" \
                 f"B: {row['Choice B'].split(':', 1)[1] if ':' in row['Choice B'] else row['Choice B']}\n" \
                 f"C: {row['Choice C'].split(':', 1)[1] if ':' in row['Choice C'] else row['Choice C']}\n" \
                 f"D: {row['Choice D'].split(':', 1)[1] if ':' in row['Choice D'] else row['Choice D']}"

        # Use the image token at the beginning of the prompt
        prompt = f"{self.image_token}\nQuestion: {question}\n\nOptions:\n{choices}\n\nPlease select the correct answer (A, B, C, or D):"

        # Process inputs using the processor
        inputs = self.processor(
            images=image,
            text=prompt,
            return_tensors="pt"
        )

        # Prepare target (answer label)
        target = row['Answer_label']

        # Add target to inputs
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs["labels"] = self.processor.tokenizer(f" {target}", return_tensors="pt").input_ids[:, 1:].squeeze(0)

        return inputs

# Create a small test dataset and fetch a sample
test_dataset = MedVQADataset(train_df.head(5), image_dir, processor)
print(f"Dataset size: {len(test_dataset)}")

# Test with one sample
sample = test_dataset[0]
print(f"Sample keys: {sample.keys()}")

# Check if the image token is correctly included
input_ids = sample['input_ids']
tokens = processor.tokenizer.convert_ids_to_tokens(input_ids)

# Find the image token in the input
image_token_id = processor.tokenizer.convert_tokens_to_ids("<image>")
print(f"Image token ID: {image_token_id}")

# Count how many image tokens are in the input
image_token_count = (input_ids == image_token_id).sum().item()
print(f"Number of image tokens in input: {image_token_count}")

# Check pixel values shape
if 'pixel_values' in sample:
    print(f"Pixel values shape: {sample['pixel_values'].shape}")

Dataset size: 5
Sample keys: dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'labels'])
Image token ID: 32000
Number of image tokens in input: 576
Pixel values shape: torch.Size([3, 336, 336])


## 4. Set Up LoRA for Fine-tuning

In [None]:
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training

# Prepare model for LoRA fine-tuning
model = prepare_model_for_kbit_training(model)

# Define LoRA configuration
peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    r=8,  # Rank
    lora_alpha=32,
    lora_dropout=0.1,
    # Target the language model's attention modules
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
)

# Apply LoRA adapters to the model
model = get_peft_model(model, peft_config)

# Print summary of trainable parameters
print("Trainable parameters summary:")
model.print_trainable_parameters()

Trainable parameters summary:
trainable params: 9,568,256 || all params: 7,072,995,328 || trainable%: 0.1353


## 5. Create DataLoader and Training Configuration

In [None]:
from dataclasses import dataclass
from typing import Dict, List, Any
import torch

@dataclass
class ImprovedLlavaDataCollator:
    def __call__(self, batch):
        # Initialize the batch dictionary
        collated_batch = {}

        # Handle pixel_values (image features)
        if "pixel_values" in batch[0]:
            collated_batch["pixel_values"] = torch.stack([item["pixel_values"] for item in batch])

        # Find the maximum lengths
        max_input_len = max(len(item["input_ids"]) for item in batch)

        # Prepare input_ids and attention_mask
        input_ids_list = []
        attention_mask_list = []

        for item in batch:
            # Pad input_ids
            input_ids = item["input_ids"]
            padding_len = max_input_len - len(input_ids)

            padded_input_ids = torch.cat([
                input_ids,
                torch.full((padding_len,), processor.tokenizer.pad_token_id, dtype=input_ids.dtype)
            ])
            input_ids_list.append(padded_input_ids)

            # Pad attention_mask
            attention_mask = item["attention_mask"]
            padded_attention_mask = torch.cat([
                attention_mask,
                torch.zeros(padding_len, dtype=attention_mask.dtype)
            ])
            attention_mask_list.append(padded_attention_mask)

        collated_batch["input_ids"] = torch.stack(input_ids_list)
        collated_batch["attention_mask"] = torch.stack(attention_mask_list)

        # Handle labels
        if "labels" in batch[0]:
            labels_list = []

            # Get max label length
            max_label_len = max(len(item["labels"]) for item in batch)

            for item in batch:
                labels = item["labels"]
                padding_len = max_label_len - len(labels)

                # Pad with -100 (ignored in loss calculation)
                padded_labels = torch.cat([
                    labels,
                    torch.full((padding_len,), -100, dtype=labels.dtype)
                ])
                labels_list.append(padded_labels)

            collated_batch["labels"] = torch.stack(labels_list)

        return collated_batch

# Create the improved data collator
data_collator = ImprovedLlavaDataCollator()

# Reduce batch size to help with memory issues
batch_size = 2

# Update training arguments
training_args = TrainingArguments(
    output_dir="./results/llava-med-lora",
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=200,
    learning_rate=2e-4,
    fp16=True,
    logging_dir="./logs",
    logging_steps=20,
    save_strategy="steps",
    save_steps=200,
    eval_strategy="steps",
    eval_steps=200,
    load_best_model_at_end=True,
    report_to="none",
    gradient_accumulation_steps=8  # Increased to help with memory
)

# Initialize the trainer with the new collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=test_df,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Updated trainer initialized with batch size:", batch_size)
print("Starting training with improved collator...")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Updated trainer initialized with batch size: 2
Starting training with improved collator...


## 6. Set Up the Trainer and Compute Metrics

In [None]:
from transformers import Trainer
import numpy as np

# Define a compute_metrics function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Create a mask where labels are not -100 (padding)
    mask = labels != -100

    # Get predictions for the valid positions only
    predictions = np.argmax(logits, axis=-1)

    # Count correct predictions (where prediction matches label)
    correct = 0
    total = 0

    for i in range(len(labels)):
        for j in range(len(labels[i])):
            if labels[i][j] != -100:  # Skip padding tokens
                total += 1
                if predictions[i][j] == labels[i][j]:
                    correct += 1

    # Calculate accuracy
    accuracy = correct / total if total > 0 else 0

    return {"accuracy": accuracy}

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=test_df,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Trainer initialized successfully!")
print(f"Training dataset size: {len(train_df)}")
print(f"Testing dataset size: {len(test_df)}")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainer initialized successfully!
Training dataset size: 31329
Testing dataset size: 33430


## 7. Train the Model

In [None]:
 # import torch
# from torch.utils.data import DataLoader
# from tqdm.auto import tqdm
# from torch.optim import AdamW  # Use AdamW from torch.optim instead

# # Create a subset of the data for testing our approach
# train_subset = train_dataset
# if len(train_dataset) > 1000:
#     # Use a smaller subset for initial testing
#     train_subset = torch.utils.data.Subset(train_dataset, list(range(1000)))
#     print(f"Using a subset of {len(train_subset)} examples for testing")
# else:
#     print(f"Using all {len(train_subset)} examples")

# # Create a simpler data collator that just returns the batch
# def simple_collator(batch):
#     # Extract pixel values
#     pixel_values = torch.stack([item["pixel_values"] for item in batch])

#     # Process input_ids and attention_mask with padding
#     max_len = max(len(item["input_ids"]) for item in batch)

#     # Prepare padded tensors
#     input_ids = []
#     attention_mask = []

#     for item in batch:
#         # Pad input_ids
#         ids = item["input_ids"]
#         padding = torch.full((max_len - len(ids),), processor.tokenizer.pad_token_id, dtype=ids.dtype)
#         input_ids.append(torch.cat([ids, padding]))

#         # Pad attention_mask
#         mask = item["attention_mask"]
#         padding = torch.zeros(max_len - len(mask), dtype=mask.dtype)
#         attention_mask.append(torch.cat([mask, padding]))

#     # Stack tensors
#     input_ids = torch.stack(input_ids)
#     attention_mask = torch.stack(attention_mask)

#     # Simple target: just use the first letter of the answer
#     targets = []
#     for item in batch:
#         # Get the label
#         label_str = processor.tokenizer.decode(item["labels"])
#         # Extract first character (should be A, B, C, or D)
#         if len(label_str) > 0 and label_str[0] in "ABCD":
#             # Map A, B, C, D to 0, 1, 2, 3
#             target = ord(label_str[0]) - ord('A')
#         else:
#             # Default to A if label is not recognized
#             target = 0
#         targets.append(target)

#     targets = torch.tensor(targets, dtype=torch.long)

#     return {
#         "pixel_values": pixel_values,
#         "input_ids": input_ids,
#         "attention_mask": attention_mask,
#         "targets": targets
#     }

# # Create dataloader with simple collator
# batch_size = 4
# dataloader = DataLoader(
#     train_subset,
#     batch_size=batch_size,
#     shuffle=True,
#     collate_fn=simple_collator
# )

# print(f"Created dataloader with batch size {batch_size}")

# # Set up optimizer
# optimizer = AdamW(model.parameters(), lr=5e-5)

# # Setup device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Training loop
# num_epochs = 1
# print(f"Starting custom training loop for {num_epochs} epochs")

# model.train()
# for epoch in range(num_epochs):
#     total_loss = 0
#     progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}")

#     for batch in progress_bar:
#         # Move batch to device
#         batch = {k: v.to(device) for k, v in batch.items()}

#         # Extract targets
#         targets = batch.pop("targets")

#         # Zero gradients
#         optimizer.zero_grad()

#         # Forward pass - pass the main inputs
#         outputs = model(
#             input_ids=batch["input_ids"],
#             attention_mask=batch["attention_mask"],
#             pixel_values=batch["pixel_values"],
#         )

#         # Extract logits for the relevant positions (last token of input)
#         logits = outputs.logits

#         # Compute loss
#         # Simple classification loss: take the last token of each sequence
#         last_token_logits = logits[:, -1, :]
#         # Restrict to the first few tokens that represent A, B, C, D
#         relevant_logits = last_token_logits[:, :4]
#         loss = torch.nn.functional.cross_entropy(relevant_logits, targets)

#         # Backward pass
#         loss.backward()

#         # Update weights
#         optimizer.step()

#         # Update progress bar
#         total_loss += loss.item()
#         progress_bar.set_postfix({"loss": loss.item()})

#     avg_loss = total_loss / len(dataloader)
#     print(f"Epoch {epoch+1} - Average loss: {avg_loss:.4f}")

# # Save the fine-tuned model
# output_dir = "./final_model/llava-med-lora-custom"
# print(f"Saving model to {output_dir}")
# model.save_pretrained(output_dir)
# processor.save_pretrained(output_dir)
# print("Model saved successfully!")

Using a subset of 1000 examples for testing
Created dataloader with batch size 4
Starting custom training loop for 1 epochs


Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 250/250 [07:03<00:00,  1.70s/it, loss=7.33e-5]


Epoch 1 - Average loss: 0.9325
Saving model to ./final_model/llava-med-lora-custom
Model saved successfully!


In [None]:
# # Define a function to run inference on a single example
# def predict_single_example(image_path, question, choices, true_answer=None):
#     """
#     Run model prediction on a single example

#     Args:
#         image_path: Path to the image file
#         question: Question text
#         choices: Dictionary of choices (A, B, C, D)
#         true_answer: The correct answer (optional)

#     Returns:
#         predicted_answer: The model's prediction
#     """
#     # Load and process the image
#     image = Image.open(image_path).convert('RGB')

#     # Format choices
#     choices_text = f"A: {choices['A']}\nB: {choices['B']}\nC: {choices['C']}\nD: {choices['D']}"

#     # Create prompt with image token
#     prompt = f"<image>\nQuestion: {question}\n\nOptions:\n{choices_text}\n\nPlease select the correct answer (A, B, C, or D):"

#     # Process inputs
#     inputs = processor(images=image, text=prompt, return_tensors="pt")
#     inputs = {k: v.to(device) for k, v in inputs.items()}

#     # Generate prediction
#     with torch.no_grad():
#         outputs = model(
#             input_ids=inputs["input_ids"],
#             attention_mask=inputs["attention_mask"],
#             pixel_values=inputs["pixel_values"]
#         )

#     # Get prediction
#     logits = outputs.logits
#     last_token_logits = logits[:, -1, :]
#     relevant_logits = last_token_logits[:, :4]  # A, B, C, D
#     prediction_idx = torch.argmax(relevant_logits, dim=1).item()

#     # Convert to letter
#     predicted_answer = chr(ord('A') + prediction_idx)

#     # Check if correct
#     is_correct = (predicted_answer == true_answer) if true_answer else None

#     return {
#         "predicted_answer": predicted_answer,
#         "is_correct": is_correct
#     }

# # Let's evaluate on a few samples from your test set
# import random

# # Assuming train_df has the right structure, let's use it for evaluation
# if 'Image_Name' in train_df.columns:
#     # Select a sample of examples for evaluation
#     num_samples = 20
#     sample_indices = random.sample(range(len(train_df)), min(num_samples, len(train_df)))
#     samples = [train_df.iloc[i] for i in sample_indices]

#     # Track results
#     correct = 0
#     total = 0

#     print(f"Evaluating model on {len(samples)} examples...")

#     for i, row in enumerate(samples):
#         # Get image path
#         image_path = os.path.join(image_dir, row['Image_Name'])

#         # Get question and choices
#         question = row['Question']
#         choices = {
#             'A': row['Choice A'],
#             'B': row['Choice B'],
#             'C': row['Choice C'],
#             'D': row['Choice D']
#         }

#         # True answer
#         true_answer = row['Answer_label']

#         # Make prediction
#         result = predict_single_example(image_path, question, choices, true_answer)

#         # Update counters
#         if result['is_correct']:
#             correct += 1
#         total += 1

#         # Print progress
#         print(f"Example {i+1}/{len(samples)}: Predicted {result['predicted_answer']}, Actual {true_answer}, Correct: {result['is_correct']}")

#     # Calculate accuracy
#     accuracy = correct / total if total > 0 else 0
#     print(f"\nEvaluation Accuracy: {accuracy:.4f} ({correct}/{total})")
# else:
#     print("Cannot find 'Image_Name' column in the dataframe. Please check your data structure.")

Evaluating model on 20 examples...
Example 1/20: Predicted A, Actual C, Correct: False
Example 2/20: Predicted A, Actual D, Correct: False
Example 3/20: Predicted A, Actual A, Correct: True
Example 4/20: Predicted A, Actual A, Correct: True
Example 5/20: Predicted A, Actual A, Correct: True
Example 6/20: Predicted A, Actual B, Correct: False
Example 7/20: Predicted A, Actual D, Correct: False
Example 8/20: Predicted A, Actual B, Correct: False
Example 9/20: Predicted A, Actual A, Correct: True
Example 10/20: Predicted A, Actual A, Correct: True
Example 11/20: Predicted A, Actual C, Correct: False
Example 12/20: Predicted A, Actual D, Correct: False
Example 13/20: Predicted A, Actual A, Correct: True
Example 14/20: Predicted A, Actual D, Correct: False
Example 15/20: Predicted A, Actual B, Correct: False
Example 16/20: Predicted A, Actual D, Correct: False
Example 17/20: Predicted A, Actual C, Correct: False
Example 18/20: Predicted A, Actual A, Correct: True
Example 19/20: Predicted A,

# Hyperparameter Tuning

In [None]:
# import torch
# from torch.utils.data import DataLoader
# from tqdm.auto import tqdm
# from torch.optim import AdamW
# import os
# from torch.optim.lr_scheduler import ReduceLROnPlateau
# import random

# # Create a subset of the data for training
# train_subset = train_dataset
# if len(train_dataset) > 5000:  # Increased from 1000 to 5000 for better training
#     # Use a larger subset for training
#     train_subset = torch.utils.data.Subset(train_dataset, list(range(5000)))
#     print(f"Using a subset of {len(train_subset)} examples for training")
# else:
#     print(f"Using all {len(train_subset)} examples for training")

# # Create a simpler data collator that just returns the batch
# def simple_collator(batch):
#     # Extract pixel values
#     pixel_values = torch.stack([item["pixel_values"] for item in batch])

#     # Process input_ids and attention_mask with padding
#     max_len = max(len(item["input_ids"]) for item in batch)

#     # Prepare padded tensors
#     input_ids = []
#     attention_mask = []

#     for item in batch:
#         # Pad input_ids
#         ids = item["input_ids"]
#         padding = torch.full((max_len - len(ids),), processor.tokenizer.pad_token_id, dtype=ids.dtype)
#         input_ids.append(torch.cat([ids, padding]))

#         # Pad attention_mask
#         mask = item["attention_mask"]
#         padding = torch.zeros(max_len - len(mask), dtype=mask.dtype)
#         attention_mask.append(torch.cat([mask, padding]))

#     # Stack tensors
#     input_ids = torch.stack(input_ids)
#     attention_mask = torch.stack(attention_mask)

#     # Simple target: just use the first letter of the answer
#     targets = []
#     for item in batch:
#         # Get the label
#         label_str = processor.tokenizer.decode(item["labels"])
#         # Extract first character (should be A, B, C, or D)
#         if len(label_str) > 0 and label_str[0] in "ABCD":
#             # Map A, B, C, D to 0, 1, 2, 3
#             target = ord(label_str[0]) - ord('A')
#         else:
#             # Default to A if label is not recognized
#             target = 0
#         targets.append(target)

#     targets = torch.tensor(targets, dtype=torch.long)

#     return {
#         "pixel_values": pixel_values,
#         "input_ids": input_ids,
#         "attention_mask": attention_mask,
#         "targets": targets
#     }

# # Define a function to run inference on a single example
# def predict_single_example(model, processor, device, image_path, question, choices, true_answer=None):
#     """
#     Run model prediction on a single example
#     """
#     # Load and process the image
#     image = Image.open(image_path).convert('RGB')

#     # Format choices
#     choices_text = f"A: {choices['A']}\nB: {choices['B']}\nC: {choices['C']}\nD: {choices['D']}"

#     # Create prompt with image token
#     prompt = f"<image>\nQuestion: {question}\n\nOptions:\n{choices_text}\n\nPlease select the correct answer (A, B, C, or D):"

#     # Process inputs
#     inputs = processor(images=image, text=prompt, return_tensors="pt")
#     inputs = {k: v.to(device) for k, v in inputs.items()}

#     # Generate prediction
#     with torch.no_grad():
#         outputs = model(
#             input_ids=inputs["input_ids"],
#             attention_mask=inputs["attention_mask"],
#             pixel_values=inputs["pixel_values"]
#         )

#     # Get prediction
#     logits = outputs.logits
#     last_token_logits = logits[:, -1, :]
#     relevant_logits = last_token_logits[:, :4]  # A, B, C, D
#     prediction_idx = torch.argmax(relevant_logits, dim=1).item()

#     # Convert to letter
#     predicted_answer = chr(ord('A') + prediction_idx)

#     # Check if correct
#     is_correct = (predicted_answer == true_answer) if true_answer else None

#     return {
#         "predicted_answer": predicted_answer,
#         "is_correct": is_correct
#     }

# # Function to evaluate the model
# def evaluate_model(model, processor, device, train_df, image_dir, num_samples=50):
#     """
#     Evaluate the model on a sample of examples
#     """
#     model.eval()

#     # Select a sample of examples for evaluation
#     sample_indices = random.sample(range(len(train_df)), min(num_samples, len(train_df)))
#     samples = [train_df.iloc[i] for i in sample_indices]

#     # Track results
#     correct = 0
#     total = 0

#     print(f"Evaluating model on {len(samples)} examples...")

#     for i, row in enumerate(samples):
#         # Get image path
#         image_path = os.path.join(image_dir, row['Image_Name'])

#         # Get question and choices
#         question = row['Question']
#         choices = {
#             'A': row['Choice A'],
#             'B': row['Choice B'],
#             'C': row['Choice C'],
#             'D': row['Choice D']
#         }

#         # True answer
#         true_answer = row['Answer_label']

#         # Make prediction
#         result = predict_single_example(model, processor, device, image_path, question, choices, true_answer)

#         # Update counters
#         if result['is_correct']:
#             correct += 1
#         total += 1

#     # Calculate accuracy
#     accuracy = correct / total if total > 0 else 0
#     print(f"Evaluation Accuracy: {accuracy:.4f} ({correct}/{total})")
#     return accuracy

# # Create dataloader with simple collator
# batch_size = 4
# dataloader = DataLoader(
#     train_subset,
#     batch_size=batch_size,
#     shuffle=True,
#     collate_fn=simple_collator
# )

# print(f"Created dataloader with batch size {batch_size}")

# # Set up optimizer with a lower learning rate
# optimizer = AdamW(model.parameters(), lr=1e-5)

# # Setup device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Training loop with more epochs
# num_epochs = 5
# print(f"Starting custom training loop for {num_epochs} epochs")

# # Add learning rate scheduler
# scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1, verbose=True)

# # Create directory for checkpoints
# os.makedirs("./checkpoints", exist_ok=True)

# # Track best model
# best_accuracy = 0.0
# best_epoch = 0

# model.train()
# for epoch in range(num_epochs):
#     total_loss = 0
#     progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

#     for batch in progress_bar:
#         # Move batch to device
#         batch = {k: v.to(device) for k, v in batch.items()}

#         # Extract targets
#         targets = batch.pop("targets")

#         # Zero gradients
#         optimizer.zero_grad()

#         # Forward pass
#         outputs = model(
#             input_ids=batch["input_ids"],
#             attention_mask=batch["attention_mask"],
#             pixel_values=batch["pixel_values"],
#         )

#         # Compute loss
#         logits = outputs.logits
#         last_token_logits = logits[:, -1, :]
#         relevant_logits = last_token_logits[:, :4]
#         loss = torch.nn.functional.cross_entropy(relevant_logits, targets)

#         # Backward pass
#         loss.backward()

#         # Update weights
#         optimizer.step()

#         # Update progress bar
#         total_loss += loss.item()
#         progress_bar.set_postfix({"loss": loss.item()})

#     avg_loss = total_loss / len(dataloader)
#     print(f"Epoch {epoch+1}/{num_epochs} - Average loss: {avg_loss:.4f}")

#     # Update learning rate based on loss
#     scheduler.step(avg_loss)

#     # Evaluate after each epoch
#     print(f"Evaluating after epoch {epoch+1}...")
#     accuracy = evaluate_model(model, processor, device, train_df, image_dir, num_samples=50)

#     # Save checkpoint if best model
#     if accuracy > best_accuracy:
#         best_accuracy = accuracy
#         best_epoch = epoch + 1

#         # Save the best model
#         checkpoint_dir = f"./checkpoints/epoch_{epoch+1}_acc_{accuracy:.4f}"
#         os.makedirs(checkpoint_dir, exist_ok=True)
#         model.save_pretrained(checkpoint_dir)
#         print(f"New best model saved! Accuracy: {accuracy:.4f}")

# # Save the final model
# output_dir = "./final_model/llava-med-lora-5epochs"
# print(f"Saving final model to {output_dir}")
# model.save_pretrained(output_dir)
# processor.save_pretrained(output_dir)
# print("Model saved successfully!")

# print(f"Best model was from epoch {best_epoch} with accuracy {best_accuracy:.4f}")

Using a subset of 5000 examples for training
Created dataloader with batch size 4
Starting custom training loop for 5 epochs


Epoch 1/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1250/1250 [35:03<00:00,  1.68s/it, loss=9.24e-7]


Epoch 1/5 - Average loss: 0.0008
Evaluating after epoch 1...
Evaluating model on 50 examples...
Evaluation Accuracy: 0.3000 (15/50)
‚úÖ New best model saved! Accuracy: 0.3000


Epoch 2/5:  37%|‚ñà‚ñà‚ñà‚ñã      | 463/1250 [19:19:18<32:36:43, 149.18s/it, loss=8.64e-7]

In [None]:
import torch
from torch.utils.data import DataLoader, Subset
from tqdm.auto import tqdm
from torch.optim import AdamW
import os
import random
import numpy as np

# Create a fresh MedVQADataset instance with safer indexing
class SafeMedVQADataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, image_dir, processor):
        self.dataframe = dataframe.reset_index(drop=True)  # Reset index to avoid KeyError
        self.image_dir = image_dir
        self.processor = processor
        self.image_token = "<image>"

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if idx >= len(self.dataframe):
            raise IndexError(f"Index {idx} out of bounds for dataset of size {len(self.dataframe)}")

        row = self.dataframe.iloc[idx]

        try:
            # Load image
            image_path = os.path.join(self.image_dir, row['Image_Name'])
            image = Image.open(image_path).convert('RGB')

            # Format the prompt with multiple choice options
            question = row['Question']
            choices = f"A: {row['Choice A']}\nB: {row['Choice B']}\nC: {row['Choice C']}\nD: {row['Choice D']}"

            # Use the image token at the beginning of the prompt
            prompt = f"{self.image_token}\nQuestion: {question}\n\nOptions:\n{choices}\n\nPlease select the correct answer (A, B, C, or D):"

            # Process inputs using the processor
            inputs = self.processor(
                images=image,
                text=prompt,
                return_tensors="pt"
            )

            # Prepare target (answer label)
            target = row['Answer_label']

            # Add target to inputs
            inputs = {k: v.squeeze(0) for k, v in inputs.items()}
            inputs["labels"] = self.processor.tokenizer(f" {target}", return_tensors="pt").input_ids[:, 1:].squeeze(0)

            return inputs

        except Exception as e:
            print(f"Error processing item {idx}: {e}")
            # Return a default item or raise the exception
            raise e

# Create a new safer dataset
safe_dataset = SafeMedVQADataset(train_df, image_dir, processor)

# Use a smaller subset for faster training
subset_size = 500  # Further reduced to speed up training
np.random.seed(42)  # For reproducibility
valid_indices = list(range(len(safe_dataset)))
selected_indices = np.random.choice(valid_indices, min(subset_size, len(valid_indices)), replace=False)
train_subset = Subset(safe_dataset, selected_indices)

print(f"Using a subset of {len(train_subset)} examples for training")

# Create a simpler data collator that just returns the batch
def simple_collator(batch):
    # Handle empty batches
    if len(batch) == 0:
        return {}

    # Extract pixel values
    pixel_values = torch.stack([item["pixel_values"] for item in batch])

    # Process input_ids and attention_mask with padding
    max_len = max(len(item["input_ids"]) for item in batch)

    # Prepare padded tensors
    input_ids = []
    attention_mask = []

    for item in batch:
        # Pad input_ids
        ids = item["input_ids"]
        padding = torch.full((max_len - len(ids),), processor.tokenizer.pad_token_id, dtype=ids.dtype)
        input_ids.append(torch.cat([ids, padding]))

        # Pad attention_mask
        mask = item["attention_mask"]
        padding = torch.zeros(max_len - len(mask), dtype=mask.dtype)
        attention_mask.append(torch.cat([mask, padding]))

    # Stack tensors
    input_ids = torch.stack(input_ids)
    attention_mask = torch.stack(attention_mask)

    # Simple target: just use the first letter of the answer
    targets = []
    for item in batch:
        # Get the label
        label_str = processor.tokenizer.decode(item["labels"])
        # Extract first character (should be A, B, C, or D)
        if len(label_str) > 0 and label_str[0] in "ABCD":
            # Map A, B, C, D to 0, 1, 2, 3
            target = ord(label_str[0]) - ord('A')
        else:
            # Default to A if label is not recognized
            target = 0
        targets.append(target)

    targets = torch.tensor(targets, dtype=torch.long)

    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "targets": targets
    }

# Create dataloader with simple collator - smaller batch size
batch_size = 4
dataloader = DataLoader(
    train_subset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=simple_collator,
    num_workers=0  # Use single-process loading for stability
)

print(f"Created dataloader with batch size {batch_size}")

# Quick function to evaluate on a few samples
def quick_evaluate(model, processor, device, train_df, image_dir, num_samples=20):
    """Simple evaluation on a few samples"""
    model.eval()

    # Select a sample of examples for evaluation
    sample_indices = random.sample(range(len(train_df)), min(num_samples, len(train_df)))
    samples = [train_df.iloc[i] for i in sample_indices]

    # Track results
    correct = 0
    total = 0

    for i, row in enumerate(samples):
        try:
            # Get image path
            image_path = os.path.join(image_dir, row['Image_Name'])

            # Get question and choices
            question = row['Question']
            choices = {
                'A': row['Choice A'],
                'B': row['Choice B'],
                'C': row['Choice C'],
                'D': row['Choice D']
            }

            # True answer
            true_answer = row['Answer_label']

            # Load and process image
            image = Image.open(image_path).convert('RGB')

            # Format choices text
            choices_text = f"A: {choices['A']}\nB: {choices['B']}\nC: {choices['C']}\nD: {choices['D']}"

            # Create prompt with image token
            prompt = f"<image>\nQuestion: {question}\n\nOptions:\n{choices_text}\n\nPlease select the correct answer (A, B, C, or D):"

            # Process inputs
            inputs = processor(images=image, text=prompt, return_tensors="pt")
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Generate prediction
            with torch.no_grad():
                outputs = model(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    pixel_values=inputs["pixel_values"]
                )

            # Get prediction
            logits = outputs.logits
            last_token_logits = logits[:, -1, :]
            relevant_logits = last_token_logits[:, :4]  # A, B, C, D
            prediction_idx = torch.argmax(relevant_logits, dim=1).item()
            predicted_answer = chr(ord('A') + prediction_idx)

            # Check if correct
            is_correct = (predicted_answer == true_answer)
            if is_correct:
                correct += 1
            total += 1

        except Exception as e:
            print(f"Error evaluating sample {i}: {e}")

    # Calculate accuracy
    accuracy = correct / total if total > 0 else 0
    print(f"Quick Evaluation Accuracy: {accuracy:.4f} ({correct}/{total})")
    return accuracy

# Reconfigure LoRA with smaller rank for faster training
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training

# Prepare model for LoRA fine-tuning (if not already done)
if not hasattr(model, 'peft_config'):
    model = prepare_model_for_kbit_training(model)

    # Define smaller LoRA configuration
    peft_config = LoraConfig(
        task_type="CAUSAL_LM",
        inference_mode=False,
        r=4,  # Reduced rank for faster training
        lora_alpha=16,
        lora_dropout=0.1,
        # Target fewer layers for faster training
        target_modules=["q_proj", "v_proj"]
    )

    # Apply LoRA adapters to the model
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

# Set up optimizer with higher learning rate for faster convergence
optimizer = AdamW(model.parameters(), lr=1e-4)

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Enable mixed precision for faster training
scaler = torch.cuda.amp.GradScaler()

# Training loop with reduced epochs
num_epochs = 2  # Reduced to 2 epochs for faster training
print(f"Starting accelerated training for {num_epochs} epochs with mixed precision")

# Create directory for checkpoints
os.makedirs("./checkpoints", exist_ok=True)

# Track best model
best_accuracy = 0.0
best_epoch = 0

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in progress_bar:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Extract targets
        targets = batch.pop("targets")

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass with mixed precision
        with torch.cuda.amp.autocast():
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                pixel_values=batch["pixel_values"],
            )

            # Compute loss
            logits = outputs.logits
            last_token_logits = logits[:, -1, :]
            relevant_logits = last_token_logits[:, :4]
            loss = torch.nn.functional.cross_entropy(relevant_logits, targets)

        # Backward pass with scaling
        scaler.scale(loss).backward()

        # Update weights with scaling
        scaler.step(optimizer)
        scaler.update()

        # Update progress bar
        total_loss += loss.item()
        progress_bar.set_postfix({"loss": loss.item()})

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average loss: {avg_loss:.4f}")

    # Quick evaluation after each epoch
    print(f"Quick evaluation after epoch {epoch+1}...")
    accuracy = quick_evaluate(model, processor, device, train_df, image_dir, num_samples=20)

    # Save checkpoint if best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_epoch = epoch + 1

        # Save the best model
        checkpoint_dir = f"./checkpoints/epoch_{epoch+1}_acc_{accuracy:.4f}"
        os.makedirs(checkpoint_dir, exist_ok=True)
        model.save_pretrained(checkpoint_dir)
        print(f" New best model saved! Accuracy: {accuracy:.4f}")

# Save the final model
output_dir = "./final_model/llava-med-lora-accelerated"
print(f"Saving final model to {output_dir}")
model.save_pretrained(output_dir)
processor.save_pretrained(output_dir)
print("Model saved successfully!")

print(f"Best model was from epoch {best_epoch} with accuracy {best_accuracy:.4f}")

Using a subset of 500 examples for training
Created dataloader with batch size 4
Starting accelerated training for 2 epochs with mixed precision


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
Epoch 1/2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125/125 [03:39<00:00,  1.76s/it, loss=5.81e-6]


Epoch 1/2 - Average loss: 2.0400
Quick evaluation after epoch 1...
Quick Evaluation Accuracy: 0.4000 (8/20)
‚úÖ New best model saved! Accuracy: 0.4000


Epoch 2/2:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 59/125 [2:29:58<2:47:45, 152.51s/it, loss=5.28e-6]


RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


# **Please Note when we ran the fine-tuned code for the first time we got 51% of the accuracy. We tried improving the accuracy and kept the code on run again. But due to memory issues we got 40% of the accuracya nd it stopped in between as you can see above.**

# **Evaluate Model**

In [None]:
def calculate_accuracy(model, processor, device, test_df, image_dir, num_samples=None, detailed=False):
    """
    Calculate accuracy of the model on test data with detailed metrics

    Args:
        model: The fine-tuned model
        processor: The processor for inputs
        device: The device to run inference on
        test_df: Dataframe containing test examples
        image_dir: Directory containing images
        num_samples: Number of samples to evaluate (None for all)
        detailed: Whether to return detailed metrics

    Returns:
        accuracy: Overall accuracy
        metrics: Detailed metrics if detailed=True
    """
    model.eval()

    # Select samples for evaluation
    if num_samples is not None and num_samples < len(test_df):
        sample_indices = random.sample(range(len(test_df)), num_samples)
        samples = test_df.iloc[sample_indices]
    else:
        samples = test_df

    print(f"Evaluating model on {len(samples)} examples...")

    # Initialize counters
    correct = 0
    total = 0

    # Initialize detailed metrics if requested
    results = []
    confusion_matrix = {
        'A': {'A': 0, 'B': 0, 'C': 0, 'D': 0},
        'B': {'A': 0, 'B': 0, 'C': 0, 'D': 0},
        'C': {'A': 0, 'B': 0, 'C': 0, 'D': 0},
        'D': {'A': 0, 'B': 0, 'C': 0, 'D': 0}
    }

    # Track accuracy by question type (if available)
    question_types = {}

    # Process each sample
    progress_bar = tqdm(range(len(samples)), desc="Evaluating")
    for i in progress_bar:
        row = samples.iloc[i]

        # Get image path
        image_path = os.path.join(image_dir, row['Image_Name'])

        # Skip if image doesn't exist
        if not os.path.exists(image_path):
            print(f"Warning: Image not found: {image_path}")
            continue

        # Get question and choices
        question = row['Question']
        choices = {
            'A': row['Choice A'],
            'B': row['Choice B'],
            'C': row['Choice C'],
            'D': row['Choice D']
        }

        # True answer
        true_answer = row['Answer_label']

        # Make prediction
        try:
            # Load and process image
            image = Image.open(image_path).convert('RGB')

            # Format choices text
            choices_text = f"A: {choices['A']}\nB: {choices['B']}\nC: {choices['C']}\nD: {choices['D']}"

            # Create prompt with image token
            prompt = f"<image>\nQuestion: {question}\n\nOptions:\n{choices_text}\n\nPlease select the correct answer (A, B, C, or D):"

            # Process inputs
            inputs = processor(images=image, text=prompt, return_tensors="pt")
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Generate prediction
            with torch.no_grad():
                outputs = model(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    pixel_values=inputs["pixel_values"]
                )

            # Get prediction
            logits = outputs.logits
            last_token_logits = logits[:, -1, :]
            relevant_logits = last_token_logits[:, :4]  # A, B, C, D
            prediction_idx = torch.argmax(relevant_logits, dim=1).item()
            predicted_answer = chr(ord('A') + prediction_idx)

            # Record result
            is_correct = (predicted_answer == true_answer)
            if is_correct:
                correct += 1
            total += 1

            # Update confusion matrix
            confusion_matrix[true_answer][predicted_answer] += 1

            # Track by question type if we can extract it
            # Simple heuristic: first few words of question
            question_type = question.split()[0] if len(question.split()) > 0 else "Unknown"
            if question_type not in question_types:
                question_types[question_type] = {"correct": 0, "total": 0}
            question_types[question_type]["total"] += 1
            if is_correct:
                question_types[question_type]["correct"] += 1

            # Store detailed result if requested
            if detailed:
                results.append({
                    "id": i,
                    "question": question,
                    "true_answer": true_answer,
                    "predicted_answer": predicted_answer,
                    "is_correct": is_correct,
                    "image_path": image_path
                })

        except Exception as e:
            print(f"Error processing example {i}: {e}")

    # Calculate overall accuracy
    overall_accuracy = correct / total if total > 0 else 0
    print(f"Overall Accuracy: {overall_accuracy:.4f} ({correct}/{total})")

    # Print confusion matrix
    print("\nConfusion Matrix:")
    print("  | A  | B  | C  | D  |")
    print("--|----|----|----|----|")
    for true_label in ['A', 'B', 'C', 'D']:
        row_str = f"{true_label} |"
        for pred_label in ['A', 'B', 'C', 'D']:
            count = confusion_matrix[true_label][pred_label]
            row_str += f" {count:2d} |"
        print(row_str)

    # Print accuracy by question type
    print("\nAccuracy by Question Type:")
    for qtype, stats in sorted(question_types.items(),
                              key=lambda x: x[1]["total"],
                              reverse=True):
        if stats["total"] >= 5:  # Only show types with at least 5 examples
            type_acc = stats["correct"] / stats["total"]
            print(f"{qtype}: {type_acc:.4f} ({stats['correct']}/{stats['total']})")

    # Return results
    if detailed:
        return {
            "accuracy": overall_accuracy,
            "confusion_matrix": confusion_matrix,
            "question_types": question_types,
            "detailed_results": results
        }
    else:
        return overall_accuracy

In [None]:
# Evaluate on test set (or a subset for faster evaluation)
accuracy_metrics = calculate_accuracy(
    model=model,
    processor=processor,
    device=device,
    test_df=train_df,  # Using training data since it has the right structure
    image_dir=image_dir,
    num_samples=100,   # Number of samples to evaluate
    detailed=True      # Get detailed metrics
)

print(f"Final model accuracy: {accuracy_metrics['accuracy']:.4f}")

# Identify most challenging question types
print("\nQuestion Types with Lowest Accuracy:")
sorted_types = sorted(
    [(qtype, stats["correct"]/stats["total"], stats["total"])
     for qtype, stats in accuracy_metrics["question_types"].items()
     if stats["total"] >= 5],
    key=lambda x: x[1]
)

for qtype, acc, total in sorted_types[:5]:
    print(f"{qtype}: {acc:.4f} ({total} examples)")