In [1]:
import pandas as pd

In [2]:
# data preview

df = pd.read_csv('../data/fundus/annotations.csv') # has to go up one directory first
df.head()

Unnamed: 0,filepath,split,tasks/disease presence,tasks/disease labels,original_filepath,original_split,disease labels,disease presence,original_image_size
0,images/000000.tiff,train,1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,DR,abnormal,"(2144, 1424)"
1,images/000001.tiff,train,1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,DR,abnormal,"(2144, 1424)"
2,images/000002.tiff,train,1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,DR,abnormal,"(2144, 1424)"
3,images/000003.tiff,train,1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,MH|ODC,abnormal,"(2144, 1424)"
4,images/000004.tiff,train,1,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,DR|LS,abnormal,"(2144, 1424)"


In [3]:
# cols

df.columns

Index(['filepath', 'split', 'tasks/disease presence', 'tasks/disease labels',
       'original_filepath', 'original_split', 'disease labels',
       'disease presence', 'original_image_size'],
      dtype='object')

In [4]:
# find satisfactory subset, w/ 3000 images
# find satisfactory floors (atleast 30%)

# split by abnormal and normal

df_abnormal = df[df['disease presence'] == 'abnormal']
df_normal = df[df['disease presence'] == 'normal']

abnormal_count = len(df_abnormal)
normal_count = len(df_normal)

print('abnormal proportion: ' + str(abnormal_count))
print('normal proportion: ' + str(normal_count))

# generate subset df w/ maximal floors

abnormal_needed = 3000 - normal_count

# get required number of abnormal entries
abnormal_subset = df_abnormal.iloc[:abnormal_needed]

# combine both
subset = pd.concat([df_normal, abnormal_subset], ignore_index=True)

print('subset size:', len(subset))
print('normal proportion:', len(df_normal)/3000)
print('abnormal proportion:', len(abnormal_subset)/3000)

abnormal proportion: 2531
normal proportion: 669
subset size: 3000
normal proportion: 0.223
abnormal proportion: 0.777


#### Note: a lower floor (22.3%) than desired (30%)

In [5]:
# condition matching

conditions_full = [
    "Diabetic Retinopathy",                       # DR
    "Age-Related Macular Degeneration",          # ARMD
    "Macular Hole",                               # MH
    "Diabetic Neuropathy",                        # DN
    "Myopia",                                     # MYA
    "Branch Retinal Vein Occlusion",             # BRVO
    "Tessellated Fundus",                         # TSLN
    "Epiretinal Membrane",                        # ERM
    "Lattice Degeneration",                       # LS
    "Multiple Sclerosis",                         # MS
    "Central Serous Retinopathy",                 # CSR
    "Optic Disc Cupping",                         # ODC
    "Central Retinal Vein Occlusion",             # CRVO
    "Temporal Vein",                               # TV
    "Asteroid Hyalosis",                           # AH
    "Optic Disc Pit",                               # ODP
    "Optic Disc Edema",                              # ODE
    "Stargardt Disease",                                # ST
    "Anterior Ischemic Optic Neuropathy",                  # AION
    "Paracentral Tubular",                                  # PT
    "Retinal Tear",                                          # RT
    "Retinoschisis",                                         # RS
    "Central Retinal Sheath",                                # CRS
    "Exudative Detachment",                                  # EDN
    "Retinal Pigment Epithelial Changes",                    # RPEC
    "Lamellar Macular Hole",                                  # MHL
    "Retinitis Pigmentosa",                                    # RP
    "Cotton Wool Spot",                                        # CWS
    "Choroidal Blood / Bruising",                               # CB
    "Optic Disc Pit Maculopathy",                                # ODPM
    "Pre-retinal Hemorrhage",                                      # PRH
    "Myelinated Nerve Fibers",                                      # MNF
    "Hard Exudates",                                                 # HR
    "Central Retinal Artery Occlusion",                               # CRAO
    "Temporal Disc",                                                  # TD
    "Cystoid Macular Edema",                                           # CME
    "Posterior Cortical Cataract",                                      # PTCR
    "Chorioretinal Fold / Choroidal Fissure",                            # CF
    "Vitreous Hemorrhage",                                               # VH
    "Middle Cerebral Artery",                                             # MCA
    "Visual Sensation",                                                   # VS
    "Branch Retinal Artery Occlusion",                                     # BRAO
    "Plaque",                                                                # PLQ
    "Hyperpigmented Epithelial Defect / Deposit",                             # HPED
    "Collateral Vessel / Chorioretinal Lesion"                                  # CL
]

def map_values_to_conditions(lst):
  exhibited_conditions = []
  for i in range(len(lst)):
    if lst[i] == 1:
      exhibited_conditions.append(conditions_full[i])
  return exhibited_conditions

# # Example input
# lst = [0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1]

# # Example usage:
# mapping = map_values_to_conditions(lst)
# print(mapping)


In [27]:
temp = df[df['disease presence'] == 'abnormal']

# all zero list
lst = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


for r in temp.iterrows():
    # convert tuple to list
    # get 'tasks/disease labels'
    r_list = r[1]['tasks/disease labels']
    r_list = list(r_list)
    print(r_list)
    # for i in range(len(r_list)):
    #     lst[i] += int(r_list[i])
        # r_list[i] = 1 if r_list[i] > 0 else 0
    # r_list = list(r[1]['tasks/disease labels'])
    # print(r_list)
    # r_list = list(r[3])
    # element-wise addition
    # lst = [x + y for x, y in zip(lst, r_list)]
# get the count of each disease

print(lst)

# disease_counts = temp[conditions_full].sum()
# disease_counts

['[', '1', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ']']
['[', '1', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0', ',', ' ', '0'

In [28]:
# control step 1 -> retrieval captions

import random
import ast

'''
two captions:
  1. this image depicts an (normal/abnormal) fundus
  2. this image depicts a (normal/abnormal) fundus with (disease presence)
'''

# fundus_control_df = subset.copy()

def generate_control_retrieval_captions(disease_labels, disease_presence):
    # generate caption 1
    caption_1 = ""
    if disease_presence == 'normal':
        caption_1 = "This image depicts a " + disease_presence + " fundus"
    else:
        caption_1 = "This image depicts an " + disease_presence + " fundus"

    if isinstance(disease_labels, str):
        disease_labels = ast.literal_eval(disease_labels)

    # get diseases
    diseases = map_values_to_conditions(disease_labels)

    # if multiple diseases are present, pick one at random
    if len(diseases) > 1:
        disease_labels = random.choice(diseases)
    elif len(diseases) == 1:
        disease_labels = diseases[0]
    else:
        disease_labels = "no disease"

    # generate caption 2
    caption_2 = caption_1 + " with " + disease_labels
    # caption_2 = "This image depicts a " + disease_presence + " fundus with " + disease_labels

    return [caption_1, caption_2]

    # add as a new col in fundus_control_df

subset['control_retrieval'] = subset.apply(lambda row: generate_control_retrieval_captions(row['tasks/disease labels'], row['disease presence']), axis=1)
subset.head()


Unnamed: 0,filepath,split,tasks/disease presence,tasks/disease labels,original_filepath,original_split,disease labels,disease presence,original_image_size,control_retrieval
0,images/000009.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag..."
1,images/000015.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag..."
2,images/000016.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag..."
3,images/000020.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag..."
4,images/000027.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag..."


In [None]:
# control step 2 -> MCQ

'''
two options:
  1. this image depicts a/an (normal/abnormal) fundus -> true choice
  2. this image depicts a/an (normal/abnormal) fundus -> false choice
'''

def generate_control_mcq_options(disease_presence):
  true = disease_presence
  false = "abnormal" if disease_presence == "normal" else "normal"

  true_question = ""
  false_question = ""

  if true == "normal":
      true_question = "This image depicts a normal fundus"
      false_question = "This image depicts an abnormal fundus"
  else:
      true_question = "This image depicts an abnormal fundus"
      false_question = "This image depicts a normal fundus"

  return [true_question, false_question]

# add a new col in fundus_control_df
# note that the first mcq option is always correct

subset['control_mcq'] = subset['disease presence'].apply(generate_control_mcq_options)
subset.head()


Unnamed: 0,filepath,split,tasks/disease presence,tasks/disease labels,original_filepath,original_split,disease labels,disease presence,original_image_size,control_retrieval,control_mcq
0,images/000009.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag..."
1,images/000015.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag..."
2,images/000016.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag..."
3,images/000020.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag..."
4,images/000027.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag..."


In [34]:
# negation step 1 -> retrieval captions

'''
two captions:
  1. this image depicts a fundus that is NOT (normal/abnormal)
  2. this image depicts a fundus that is NOT (normal/abnormal) with (disease presence)
'''

def generate_negated_retrieval_captions(disease_labels, disease_presence):
  # for each row generate two captions, using complement disease_presence
  if disease_presence == "normal":
    complement = "abnormal"
  else:
    complement = "normal"

  caption_1 = "This image depicts a fundus that is not " + complement

  if isinstance(disease_labels, str):
        disease_labels = ast.literal_eval(disease_labels)

  # get diseases
  diseases = map_values_to_conditions(disease_labels)

  # if multiple diseases are present, pick one at random
  if len(diseases) > 1:
    disease_labels = random.choice(diseases)
  elif len(diseases) == 1:
    disease_labels = diseases[0]
  else:
    disease_labels = "no disease"

  caption_2 = "This image depicts a fundus with " + disease_labels + " that is not " + complement

  return [caption_1, caption_2]

# add as a new col in subset

subset['negated_retrieval'] = subset.apply(lambda row: generate_negated_retrieval_captions(row['tasks/disease labels'], row['disease presence']), axis=1)
subset.head()


Unnamed: 0,filepath,split,tasks/disease presence,tasks/disease labels,original_filepath,original_split,disease labels,disease presence,original_image_size,control_retrieval,control_mcq,negated_retrieval
0,images/000009.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag...",[This image depicts a fundus that is not abnor...
1,images/000015.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag...",[This image depicts a fundus that is not abnor...
2,images/000016.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag...",[This image depicts a fundus that is not abnor...
3,images/000020.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag...",[This image depicts a fundus that is not abnor...
4,images/000027.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag...",[This image depicts a fundus that is not abnor...


In [37]:
# negation step 2 -> MCQ

'''
two options:
  1. this image does not depict a/an (normal/abnormal) fundus -> true choice
  2. this image does not depict a/an (normal/abnormal) fundus
'''

def generate_negated_mcq_options(disease_presence):
    true = disease_presence
    false = "abnormal" if disease_presence == "normal" else "normal"

    true_question, false_question = "", ""

    # if true is normal, false is abnormal and vice versa
    if false == "normal":
        true_question = "This image does not depict a " + false + " fundus"
        false_question = "This image does not depict an " + true + " fundus"
    else:
        true_question = "This image does not depict an " + false + " fundus"
        false_question = "This image does not depict a " + true + " fundus"

    # # generate true question first
    # true_question = "This image does not depict a " + false + " fundus"

    # # generate incorrect question next
    # false_question = "This image does not depict a " + true + " fundus"

    return [true_question, false_question]

# add a new col in fundus_control_df
# note that the first mcq option is always correct

subset['negated_mcq'] = subset['disease presence'].apply(generate_negated_mcq_options)
subset.head()

Unnamed: 0,filepath,split,tasks/disease presence,tasks/disease labels,original_filepath,original_split,disease labels,disease presence,original_image_size,control_retrieval,control_mcq,negated_retrieval,negated_mcq
0,images/000009.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag...",[This image depicts a fundus that is not abnor...,[This image does not depict an abnormal fundus...
1,images/000015.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag...",[This image depicts a fundus that is not abnor...,[This image does not depict an abnormal fundus...
2,images/000016.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag...",[This image depicts a fundus that is not abnor...,[This image does not depict an abnormal fundus...
3,images/000020.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag...",[This image depicts a fundus that is not abnor...,[This image does not depict an abnormal fundus...
4,images/000027.tiff,train,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. RFMiD_All_Classes_Dataset/1. Original Image...,train,,normal,"(2144, 1424)","[This image depicts a normal fundus, This imag...","[This image depicts a normal fundus, This imag...",[This image depicts a fundus that is not abnor...,[This image does not depict an abnormal fundus...


In [None]:
# variation w/ paraphrasing

import sys
sys.path.insert(0, '../')  # Go up one directory level

from groq_paraphrase import paraphrase_captions

subset['control_retrieval'] = subset['control_retrieval'].apply(paraphrase_captions)
subset['control_mcq'] = subset['control_mcq'].apply(paraphrase_captions)
subset['negated_retrieval'] = subset['negated_retrieval'].apply(paraphrase_captions)
subset['negated_mcq'] = subset['negated_mcq'].apply(paraphrase_captions)

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01jk19dm48e0ntktmn79yzfhfv` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 200189, Requested 107. Please try again in 2m8.155999999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

In [None]:
# Alternative import method if the above doesn't work
import sys
sys.path.insert(0, '../')  # Go up one directory level

from groq_paraphrase import paraphrase_captions

In [None]:
# Example: Paraphrase control retrieval captions
# Make sure you have your GROQ_API_KEY environment variable set first

# Get a sample of captions to paraphrase
sample_captions = []
for captions_list in subset['control_retrieval'].head(3):  # Just first 3 for testing
    sample_captions.extend(captions_list)

print("Original captions:")
for i, caption in enumerate(sample_captions):
    print(f"{i+1}. {caption}")

# Paraphrase them
paraphrased = paraphrase_captions(sample_captions)

print("\nParaphrased captions:")
for i, caption in enumerate(paraphrased):
    print(f"{i+1}. {caption}")