In [2]:
import pandas as pd
import random

In [4]:
# data preview

df = pd.read_csv('../data/cxr/annotations.csv') # has to go up one directory first
df.head()

Unnamed: 0,filepath,split,tasks/disease labels,tasks/patient sex,original_filepath,original_split,patient_id,bounding_box,disease labels,finding_labels,follow-up_nb,original_image_size,original_pixel_spacing,patient sex,patient_age,view_position
0,images/000000.tiff,train,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,images/00000001_000.png,train,1,,Cardiomegaly,Cardiomegaly,0,"(2682,2749)","(0.143,0.143)",M,57,PA
1,images/000001.tiff,train,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",1,images/00000001_001.png,train,1,,Cardiomegaly|Emphysema,Cardiomegaly|Emphysema,1,"(2894,2729)","(0.143,0.143)",M,58,PA
2,images/000002.tiff,train,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,images/00000001_002.png,train,1,,Cardiomegaly|Effusion,Cardiomegaly|Effusion,2,"(2500,2048)","(0.168,0.168)",M,58,PA
3,images/000003.tiff,train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,images/00000002_000.png,train,2,,,No Finding,0,"(2500,2048)","(0.171,0.171)",M,80,PA
4,images/000004.tiff,test,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",0,images/00000003_001.png,test,3,,Hernia,Hernia,0,"(2500,2048)","(0.168,0.168)",F,74,PA


In [4]:
# EDA; for most common diseases

# disease_set = {}

# for labels in df['finding_labels']:
#     for label in labels.split('|'):
#         if label != 'No Finding':
#             disease_set[label] = disease_set.get(label, 0) + 1

# print(disease_set)

# # sort
# sorted_disease_set = dict(sorted(disease_set.items(), key=lambda item: item[1], reverse=True))
# print(sorted_disease_set)

In [None]:
df['patient_id'].unique()
# ~ 30805 unique patients

array([    1,     2,     3, ..., 30803, 30804, 30805], shape=(30805,))

In [None]:
# find satisfactory subset, w/ 3000 images
# find satisfactory floors (atleast 30%)

# split by the 3 most common diseases, which do not contain the other two
infiltration = df[df['finding_labels'].str.contains('Infiltration') & ~df['finding_labels'].str.contains('Effusion') & ~df['finding_labels'].str.contains('Atelectasis')].copy()
effusion = df[df['finding_labels'].str.contains('Effusion') & ~df['finding_labels'].str.contains('Infiltration') & ~df['finding_labels'].str.contains('Atelectasis')].copy()
atelectasis = df[df['finding_labels'].str.contains('Atelectasis') & ~df['finding_labels'].str.contains('Infiltration') & ~df['finding_labels'].str.contains('Effusion')].copy()

# add a label to determine main disease
infiltration['main_disease'] = 'Infiltration'
effusion['main_disease'] = 'Effusion'
atelectasis['main_disease'] = 'Atelectasis'

# randomly choose 1000 images from each of CNV, DME, DRUSEN
infiltration = infiltration.sample(1000, random_state=42)
effusion = effusion.sample(1000, random_state=42)
atelectasis = atelectasis.sample(1000, random_state=42)

subset = pd.concat([infiltration, effusion, atelectasis])
# reset index
subset = subset.reset_index(drop=True)
infiltration = infiltration.reset_index(drop=True)
effusion = effusion.reset_index(drop=True)
atelectasis = atelectasis.reset_index(drop=True)

In [6]:
# step 1 -> generate retrieval captions

'''
template 1:
1. depicted condition
2. number of diseases
3. not non-depicted condition 1
4. not non-depicted condition 2
5. (hybrid) not non-depicted condition 1 OR 2 and number of conditions

template 2:
1. depicted condition
2. number of diseases
3. not non-depicted condition 1
4. not non-depicted condition 2
5. (hybrid) depicted condition and number of conditions 
'''

def generate_retrieval_captions(main_disease, all_diseases, template):
    disease_map = ['Infiltration', 'Effusion', 'Atelectasis']
    disease_count = len(all_diseases.split('|'))
    non_depicted = [d for d in disease_map if d != main_disease]
    first_condition = random.choice(non_depicted)
    second_condition = random.choice([d for d in non_depicted if d != first_condition])

    c1 = f"This CXR scan shows {main_disease}."
    if disease_count == 1:
        c2 = f"This CXR scan features {disease_count} disease."
    else:
        c2 = f"This CXR scan features {disease_count} diseases."
    c3 = f"This CXR scan does not show {first_condition}."
    c4 = f"This CXR scan shows no evidence of {second_condition}."

    if template == '1':
        random_condition = random.choice(non_depicted)
        if disease_count == 1:
            c5 = f"This CXR is without {random_condition} disease and has {disease_count} condition."
        else:
            c5 = f"This CXR is without {random_condition} disease and has {disease_count} conditions."

    elif template == '2':
        if disease_count == 1:
            c5 = f"This CXR exhibits {main_disease} and {disease_count} condition."
        else:
            c5 = f"This CXR exhibits {main_disease} and {disease_count} conditions."

    captions = [c1, c2, c3, c4, c5]
    return captions

s1 = subset.iloc[:1500].copy()
s2 = subset.iloc[1500:].copy()

s1['retrieval_captions'] = s1.apply(lambda row: generate_retrieval_captions(row['main_disease'], row['finding_labels'], '1'), axis=1)
s2['retrieval_captions'] = s2.apply(lambda row: generate_retrieval_captions(row['main_disease'], row['finding_labels'], '2'), axis=1)

cxr_retrieval = pd.concat([s1, s2]).reset_index(drop=True)

In [7]:
# step 2 -> generate mcq questions

'''
1500 images have correct answers not involving negation -> set A
1500 images have correct answers involving negation -> set B

500 images from each disease type are allotted to set A
500 images from each disease type are allotted to set B

correct option is always first
'''

'''
template 1 (non-negated):
1. depicted condition
2. non-depicted condition 1
3. non-depicted condition 2

template 2.1 (negated):
1. not non-depicted condition 1
2. non-depicted condition 2
3. not depicted condition

template 2.2 (negated):
1. not non-depicted condition 2
2. non-depicted condition 1
3. not depicted condition
'''

def generate_mcq(main_disease, template):
    disease_map = ['Infiltration', 'Effusion', 'Atelectasis']
    negative_objects = [d for d in disease_map if d != main_disease]
    negative_obj1 = negative_objects[0]
    negative_obj2 = negative_objects[1]

    if template == '1':
        c1 = f"This CXR scan shows {main_disease}."
        c2 = f"This CXR scan shows {negative_obj1}."
        c3 = f"This CXR scan shows {negative_obj2}."
        return [c1, c2, c3]

    if template == '2.1':
        c1 = f"This CXR scan does not show {negative_obj1}."
        c2 = f"This CXR scan shows {negative_obj2}."
        c3 = f"This CXR scan does not show {main_disease}."
        return [c1, c2, c3]
    
    if template == '2.2':
        c1 = f"This CXR scan does not show {negative_obj2}."
        c2 = f"This CXR scan shows {negative_obj1}."
        c3 = f"This CXR scan does not show {main_disease}."
        return [c1, c2, c3]


infiltration_affirmative, infiltration_negated = infiltration.iloc[:500].copy(), infiltration.iloc[500:].copy()
effusion_affirmative, effusion_negated = effusion.iloc[:500].copy(), effusion.iloc[500:].copy()
atelectasis_affirmative, atelectasis_negated = atelectasis.iloc[:500].copy(), atelectasis.iloc[500:].copy()

# affirmative step

infiltration_affirmative['mcq_captions'] = infiltration_affirmative.apply(lambda row: generate_mcq(row['main_disease'], '1'), axis=1)
effusion_affirmative['mcq_captions'] = effusion_affirmative.apply(lambda row: generate_mcq(row['main_disease'], '1'), axis=1)
atelectasis_affirmative['mcq_captions'] = atelectasis_affirmative.apply(lambda row: generate_mcq(row['main_disease'], '1'), axis=1)

affirmative_mcq = pd.concat([infiltration_affirmative, effusion_affirmative, atelectasis_affirmative]).reset_index(drop=True)

# negated step

infiltration_negated1 = infiltration_negated.iloc[:250].copy()
infiltration_negated2 = infiltration_negated.iloc[250:].copy()
effusion_negated1 = effusion_negated.iloc[:250].copy()
effusion_negated2 = effusion_negated.iloc[250:].copy()
atelectasis_negated1 = atelectasis_negated.iloc[:250].copy()
atelectasis_negated2 = atelectasis_negated.iloc[250:].copy()

infiltration_negated1['mcq_captions'] = infiltration_negated1.apply(lambda row: generate_mcq(row['main_disease'], '2.1'), axis=1)
infiltration_negated2['mcq_captions'] = infiltration_negated2.apply(lambda row: generate_mcq(row['main_disease'], '2.2'), axis=1)
effusion_negated1['mcq_captions'] = effusion_negated1.apply(lambda row: generate_mcq(row['main_disease'], '2.1'), axis=1)
effusion_negated2['mcq_captions'] = effusion_negated2.apply(lambda row: generate_mcq(row['main_disease'], '2.2'), axis=1)
atelectasis_negated1['mcq_captions'] = atelectasis_negated1.apply(lambda row: generate_mcq(row['main_disease'], '2.1'), axis=1)
atelectasis_negated2['mcq_captions'] = atelectasis_negated2.apply(lambda row: generate_mcq(row['main_disease'], '2.2'), axis=1)

negated_mcq = pd.concat([infiltration_negated1, infiltration_negated2, effusion_negated1, effusion_negated2, atelectasis_negated1, atelectasis_negated2]).reset_index(drop=True)

In [9]:
# step 3 -> merge retrieval and mcq captions into one final dataset

# retrieval captions (already built in cxr_retrieval)
# mcq captions (affirmative_mcq + negated_mcq combined)

all_mcq = pd.concat([affirmative_mcq, negated_mcq]).reset_index(drop=True)

# make sure both retrieval and mcq dfs have the same order
# (same image identifiers / row alignment)
# assuming both have 'id' or some unique identifier column
# if not, index will be used as fallback

final_df = cxr_retrieval.copy()

# add mcq captions column (aligned by row order)
final_df['mcq_captions'] = all_mcq['mcq_captions']

# sanity check: should be 3000 rows, each with retrieval + mcq captions
# print(final_df.shape)   # expect (3000, ...)
# print(final_df.head())
final_df

Unnamed: 0,filepath,split,tasks/disease labels,tasks/patient sex,original_filepath,original_split,patient_id,bounding_box,disease labels,finding_labels,follow-up_nb,original_image_size,original_pixel_spacing,patient sex,patient_age,view_position,main_disease,retrieval_captions,mcq_captions
0,images/021594.tiff,train,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,images/00005746_011.png,train,5746,,Infiltration,Infiltration,11,"(2500,2048)","(0.171,0.171)",F,46,PA,Infiltration,"[This CXR scan shows Infiltration., This CXR s...","[This CXR scan shows Infiltration., This CXR s..."
1,images/009042.tiff,train,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,images/00002371_003.png,train,2371,,Infiltration,Infiltration,0,"(2500,2048)","(0.168,0.168)",M,35,AP,Infiltration,"[This CXR scan shows Infiltration., This CXR s...","[This CXR scan shows Infiltration., This CXR s..."
2,images/062315.tiff,train,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,images/00015414_017.png,train,15414,,Infiltration,Infiltration,17,"(2992,2991)","(0.143,0.143)",M,40,PA,Infiltration,"[This CXR scan shows Infiltration., This CXR s...","[This CXR scan shows Infiltration., This CXR s..."
3,images/064877.tiff,test,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,images/00016014_010.png,test,16014,,Infiltration,Infiltration,10,"(2992,2991)","(0.143,0.143)",M,63,PA,Infiltration,"[This CXR scan shows Infiltration., This CXR s...","[This CXR scan shows Infiltration., This CXR s..."
4,images/048204.tiff,train,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,images/00012219_016.png,train,12219,,Infiltration,Infiltration,16,"(2048,2500)","(0.168,0.168)",F,68,PA,Infiltration,"[This CXR scan shows Infiltration., This CXR s...","[This CXR scan shows Infiltration., This CXR s..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,images/016664.tiff,val,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,images/00004463_000.png,train,4463,,Atelectasis,Atelectasis,0,"(2500,2048)","(0.171,0.171)",F,54,AP,Atelectasis,"[This CXR scan shows Atelectasis., This CXR sc...","[This CXR scan does not show Effusion., This C..."
2996,images/059547.tiff,train,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,images/00014710_012.png,train,14710,,Atelectasis,Atelectasis,12,"(2730,2991)","(0.143,0.143)",F,59,PA,Atelectasis,"[This CXR scan shows Atelectasis., This CXR sc...","[This CXR scan does not show Effusion., This C..."
2997,images/111184.tiff,train,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,images/00030350_000.png,train,30350,,Atelectasis,Atelectasis,0,"(2021,2020)","(0.1943109999999999,0.1943109999999999)",M,24,PA,Atelectasis,"[This CXR scan shows Atelectasis., This CXR sc...","[This CXR scan does not show Effusion., This C..."
2998,images/043254.tiff,train,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,images/00011156_001.png,train,11156,,Atelectasis,Atelectasis,1,"(2990,2991)","(0.143,0.143)",M,61,PA,Atelectasis,"[This CXR scan shows Atelectasis., This CXR sc...","[This CXR scan does not show Effusion., This C..."


In [12]:
final_df.to_csv('../data/cxr_captions.csv', index=False)