In [1]:
import pandas as pd

MAX_ROWS = 100

In [2]:
# max_number_of_rows
df_mimic_all = pd.read_csv('data/medical-diff-vqa-1.0.1/mimic_all.csv')
df_mimic_all.head(10)

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,...,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices,dicom_id,view,split,study_date,study_order
0,10000032,50414267,,,,,,,,,...,,,,,,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,postero-anterior,train,21800506,1.0
1,10000032,53189527,,,,,,,,,...,,,,,,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,postero-anterior,train,21800626,2.0
2,10000032,53911762,,,,,,,,,...,,,,,,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,antero-posterior,train,21800723,3.0
3,10000032,56699142,,,,,,,,,...,,,,,,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,antero-posterior,train,21800805,4.0
4,10000764,57375967,,,1.0,,,,,,...,,,-1.0,,,096052b7-d256dc40-453a102b-fa7d01c6-1b22c6b4,antero-posterior,train,21321015,1.0
5,10000898,50771383,,,,,,,,,...,,,,,,2a280266-c8bae121-54d75383-cac046f4-ca37aa16,postero-anterior,train,21880312,2.0
6,10000898,54205396,,,,,,,,,...,,,,,,b75df1bd-0f22d631-52d73526-2ae7b85a-d843b39d,postero-anterior,train,21880113,1.0
7,10000935,50578979,,,,-1.0,,,,-1.0,...,1.0,,1.0,,,d0b71acc-b5a62046-bbb5f6b8-7b173b85-65cdf738,antero-posterior,train,21871016,6.0
8,10000935,51178377,,,,,,,,1.0,...,,,-1.0,,,9b314ad7-fbcb0422-6db62dfc-732858d0-a5527d8b,antero-posterior,train,21870823,4.0
9,10000935,55697293,,,,,,,,,...,,,,,,c50494f1-90e2bff5-e9189550-1a4562fd-6ab5204c,postero-anterior,train,21870226,2.0


In [4]:
def get_labels_as_string(study_row, label_columns):
    """
    Scans through label columns for a study and returns a single,
    comma-separated string of all positive findings.
    """
    positive_labels = []
    for label in label_columns:
        # Check if the label column exists and its value is 1.0
        if label in study_row and study_row[label] == 1.0:
            positive_labels.append(label)

    # If the list is empty, it might be 'No Finding'
    if (
        not positive_labels
        and "No Finding" in study_row
        and study_row["No Finding"] == 1.0
    ):
        return "No Finding"

    return ", ".join(positive_labels)


def create_pairs_for_training(df: pd.DataFrame):
    """
    Creates study pairs with a separate column for each radiologist label,
    suffixed with _1 and _2. Ideal for machine learning.
    """
    label_columns = [
        "Atelectasis",
        "Cardiomegaly",
        "Consolidation",
        "Edema",
        "Enlarged Cardiomediastinum",
        "Fracture",
        "Lung Lesion",
        "Lung Opacity",
        "No Finding",
        "Pleural Effusion",
        "Pleural Other",
        "Pneumonia",
        "Pneumothorax",
        "Support Devices",
    ]

    df_sorted = df.sort_values(by=["subject_id", "study_date"])
    grouped = df_sorted.groupby("subject_id")
    paired_studies_list = []

    for subject, group in grouped:
        if len(group) >= 2:
            for i in range(len(group) - 1):
                study_1 = group.iloc[i]
                study_2 = group.iloc[i + 1]

                paired_row = {
                    "subject_id": subject,
                    "study_id_1": study_1["study_id"],
                    "dicom_id_1": study_1["dicom_id"],
                    "study_id_2": study_2["study_id"],
                    "dicom_id_2": study_2["dicom_id"],
                }

                for label in label_columns:
                    paired_row[f"{label}_1"] = study_1.get(label)
                    paired_row[f"{label}_2"] = study_2.get(label)

                paired_studies_list.append(paired_row)

    return pd.DataFrame(paired_studies_list)


def create_pairs_readble(df: pd.DataFrame):
    """
    Creates study pairs with radiologist labels condensed into single string columns
    This enables the dataset to be human readableaatyj]]wqlkojioiuiukkhkjhjuytlyytfdddddddddddsddddddddddddddddddddddddddd
    """
    label_columns = [
        "Atelectasis",
        "Cardiomegaly",
        "Consolidation",
        "Edema",
        "Enlarged Cardiomediastinum",
        "Fracture",
        "Lung Lesion",
        "Lung Opacity",
        "Pleural Effusion",
        "Pleural Other",
        "Pneumonia",
        "Pneumothorax",
        "Support Devices",
    ]

    df_sorted = df.sort_values(by=["subject_id", "study_date"])
    grouped = df_sorted.groupby("subject_id")
    paired_studies_list = []

    for subject, group in grouped:
        if len(group) >= 2:
            for i in range(len(group) - 1):
                study_1 = group.iloc[i]
                study_2 = group.iloc[i + 1]

                date_1 = pd.to_datetime(study_1["study_date"], format="%Y%m%d")
                date_2 = pd.to_datetime(study_2["study_date"], format="%Y%m%d")
                days_diff = (date_2 - date_1).days

                # *** MODIFIED: Get labels as a single string ***
                labels_1_str = get_labels_as_string(study_1, label_columns)
                labels_2_str = get_labels_as_string(study_2, label_columns)

                paired_row = {
                    "subject_id": subject,
                    "days_between_studies": days_diff,
                    "study_id_1": study_1["study_id"],
                    "dicom_id_1": study_1["dicom_id"],
                    "labels_1": labels_1_str,  # <-- New condensed column
                    "study_id_2": study_2["study_id"],
                    "dicom_id_2": study_2["dicom_id"],
                    "labels_2": labels_2_str,  # <-- New condensed column
                }
                paired_studies_list.append(paired_row)

    paired_df = pd.DataFrame(paired_studies_list)
    return paired_df

In [5]:
df_human_readable = create_pairs_readble(df_mimic_all.head(MAX_ROWS))
df_human_readable

Unnamed: 0,subject_id,days_between_studies,study_id_1,dicom_id_1,labels_1,study_id_2,dicom_id_2,labels_2
0,10000032,51,50414267,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,No Finding,53189527,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,No Finding
1,10000032,27,53189527,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,No Finding,53911762,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,No Finding
2,10000032,13,53911762,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,No Finding,56699142,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,No Finding
3,10000898,59,54205396,b75df1bd-0f22d631-52d73526-2ae7b85a-d843b39d,No Finding,50771383,2a280266-c8bae121-54d75383-cac046f4-ca37aa16,No Finding
4,10000935,211,56522600,f1adcae3-2921c0a8-5d9652f9-4191ecd7-f2a96f35,Fracture,55697293,c50494f1-90e2bff5-e9189550-1a4562fd-6ab5204c,No Finding
...,...,...,...,...,...,...,...,...
77,10002428,3,52460896,54c2ed5c-f4fbc20d-3bf4c783-283c3878-e9eb320d,"Pleural Effusion, Support Devices",56597576,d98d5096-a32483d3-669ed39b-bc81eafb-5e251563,"Lung Opacity, Pleural Effusion, Support Devices"
78,10002428,2,56597576,d98d5096-a32483d3-669ed39b-bc81eafb-5e251563,"Lung Opacity, Pleural Effusion, Support Devices",56836542,471418ab-0bfd6700-6bb770d0-07f7f6a3-2ce2d9c2,"Atelectasis, Edema, Pleural Effusion"
79,10002428,1,56836542,471418ab-0bfd6700-6bb770d0-07f7f6a3-2ce2d9c2,"Atelectasis, Edema, Pleural Effusion",50292543,7f51d06c-dad16fe6-ff21a632-edc5a3e6-ffa387bf,"Atelectasis, Edema, Lung Opacity, Pleural Effu..."
80,10002428,1,50292543,7f51d06c-dad16fe6-ff21a632-edc5a3e6-ffa387bf,"Atelectasis, Edema, Lung Opacity, Pleural Effu...",50444997,85b903ac-bfc4d6cd-491d2a72-18e7c16e-23d2cafa,"Atelectasis, Lung Opacity, Pleural Effusion, S..."
