In [60]:
import os
import pandas as pd
import tqdm
import regex as re

RANDOM_STATE = 123

## Clinical Indication Dataset

In [22]:
clinical_indication_dataset = []
for i in tqdm.tqdm(range(1, 7)):
    clinical_indication_dataset.append(pd.read_parquet(f"clinical_indication_dataset_shard{i}.parquet"))
clinical_indication_dataset = pd.concat(clinical_indication_dataset)

100%|█████████████████████████████████████████████████████████████████████████████| 6/6 [04:12<00:00, 42.09s/it]


In [31]:
len(clinical_indication_dataset)

9745792

In [23]:
clinical_indication_dataset["patientdurablekey"].nunique()

57609

In [24]:
def categorize_body_group(exam_type):
    categories = {
        "Head": ["HEAD", "BRAIN", "STROKE PROTOCOL", "NEURO"],
        "Abdomen/Pelvis": ["ABDOMEN", "PELVIS", "PROSTATE", "RENAL", "MRCP", "UROGRAM"],
        "MSK": ["KNEE", "HIP", "SHOULDER", "EXTREMITY", "SPINE", "LUMBAR", 
                "CERVICAL", "SCOLIOSIS", "JOINT"],
        "Chest": ["CHEST", "CARDIAC", "PULMONARY EMBOLISM", "CTA", "HEART"],
        "Neck": ["FACE", "NECK", "CRANIOFACIAL", "MAXILLOFACIAL", "ORBIT", "TEMPORAL BONE", 
                 "SINUS", "THYROID", "MANDIBLE", "SKULL"]
    }
    
    for category, keywords in categories.items():
        if any(keyword in exam_type for keyword in keywords):
            return category
    return "Other"

def categorize_imaging_modality(exam_type):
    categories = {
        "MRI": ["MR", "MRI"],
        "CT": ["CT", "CTA"],
        "XR": ["XR"],
        "US": ["US"]
    }
    
    for category, keywords in categories.items():
        if any(keyword in exam_type for keyword in keywords):
            return category
    return "Other"

clinical_indication_dataset.loc[clinical_indication_dataset["exam_type"].notna(), "body_group"] = clinical_indication_dataset.loc[clinical_indication_dataset["exam_type"].notna()]["exam_type"].apply(categorize_body_group)
clinical_indication_dataset.loc[clinical_indication_dataset["exam_type"].notna(), "imaging_modality"] = clinical_indication_dataset.loc[clinical_indication_dataset["exam_type"].notna()]["exam_type"].apply(categorize_imaging_modality)

In [30]:
clinical_indication_dataset["body_group"].value_counts() 

body_group
Head              81805
Neck              19426
MSK               16747
Abdomen/Pelvis     8443
Chest              6490
Other              1792
Name: count, dtype: int64

In [25]:
clinical_indication_dataset["body_group"].value_counts(normalize=True) * 100

body_group
Head              60.729902
Neck              14.421357
MSK               12.432537
Abdomen/Pelvis     6.267863
Chest              4.818007
Other              1.330334
Name: proportion, dtype: float64

In [26]:
clinical_indication_dataset["imaging_modality"].value_counts(normalize=True) * 100

imaging_modality
CT       52.489551
MRI      45.736175
Other     0.729754
US        0.637699
XR        0.406821
Name: proportion, dtype: float64

## Clinical Indication Dataset without Redactions

In [27]:
clinical_indication_dataset_redactions = []
for i in tqdm.tqdm(range(1, 7)):
    clinical_indication_dataset_redactions.append(pd.read_parquet(f"clinical_indication_dataset_without_redactions_shard{i}.parquet"))
clinical_indication_dataset_redactions = pd.concat(clinical_indication_dataset_redactions)

100%|█████████████████████████████████████████████████████████████████████████████| 6/6 [02:55<00:00, 29.20s/it]


In [28]:
len(clinical_indication_dataset_redactions)

6391890

In [29]:
clinical_indication_dataset_redactions["patientdurablekey"].nunique()

37216

In [32]:
clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna(), "body_group"] = clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna()]["exam_type"].apply(categorize_body_group)
clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna(), "imaging_modality"] = clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna()]["exam_type"].apply(categorize_imaging_modality)

In [33]:
clinical_indication_dataset_redactions["body_group"].value_counts()

body_group
Head              38726
MSK                8737
Neck               8538
Abdomen/Pelvis     4373
Chest              3533
Other               819
Name: count, dtype: int64

In [34]:
clinical_indication_dataset_redactions["body_group"].value_counts(normalize=True)

body_group
Head              0.598307
MSK               0.134984
Neck              0.131910
Abdomen/Pelvis    0.067562
Chest             0.054584
Other             0.012653
Name: proportion, dtype: float64

In [37]:
clinical_indication_dataset_redactions["imaging_modality"].value_counts(normalize=True) * 100

imaging_modality
CT       54.770880
MRI      43.339616
US        0.818836
Other     0.633439
XR        0.437228
Name: proportion, dtype: float64

## Stratified Random Sampling (Reader Study)

In [82]:
radiology_reports = clinical_indication_dataset_redactions[
    (~clinical_indication_dataset_redactions["exam_type"].isna()) & 
    (clinical_indication_dataset_redactions["body_group"] != "Other")
]
reader_study_sampled_reports = radiology_reports.groupby('body_group').sample(n=22, random_state=RANDOM_STATE)

In [91]:
reader_study_total_dataset = []
for i in tqdm.tqdm(range(len(reader_study_sampled_reports))):
    radiology_report = reader_study_sampled_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_total_dataset.append(radiology_report.to_frame().T)
    reader_study_total_dataset.append(filtered_patient_notes)

reader_study_total_dataset = pd.concat(reader_study_total_dataset)

print("Clinical Reader Study Model Evaluation Dataset")
print("="*20)
print("n_notes", len(reader_study_total_dataset))
print("n_patients", reader_study_total_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_total_dataset[~reader_study_total_dataset["exam_type"].isna()]))

100%|█████████████████████████████████████████████████████████████████████████| 110/110 [00:35<00:00,  3.08it/s]

Clinical Reader Study Model Evaluation Dataset
n_notes 21192
n_patients 110
n_radiology reports 110





In [83]:
reader_study_calibration_dataset_reports = reader_study_sampled_reports.groupby('body_group').sample(n=2, random_state=RANDOM_STATE)
reader_study_evaluation_dataset_reports = reader_study_sampled_reports.drop(reader_study_calibration_dataset_reports.index).groupby('body_group').sample(n=20, random_state=RANDOM_STATE)

In [84]:
print(reader_study_calibration_dataset_reports["body_group"].value_counts())
print(reader_study_calibration_dataset_reports["imaging_modality"].value_counts())

body_group
Abdomen/Pelvis    2
Chest             2
Head              2
MSK               2
Neck              2
Name: count, dtype: int64
imaging_modality
CT     7
MRI    3
Name: count, dtype: int64


In [85]:
print(reader_study_evaluation_dataset_reports["body_group"].value_counts())
print(reader_study_evaluation_dataset_reports["imaging_modality"].value_counts())

body_group
Abdomen/Pelvis    20
Chest             20
Head              20
MSK               20
Neck              20
Name: count, dtype: int64
imaging_modality
CT     73
MRI    24
US      2
XR      1
Name: count, dtype: int64


In [86]:
reader_study_evaluation_dataset = []
for i in tqdm.tqdm(range(len(reader_study_evaluation_dataset_reports))):
    radiology_report = reader_study_evaluation_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_evaluation_dataset.append(radiology_report.to_frame().T)
    reader_study_evaluation_dataset.append(filtered_patient_notes)

reader_study_evaluation_dataset = pd.concat(reader_study_evaluation_dataset)

print("Clinical Reader Study Model Evaluation Dataset")
print("="*20)
print("n_notes", len(reader_study_evaluation_dataset))
print("n_patients", reader_study_evaluation_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_evaluation_dataset[~reader_study_evaluation_dataset["exam_type"].isna()]))

100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:30<00:00,  3.31it/s]

Clinical Reader Study Model Evaluation Dataset
n_notes 18645
n_patients 100
n_radiology reports 100





In [87]:
reader_study_calibration_dataset = []
for i in tqdm.tqdm(range(len(reader_study_calibration_dataset_reports))):
    radiology_report = reader_study_calibration_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_calibration_dataset.append(radiology_report.to_frame().T)
    reader_study_calibration_dataset.append(filtered_patient_notes)

reader_study_calibration_dataset = pd.concat(reader_study_calibration_dataset)

print("Clinical Reader Study Model Evaluation Dataset")
print("="*20)
print("n_notes", len(reader_study_calibration_dataset))
print("n_patients", reader_study_calibration_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_calibration_dataset[~reader_study_calibration_dataset["exam_type"].isna()]))

100%|███████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  3.31it/s]

Clinical Reader Study Model Evaluation Dataset
n_notes 2547
n_patients 10
n_radiology reports 10



