In [1]:
import os
import pandas as pd
import tqdm
import regex as re

RANDOM_STATE = 123

  from pandas.core import (


In [2]:
def categorize_body_group(exam_type):
    categories = {
        "Head": ["HEAD", "BRAIN", "STROKE PROTOCOL", "NEURO"],
        "Abdomen/Pelvis": ["ABDOMEN", "PELVIS", "PROSTATE", "RENAL", "MRCP", "UROGRAM"],
        "MSK": ["KNEE", "HIP", "SHOULDER", "EXTREMITY", "SPINE", "LUMBAR", 
                "CERVICAL", "SCOLIOSIS", "JOINT"],
        "Chest": ["CHEST", "CARDIAC", "PULMONARY EMBOLISM", "CTA", "HEART"],
        "Neck": ["FACE", "NECK", "CRANIOFACIAL", "MAXILLOFACIAL", "ORBIT", "TEMPORAL BONE", 
                 "SINUS", "THYROID", "MANDIBLE", "SKULL"]
    }
    
    for category, keywords in categories.items():
        if any(keyword in exam_type for keyword in keywords):
            return category
    return "Other"

def categorize_imaging_modality(exam_type):
    categories = {
        "MRI": ["MR", "MRI"],
        "CT": ["CT", "CTA"],
        "XR": ["XR"],
        "US": ["US"]
    }
    
    for category, keywords in categories.items():
        if any(keyword in exam_type for keyword in keywords):
            return category
    return "Other"

# LLM Balanced Test Dataset

In [48]:
llm_balanced_test_dataset = pd.read_parquet("llm_balanced_test_dataset.parquet")

In [49]:
len(llm_balanced_test_dataset)

2350575

In [50]:
llm_balanced_test_dataset["patientdurablekey"].nunique()

8695

In [52]:
len(llm_balanced_test_dataset[~llm_balanced_test_dataset["exam_type"].isna()])

10000

In [55]:
llm_balanced_test_dataset["body_group"].value_counts()

body_group
Abdomen/Pelvis    2000
Chest             2000
Head              2000
MSK               2000
Neck              2000
Name: count, dtype: int64

In [51]:
llm_balanced_test_dataset["imaging_modality"].value_counts()

imaging_modality
CT     6960
MRI    2853
US      125
XR       62
Name: count, dtype: int64

In [53]:
llm_balanced_test_dataset["imaging_modality"].value_counts(normalize=True)*100

imaging_modality
CT     69.60
MRI    28.53
US      1.25
XR      0.62
Name: proportion, dtype: float64

In [None]:
llm_balanced_test_dataset_processed = pd.read_parquet("llm_balanced_test_dataset_processed.parquet")

In [11]:
llm_balanced_test_dataset_processed["note_texts_full"].apply(len).sum() + len(llm_balanced_test_dataset_processed)

98298

# Clinical Indication Dataset without Redactions

In [12]:
clinical_indication_dataset_redactions = []
for i in tqdm.tqdm(range(1, 7)):
    clinical_indication_dataset_redactions.append(pd.read_parquet(f"clinical_indication_dataset_without_redactions_shard{i}.parquet"))
clinical_indication_dataset_redactions = pd.concat(clinical_indication_dataset_redactions)

100%|█████████████████████████████████████████████████████████████████████████████| 6/6 [02:54<00:00, 29.06s/it]


In [13]:
len(clinical_indication_dataset_redactions)

6382247

In [14]:
clinical_indication_dataset_redactions["patientdurablekey"].nunique()

34543

In [16]:
len(clinical_indication_dataset_redactions[~clinical_indication_dataset_redactions["exam_type"].isna()])

61051

In [17]:
clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna(), "body_group"] = clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna()]["exam_type"].apply(categorize_body_group)
clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna(), "imaging_modality"] = clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna()]["exam_type"].apply(categorize_imaging_modality)

In [18]:
clinical_indication_dataset_redactions["body_group"].value_counts()

body_group
Head              36598
MSK                8312
Neck               7786
Abdomen/Pelvis     4199
Chest              3422
Other               734
Name: count, dtype: int64

In [20]:
clinical_indication_dataset_redactions["imaging_modality"].value_counts()

imaging_modality
CT       33067
MRI      26902
US         447
Other      384
XR         251
Name: count, dtype: int64

# Reader Study

In [56]:
NUM_PER_BODY_GROUP = 24

radiology_reports = clinical_indication_dataset_redactions[
    (~clinical_indication_dataset_redactions["exam_type"].isna()) & 
    (clinical_indication_dataset_redactions["body_group"] != "Other")
]
reader_study_sampled_reports = radiology_reports.groupby('body_group').sample(n=NUM_PER_BODY_GROUP, random_state=RANDOM_STATE)

In [57]:
reader_study_total_dataset = []
for i in tqdm.tqdm(range(len(reader_study_sampled_reports))):
    radiology_report = reader_study_sampled_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_total_dataset.append(radiology_report.to_frame().T)
    reader_study_total_dataset.append(filtered_patient_notes)

reader_study_total_dataset = pd.concat(reader_study_total_dataset)

print("Clinical Reader Study Model Evaluation Dataset")
print("="*20)
print("n_notes", len(reader_study_total_dataset))
print("n_patients", reader_study_total_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_total_dataset[~reader_study_total_dataset["exam_type"].isna()]))

100%|█████████████████████████████████████████████████████████████████████████| 120/120 [00:37<00:00,  3.21it/s]

Clinical Reader Study Model Evaluation Dataset
n_notes 26404
n_patients 120
n_radiology reports 120





In [69]:
reader_study_calibration_dataset_reports = reader_study_sampled_reports.groupby('body_group').sample(n=4, random_state=RANDOM_STATE)
reader_study_evaluation_dataset_reports = reader_study_sampled_reports.drop(reader_study_calibration_dataset_reports.index).groupby('body_group').sample(n=20, random_state=RANDOM_STATE)

In [70]:
print(reader_study_calibration_dataset_reports["body_group"].value_counts())
print(reader_study_calibration_dataset_reports["imaging_modality"].value_counts())

body_group
Abdomen/Pelvis    4
Chest             4
Head              4
MSK               4
Neck              4
Name: count, dtype: int64
imaging_modality
CT     18
MRI     2
Name: count, dtype: int64


In [71]:
print(reader_study_evaluation_dataset_reports["body_group"].value_counts())
print(reader_study_evaluation_dataset_reports["imaging_modality"].value_counts())

body_group
Abdomen/Pelvis    20
Chest             20
Head              20
MSK               20
Neck              20
Name: count, dtype: int64
imaging_modality
CT     77
MRI    20
XR      2
US      1
Name: count, dtype: int64


In [72]:
reader_study_evaluation_dataset = []
for i in tqdm.tqdm(range(len(reader_study_evaluation_dataset_reports))):
    radiology_report = reader_study_evaluation_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_evaluation_dataset.append(radiology_report.to_frame().T)
    reader_study_evaluation_dataset.append(filtered_patient_notes)

reader_study_evaluation_dataset = pd.concat(reader_study_evaluation_dataset)

print("Clinical Reader Study Model Evaluation Dataset")
print("="*20)
print("n_notes", len(reader_study_evaluation_dataset))
print("n_patients", reader_study_evaluation_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_evaluation_dataset[~reader_study_evaluation_dataset["exam_type"].isna()]))

100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:30<00:00,  3.23it/s]

Clinical Reader Study Model Evaluation Dataset
n_notes 22233
n_patients 100
n_radiology reports 100





In [73]:
reader_study_calibration_dataset = []
for i in tqdm.tqdm(range(len(reader_study_calibration_dataset_reports))):
    radiology_report = reader_study_calibration_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_calibration_dataset.append(radiology_report.to_frame().T)
    reader_study_calibration_dataset.append(filtered_patient_notes)

reader_study_calibration_dataset = pd.concat(reader_study_calibration_dataset)

print("Clinical Reader Study Rater Calibration Dataset")
print("="*20)
print("n_notes", len(reader_study_calibration_dataset))
print("n_patients", reader_study_calibration_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_calibration_dataset[~reader_study_calibration_dataset["exam_type"].isna()]))

100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:06<00:00,  3.29it/s]

Clinical Reader Study Rater Calibration Dataset
n_notes 4171
n_patients 20
n_radiology reports 20





In [74]:
def generate_note_title(enc_dept_name, note_type, auth_prov_type, deid_service_date):
    if enc_dept_name and note_type and auth_prov_type and deid_service_date:
        return f"{enc_dept_name} - {note_type} ({auth_prov_type}) | {deid_service_date}"
    elif enc_dept_name and note_type and deid_service_date:
        return f"{enc_dept_name} - {note_type} | {deid_service_date}"
    elif note_type and deid_service_date:
        return f"{note_type} | {deid_service_date}"
    elif deid_service_date:
        return f"Clinical Note | {deid_service_date}"
    return f"Clinical Note"

In [75]:
NUM_NOTES = 10
reader_study_calibration_dataset_processed = pd.DataFrame(columns=[
    "patientdurablekey",
    "exam_type",
    "imaging_modality",
    "body_group",
    "report_text",
    "original_indication",
    "radiologist_indication",
    "enc_dept_names",
    "note_types",
    "auth_prov_types",
    "deid_service_dates",
    "note_titles",
    "note_texts",
    "note_texts_full"
])
for i in tqdm.tqdm(range(len(reader_study_calibration_dataset_reports))):
    radiology_report = reader_study_calibration_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    filtered_patient_notes['note_title'] = filtered_patient_notes.apply(lambda row: generate_note_title(
        row['enc_dept_name'], row['note_type'], row['auth_prov_type'], row['deid_service_date']
    ), axis=1)
    enc_dept_names = filtered_patient_notes[:NUM_NOTES][["enc_dept_name"]].squeeze().tolist()
    note_types = filtered_patient_notes[:NUM_NOTES][["note_type"]].squeeze().tolist()
    auth_prov_types = filtered_patient_notes[:NUM_NOTES][["auth_prov_type"]].squeeze().tolist()
    deid_service_dates = filtered_patient_notes[:NUM_NOTES][["deid_service_date"]].squeeze().tolist()
    note_titles = filtered_patient_notes[:NUM_NOTES][["note_title"]].squeeze().tolist()
    note_texts = filtered_patient_notes[:NUM_NOTES][["note_text"]].squeeze().tolist()
    note_texts_full = filtered_patient_notes[["note_text"]].squeeze().tolist()

    row = {
        "patientdurablekey": radiology_report["patientdurablekey"],
        "exam_type": radiology_report["exam_type"],
        "imaging_modality": radiology_report["imaging_modality"],
        "body_group": radiology_report["body_group"],
        "report_text": radiology_report["note_text"],
        "original_indication": radiology_report["original_indication"],
        "radiologist_indication": radiology_report["radiologist_indication"],
        "enc_dept_names": enc_dept_names,
        "note_types": note_types,
        "auth_prov_types": auth_prov_types,
        "deid_service_dates": deid_service_dates,
        "note_titles": note_titles,
        "note_texts": note_texts,
        "note_texts_full": note_texts_full
    }
    reader_study_calibration_dataset_processed.loc[
        len(reader_study_calibration_dataset_processed)
    ] = row
reader_study_calibration_dataset_processed.to_parquet("reader_study_calibration_dataset_processed.parquet")

100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:06<00:00,  3.08it/s]


In [76]:
reader_study_calibration_dataset_processed.head()

Unnamed: 0,patientdurablekey,exam_type,imaging_modality,body_group,report_text,original_indication,radiologist_indication,enc_dept_names,note_types,auth_prov_types,deid_service_dates,note_titles,note_texts,note_texts_full
0,D6180CD2E0466F,CT ABDOMEN/PELVIS WITH CONTRAST,CT,Abdomen/Pelvis,CT ABDOMEN/PELVIS WITH CONTRAST 09/05/2017 ...,"Abdominal pain, left sided, diarrhea and ? fever.",history of smoking,"[EMERGENCY DEPT PARN, ZZZOPHTH CORNEA PARN, ZZ...","[ED Provider Notes, Progress Notes, Assessment...","[Physician, Physician, Physician, Physician, P...","[2017-09-05 15:50:00, 2017-04-11 00:00:00, 201...",[EMERGENCY DEPT PARN - ED Provider Notes (Phys...,[ ED First Attending History Chief ...,[ ED First Attending History Chief ...
1,D467EBAF5821FB,CT ABDOMEN/PELVIS WITH AND WITHOUT CONTRAST QU...,CT,Abdomen/Pelvis,CT ABDOMEN/PELVIS WITH AND WITHOUT CONTRAST QU...,"leukocytosis eval for GI infection, new cirrho...","History of chronic alcohol abuse, and TBI in 2...","[11NE NICU, 11NE NICU, 11NE NICU, 11NE NICU, 1...","[Consults, Progress Notes, Interdisciplinary, ...","[Pharmacist, Physician, Social Worker, Physici...","[2017-11-29 16:46:00, 2017-11-29 15:26:00, 201...",[11NE NICU - Consults (Pharmacist) | 2017-11-2...,[ ANTIMICROBIAL STEWARDSHIP CONSULT Thi...,[ ANTIMICROBIAL STEWARDSHIP CONSULT Thi...
2,DF0FE5DDF1E041,CT ABDOMEN/PELVIS WITH CONTRAST,CT,Abdomen/Pelvis,CT ABDOMEN/PELVIS WITH CONTRAST 01/12/2024 ...,vomiting.,"Atrial fibrillation, renal cell carcinoma stat...","[EMERGENCY DEPT PARN, EMERGENCY DEPT PARN, EME...","[Discharge Instructions, Progress Notes, H&P, ...","[Resident, Resident, Resident, Resident, Regis...","[2024-01-12 10:54:00, 2024-01-12 09:36:00, 202...",[EMERGENCY DEPT PARN - Discharge Instructions ...,[You were seen in the ER for evaluation of nau...,[You were seen in the ER for evaluation of nau...
3,D030EA8C4DC22D,CT ABDOMEN/PELVIS WITH AND WITHOUT CONTRAST QU...,CT,Abdomen/Pelvis,CT ABDOMEN/PELVIS WITH AND WITHOUT CONTRAST QU...,metastatic sigmoid colon ca to liver,Hepatic arterial infusion pump planning.,"[GI SURG ONC MB 4, ZZZGENERIC EXTERNAL DATA DE...","[Progress Notes, H&P, Progress Notes, Progress...","[Nurse Practitioner, None, None, None, None, N...","[2023-11-18 00:00:00, 2023-11-18 00:00:00, 202...",[GI SURG ONC MB 4 - Progress Notes (Nurse Prac...,[GI Surgical Oncology Follow Up Referri...,[GI Surgical Oncology Follow Up Referri...
4,DB5F48897682E7,CT ANGIOGRAM CHEST,CT,Chest,CT ANGIOGRAM CHEST 04/17/2023 8:56 PM CL...,"Chest pain radiating to the back, evaluate for...",PMH of CAD s/p 2-vessel CABG in 2019,"[EMERGENCY DEPT PARN, EMERGENCY DEPT PARN, EME...","[Discharge Instructions, ED Provider Notes, ED...","[Resident, Resident, Physician Assistant, Resi...","[2023-04-17 22:35:00, 2023-04-17 16:50:00, 202...",[EMERGENCY DEPT PARN - Discharge Instructions ...,[You were evaluated in the Emergency Departmen...,[You were evaluated in the Emergency Departmen...


In [77]:
NUM_NOTES = 10
reader_study_evaluation_dataset_processed = pd.DataFrame(columns=[
    "patientdurablekey",
    "exam_type",
    "imaging_modality",
    "body_group",
    "report_text",
    "original_indication",
    "radiologist_indication",
    "enc_dept_names",
    "note_types",
    "auth_prov_types",
    "deid_service_dates",
    "note_titles",
    "note_texts",
    "note_texts_full"
])
for i in tqdm.tqdm(range(len(reader_study_evaluation_dataset_reports))):
    radiology_report = reader_study_evaluation_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    filtered_patient_notes['note_title'] = filtered_patient_notes.apply(lambda row: generate_note_title(
        row['enc_dept_name'], row['note_type'], row['auth_prov_type'], row['deid_service_date']
    ), axis=1)
    enc_dept_names = filtered_patient_notes[:NUM_NOTES][["enc_dept_name"]].squeeze().tolist()
    note_types = filtered_patient_notes[:NUM_NOTES][["note_type"]].squeeze().tolist()
    auth_prov_types = filtered_patient_notes[:NUM_NOTES][["auth_prov_type"]].squeeze().tolist()
    deid_service_dates = filtered_patient_notes[:NUM_NOTES][["deid_service_date"]].squeeze().tolist()
    note_titles = filtered_patient_notes[:NUM_NOTES][["note_title"]].squeeze().tolist()
    note_texts = filtered_patient_notes[:NUM_NOTES][["note_text"]].squeeze().tolist()
    note_texts_full = filtered_patient_notes[["note_text"]].squeeze().tolist()
    row = {
        "patientdurablekey": radiology_report["patientdurablekey"],
        "exam_type": radiology_report["exam_type"],
        "imaging_modality": radiology_report["imaging_modality"],
        "body_group": radiology_report["body_group"],
        "report_text": radiology_report["note_text"],
        "original_indication": radiology_report["original_indication"],
        "radiologist_indication": radiology_report["radiologist_indication"],
        "enc_dept_names": enc_dept_names,
        "note_types": note_types,
        "auth_prov_types": auth_prov_types,
        "deid_service_dates": deid_service_dates,
        "note_titles": note_titles,
        "note_texts": note_texts,
        "note_texts_full": note_texts_full
    }
    reader_study_evaluation_dataset_processed.loc[
        len(reader_study_evaluation_dataset_processed)
    ] = row
reader_study_evaluation_dataset_processed.to_parquet("reader_study_evaluation_dataset_processed.parquet")

100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:31<00:00,  3.20it/s]


In [78]:
reader_study_calibration_dataset_processed["note_texts_full"].apply(len).sum() + len(reader_study_calibration_dataset_processed)

4171

In [79]:
reader_study_evaluation_dataset_processed["note_texts_full"].apply(len).sum() + len(reader_study_evaluation_dataset_processed)

22233