In [2]:
import os
import pandas as pd
import tqdm
import regex as re

RANDOM_STATE = 123

  from pandas.core import (


In [None]:
def categorize_body_group(exam_type):
    categories = {
        "Head": ["HEAD", "BRAIN", "STROKE PROTOCOL", "NEURO"],
        "Abdomen/Pelvis": ["ABDOMEN", "PELVIS", "PROSTATE", "RENAL", "MRCP", "UROGRAM"],
        "MSK": ["KNEE", "HIP", "SHOULDER", "EXTREMITY", "SPINE", "LUMBAR", 
                "CERVICAL", "SCOLIOSIS", "JOINT"],
        "Chest": ["CHEST", "CARDIAC", "PULMONARY EMBOLISM", "CTA", "HEART"],
        "Neck": ["FACE", "NECK", "CRANIOFACIAL", "MAXILLOFACIAL", "ORBIT", "TEMPORAL BONE", 
                 "SINUS", "THYROID", "MANDIBLE", "SKULL"]
    }
    
    for category, keywords in categories.items():
        if any(keyword in exam_type for keyword in keywords):
            return category
    return "Other"

def categorize_imaging_modality(exam_type):
    categories = {
        "MRI": ["MR", "MRI"],
        "CT": ["CT", "CTA"],
        "XR": ["XR"],
        "US": ["US"]
    }
    
    for category, keywords in categories.items():
        if any(keyword in exam_type for keyword in keywords):
            return category
    return "Other"

## LLM Balanced Test Dataset

In [3]:
llm_balanced_test_dataset_processed = pd.read_parquet("llm_balanced_test_dataset_processed.parquet")
llm_balanced_test_dataset_processed["note_texts"].apply(len).min()

# Clinical Indication Dataset without Redactions

In [27]:
clinical_indication_dataset_redactions = []
for i in tqdm.tqdm(range(1, 7)):
    clinical_indication_dataset_redactions.append(pd.read_parquet(f"clinical_indication_dataset_without_redactions_shard{i}.parquet"))
clinical_indication_dataset_redactions = pd.concat(clinical_indication_dataset_redactions)

100%|█████████████████████████████████████████████████████████████████████████████| 6/6 [02:55<00:00, 29.20s/it]


In [28]:
len(clinical_indication_dataset_redactions)

6391890

In [29]:
clinical_indication_dataset_redactions["patientdurablekey"].nunique()

37216

In [32]:
clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna(), "body_group"] = clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna()]["exam_type"].apply(categorize_body_group)
clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna(), "imaging_modality"] = clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna()]["exam_type"].apply(categorize_imaging_modality)

In [33]:
clinical_indication_dataset_redactions["body_group"].value_counts()

body_group
Head              38726
MSK                8737
Neck               8538
Abdomen/Pelvis     4373
Chest              3533
Other               819
Name: count, dtype: int64

In [34]:
clinical_indication_dataset_redactions["body_group"].value_counts(normalize=True)

body_group
Head              0.598307
MSK               0.134984
Neck              0.131910
Abdomen/Pelvis    0.067562
Chest             0.054584
Other             0.012653
Name: proportion, dtype: float64

In [37]:
clinical_indication_dataset_redactions["imaging_modality"].value_counts(normalize=True) * 100

imaging_modality
CT       54.770880
MRI      43.339616
US        0.818836
Other     0.633439
XR        0.437228
Name: proportion, dtype: float64

# Reader Study

In [82]:
radiology_reports = clinical_indication_dataset_redactions[
    (~clinical_indication_dataset_redactions["exam_type"].isna()) & 
    (clinical_indication_dataset_redactions["body_group"] != "Other")
]
reader_study_sampled_reports = radiology_reports.groupby('body_group').sample(n=22, random_state=RANDOM_STATE)

In [91]:
reader_study_total_dataset = []
for i in tqdm.tqdm(range(len(reader_study_sampled_reports))):
    radiology_report = reader_study_sampled_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_total_dataset.append(radiology_report.to_frame().T)
    reader_study_total_dataset.append(filtered_patient_notes)

reader_study_total_dataset = pd.concat(reader_study_total_dataset)

print("Clinical Reader Study Model Evaluation Dataset")
print("="*20)
print("n_notes", len(reader_study_total_dataset))
print("n_patients", reader_study_total_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_total_dataset[~reader_study_total_dataset["exam_type"].isna()]))

100%|█████████████████████████████████████████████████████████████████████████| 110/110 [00:35<00:00,  3.08it/s]

Clinical Reader Study Model Evaluation Dataset
n_notes 21192
n_patients 110
n_radiology reports 110





In [83]:
reader_study_calibration_dataset_reports = reader_study_sampled_reports.groupby('body_group').sample(n=2, random_state=RANDOM_STATE)
reader_study_evaluation_dataset_reports = reader_study_sampled_reports.drop(reader_study_calibration_dataset_reports.index).groupby('body_group').sample(n=20, random_state=RANDOM_STATE)

In [84]:
print(reader_study_calibration_dataset_reports["body_group"].value_counts())
print(reader_study_calibration_dataset_reports["imaging_modality"].value_counts())

body_group
Abdomen/Pelvis    2
Chest             2
Head              2
MSK               2
Neck              2
Name: count, dtype: int64
imaging_modality
CT     7
MRI    3
Name: count, dtype: int64


In [85]:
print(reader_study_evaluation_dataset_reports["body_group"].value_counts())
print(reader_study_evaluation_dataset_reports["imaging_modality"].value_counts())

body_group
Abdomen/Pelvis    20
Chest             20
Head              20
MSK               20
Neck              20
Name: count, dtype: int64
imaging_modality
CT     73
MRI    24
US      2
XR      1
Name: count, dtype: int64


In [86]:
reader_study_evaluation_dataset = []
for i in tqdm.tqdm(range(len(reader_study_evaluation_dataset_reports))):
    radiology_report = reader_study_evaluation_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_evaluation_dataset.append(radiology_report.to_frame().T)
    reader_study_evaluation_dataset.append(filtered_patient_notes)

reader_study_evaluation_dataset = pd.concat(reader_study_evaluation_dataset)

print("Clinical Reader Study Model Evaluation Dataset")
print("="*20)
print("n_notes", len(reader_study_evaluation_dataset))
print("n_patients", reader_study_evaluation_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_evaluation_dataset[~reader_study_evaluation_dataset["exam_type"].isna()]))

100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:30<00:00,  3.31it/s]

Clinical Reader Study Model Evaluation Dataset
n_notes 18645
n_patients 100
n_radiology reports 100





In [87]:
reader_study_calibration_dataset = []
for i in tqdm.tqdm(range(len(reader_study_calibration_dataset_reports))):
    radiology_report = reader_study_calibration_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_calibration_dataset.append(radiology_report.to_frame().T)
    reader_study_calibration_dataset.append(filtered_patient_notes)

reader_study_calibration_dataset = pd.concat(reader_study_calibration_dataset)

print("Clinical Reader Study Model Evaluation Dataset")
print("="*20)
print("n_notes", len(reader_study_calibration_dataset))
print("n_patients", reader_study_calibration_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_calibration_dataset[~reader_study_calibration_dataset["exam_type"].isna()]))

100%|███████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  3.31it/s]

Clinical Reader Study Model Evaluation Dataset
n_notes 2547
n_patients 10
n_radiology reports 10





In [124]:
def generate_note_title(enc_dept_name, note_type, auth_prov_type, deid_service_date):
    if enc_dept_name and note_type and auth_prov_type and deid_service_date:
        return f"{enc_dept_name} - {note_type} ({auth_prov_type}) | {deid_service_date}"
    elif enc_dept_name and note_type and deid_service_date:
        return f"{enc_dept_name} - {note_type} | {deid_service_date}"
    elif note_type and deid_service_date:
        return f"{note_type} | {deid_service_date}"
    elif deid_service_date:
        return f"Clinical Note | {deid_service_date}"
    return f"Clinical Note"

In [128]:
NUM_NOTES = 10
reader_study_calibration_dataset_processed = pd.DataFrame(columns=[
    "exam_type",
    "imaging_modality",
    "body_group",
    "report_text",
    "original_indication",
    "radiologist_indication",
    "enc_dept_names",
    "note_types",
    "auth_prov_types",
    "deid_service_dates",
    "note_titles",
    "note_texts"
])
for i in tqdm.tqdm(range(len(reader_study_calibration_dataset_reports))):
    radiology_report = reader_study_calibration_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    filtered_patient_notes['note_title'] = filtered_patient_notes.apply(lambda row: generate_note_title(
        row['enc_dept_name'], row['note_type'], row['auth_prov_type'], row['deid_service_date']
    ), axis=1)
    enc_dept_names = filtered_patient_notes[:NUM_NOTES][["enc_dept_name"]].squeeze().tolist()
    note_types = filtered_patient_notes[:NUM_NOTES][["note_type"]].squeeze().tolist()
    auth_prov_types = filtered_patient_notes[:NUM_NOTES][["auth_prov_type"]].squeeze().tolist()
    deid_service_dates = filtered_patient_notes[:NUM_NOTES][["deid_service_date"]].squeeze().tolist()
    note_titles = filtered_patient_notes[:NUM_NOTES][["note_title"]].squeeze().tolist()
    note_texts = filtered_patient_notes[:NUM_NOTES][["note_text"]].squeeze().tolist()
    row = {
        "exam_type": radiology_report["exam_type"],
        "imaging_modality": radiology_report["imaging_modality"],
        "body_group": radiology_report["body_group"],
        "report_text": radiology_report["note_text"],
        "original_indication": radiology_report["original_indication"],
        "radiologist_indication": radiology_report["radiologist_indication"],
        "enc_dept_names": enc_dept_names,
        "note_types": note_types,
        "auth_prov_types": auth_prov_types,
        "deid_service_dates": deid_service_dates,
        "note_titles": note_titles,
        "note_texts": note_texts
    }
    reader_study_calibration_dataset_processed.loc[
        len(reader_study_calibration_dataset_processed)
    ] = row
reader_study_calibration_dataset_processed.to_parquet("reader_study_calibration_dataset_processed.parquet")

100%|███████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  3.29it/s]


In [142]:
reader_study_calibration_dataset_processed

Unnamed: 0,exam_type,imaging_modality,body_group,report_text,original_indication,radiologist_indication,enc_dept_names,note_types,auth_prov_types,deid_service_dates,note_titles
0,CT ABDOMEN/PELVIS WITH CONTRAST,CT,Abdomen/Pelvis,CT ABDOMEN/PELVIS WITH CONTRAST 10/06/2017 8:...,Concern for SBO,Colon cancer with peritoneal carcinomatosis on...,"[14L MEDICINE, 14L MEDICINE, 14L MEDICINE, ADU...","[RN Note, H&P, ED Provider Notes, RN Note, RN ...","[Registered Nurse, None, Physician, Registered...","[2017-10-07 07:07:00, 2017-10-07 01:12:00, 201...",[14L MEDICINE - RN Note (Registered Nurse) | 2...
1,CT ABDOMEN/PELVIS WITH CONTRAST,CT,Abdomen/Pelvis,CT ABDOMEN/PELVIS WITH CONTRAST 12/18/2023 ...,"erythema, inc warmth over R groin, r/o fat str...",History of metastatic breast cancer.,"[EMERGENCY DEPT PARN, POPULATION HLTH ADMIN, E...","[Progress Notes, Progress Notes, ED Provider N...","[Physician Assistant, Registered Nurse, Physic...","[2023-12-19 00:00:00, 2023-12-19 00:00:00, 202...",[EMERGENCY DEPT PARN - Progress Notes (Physici...
2,CT CHEST WITH CONTRAST,CT,Chest,CT CHEST WITH CONTRAST CLINICAL HISTORY: C...,"Cancer surveillance (no evidence of disease, n...",History of pancreatic head mass complicated by...,"[None, GI SURG ONC MB 4, None, PATH SURGICAL P...","[HNO_MYC_LET_INFO|NOTE_ID, Progress Notes, Pat...","[None, Nurse Practitioner, None, None, Residen...","[2022-02-03 23:19:00, 2022-02-03 00:00:00, 202...",[HNO_MYC_LET_INFO|NOTE_ID | 2022-02-03 23:19:0...
3,CT CHEST WITHOUT CONTRAST,CT,Chest,CT CHEST WITHOUT CONTRAST CLINICAL HISTORY:...,heart transplant evaluation,History of bicuspid aortic valve status post r...,"[None, None, ADVANCED HEART FAIL PARN, 10LS CV...","[Imaging-Vasc, Imaging-Vasc, Progress Notes, P...","[None, None, Registered Nurse, Physician, Phys...","[2022-03-02 05:05:00, 2022-03-02 05:05:00, 202...","[Imaging-Vasc | 2022-03-02 05:05:00, Imaging-V..."
4,MR BRAINNAV WITH AND WITHOUT CONTRAST DTI,MRI,Head,MR BRAINNAV WITH AND WITHOUT CONTRAST DTI: 10...,post-op needed October 11,History of GBM status post resection and TMZ/X...,"[15M NEURO SPECIALTY, 15M NEURO SPECIALTY, 15M...","[Plan of Care, Discharge Summary, Consults, Di...","[Resident, Resident, Physical Therapist, Resid...","[2021-10-13 15:43:00, 2021-10-13 15:43:00, 202...",[15M NEURO SPECIALTY - Plan of Care (Resident)...
5,CT BRAIN WITHOUT CONTRAST,CT,Head,CT BRAIN WITHOUT CONTRAST 03/20/2021 10:48 ...,Altered mental status.,Metastatic lung cancer with intracranial metas...,"[None, None, 14M MS-HI-ACUITY, 14M MS-HI-ACUIT...","[ECG, ECG, H&P, ECG, ED Information Exchange, ...","[None, None, Physician, Physician, None, Resid...","[2021-03-21 03:05:00, 2021-03-21 03:05:00, 202...","[ECG | 2021-03-21 03:05:00, ECG | 2021-03-21 0..."
6,MR LUMBAR SPINE WITH AND WITHOUT CONTRAST,MRI,MSK,MR LUMBAR SPINE WITH AND WITHOUT CONTRAST: 09...,"Low back pain, less than 6 weeks","IgG lambda multiple myeloma, afib, CKD, BPH wh...","[13L GEN SURG, 13L GEN SURG, 13L GEN SURG, TRA...","[Discharge Instructions, ED Provider Notes, ED...","[Resident, Resident, Physician Assistant, Regi...","[2023-09-02 22:06:00, 2023-09-02 18:10:00, 202...",[13L GEN SURG - Discharge Instructions (Reside...
7,MR LUMBAR SPINE WITH AND WITHOUT CONTRAST,MRI,MSK,MR LUMBAR SPINE WITH AND WITHOUT CONTRAST: 06...,? Large disc extrusion left; left leg pain,[The right leg pain with suspected adjacent se...,"[NEURO SPINE PARN 2, NEURO SPINE PARN 2, NEURO...","[Progress Notes, Letter, Progress Notes, Disch...","[Physician Assistant, None, Physician Assistan...","[2023-06-03 00:00:00, 2023-06-03 00:00:00, 202...",[NEURO SPINE PARN 2 - Progress Notes (Physicia...
8,CT MAXILLOFACIAL WITHOUT CONTRAST,CT,Neck,CT MAXILLOFACIAL WITHOUT CONTRAST 05/28/202...,Heart Transplant Eval.,"CAD status post 3-v-CABG (2018), PCI to LCX (A...","[None, None, None, None, 9NE M/S ICU, 9NE M/S ...","[Imaging-Vasc, Imaging-Vasc, Imaging-Vasc, Ima...","[None, None, None, None, Physician, Resident, ...","[2023-05-30 04:16:00, 2023-05-30 04:16:00, 202...","[Imaging-Vasc | 2023-05-30 04:16:00, Imaging-V..."
9,CT MAXILLOFACIAL WITHOUT CONTRAST,CT,Neck,CT MAXILLOFACIAL WITHOUT CONTRAST: 05/09/2022...,Poor dentition. Pre transplant dental eval,Lung transplant eval; history of IPF and COPD,"[10LS CVT, None, 10LS CVT, 10LS CVT, 10LS CVT,...","[Echocardiography, Echocardiography, Progress ...","[Physician, None, Physician, Registered Nurse,...","[2022-05-09 09:21:00, 2022-05-09 08:07:00, 202...",[10LS CVT - Echocardiography (Physician) | 202...


In [143]:
NUM_NOTES = 10
reader_study_evaluation_dataset_processed = pd.DataFrame(columns=[
    "exam_type",
    "imaging_modality",
    "body_group",
    "report_text",
    "original_indication",
    "radiologist_indication",
    "enc_dept_names",
    "note_types",
    "auth_prov_types",
    "deid_service_dates",
    "note_titles",
    "note_texts"
])
for i in tqdm.tqdm(range(len(reader_study_evaluation_dataset_reports))):
    radiology_report = reader_study_evaluation_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    filtered_patient_notes['note_title'] = filtered_patient_notes.apply(lambda row: generate_note_title(
        row['enc_dept_name'], row['note_type'], row['auth_prov_type'], row['deid_service_date']
    ), axis=1)
    enc_dept_names = filtered_patient_notes[:NUM_NOTES][["enc_dept_name"]].squeeze().tolist()
    note_types = filtered_patient_notes[:NUM_NOTES][["note_type"]].squeeze().tolist()
    auth_prov_types = filtered_patient_notes[:NUM_NOTES][["auth_prov_type"]].squeeze().tolist()
    deid_service_dates = filtered_patient_notes[:NUM_NOTES][["deid_service_date"]].squeeze().tolist()
    note_titles = filtered_patient_notes[:NUM_NOTES][["note_title"]].squeeze().tolist()
    note_texts = filtered_patient_notes[:NUM_NOTES][["note_text"]].squeeze().tolist()
    row = {
        "exam_type": radiology_report["exam_type"],
        "imaging_modality": radiology_report["imaging_modality"],
        "body_group": radiology_report["body_group"],
        "report_text": radiology_report["note_text"],
        "original_indication": radiology_report["original_indication"],
        "radiologist_indication": radiology_report["radiologist_indication"],
        "enc_dept_names": enc_dept_names,
        "note_types": note_types,
        "auth_prov_types": auth_prov_types,
        "deid_service_dates": deid_service_dates,
        "note_titles": note_titles
    }
    reader_study_evaluation_dataset_processed.loc[
        len(reader_study_evaluation_dataset_processed)
    ] = row
reader_study_evaluation_dataset_processed.to_parquet("reader_study_evaluation_dataset_processed.parquet")

 97%|███████████████████████████████████████████████████████████████████████▊  | 97/100 [00:32<00:00,  3.03it/s]


AttributeError: 'NoneType' object has no attribute 'tolist'