In [1]:
import os
import pandas as pd
import tqdm
import regex as re
import polars as pl
import tiktoken

RANDOM_STATE = 123

  from pandas.core import (


In [2]:
def categorize_body_group(exam_type):
    categories = {
        "Head": ["HEAD", "BRAIN", "STROKE PROTOCOL", "NEURO"],
        "Abdomen/Pelvis": ["ABDOMEN", "PELVIS", "PROSTATE", "RENAL", "MRCP", "UROGRAM"],
        "MSK": ["KNEE", "HIP", "SHOULDER", "EXTREMITY", "SPINE", "LUMBAR", 
                "CERVICAL", "SCOLIOSIS", "JOINT"],
        "Chest": ["CHEST", "CARDIAC", "PULMONARY EMBOLISM", "CTA", "HEART"],
        "Neck": ["FACE", "NECK", "CRANIOFACIAL", "MAXILLOFACIAL", "ORBIT", "TEMPORAL BONE", 
                 "SINUS", "THYROID", "MANDIBLE", "SKULL"]
    }
    
    for category, keywords in categories.items():
        if any(keyword in exam_type for keyword in keywords):
            return category
    return "Other"

def categorize_imaging_modality(exam_type):
    categories = {
        "MRI": ["MR", "MRI"],
        "CT": ["CT", "CTA"],
        "XR": ["XR"],
        "US": ["US"]
    }
    
    for category, keywords in categories.items():
        if any(keyword in exam_type for keyword in keywords):
            return category
    return "Other"

# Clinical Indication Dataset

In [None]:
clinical_indication_dataset = []
for i in tqdm.tqdm(range(1, 7)):
    clinical_indication_dataset.append(pd.read_parquet(f"clinical_indication_dataset_shard{i}.parquet"))
clinical_indication_dataset = pd.concat(clinical_indication_dataset)

# LLM Balanced Test Dataset

In [None]:
llm_balanced_test_dataset = pd.read_parquet("llm_balanced_test_dataset.parquet")

In [61]:
len(llm_balanced_test_dataset)

2332093

In [62]:
llm_balanced_test_dataset["patientdurablekey"].nunique()

8678

In [63]:
len(llm_balanced_test_dataset[~llm_balanced_test_dataset["exam_type"].isna()])

10000

In [64]:
llm_balanced_test_dataset["body_group"].value_counts()

body_group
Abdomen/Pelvis    2000
Chest             2000
Head              2000
MSK               2000
Neck              2000
Name: count, dtype: int64

In [65]:
llm_balanced_test_dataset["imaging_modality"].value_counts()

imaging_modality
CT     6998
MRI    2822
US      122
XR       58
Name: count, dtype: int64

In [66]:
llm_balanced_test_dataset["imaging_modality"].value_counts(normalize=True)*100

imaging_modality
CT     69.98
MRI    28.22
US      1.22
XR      0.58
Name: proportion, dtype: float64

In [17]:
llm_balanced_test_dataset_processed = pl.read_parquet("llm_balanced_test_dataset_processed.parquet").to_pandas()

In [20]:
llm_balanced_test_dataset_processed["patientdurablekey"].nunique()

8678

In [19]:
llm_balanced_test_dataset_processed["note_texts_full"].apply(len).sum() + len(llm_balanced_test_dataset_processed)

2332093

# Prompt

In [52]:
def generate_prompt(exam_type, original_indication, clinical_notes):
    clinical_notes_string = "\n".join([n[:2000] for n in clinical_notes])
    return f"""Given the following set of CLINICAL NOTES and ORIGINAL INDICATION, generate a REVISED INDICATION
that supplies the relevant clinical history which includes patient's sex and age which may optionally include
biopsies, surgeries, resections, treatments, and relevant symptoms to the radiological exam of {exam_type}. 
If the ORIGINAL INDICATION is for evaluation of oncological or other chronic conditions, make sure to include
surgeries, biopsies, and treatments with dates as applicable, up to 30 words. Otherwise, generate only up to 
15 words or less. Do not add any explanation. 

(SAMPLE) CLINICAL NOTE:
FOLLOW-UP GASTROINTESTINAL MEDICAL ONCOLOGY VISIT 
Patient name ***** ***** 
DOB 08/31/1953 Medical record number ***** Date of service 06/16/2021 Referring Provider: Dr. 
***** ***** ***** ***** Subjective ***** ***** is a 67 y.o. female who presents with the following: 
Interval history and review of systems 

Interval History: -06/30/20: CT CAP ***** Pt remains off of pembrolizumab. She is doing well. 
Only rare loose stools but generally formed stools now . Occasional fatigue. 
Seeing Dr, ***** for labs and followup. Otherwise full 14-point ROS was negative in detail 

Oncologic history ***** ***** is a very pleasant 66 y.o. female who was seen in follow-up for esophageal 
squamous cell carcinoma at our Gastrointestinal Oncology Faculty Practice. The patient had
been noticing ***** dysphagia in ***** ***** *****. An upper endoscopy was done on 
04/23/15 EGD: mass at 25 cm, biopsies demonstrate squamous cell carcinoma in situ. 
09/07/15 EGD: 1. Half circumferential mass, measuring 3 X 2cm in size, was found; 
nodular and sub-mucosal; multiple biopsies were performed 2. Hiatal hernia 3. Nodular mass in the
proximal esophagus consistent with cancer Pathology: Esophagus, mass at 25 cm, biopsy: 
At least squamous cell carcinoma in situ; see comment. 08/19/2015: PET/CT Whole Body (vertex to thighs) 
, 1. Hypermetabolic esophageal mass measuring 2.1 x 1.4 cm, corresponding to the patient's known primary malignancy. 
No evidence of hypermetabolic lymphadenopathy or distant metastatic disease. 
2. Scattered solid pulmonary nodules measuring up to 7 mm without associated hypermetabolism.
Recommend correlation to prior imaging, if available. Otherwise, further follow-up is 
recommended as per oncologic protocol. 

(SAMPLE) ORIGINAL INDICATION:
Restaging.

(SAMPLE) REVISED INDICATION:
Esophageal SCC diagnosed 2015 s/p resection Nov 2015 with negative margins. Follow up FDG PET with 
suspicious superior mediastinal nodes, s/p adjuvant chemoradiation 2017 with radiation to mediastinal 
nodes.

CLINICAL NOTES:
{clinical_notes_string}

ORIGINAL INDICATION:
{original_indication}

REVISED_INDICATION:
    """

llm_balanced_test_dataset_processed["prompt"] = llm_balanced_test_dataset_processed.apply(
    lambda row: generate_prompt(
        row["exam_type"], 
        row["original_indication"], 
        row["note_texts"]
    ), 
axis=1)

tokenizer = tiktoken.get_encoding("cl100k_base")
llm_balanced_test_dataset_processed["input_token_count"] = llm_balanced_test_dataset_processed["prompt"].apply(lambda x: len(tokenizer.encode(x)))
llm_balanced_test_dataset_processed["output_token_count"] = llm_balanced_test_dataset_processed["radiologist_indication"].apply(lambda x: len(tokenizer.encode(x)))

In [54]:
print("input tokens (median)", llm_balanced_test_dataset_processed["input_token_count"].median())
print("output tokens (median)", llm_balanced_test_dataset_processed["output_token_count"].median())

input tokens (median) 4270.0
output tokens (median) 31.0


In [55]:
print("input tokens (max)", llm_balanced_test_dataset_processed["input_token_count"].max())

input tokens (max) 7725


# Clinical Indication Dataset without Redactions

In [3]:
clinical_indication_dataset_redactions = []
for i in tqdm.tqdm(range(1, 7)):
    clinical_indication_dataset_redactions.append(pd.read_parquet(f"clinical_indication_dataset_without_redactions_shard{i}.parquet"))
clinical_indication_dataset_redactions = pd.concat(clinical_indication_dataset_redactions)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [03:08<00:00, 31.40s/it]


In [4]:
len(clinical_indication_dataset_redactions)

6210837

In [5]:
clinical_indication_dataset_redactions["patientdurablekey"].nunique()

33611

In [6]:
len(clinical_indication_dataset_redactions[~clinical_indication_dataset_redactions["exam_type"].isna()])

58759

In [7]:
clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna(), "body_group"] = clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna()]["exam_type"].apply(categorize_body_group)
clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna(), "imaging_modality"] = clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna()]["exam_type"].apply(categorize_imaging_modality)

In [8]:
clinical_indication_dataset_redactions["body_group"].value_counts()

body_group
Head              35079
MSK                7919
Neck               7466
Abdomen/Pelvis     4171
Chest              3406
Other               718
Name: count, dtype: int64

In [9]:
clinical_indication_dataset_redactions["imaging_modality"].value_counts()

imaging_modality
CT       32044
MRI      25666
US         442
Other      378
XR         229
Name: count, dtype: int64

# Reader Study

In [157]:
NUM_PER_BODY_GROUP = 24

radiology_reports = clinical_indication_dataset_redactions[
    (~clinical_indication_dataset_redactions["exam_type"].isna()) & 
    (clinical_indication_dataset_redactions["body_group"] != "Other")
]
radiology_reports = radiology_reports[
    (~radiology_reports["original_indication"]
    .str.contains(r"COMPARISON|Comparison|TECHNIQUE|Technique|FINDINGS|Findings|IMPRESSION|Impression|(As above)|(Not available)|(No known contributory history)|(No clinical notes)|CT|MRI|US|None|none")) &  
    (~radiology_reports["radiologist_indication"]
    .str.contains(r"COMPARISON|Comparison|TECHNIQUE|Technique|FINDINGS|Findings|IMPRESSION|Impression|(As above)|(Not available)|(No known contributory history)|(No clinical notes)|CT|MRI|US|None|none")) 
]

reader_study_sampled_reports = radiology_reports.groupby('body_group').sample(n=NUM_PER_BODY_GROUP, random_state=RANDOM_STATE)

  .str.contains(r"COMPARISON|Comparison|TECHNIQUE|Technique|FINDINGS|Findings|IMPRESSION|Impression|(As above)|(Not available)|(No known contributory history)|(No clinical notes)|CT|MRI|US|None|none")) &
  .str.contains(r"COMPARISON|Comparison|TECHNIQUE|Technique|FINDINGS|Findings|IMPRESSION|Impression|(As above)|(Not available)|(No known contributory history)|(No clinical notes)|CT|MRI|US|None|none"))


In [158]:
reader_study_total_dataset = []
for i in tqdm.tqdm(range(len(reader_study_sampled_reports))):
    radiology_report = reader_study_sampled_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_total_dataset.append(radiology_report.to_frame().T)
    reader_study_total_dataset.append(filtered_patient_notes)

reader_study_total_dataset = pd.concat(reader_study_total_dataset)

print("Clinical Reader Study Model Evaluation Dataset")
print("="*20)
print("n_notes", len(reader_study_total_dataset))
print("n_patients", reader_study_total_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_total_dataset[~reader_study_total_dataset["exam_type"].isna()]))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 120/120 [00:38<00:00,  3.14it/s]

Clinical Reader Study Model Evaluation Dataset
n_notes 25324
n_patients 120
n_radiology reports 120





In [159]:
reader_study_calibration_dataset_reports = reader_study_sampled_reports.groupby('body_group').sample(n=4, random_state=RANDOM_STATE)
reader_study_evaluation_dataset_reports = reader_study_sampled_reports.drop(reader_study_calibration_dataset_reports.index).groupby('body_group').sample(n=20, random_state=RANDOM_STATE)

In [160]:
print(reader_study_calibration_dataset_reports["body_group"].value_counts())
print(reader_study_calibration_dataset_reports["imaging_modality"].value_counts())

body_group
Abdomen/Pelvis    4
Chest             4
Head              4
MSK               4
Neck              4
Name: count, dtype: int64
imaging_modality
CT     12
MRI     8
Name: count, dtype: int64


In [161]:
print(reader_study_evaluation_dataset_reports["body_group"].value_counts())
print(reader_study_evaluation_dataset_reports["imaging_modality"].value_counts())

body_group
Abdomen/Pelvis    20
Chest             20
Head              20
MSK               20
Neck              20
Name: count, dtype: int64
imaging_modality
CT     63
MRI    33
XR      3
US      1
Name: count, dtype: int64


In [162]:
reader_study_calibration_dataset = []
for i in tqdm.tqdm(range(len(reader_study_calibration_dataset_reports))):
    radiology_report = reader_study_calibration_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_calibration_dataset.append(radiology_report.to_frame().T)
    reader_study_calibration_dataset.append(filtered_patient_notes)

reader_study_calibration_dataset = pd.concat(reader_study_calibration_dataset)

print("Clinical Reader Study Rater Calibration Dataset")
print("="*20)
print("n_notes", len(reader_study_calibration_dataset))
print("n_patients", reader_study_calibration_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_calibration_dataset[~reader_study_calibration_dataset["exam_type"].isna()]))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:06<00:00,  3.06it/s]

Clinical Reader Study Rater Calibration Dataset
n_notes 5491
n_patients 20
n_radiology reports 20





In [163]:
reader_study_evaluation_dataset = []
for i in tqdm.tqdm(range(len(reader_study_evaluation_dataset_reports))):
    radiology_report = reader_study_evaluation_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_evaluation_dataset.append(radiology_report.to_frame().T)
    reader_study_evaluation_dataset.append(filtered_patient_notes)

reader_study_evaluation_dataset = pd.concat(reader_study_evaluation_dataset)

print("Clinical Reader Study Model Evaluation Dataset")
print("="*20)
print("n_notes", len(reader_study_evaluation_dataset))
print("n_patients", reader_study_evaluation_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_evaluation_dataset[~reader_study_evaluation_dataset["exam_type"].isna()]))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:31<00:00,  3.13it/s]

Clinical Reader Study Model Evaluation Dataset
n_notes 19833
n_patients 100
n_radiology reports 100





In [164]:
def generate_note_title(enc_dept_name, note_type, auth_prov_type, deid_service_date):
    if enc_dept_name and note_type and auth_prov_type and deid_service_date:
        return f"{enc_dept_name} - {note_type} ({auth_prov_type}) | {deid_service_date}"
    elif enc_dept_name and note_type and deid_service_date:
        return f"{enc_dept_name} - {note_type} | {deid_service_date}"
    elif note_type and deid_service_date:
        return f"{note_type} | {deid_service_date}"
    elif deid_service_date:
        return f"Clinical Note | {deid_service_date}"
    return f"Clinical Note"

In [165]:
NUM_NOTES = 10
reader_study_calibration_dataset_processed = pd.DataFrame(columns=[
    "patientdurablekey",
    "exam_type",
    "imaging_modality",
    "body_group",
    "report_deid_note_key",
    "report_text",
    "original_indication",
    "radiologist_indication",
    "enc_dept_names",
    "note_types",
    "auth_prov_types",
    "deid_service_dates",
    "note_titles",
    "note_texts",
    "note_texts_full"
])
for i in tqdm.tqdm(range(len(reader_study_calibration_dataset_reports))):
    radiology_report = reader_study_calibration_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    filtered_patient_notes['note_title'] = filtered_patient_notes.apply(lambda row: generate_note_title(
        row['enc_dept_name'], row['note_type'], row['auth_prov_type'], row['deid_service_date']
    ), axis=1)
    enc_dept_names = filtered_patient_notes[:NUM_NOTES][["enc_dept_name"]].squeeze().tolist()
    note_types = filtered_patient_notes[:NUM_NOTES][["note_type"]].squeeze().tolist()
    auth_prov_types = filtered_patient_notes[:NUM_NOTES][["auth_prov_type"]].squeeze().tolist()
    deid_service_dates = filtered_patient_notes[:NUM_NOTES][["deid_service_date"]].squeeze().tolist()
    note_titles = filtered_patient_notes[:NUM_NOTES][["note_title"]].squeeze().tolist()
    note_texts = filtered_patient_notes[:NUM_NOTES][["note_text"]].squeeze().tolist()
    note_texts_full = filtered_patient_notes[["note_text"]].squeeze().tolist()

    row = {
        "patientdurablekey": radiology_report["patientdurablekey"],
        "exam_type": radiology_report["exam_type"],
        "imaging_modality": radiology_report["imaging_modality"],
        "body_group": radiology_report["body_group"],
        "report_deid_note_key": radiology_report["deid_note_key"],
        "report_text": radiology_report["note_text"],
        "original_indication": radiology_report["original_indication"],
        "radiologist_indication": radiology_report["radiologist_indication"],
        "enc_dept_names": enc_dept_names,
        "note_types": note_types,
        "auth_prov_types": auth_prov_types,
        "deid_service_dates": deid_service_dates,
        "note_titles": note_titles,
        "note_texts": note_texts,
        "note_texts_full": note_texts_full
    }
    reader_study_calibration_dataset_processed.loc[
        len(reader_study_calibration_dataset_processed)
    ] = row
reader_study_calibration_dataset_processed.to_parquet("reader_study_calibration_dataset_processed.parquet")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:06<00:00,  3.02it/s]


In [166]:
reader_study_calibration_dataset_processed.head()

Unnamed: 0,patientdurablekey,exam_type,imaging_modality,body_group,report_deid_note_key,report_text,original_indication,radiologist_indication,enc_dept_names,note_types,auth_prov_types,deid_service_dates,note_titles,note_texts,note_texts_full
0,D16CA190FA80BB,CT ABDOMEN/PELVIS WITH CONTRAST,CT,Abdomen/Pelvis,D942F5BE2758E1,CT ABDOMEN/PELVIS WITH CONTRAST 12/28/2022 1:...,"gastric cancer, restaging for clinical trial",gastric cancer with signet ring features metas...,"[GI MED ONC MB 4, GI MED ONC MB 4, ADULT INFUS...","[Progress Notes, Letter, RN Note, RN Note, Pro...","[Physician, None, Registered Nurse, Registered...","[2022-12-19 00:00:00, 2022-12-19 00:00:00, 202...",[GI MED ONC MB 4 - Progress Notes (Physician) ...,[VIDEO VISIT I performed this consultation us...,[VIDEO VISIT I performed this consultation us...
1,D26B0E8A04E300,CT ABDOMEN/PELVIS WITHOUT CONTRAST,CT,Abdomen/Pelvis,D34C8F64685D55,CT ABDOMEN/PELVIS WITHOUT CONTRAST 08/07/2017...,concern R perinephric abscess,History of metastatic bladder cancer with brai...,"[15L ADULT ACUTE CARE, 15L ADULT ACUTE CARE, 1...","[Plan of Care, Consults, ED Notes, H&P, ED Not...","[Registered Nurse, Pharmacist, Registered Nurs...","[2017-08-08 02:19:00, 2017-08-07 22:29:00, 201...",[15L ADULT ACUTE CARE - Plan of Care (Register...,[Pt new admit to unit. S/p bilateral ankle fra...,[Pt new admit to unit. S/p bilateral ankle fra...
2,DEF9A819A4D6CE,CT ABDOMEN/PELVIS WITHOUT CONTRAST,CT,Abdomen/Pelvis,DF9877482DF456,CT ABDOMEN/PELVIS WITHOUT CONTRAST 10/22/20...,abd pain ? sepsis.,"History of HIV, epigastric pain,","[15L ADULT ACUTE CARE, 15L ADULT ACUTE CARE, 1...","[Consult, Consult, Consults, Progress Notes, E...","[Physician, Physician, Pharmacist, Physician, ...","[2023-10-23 09:05:00, 2023-10-23 09:02:00, 202...",[15L ADULT ACUTE CARE - Consult (Physician) | ...,"[***** *****, RN 10/23/2023 10:58 AM""\n\na...","[***** *****, RN 10/23/2023 10:58 AM""\n\na..."
3,DFD86620184F55,CT ABDOMEN/PELVIS WITHOUT CONTRAST,CT,Abdomen/Pelvis,DD20946E836E7B,CT ABDOMEN/PELVIS WITHOUT CONTRAST 09/26/2016...,"Intense RUQ pain, r/o free air or perforation,...",Admitted to surgery service with initial conce...,"[PERIOP PARN, PERIOP PARN, PERIOP PARN, 13L GE...","[Anesthesia Transfer of Care, Anesthesia Post-...","[Physician, Physician, Physician, Registered N...","[2016-09-26 00:00:00, 2016-09-26 00:00:00, 201...",[PERIOP PARN - Anesthesia Transfer of Care (Ph...,[Anesthesia Case Summary Scheduled date of Op...,[Anesthesia Case Summary Scheduled date of Op...
4,DECE9B13953718,CT CHEST WITHOUT CONTRAST,CT,Chest,DB1900835B31BA,CT CHEST WITHOUT CONTRAST CLINICAL HISTORY:...,S/p lung transplant,History of mixed connective tissue disease / s...,"[12M MED/SURG/ACUTE TCU, 12M MED/SURG/ACUTE TC...","[H&P, ECG, ECG, Progress Notes, Progress Notes...","[Physician, Physician, None, Physician, None, ...","[2019-10-08 02:28:00, 2019-10-07 20:14:00, 201...",[12M MED/SURG/ACUTE TCU - H&P (Physician) | 20...,[ALD SERVICE H&P NOTE - ***** ATTENDING My...,[ALD SERVICE H&P NOTE - ***** ATTENDING My...


In [167]:
NUM_NOTES = 10
reader_study_evaluation_dataset_processed = pd.DataFrame(columns=[
    "patientdurablekey",
    "exam_type",
    "imaging_modality",
    "body_group",
    "report_deid_note_key",
    "report_text",
    "original_indication",
    "radiologist_indication",
    "enc_dept_names",
    "note_types",
    "auth_prov_types",
    "deid_service_dates",
    "note_titles",
    "note_texts",
    "note_texts_full"
])
for i in tqdm.tqdm(range(len(reader_study_evaluation_dataset_reports))):
    radiology_report = reader_study_evaluation_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    filtered_patient_notes['note_title'] = filtered_patient_notes.apply(lambda row: generate_note_title(
        row['enc_dept_name'], row['note_type'], row['auth_prov_type'], row['deid_service_date']
    ), axis=1)
    enc_dept_names = filtered_patient_notes[:NUM_NOTES][["enc_dept_name"]].squeeze().tolist()
    note_types = filtered_patient_notes[:NUM_NOTES][["note_type"]].squeeze().tolist()
    auth_prov_types = filtered_patient_notes[:NUM_NOTES][["auth_prov_type"]].squeeze().tolist()
    deid_service_dates = filtered_patient_notes[:NUM_NOTES][["deid_service_date"]].squeeze().tolist()
    note_titles = filtered_patient_notes[:NUM_NOTES][["note_title"]].squeeze().tolist()
    note_texts = filtered_patient_notes[:NUM_NOTES][["note_text"]].squeeze().tolist()
    note_texts_full = filtered_patient_notes[["note_text"]].squeeze().tolist()
    row = {
        "patientdurablekey": radiology_report["patientdurablekey"],
        "exam_type": radiology_report["exam_type"],
        "imaging_modality": radiology_report["imaging_modality"],
        "body_group": radiology_report["body_group"],
        "report_deid_note_key": radiology_report["deid_note_key"],
        "report_text": radiology_report["note_text"],
        "original_indication": radiology_report["original_indication"],
        "radiologist_indication": radiology_report["radiologist_indication"],
        "enc_dept_names": enc_dept_names,
        "note_types": note_types,
        "auth_prov_types": auth_prov_types,
        "deid_service_dates": deid_service_dates,
        "note_titles": note_titles,
        "note_texts": note_texts,
        "note_texts_full": note_texts_full
    }
    reader_study_evaluation_dataset_processed.loc[
        len(reader_study_evaluation_dataset_processed)
    ] = row
reader_study_evaluation_dataset_processed.to_parquet("reader_study_evaluation_dataset_processed.parquet")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:31<00:00,  3.13it/s]


In [168]:
reader_study_calibration_dataset_processed["note_texts_full"].apply(len).sum() + len(reader_study_calibration_dataset_processed)

5491

In [169]:
reader_study_evaluation_dataset_processed["note_texts_full"].apply(len).sum() + len(reader_study_evaluation_dataset_processed)

19833

In [170]:
reader_study_calibration_dataset_processed[["exam_type", "original_indication", "radiologist_indication"]].to_csv("reader_calibration_dataset.csv", index=False)

In [171]:
reader_study_evaluation_dataset_processed[["exam_type", "original_indication", "radiologist_indication"]].to_csv("reader_evaluation_dataset.csv", index=False)

In [173]:
reader_study_calibration_dataset_processed

Unnamed: 0,patientdurablekey,exam_type,imaging_modality,body_group,report_deid_note_key,report_text,original_indication,radiologist_indication,enc_dept_names,note_types,auth_prov_types,deid_service_dates,note_titles,note_texts,note_texts_full
0,D16CA190FA80BB,CT ABDOMEN/PELVIS WITH CONTRAST,CT,Abdomen/Pelvis,D942F5BE2758E1,CT ABDOMEN/PELVIS WITH CONTRAST 12/28/2022 1:...,"gastric cancer, restaging for clinical trial",gastric cancer with signet ring features metas...,"[GI MED ONC MB 4, GI MED ONC MB 4, ADULT INFUS...","[Progress Notes, Letter, RN Note, RN Note, Pro...","[Physician, None, Registered Nurse, Registered...","[2022-12-19 00:00:00, 2022-12-19 00:00:00, 202...",[GI MED ONC MB 4 - Progress Notes (Physician) ...,[VIDEO VISIT I performed this consultation us...,[VIDEO VISIT I performed this consultation us...
1,D26B0E8A04E300,CT ABDOMEN/PELVIS WITHOUT CONTRAST,CT,Abdomen/Pelvis,D34C8F64685D55,CT ABDOMEN/PELVIS WITHOUT CONTRAST 08/07/2017...,concern R perinephric abscess,History of metastatic bladder cancer with brai...,"[15L ADULT ACUTE CARE, 15L ADULT ACUTE CARE, 1...","[Plan of Care, Consults, ED Notes, H&P, ED Not...","[Registered Nurse, Pharmacist, Registered Nurs...","[2017-08-08 02:19:00, 2017-08-07 22:29:00, 201...",[15L ADULT ACUTE CARE - Plan of Care (Register...,[Pt new admit to unit. S/p bilateral ankle fra...,[Pt new admit to unit. S/p bilateral ankle fra...
2,DEF9A819A4D6CE,CT ABDOMEN/PELVIS WITHOUT CONTRAST,CT,Abdomen/Pelvis,DF9877482DF456,CT ABDOMEN/PELVIS WITHOUT CONTRAST 10/22/20...,abd pain ? sepsis.,"History of HIV, epigastric pain,","[15L ADULT ACUTE CARE, 15L ADULT ACUTE CARE, 1...","[Consult, Consult, Consults, Progress Notes, E...","[Physician, Physician, Pharmacist, Physician, ...","[2023-10-23 09:05:00, 2023-10-23 09:02:00, 202...",[15L ADULT ACUTE CARE - Consult (Physician) | ...,"[***** *****, RN 10/23/2023 10:58 AM""\n\na...","[***** *****, RN 10/23/2023 10:58 AM""\n\na..."
3,DFD86620184F55,CT ABDOMEN/PELVIS WITHOUT CONTRAST,CT,Abdomen/Pelvis,DD20946E836E7B,CT ABDOMEN/PELVIS WITHOUT CONTRAST 09/26/2016...,"Intense RUQ pain, r/o free air or perforation,...",Admitted to surgery service with initial conce...,"[PERIOP PARN, PERIOP PARN, PERIOP PARN, 13L GE...","[Anesthesia Transfer of Care, Anesthesia Post-...","[Physician, Physician, Physician, Registered N...","[2016-09-26 00:00:00, 2016-09-26 00:00:00, 201...",[PERIOP PARN - Anesthesia Transfer of Care (Ph...,[Anesthesia Case Summary Scheduled date of Op...,[Anesthesia Case Summary Scheduled date of Op...
4,DECE9B13953718,CT CHEST WITHOUT CONTRAST,CT,Chest,DB1900835B31BA,CT CHEST WITHOUT CONTRAST CLINICAL HISTORY:...,S/p lung transplant,History of mixed connective tissue disease / s...,"[12M MED/SURG/ACUTE TCU, 12M MED/SURG/ACUTE TC...","[H&P, ECG, ECG, Progress Notes, Progress Notes...","[Physician, Physician, None, Physician, None, ...","[2019-10-08 02:28:00, 2019-10-07 20:14:00, 201...",[12M MED/SURG/ACUTE TCU - H&P (Physician) | 20...,[ALD SERVICE H&P NOTE - ***** ATTENDING My...,[ALD SERVICE H&P NOTE - ***** ATTENDING My...
5,D8DB710446D2E9,CT CHEST WITH CONTRAST,CT,Chest,D6B5618FFE34F1,CT CHEST WITH CONTRAST CLINICAL HISTORY: p...,persistent neutropenic fever,relapsed AML presenting w/ neutropenic fever O...,"[11L MEDSURG-ONC/BMT B, 11L MEDSURG-ONC/BMT B,...","[Plan of Care, Interdisciplinary, Consults, Pr...","[Registered Nurse, Social Worker, Registered N...","[2021-01-07 00:44:00, 2021-01-06 16:25:00, 202...",[11L MEDSURG-ONC/BMT B - Plan of Care (Registe...,[Pt more alert tonight. PRN oxy and ativan giv...,[Pt more alert tonight. PRN oxy and ativan giv...
6,DD86FC8BB5B04E,CT CHEST WITHOUT CONTRAST,CT,Chest,D30B20A758BB53,CT CHEST WITHOUT CONTRAST CLINICAL HISTORY:...,"Hypoxia eval for infection, pneumonitis",Metastatic ocular melanoma with metastases to ...,"[None, 14L MEDICINE, 14L MEDICINE, 14L MEDICIN...","[Sticky Note, ACP (Advance Care Planning), Lab...","[None, Physician, Physician, Physician, Physic...","[2021-12-17 02:05:00, 2021-12-17 01:29:00, 202...","[Sticky Note | 2021-12-17 02:05:00, 14L MEDICI...",[ipilimumab (IPI) ( inhibition of T-cell inact...,[ipilimumab (IPI) ( inhibition of T-cell inact...
7,DFD47DDE3AEF59,CT CHEST WITH CONTRAST,CT,Chest,D5B427AC62951E,CT CHEST WITH CONTRAST 03/07/2023 5:51 PM ...,"Fevers of unknown origin, L pleural effusion. ...",HIV (on ART) and HPV-negative SCC of tonsil w/...,"[14M MS-HI-ACUITY, 14M MS-HI-ACUITY, 14M MS-HI...","[Plan of Care, Consults, Consults, Consults, P...","[Registered Nurse, Registered Nurse, Registere...","[2023-03-08 05:29:00, 2023-03-07 15:30:00, 202...",[14M MS-HI-ACUITY - Plan of Care (Registered N...,[ Problem: Discharge Planning - Adult Goal: ...,[ Problem: Discharge Planning - Adult Goal: ...
8,D6DE10FC3EE718,MR BRAIN WITH AND WITHOUT CONTRAST 10/10/2019...,MRI,Head,DFD12CA4895C48,MR BRAIN WITH AND WITHOUT CONTRAST : 10/10/201...,"Cancer, recurrence suspected Other reason tha...",Resection of scalp lesion with myocutaneous fl...,"[None, RAD ONC MB 1, None, None, HEAD & NECK S...","[HNO_MYC_LET_INFO|NOTE_ID, RN Note, ECG, HNO_M...","[None, Registered Nurse, None, None, Registere...","[2019-10-10 23:42:00, 2019-10-09 11:15:00, 201...",[HNO_MYC_LET_INFO|NOTE_ID | 2019-10-10 23:42:0...,[COVID-19 Screening Result 10/10/19 CLEA...,[COVID-19 Screening Result 10/10/19 CLEA...
9,DABCF6B64FFC0F,MR BRAIN WITH AND WITHOUT CONTRAST,MRI,Head,D1AD2491980CDD,MR BRAIN WITH AND WITHOUT CONTRAST: 11/04/202...,"persistent encephalopathy, r/o stroke","History of decompensated EtOH cirrhosis, chron...","[14L MEDICINE, 14L MEDICINE, 14L MEDICINE, NON...","[Progress Notes, Consults, Consults, Anesthesi...","[Resident, Registered Dietitian, Speech and La...","[2020-11-04 09:29:00, 2020-11-04 09:20:00, 202...",[14L MEDICINE - Progress Notes (Resident) | 20...,[HOSPITAL MEDICINE PROGRESS NOTE Admit d...,[HOSPITAL MEDICINE PROGRESS NOTE Admit d...
