In [44]:
import os
import pandas as pd
import tqdm
import regex as re
import polars as pl
import tiktoken

RANDOM_STATE = 123

In [2]:
def categorize_body_group(exam_type):
    categories = {
        "Head": ["HEAD", "BRAIN", "STROKE PROTOCOL", "NEURO"],
        "Abdomen/Pelvis": ["ABDOMEN", "PELVIS", "PROSTATE", "RENAL", "MRCP", "UROGRAM"],
        "MSK": ["KNEE", "HIP", "SHOULDER", "EXTREMITY", "SPINE", "LUMBAR", 
                "CERVICAL", "SCOLIOSIS", "JOINT"],
        "Chest": ["CHEST", "CARDIAC", "PULMONARY EMBOLISM", "CTA", "HEART"],
        "Neck": ["FACE", "NECK", "CRANIOFACIAL", "MAXILLOFACIAL", "ORBIT", "TEMPORAL BONE", 
                 "SINUS", "THYROID", "MANDIBLE", "SKULL"]
    }
    
    for category, keywords in categories.items():
        if any(keyword in exam_type for keyword in keywords):
            return category
    return "Other"

def categorize_imaging_modality(exam_type):
    categories = {
        "MRI": ["MR", "MRI"],
        "CT": ["CT", "CTA"],
        "XR": ["XR"],
        "US": ["US"]
    }
    
    for category, keywords in categories.items():
        if any(keyword in exam_type for keyword in keywords):
            return category
    return "Other"

# Clinical Indication Dataset

In [None]:
clinical_indication_dataset = []
for i in tqdm.tqdm(range(1, 7)):
    clinical_indication_dataset.append(pd.read_parquet(f"clinical_indication_dataset_shard{i}.parquet"))
clinical_indication_dataset = pd.concat(clinical_indication_dataset)

# LLM Balanced Test Dataset

In [60]:
llm_balanced_test_dataset = pd.read_parquet("llm_balanced_test_dataset.parquet")

In [61]:
len(llm_balanced_test_dataset)

2332093

In [62]:
llm_balanced_test_dataset["patientdurablekey"].nunique()

8678

In [63]:
len(llm_balanced_test_dataset[~llm_balanced_test_dataset["exam_type"].isna()])

10000

In [64]:
llm_balanced_test_dataset["body_group"].value_counts()

body_group
Abdomen/Pelvis    2000
Chest             2000
Head              2000
MSK               2000
Neck              2000
Name: count, dtype: int64

In [65]:
llm_balanced_test_dataset["imaging_modality"].value_counts()

imaging_modality
CT     6998
MRI    2822
US      122
XR       58
Name: count, dtype: int64

In [66]:
llm_balanced_test_dataset["imaging_modality"].value_counts(normalize=True)*100

imaging_modality
CT     69.98
MRI    28.22
US      1.22
XR      0.58
Name: proportion, dtype: float64

In [17]:
llm_balanced_test_dataset_processed = pl.read_parquet("llm_balanced_test_dataset_processed.parquet").to_pandas()

In [20]:
llm_balanced_test_dataset_processed["patientdurablekey"].nunique()

8678

In [19]:
llm_balanced_test_dataset_processed["note_texts_full"].apply(len).sum() + len(llm_balanced_test_dataset_processed)

2332093

# Prompt

In [52]:
def generate_prompt(exam_type, original_indication, clinical_notes):
    clinical_notes_string = "\n".join([n[:2000] for n in clinical_notes])
    return f"""Given the following set of CLINICAL NOTES and ORIGINAL INDICATION, generate a REVISED INDICATION
that supplies the relevant clinical history which includes patient's sex and age which may optionally include
biopsies, surgeries, resections, treatments, and relevant symptoms to the radiological exam of {exam_type}. 
If the ORIGINAL INDICATION is for evaluation of oncological or other chronic conditions, make sure to include
surgeries, biopsies, and treatments with dates as applicable, up to 30 words. Otherwise, generate only up to 
15 words or less. Do not add any explanation. 

(SAMPLE) CLINICAL NOTE:
FOLLOW-UP GASTROINTESTINAL MEDICAL ONCOLOGY VISIT 
Patient name ***** ***** 
DOB 08/31/1953 Medical record number ***** Date of service 06/16/2021 Referring Provider: Dr. 
***** ***** ***** ***** Subjective ***** ***** is a 67 y.o. female who presents with the following: 
Interval history and review of systems 

Interval History: -06/30/20: CT CAP ***** Pt remains off of pembrolizumab. She is doing well. 
Only rare loose stools but generally formed stools now . Occasional fatigue. 
Seeing Dr, ***** for labs and followup. Otherwise full 14-point ROS was negative in detail 

Oncologic history ***** ***** is a very pleasant 66 y.o. female who was seen in follow-up for esophageal 
squamous cell carcinoma at our Gastrointestinal Oncology Faculty Practice. The patient had
been noticing ***** dysphagia in ***** ***** *****. An upper endoscopy was done on 
04/23/15 EGD: mass at 25 cm, biopsies demonstrate squamous cell carcinoma in situ. 
09/07/15 EGD: 1. Half circumferential mass, measuring 3 X 2cm in size, was found; 
nodular and sub-mucosal; multiple biopsies were performed 2. Hiatal hernia 3. Nodular mass in the
proximal esophagus consistent with cancer Pathology: Esophagus, mass at 25 cm, biopsy: 
At least squamous cell carcinoma in situ; see comment. 08/19/2015: PET/CT Whole Body (vertex to thighs) 
, 1. Hypermetabolic esophageal mass measuring 2.1 x 1.4 cm, corresponding to the patient's known primary malignancy. 
No evidence of hypermetabolic lymphadenopathy or distant metastatic disease. 
2. Scattered solid pulmonary nodules measuring up to 7 mm without associated hypermetabolism.
Recommend correlation to prior imaging, if available. Otherwise, further follow-up is 
recommended as per oncologic protocol. 

(SAMPLE) ORIGINAL INDICATION:
Restaging.

(SAMPLE) REVISED INDICATION:
Esophageal SCC diagnosed 2015 s/p resection Nov 2015 with negative margins. Follow up FDG PET with 
suspicious superior mediastinal nodes, s/p adjuvant chemoradiation 2017 with radiation to mediastinal 
nodes.

CLINICAL NOTES:
{clinical_notes_string}

ORIGINAL INDICATION:
{original_indication}

REVISED_INDICATION:
    """

tokenizer = tiktoken.get_encoding("cl100k_base")
llm_balanced_test_dataset_processed["prompt"] = llm_balanced_test_dataset_processed.apply(
    lambda row: generate_prompt(
        row["exam_type"], 
        row["original_indication"], 
        row["note_texts"]
    ), 
axis=1)
llm_balanced_test_dataset_processed["input_token_count"] = llm_balanced_test_dataset_processed["prompt"].apply(lambda x: len(tokenizer.encode(x)))
llm_balanced_test_dataset_processed["output_token_count"] = llm_balanced_test_dataset_processed["radiologist_indication"].apply(lambda x: len(tokenizer.encode(x)))

In [54]:
print("input tokens (median)", llm_balanced_test_dataset_processed["input_token_count"].median())
print("output tokens (median)", llm_balanced_test_dataset_processed["output_token_count"].median())

input tokens (median) 4270.0
output tokens (median) 31.0


# Clinical Indication Dataset without Redactions

In [21]:
clinical_indication_dataset_redactions = []
for i in tqdm.tqdm(range(1, 7)):
    clinical_indication_dataset_redactions.append(pd.read_parquet(f"clinical_indication_dataset_without_redactions_shard{i}.parquet"))
clinical_indication_dataset_redactions = pd.concat(clinical_indication_dataset_redactions)

100%|█████████████████████████████████████████████████████████████████████████████| 6/6 [02:51<00:00, 28.51s/it]


In [22]:
len(clinical_indication_dataset_redactions)

6210837

In [23]:
clinical_indication_dataset_redactions["patientdurablekey"].nunique()

33611

In [24]:
len(clinical_indication_dataset_redactions[~clinical_indication_dataset_redactions["exam_type"].isna()])

58759

In [27]:
clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna(), "body_group"] = clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna()]["exam_type"].apply(categorize_body_group)
clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna(), "imaging_modality"] = clinical_indication_dataset_redactions.loc[clinical_indication_dataset_redactions["exam_type"].notna()]["exam_type"].apply(categorize_imaging_modality)

In [28]:
clinical_indication_dataset_redactions["body_group"].value_counts()

body_group
Head              35079
MSK                7919
Neck               7466
Abdomen/Pelvis     4171
Chest              3406
Other               718
Name: count, dtype: int64

In [29]:
clinical_indication_dataset_redactions["imaging_modality"].value_counts()

imaging_modality
CT       32044
MRI      25666
US         442
Other      378
XR         229
Name: count, dtype: int64

# Reader Study

In [30]:
NUM_PER_BODY_GROUP = 24

radiology_reports = clinical_indication_dataset_redactions[
    (~clinical_indication_dataset_redactions["exam_type"].isna()) & 
    (clinical_indication_dataset_redactions["body_group"] != "Other")
]
reader_study_sampled_reports = radiology_reports.groupby('body_group').sample(n=NUM_PER_BODY_GROUP, random_state=RANDOM_STATE)

In [31]:
reader_study_total_dataset = []
for i in tqdm.tqdm(range(len(reader_study_sampled_reports))):
    radiology_report = reader_study_sampled_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_total_dataset.append(radiology_report.to_frame().T)
    reader_study_total_dataset.append(filtered_patient_notes)

reader_study_total_dataset = pd.concat(reader_study_total_dataset)

print("Clinical Reader Study Model Evaluation Dataset")
print("="*20)
print("n_notes", len(reader_study_total_dataset))
print("n_patients", reader_study_total_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_total_dataset[~reader_study_total_dataset["exam_type"].isna()]))

100%|█████████████████████████████████████████████████████████████████████████| 120/120 [00:39<00:00,  3.08it/s]

Clinical Reader Study Model Evaluation Dataset
n_notes 27485
n_patients 120
n_radiology reports 120





In [32]:
reader_study_calibration_dataset_reports = reader_study_sampled_reports.groupby('body_group').sample(n=4, random_state=RANDOM_STATE)
reader_study_evaluation_dataset_reports = reader_study_sampled_reports.drop(reader_study_calibration_dataset_reports.index).groupby('body_group').sample(n=20, random_state=RANDOM_STATE)

In [33]:
print(reader_study_calibration_dataset_reports["body_group"].value_counts())
print(reader_study_calibration_dataset_reports["imaging_modality"].value_counts())

body_group
Abdomen/Pelvis    4
Chest             4
Head              4
MSK               4
Neck              4
Name: count, dtype: int64
imaging_modality
CT     16
MRI     4
Name: count, dtype: int64


In [34]:
print(reader_study_evaluation_dataset_reports["body_group"].value_counts())
print(reader_study_evaluation_dataset_reports["imaging_modality"].value_counts())

body_group
Abdomen/Pelvis    20
Chest             20
Head              20
MSK               20
Neck              20
Name: count, dtype: int64
imaging_modality
CT     72
MRI    25
US      2
XR      1
Name: count, dtype: int64


In [35]:
reader_study_calibration_dataset = []
for i in tqdm.tqdm(range(len(reader_study_calibration_dataset_reports))):
    radiology_report = reader_study_calibration_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_calibration_dataset.append(radiology_report.to_frame().T)
    reader_study_calibration_dataset.append(filtered_patient_notes)

reader_study_calibration_dataset = pd.concat(reader_study_calibration_dataset)

print("Clinical Reader Study Rater Calibration Dataset")
print("="*20)
print("n_notes", len(reader_study_calibration_dataset))
print("n_patients", reader_study_calibration_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_calibration_dataset[~reader_study_calibration_dataset["exam_type"].isna()]))

100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:06<00:00,  3.01it/s]

Clinical Reader Study Rater Calibration Dataset
n_notes 5545
n_patients 20
n_radiology reports 20





In [36]:
reader_study_evaluation_dataset = []
for i in tqdm.tqdm(range(len(reader_study_evaluation_dataset_reports))):
    radiology_report = reader_study_evaluation_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    reader_study_evaluation_dataset.append(radiology_report.to_frame().T)
    reader_study_evaluation_dataset.append(filtered_patient_notes)

reader_study_evaluation_dataset = pd.concat(reader_study_evaluation_dataset)

print("Clinical Reader Study Model Evaluation Dataset")
print("="*20)
print("n_notes", len(reader_study_evaluation_dataset))
print("n_patients", reader_study_evaluation_dataset["patientdurablekey"].nunique())
print("n_radiology reports", len(reader_study_evaluation_dataset[~reader_study_evaluation_dataset["exam_type"].isna()]))

100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:30<00:00,  3.23it/s]

Clinical Reader Study Model Evaluation Dataset
n_notes 21940
n_patients 100
n_radiology reports 100





In [37]:
def generate_note_title(enc_dept_name, note_type, auth_prov_type, deid_service_date):
    if enc_dept_name and note_type and auth_prov_type and deid_service_date:
        return f"{enc_dept_name} - {note_type} ({auth_prov_type}) | {deid_service_date}"
    elif enc_dept_name and note_type and deid_service_date:
        return f"{enc_dept_name} - {note_type} | {deid_service_date}"
    elif note_type and deid_service_date:
        return f"{note_type} | {deid_service_date}"
    elif deid_service_date:
        return f"Clinical Note | {deid_service_date}"
    return f"Clinical Note"

In [38]:
NUM_NOTES = 10
reader_study_calibration_dataset_processed = pd.DataFrame(columns=[
    "patientdurablekey",
    "exam_type",
    "imaging_modality",
    "body_group",
    "report_deid_note_key",
    "report_text",
    "original_indication",
    "radiologist_indication",
    "enc_dept_names",
    "note_types",
    "auth_prov_types",
    "deid_service_dates",
    "note_titles",
    "note_texts",
    "note_texts_full"
])
for i in tqdm.tqdm(range(len(reader_study_calibration_dataset_reports))):
    radiology_report = reader_study_calibration_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    filtered_patient_notes['note_title'] = filtered_patient_notes.apply(lambda row: generate_note_title(
        row['enc_dept_name'], row['note_type'], row['auth_prov_type'], row['deid_service_date']
    ), axis=1)
    enc_dept_names = filtered_patient_notes[:NUM_NOTES][["enc_dept_name"]].squeeze().tolist()
    note_types = filtered_patient_notes[:NUM_NOTES][["note_type"]].squeeze().tolist()
    auth_prov_types = filtered_patient_notes[:NUM_NOTES][["auth_prov_type"]].squeeze().tolist()
    deid_service_dates = filtered_patient_notes[:NUM_NOTES][["deid_service_date"]].squeeze().tolist()
    note_titles = filtered_patient_notes[:NUM_NOTES][["note_title"]].squeeze().tolist()
    note_texts = filtered_patient_notes[:NUM_NOTES][["note_text"]].squeeze().tolist()
    note_texts_full = filtered_patient_notes[["note_text"]].squeeze().tolist()

    row = {
        "patientdurablekey": radiology_report["patientdurablekey"],
        "exam_type": radiology_report["exam_type"],
        "imaging_modality": radiology_report["imaging_modality"],
        "body_group": radiology_report["body_group"],
        "report_deid_note_key": radiology_report["deid_note_key"],
        "report_text": radiology_report["note_text"],
        "original_indication": radiology_report["original_indication"],
        "radiologist_indication": radiology_report["radiologist_indication"],
        "enc_dept_names": enc_dept_names,
        "note_types": note_types,
        "auth_prov_types": auth_prov_types,
        "deid_service_dates": deid_service_dates,
        "note_titles": note_titles,
        "note_texts": note_texts,
        "note_texts_full": note_texts_full
    }
    reader_study_calibration_dataset_processed.loc[
        len(reader_study_calibration_dataset_processed)
    ] = row
reader_study_calibration_dataset_processed.to_parquet("reader_study_calibration_dataset_processed.parquet")

100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:06<00:00,  3.01it/s]


In [39]:
reader_study_calibration_dataset_processed.head()

Unnamed: 0,patientdurablekey,exam_type,imaging_modality,body_group,report_deid_note_key,report_text,original_indication,radiologist_indication,enc_dept_names,note_types,auth_prov_types,deid_service_dates,note_titles,note_texts,note_texts_full
0,DB8470625F58D9,CT ABDOMEN/PELVIS WITH CONTRAST,CT,Abdomen/Pelvis,DD7ABE64ECF8D9,CT ABDOMEN/PELVIS WITH CONTRAST 07/10/2021 8:...,abdominal and flank pain.,"UTI for the past week, started abx on Sunday. ...","[EMERGENCY DEPT PARN, EMERGENCY DEPT PARN, EME...","[Discharge Instructions, ED Provider Notes, EC...","[Physician, Resident, Physician, Physician, Ho...","[2021-07-10 22:49:00, 2021-07-10 17:31:00, 202...",[EMERGENCY DEPT PARN - Discharge Instructions ...,[Discharge instructions for ***** ***** You...,[Discharge instructions for ***** ***** You...
1,DCF5D0853DD7F2,CT ABDOMEN/PELVIS WITHOUT CONTRAST,CT,Abdomen/Pelvis,D17CB67DD1BE87,CT ABDOMEN/PELVIS WITHOUT CONTRAST 05/08/20...,Abdominal pain.,Found to have A. fib RVR in the ED,"[None, None, None, 13L GEN SURG, None, 13L GEN...","[ECG, ECG, ECG, H&P, Sticky Note, ED Provider ...","[None, None, None, Resident, None, Resident, N...","[2023-05-08 02:06:00, 2023-05-08 01:05:00, 202...","[ECG | 2023-05-08 02:06:00, ECG | 2023-05-08 0...",[IMPRESSION: Sinus rhythm with Premature atr...,[IMPRESSION: Sinus rhythm with Premature atr...
2,DC634AA2A4EC9B,CT ABDOMEN/PELVIS WITH CONTRAST,CT,Abdomen/Pelvis,DDD766258A0440,CT ABDOMEN/PELVIS WITH CONTRAST 08/14/2020 ...,Tachycardia and fever.,Recent anterior cervical discectomy and fusion...,"[10LS CVT, 10LS CVT, 10LS CVT, 10LS CVT, None,...","[Plan of Care, H&P, ED Provider Notes, ECG, EC...","[Registered Nurse, Resident, Physician Assista...","[2020-08-15 04:07:00, 2020-08-14 20:15:00, 202...",[10LS CVT - Plan of Care (Registered Nurse) | ...,"[ Problem: Pain, Acute / Chronic- Adult Goa...","[ Problem: Pain, Acute / Chronic- Adult Goa..."
3,D5E104EF1812AD,MR PELVIS WITH AND WITHOUT CONTRAST 11/10/2022...,MRI,Abdomen/Pelvis,D64FEB8A951B13,MR PELVIS WITH AND WITHOUT CONTRAST 11/10/2022...,"perianal abscess, anal fistula evaluation",Chronic transsphincteric horseshoe perineal fi...,"[COLORECTAL SURG MB 4, PERIOP MB, PERIOP MB, P...","[Progress Notes, Letter, IP AVS Snapshot, IP A...","[Nurse Practitioner, None, None, None, None, N...","[2022-11-04 00:00:00, 2022-08-30 15:34:00, 202...",[COLORECTAL SURG MB 4 - Progress Notes (Nurse ...,[This is an independent service. The availab...,[This is an independent service. The availab...
4,D34C07E1645953,CT CHEST PULMONARY EMBOLISM (CTPE),CT,Chest,D496F446F36901,CT CHEST PULMONARY EMBOLISM (CTPE) CLINICAL...,Active cancer treatment or palliation Chest p...,alveolar rhabdomyosarcoma,"[EMERGENCY DEPT PARN, EMERGENCY DEPT PARN, EME...","[Consults, Progress Notes, Consult, ED Provide...","[Resident, Physician Assistant, Physician, Res...","[2022-05-11 01:53:00, 2022-05-11 01:41:00, 202...",[EMERGENCY DEPT PARN - Consults (Resident) | 2...,[OPHTHALMOLOGY CONSULTATION Date of initial...,[OPHTHALMOLOGY CONSULTATION Date of initial...


In [40]:
NUM_NOTES = 10
reader_study_evaluation_dataset_processed = pd.DataFrame(columns=[
    "patientdurablekey",
    "exam_type",
    "imaging_modality",
    "body_group",
    "report_deid_note_key",
    "report_text",
    "original_indication",
    "radiologist_indication",
    "enc_dept_names",
    "note_types",
    "auth_prov_types",
    "deid_service_dates",
    "note_titles",
    "note_texts",
    "note_texts_full"
])
for i in tqdm.tqdm(range(len(reader_study_evaluation_dataset_reports))):
    radiology_report = reader_study_evaluation_dataset_reports.iloc[i]
    patient_mrn = radiology_report["patientdurablekey"]
    patient_notes = clinical_indication_dataset_redactions[clinical_indication_dataset_redactions["patientdurablekey"] == patient_mrn].copy()    
    filtered_patient_notes = patient_notes[
        (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
        (patient_notes["note_type"] != "Imaging")
    ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
    filtered_patient_notes['note_title'] = filtered_patient_notes.apply(lambda row: generate_note_title(
        row['enc_dept_name'], row['note_type'], row['auth_prov_type'], row['deid_service_date']
    ), axis=1)
    enc_dept_names = filtered_patient_notes[:NUM_NOTES][["enc_dept_name"]].squeeze().tolist()
    note_types = filtered_patient_notes[:NUM_NOTES][["note_type"]].squeeze().tolist()
    auth_prov_types = filtered_patient_notes[:NUM_NOTES][["auth_prov_type"]].squeeze().tolist()
    deid_service_dates = filtered_patient_notes[:NUM_NOTES][["deid_service_date"]].squeeze().tolist()
    note_titles = filtered_patient_notes[:NUM_NOTES][["note_title"]].squeeze().tolist()
    note_texts = filtered_patient_notes[:NUM_NOTES][["note_text"]].squeeze().tolist()
    note_texts_full = filtered_patient_notes[["note_text"]].squeeze().tolist()
    row = {
        "patientdurablekey": radiology_report["patientdurablekey"],
        "exam_type": radiology_report["exam_type"],
        "imaging_modality": radiology_report["imaging_modality"],
        "body_group": radiology_report["body_group"],
        "report_deid_note_key": radiology_report["deid_note_key"],
        "report_text": radiology_report["note_text"],
        "original_indication": radiology_report["original_indication"],
        "radiologist_indication": radiology_report["radiologist_indication"],
        "enc_dept_names": enc_dept_names,
        "note_types": note_types,
        "auth_prov_types": auth_prov_types,
        "deid_service_dates": deid_service_dates,
        "note_titles": note_titles,
        "note_texts": note_texts,
        "note_texts_full": note_texts_full
    }
    reader_study_evaluation_dataset_processed.loc[
        len(reader_study_evaluation_dataset_processed)
    ] = row
reader_study_evaluation_dataset_processed.to_parquet("reader_study_evaluation_dataset_processed.parquet")

100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:33<00:00,  2.96it/s]


In [41]:
reader_study_calibration_dataset_processed["note_texts_full"].apply(len).sum() + len(reader_study_calibration_dataset_processed)

5545

In [42]:
reader_study_evaluation_dataset_processed["note_texts_full"].apply(len).sum() + len(reader_study_evaluation_dataset_processed)

21940