In [2]:
import os
import pandas as pd
import tqdm

  from pandas.core import (


In [213]:
additional_history_file = "/mnt/sohn2022/UCSF_secure_data_info_commons_clinical_notes/additional_history/notes/shard1/additional_history_0.parquet"
data = pd.read_parquet(additional_history_file)
print("n_notes", len(data))
print("n_patients", data["patientdurablekey"].nunique())

n_notes 629457
n_patients 1000


## Note Type Exclusion
We remove all clinical notes that are telephone encounters of patient instructions or have a `note_type` of `None`. We also filter out notes that have < 20 words, regardless of type.

In [214]:
filtered_data_note_type = data[
    ~(data["note_type"].isna()) & 
    ~(data["note_type"].isin(["Telephone Encounter", "Patient Instructions"])) & 
    ~(data["note_type"].isna()) &
    ~(data["note_text"].apply(lambda s: len(s.split()) < 20))
]

In [215]:
print("n_notes", len(filtered_data_note_type))
print("n_patients", filtered_data_note_type["patientdurablekey"].nunique())

n_notes 421233
n_patients 1000


In [216]:
print("n_notes (excluded)", len(data) - len(filtered_data_note_type))
print("n_patients (excluded)", data["patientdurablekey"].nunique() - filtered_data_note_type["patientdurablekey"].nunique())

n_notes (excluded) 208224
n_patients (excluded) 0


## Radiology Report Exclusion
We filter out all the radiology reports with an additional history of `None` and other miscellaneous preprocessing steps. We also make sure to filter out all notes that occur before the `deid_service_date` of the radiological exam. We assume that all previous notes that will not mention the radiology report to prevent data leakage.

In [217]:
patient_mrns = filtered_data_note_type["patientdurablekey"].unique()
filtered_data_radiology_report = []
for i in tqdm.tqdm(range(len(patient_mrns))):
    patient_mrn = patient_mrns[i]
    patient_notes = filtered_data_note_type[filtered_data_note_type["patientdurablekey"] == patient_mrn]
    radiology_reports = patient_notes[
        (patient_notes["note_type"] == "Imaging") &
        (patient_notes["auth_prov_type"].isna()) & 
        (~patient_notes["note_text"].str.contains("RADIOLOGY PRELIMINARY INTERPRETATION")) &
        (~patient_notes["note_text"].str.contains("ADDITIONAL HISTORY: None")) &
        (patient_notes["note_text"].str.contains("ADDITIONAL HISTORY"))  
    ].copy()    
    radiology_reports.loc[:, "exam_type"] = radiology_reports["note_text"].str.split("  ").apply(lambda l: l[0]).str.replace(":", "", regex=False).str.strip()
    radiology_reports = radiology_reports[
        (radiology_reports["exam_type"] != "") &
        (~radiology_reports["exam_type"].str.contains("\*"))
    ].sort_values(by=["deid_service_date"], ascending=False)
    if len(radiology_reports) > 0:
        latest_radiology_report = radiology_reports.iloc[0]
        filtered_patient_notes = patient_notes[patient_notes["deid_service_date"] < latest_radiology_report["deid_service_date"]].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
        filtered_data_radiology_report.append(filtered_patient_notes)

filtered_data_radiology_report = pd.concat(filtered_data_radiology_report)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:32<00:00, 30.50it/s]


In [220]:
print("n_notes", len(filtered_data_radiology_report))
print("n_patients", filtered_data_radiology_report["patientdurablekey"].nunique())

n_notes 296392
n_patients 745


In [221]:
print("n_notes (excluded)", len(filtered_data_note_type) - len(filtered_data_radiology_report))
print("n_patients (excluded)", filtered_data_note_type["patientdurablekey"].nunique() - filtered_data_radiology_report["patientdurablekey"].nunique())

n_notes (excluded) 124841
n_patients (excluded) 255


## Radiology Report Datasets