In [281]:
import os
import pandas as pd
import tqdm
import regex as re

In [213]:
additional_history_file = "/mnt/sohn2022/UCSF_secure_data_info_commons_clinical_notes/additional_history/notes/shard1/additional_history_0.parquet"
data = pd.read_parquet(additional_history_file)
print("n_notes", len(data))
print("n_patients", data["patientdurablekey"].nunique())

n_notes 629457
n_patients 1000


## Note Type Exclusion
We remove all clinical notes that are telephone encounters of patient instructions or have a `note_type` of `None`. We also filter out notes that have < 20 words, regardless of type.

In [214]:
filtered_data_note_type = data[
    ~(data["note_type"].isna()) & 
    ~(data["note_type"].isin(["Telephone Encounter", "Patient Instructions"])) & 
    ~(data["note_type"].isna()) &
    ~(data["note_text"].apply(lambda s: len(s.split()) < 20))
]

In [215]:
print("n_notes", len(filtered_data_note_type))
print("n_patients", filtered_data_note_type["patientdurablekey"].nunique())

n_notes 421233
n_patients 1000


In [216]:
print("n_notes (excluded)", len(data) - len(filtered_data_note_type))
print("n_patients (excluded)", data["patientdurablekey"].nunique() - filtered_data_note_type["patientdurablekey"].nunique())

n_notes (excluded) 208224
n_patients (excluded) 0


## Radiology Report Exclusion
We filter out all the radiology reports with an additional history of `None` and other miscellaneous preprocessing steps. We also make sure to filter out all notes that occur before the `deid_service_date` of the radiological exam. We assume that all previous notes that will not mention the radiology report to prevent data leakage.

In [264]:
patient_mrns = filtered_data_note_type["patientdurablekey"].unique()
filtered_data_radiology_report = []
for i in tqdm.tqdm(range(len(patient_mrns))):
    patient_mrn = patient_mrns[i]
    patient_notes = filtered_data_note_type[filtered_data_note_type["patientdurablekey"] == patient_mrn]
    radiology_reports = patient_notes[
        (patient_notes["note_type"] == "Imaging") &
        (patient_notes["auth_prov_type"].isna()) & 
        (~patient_notes["note_text"].str.contains("RADIOLOGY PRELIMINARY INTERPRETATION")) &
        (~patient_notes["note_text"].str.contains("ADDITIONAL HISTORY: None")) &
        (patient_notes["note_text"].str.contains("ADDITIONAL HISTORY"))  
    ].copy()    
    radiology_reports.loc[:, "exam_type"] = radiology_reports["note_text"].str.split("  ").apply(lambda l: l[0]).str.replace(":", "", regex=False).str.strip()
    radiology_reports = radiology_reports[
        (radiology_reports["exam_type"] != "") &
        (~radiology_reports["exam_type"].str.contains("\*"))
    ].drop_duplicates(subset=["deid_service_date"]).sort_values(by=["deid_service_date"], ascending=False)
    for j in range(len(radiology_reports)):
        radiology_report = radiology_reports.iloc[j]
        filtered_patient_notes = patient_notes[
            (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
            (patient_notes["note_type"] != "Imaging")
        ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
        filtered_data_radiology_report.append(radiology_report.to_frame().T)
        filtered_data_radiology_report.append(filtered_patient_notes)
filtered_data_radiology_report = pd.concat(filtered_data_radiology_report).drop_duplicates(subset=["deid_note_key"])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:38<00:00, 25.97it/s]


In [288]:
filtered_data_radiology_report = filtered_data_radiology_report.fillna("None")

filtered_data_radiology_report[filtered_data_radiology_report["exam_type"].str.contains("PET")]["exam_type"].value_counts()

exam_type
CT NECK WITH CONTRAST (PETCT)                                    37
CT NECK WITHOUT CONTRAST (PETCT)                                  6
CT BRAIN WITH CONTRAST (PETCT), CT NECK WITH CONTRAST (PETCT)     1
Name: count, dtype: int64

In [265]:
print("n_notes", len(filtered_data_radiology_report))
print("n_patients", filtered_data_radiology_report["patientdurablekey"].nunique())

n_notes 223745
n_patients 746


In [266]:
print("n_notes (excluded)", len(filtered_data_note_type) - len(filtered_data_radiology_report))
print("n_patients (excluded)", filtered_data_note_type["patientdurablekey"].nunique() - filtered_data_radiology_report["patientdurablekey"].nunique())

n_notes (excluded) 197488
n_patients (excluded) 254


## Radiology Report without Redaction of Original Indication and Revised Indication

In [284]:
report_text = filtered_data_radiology_report[~filtered_data_radiology_report["exam_type"].isna()].iloc[0]["note_text"]
print("="*20)
print(report_text)
match = re.search(r'CLINICAL HISTORY:\s*(.*?)\s*ADDITIONAL HISTORY:\s*(.*?)(?=\s*COMPARISON:|$)', report_text)
if match:
    clinical_history = match.group(1)
    additional_history = match.group(2)
    print("Clinical History:", clinical_history)
    print("Additional History:", additional_history)


Clinical History: New diagnosis gallbladder cancer s/p lap chole. Multiphase protocol, localized staging
Additional History: Status post laparoscopic cholecystectomy on 07/04/2023 with pathology showing invasive adenocarcinoma.
CT ABDOMEN/PELVIS WITH AND WITHOUT CONTRAST  07/19/2023 10:14 AM    CLINICAL HISTORY: New diagnosis gallbladder cancer s/p lap chole. Multiphase protocol, localized staging    ADDITIONAL HISTORY: Status post laparoscopic cholecystectomy on 07/04/2023 with pathology showing invasive adenocarcinoma.    COMPARISON:  CT urogram 01/17/2023.    TECHNIQUE: CT of the abdomen and pelvis was performed.    MEDICATIONS:  Iohexol 350 - 145 mL - Intravenous    FINDINGS:    Visualized lung bases:  For chest findings, please see the separately dictated report from the CT of the chest of the same date.    Liver:  Unremarkable    Gallbladder: Interval cholecystectomy with mild ill-defined stranding in the gallbladder fossa, likely postsurgical.    Spleen:  Unremarkable    Pancrea

In [None]:
data[data["note_type"] == "Imaging"][]

In [274]:
patient_mrns = filtered_data_radiology_report["patientdurablekey"].unique()
filtered_data_report_redactions = []
for i in tqdm.tqdm(range(len(patient_mrns))):
    patient_mrn = patient_mrns[i]
    patient_notes = filtered_data_radiology_report[filtered_data_radiology_report["patientdurablekey"] == patient_mrn]
    radiology_reports = patient_notes[
        ~patient_notes["exam_type"].isna()
    ].copy()    
    
    
    
    
#     radiology_reports = radiology_reports[
#         (radiology_reports["exam_type"] != "") &
#         (~radiology_reports["exam_type"].str.contains("\*"))
#     ].drop_duplicates(subset=["deid_service_date"]).sort_values(by=["deid_service_date"], ascending=False)
#     for j in range(len(radiology_reports)):
#         radiology_report = radiology_reports.iloc[j]
#         filtered_patient_notes = patient_notes[
#             (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
#             (patient_notes["note_type"] != "Imaging")
#         ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
#         filtered_data_radiology_report.append(radiology_report.to_frame().T)
#         filtered_data_radiology_report.append(filtered_patient_notes)
filtered_data_report_redactions = pd.concat(filtered_data_report_redactions).drop_duplicates(subset=["deid_note_key"])

Unnamed: 0,patientepicid,patientdurablekey,deid_note_key,deid_note_id,deid_note_csn_id,procedureorderfactid,accessionnumber,accessionnumber2,accessionnumber3,encounterfactid,...,enc_dept_specialty,employeeepicid,providerepicid,auth_prov_type,prov_specialty,deid_service_date,redact_date,deid_note_key_1,note_text,exam_type
0,D7411E99AA0CAA,D861F459C31638,D8C57A78500A5B,D6C4D458AB4FB7,DD3F2EA6C999CE,,,,,,...,,,,,,2023-07-18 06:13:00,2024-08-04,D8C57A78500A5B,IMPRESSION: UNIVERSITY ***** ***** ***** ***...,
1,D7411E99AA0CAA,D861F459C31638,DD19789B5F9493,D9FB503DF66695,DFFEAB06192C18,,,,,DCD2D3E48F7E84,...,General Surgery,,,,,2023-07-18 00:00:00,2024-08-16,DD19789B5F9493,UCSF General Surgery ***** ***** ***** ***...,
2,D7411E99AA0CAA,D861F459C31638,D1327671977522,D115924A420188,D6672A29812214,,,,,DCD2D3E48F7E84,...,General Surgery,DC6C2F7B48C34F,DECF3A4BA76CCE,Nurse Practitioner,Neurosurgery,2023-07-18 00:00:00,2024-08-16,D1327671977522,I saw ***** ***** ***** ***** today at the Uni...,
3,D7411E99AA0CAA,D861F459C31638,D3CD466F39280B,DD6D35B5D7CF54,D0296BC527EB13,,,,,DEC9BD2B59F8C7,...,General Internal Medicine,D64D7A6C29A75A,D777FD8B2EC704,Physician,Internal Medicine,2023-07-18 00:00:00,2024-08-01,D3CD466F39280B,Subjective ***** ***** ***** ***** is a...,
4,D7411E99AA0CAA,D861F459C31638,D698123E0E469D,D1FF7432EA01DA,D715FB7D7E2841,,,,,DFC08C24A954B1,...,Urology,D339379D5BC950,DEFE09570E7D09,Physician,Urology,2023-07-14 00:00:00,2024-07-22,D698123E0E469D,I am requesting an eConsult for this ***** y.o...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,D3941E3590B943,D212EC98FC02A1,D0F2755A28CF11,D946F8808F5560,D0D5521C7DD016,,,,,D551AEF9AE20C0,...,Emergency Medicine,D900D4EC1606FE,D217C6D0E03C73,Physician,Hospital Medicine,2020-11-05 02:47:00,2024-08-15,D0F2755A28CF11,HOSPITAL MEDICINE ED TRIAGE ATTENDING ***** **...,
2,D3941E3590B943,D212EC98FC02A1,DAD0AB8C8F2641,DD3EE8601124AF,DC579CD26837E4,,,,,D551AEF9AE20C0,...,Emergency Medicine,D947C2768F73BB,DB424CCE0F272C,Physician,Neurology,2020-11-04 21:05:00,2024-08-06,DAD0AB8C8F2641,NEUROLOGY INITIAL CONSULT NOTE Consult req...,
3,D3941E3590B943,D212EC98FC02A1,DB6C40ACD31C1B,D53C396E7B7A34,D7821257F1E26D,,,,,D551AEF9AE20C0,...,Emergency Medicine,DE935AFEFDF1BA,DB4C4D709BBB29,Resident,UCSF,2020-11-04 18:47:00,2024-07-19,DB6C40ACD31C1B,ED Attendings History Chief Compla...,
4,D3941E3590B943,D212EC98FC02A1,DD3C11633F4C9B,,D4F56B110FEA4E,D18D436277897E,DE89F186D9E141,,,D551AEF9AE20C0,...,Emergency Medicine,DC1397DC54DB82,DF537336EC13CF,Physician,Emergency Medicine,2020-11-04 17:01:00,2024-08-09,DD3C11633F4C9B,"Abnormal record""\n\nNormal sinus rhythm""\n\nRi...",
