In [1]:
import os
import pandas as pd
import tqdm
import regex as re

  from pandas.core import (


In [100]:
print("="*20)
print("UCSF Information Commons De-identified Clinical Notes Additional History Subset")
print("="*20)

UCSF Information Commons De-identified Clinical Notes Additional History Subset


In [2]:
additional_history_file = "/mnt/sohn2022/UCSF_secure_data_info_commons_clinical_notes/additional_history/notes/shard1/additional_history_0.parquet"
data = pd.read_parquet(additional_history_file)
print("n_notes", len(data))
print("n_patients", data["patientdurablekey"].nunique())

n_notes 629457
n_patients 1000


## Note Type Exclusion
We remove all clinical notes that are telephone encounters of patient instructions or have a `note_type` of `None`. We also filter out notes that have < 20 words, regardless of type.

In [101]:
print("="*20)
print("Note Type Exclusion")
print("="*20)

Note Type Exclusion


In [3]:
filtered_data_note_type = data[
    ~(data["note_type"].isna()) & 
    ~(data["note_type"].isin(["Telephone Encounter", "Patient Instructions"])) & 
    ~(data["note_type"].isna()) &
    ~(data["note_text"].apply(lambda s: len(s.split()) < 20))
]

In [4]:
print("n_notes", len(filtered_data_note_type))
print("n_patients", filtered_data_note_type["patientdurablekey"].nunique())

n_notes 421233
n_patients 1000


In [97]:
print("-"*20)

--------------------


In [5]:
print("n_notes (excluded)", len(data) - len(filtered_data_note_type))
print("n_patients (excluded)", data["patientdurablekey"].nunique() - filtered_data_note_type["patientdurablekey"].nunique())

n_notes (excluded) 208224
n_patients (excluded) 0


## Radiology Report Exclusion
We filter out all the radiology reports with an additional history of `None` and other miscellaneous preprocessing steps. We also make sure to filter out all notes that occur before the `deid_service_date` of the radiological exam. We assume that all previous notes that will not mention the radiology report to prevent data leakage.

In [102]:
print("="*20)
print("Radiology Report Exclusion")
print("="*20)

Radiology Report Exclusion


In [64]:
patient_mrns = filtered_data_note_type["patientdurablekey"].unique()
filtered_data_radiology_report = []
for i in tqdm.tqdm(range(len(patient_mrns))):
    patient_mrn = patient_mrns[i]
    patient_notes = filtered_data_note_type[filtered_data_note_type["patientdurablekey"] == patient_mrn]
    radiology_reports = patient_notes[
        (patient_notes["note_type"] == "Imaging") &
        (patient_notes["auth_prov_type"].isna()) & 
        (~patient_notes["note_text"].str.contains("RADIOLOGY PRELIMINARY INTERPRETATION")) &
        (~patient_notes["note_text"].str.contains("ADDITIONAL HISTORY: None")) &
        (patient_notes["note_text"].str.contains("ADDITIONAL HISTORY"))  
    ].copy()    
    if len(radiology_reports) > 0:
        radiology_reports.loc[:, "exam_type"] = radiology_reports["note_text"].str.split("  ").apply(lambda l: l[0]).str.replace(":", "", regex=False).str.strip()
        def extract_history(report_text):
            match = re.search(r'(?:CLINICAL HISTORY:|INDICATION \(as provided by referring clinician\):)\s*(.*?)\s*ADDITIONAL HISTORY:\s*(.*?)(?:\s*COMPARISON:|$)', report_text)
            if match:
                clinical_history = match.group(1).strip()
                additional_history = match.group(2).strip()
                return clinical_history, additional_history
            return None, None
        radiology_reports[['original_indication', 'radiologist_indication']] = radiology_reports['note_text'].apply(lambda x: pd.Series(extract_history(x)))
        radiology_reports = radiology_reports[
            (radiology_reports["exam_type"] != "") &
            (~radiology_reports["exam_type"].str.contains("\*")) &
            (~radiology_reports["radiologist_indication"].isna())
        ].drop_duplicates(subset=["deid_service_date"]).sort_values(by=["deid_service_date"], ascending=False)
    for j in range(len(radiology_reports)):
        radiology_report = radiology_reports.iloc[j]
        filtered_patient_notes = patient_notes[
            (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
            (patient_notes["note_type"] != "Imaging")
        ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
        filtered_data_radiology_report.append(radiology_report.to_frame().T)
        filtered_data_radiology_report.append(filtered_patient_notes)
filtered_data_radiology_report = pd.concat(filtered_data_radiology_report).drop_duplicates(subset=["deid_note_key"])

100%|███████████████████████████████████████| 1000/1000 [00:38<00:00, 26.10it/s]


In [89]:
n_filtered_radiology_reports = len(filtered_data_radiology_report[~filtered_data_radiology_report["exam_type"].isna()])

print("n_notes", len(filtered_data_radiology_report))
print("n_patients", filtered_data_radiology_report["patientdurablekey"].nunique())
print("n_radiology reports ", n_filtered_radiology_reports)

n_notes 219052
n_patients 727
n_radiology reports  6135


In [99]:
print("-"*20)

--------------------


In [70]:
print("n_notes (excluded)", len(filtered_data_note_type) - len(filtered_data_radiology_report))
print("n_patients (excluded)", filtered_data_note_type["patientdurablekey"].nunique() - filtered_data_radiology_report["patientdurablekey"].nunique())

n_notes (excluded) 202181
n_patients (excluded) 273


In [108]:
filtered_data_radiology_report.to_parquet("clinical_indication_dataset.parquet")

## Radiology Report without Redaction of Original Indication and Revised Indication

In [103]:
print("="*20)
print("Radiology Report without Redaction of Original Indication and Revised Indication")
print("="*20)

Radiology Report without Redaction of Original Indication and Revised Indication


In [81]:
patient_mrns = filtered_data_radiology_report["patientdurablekey"].unique()
filtered_data_report_redactions = []
for i in tqdm.tqdm(range(len(patient_mrns))):
    patient_mrn = patient_mrns[i]
    patient_notes = filtered_data_radiology_report[filtered_data_radiology_report["patientdurablekey"] == patient_mrn]
    radiology_reports = patient_notes[
        (~patient_notes["exam_type"].isna()) & 
        (~patient_notes["original_indication"].astype(str).str.contains("\*")) &
        (~patient_notes["radiologist_indication"].astype(str).str.contains("\*"))
    ].copy()    
    for j in range(len(radiology_reports)):
        radiology_report = radiology_reports.iloc[j]
        filtered_patient_notes = patient_notes[
            (patient_notes["deid_service_date"] < radiology_report["deid_service_date"]) & 
            (patient_notes["note_type"] != "Imaging")
        ].sort_values(by=["deid_service_date"], ascending=False).reset_index(drop=True)
        filtered_data_report_redactions.append(radiology_report.to_frame().T)
        filtered_data_report_redactions.append(filtered_patient_notes)
filtered_data_report_redactions = pd.concat(filtered_data_report_redactions).drop_duplicates(subset=["deid_note_key"])

100%|█████████████████████████████████████████| 727/727 [00:11<00:00, 61.16it/s]


In [87]:
n_radiology_report_redactions = len(filtered_data_report_redactions[~filtered_data_report_redactions["exam_type"].isna()])

print("n_notes", len(filtered_data_report_redactions))
print("n_patients", filtered_data_report_redactions["patientdurablekey"].nunique())
print("n_radiology reports ", n_radiology_report_redactions)

n_notes 156666
n_patients 536
n_radiology reports  2655


In [98]:
print("-"*20)

--------------------


In [83]:
print("n_notes (excluded)", len(filtered_data_radiology_report) - len(filtered_data_report_redactions))
print("n_patients (excluded)", filtered_data_radiology_report["patientdurablekey"].nunique() - filtered_data_report_redactions["patientdurablekey"].nunique())

n_notes (excluded) 62386
n_patients (excluded) 191


In [109]:
filtered_data_report_redactions.to_parquet("clinical_indication_dataset_without_redactions.parquet")

## Clinical Indication Dataset

In [2]:
clinical_indication_dataset = pd.read_parquet("clinical_indication_dataset_shard6.parquet")

In [18]:
def categorize_body_group(exam_type):
    categories = {
        "Head": ["HEAD", "BRAIN", "STROKE PROTOCOL", "NEURO"],
        "Abdomen/Pelvis": ["ABDOMEN", "PELVIS", "PROSTATE", "RENAL", "MRCP"],
        "MSK": ["KNEE", "HIP", "SHOULDER", "EXTREMITY", "SPINE", "LUMBAR", 
                "UROGRAM", "CERVICAL", "SCOLIOSIS", "JOINT"],
        "Chest": ["CHEST", "CARDIAC", "PULMONARY EMBOLISM", "CTA", "HEART"],
        "Neck": ["FACE", "NECK", "CRANIOFACIAL", "MAXILLOFACIAL", "ORBIT", "TEMPORAL BONE", 
                 "SINUS", "THYROID", "MANDIBLE", "SKULL"]
    }
    
    for category, keywords in categories.items():
        if any(keyword in exam_type for keyword in keywords):
            return category
    return "Other"

def categorize_imaging_modality(exam_type):
    categories = {
        "MRI": ["MR", "MRI"],
        "CT": ["CT", "CTA"],
        "XR": ["XR"],
        "US": ["US"]
    }
    
    for category, keywords in categories.items():
        if any(keyword in exam_type for keyword in keywords):
            return category
    return "Other"

clinical_indication_dataset.loc[clinical_indication_dataset["exam_type"].notna(), "body_group"] = clinical_indication_dataset.loc[clinical_indication_dataset["exam_type"].notna()]["exam_type"].apply(categorize_body_group)
clinical_indication_dataset.loc[clinical_indication_dataset["exam_type"].notna(), "imaging_modality"] = clinical_indication_dataset.loc[clinical_indication_dataset["exam_type"].notna()]["exam_type"].apply(categorize_imaging_modality)

In [21]:
clinical_indication_dataset["body_group"].value_counts(normalize=True) * 100

body_group
Head              49.586777
Neck              26.363636
MSK               10.853994
Abdomen/Pelvis     7.162534
Chest              4.545455
Other              1.487603
Name: proportion, dtype: float64

In [12]:
clinical_indication_dataset["imaging_modality"] == 

imaging_modality
CT      2065
MRI     1489
US        29
XR        29
None      18
Name: count, dtype: int64

In [92]:
clinical_indication_dataset[
    (~clinical_indication_dataset["body_group"].isna())
].iloc[0]["note_text"]

'MR BRAIN WITH AND WITHOUT CONTRAST:  03/18/2023 10:25 AM    INDICATION (as provided by referring clinician): history of craniotomy, has titanium in skull  Diplopia  Ptosis, bilateral    ADDITIONAL HISTORY: History of 2 craniotomies, has titanium in skull.    Additional clinical history: history of frontal bone osteomyelitis and associated meningoencephalitis, s/p craniotomy, ongoing emotional changes and difficulty with school since then, with a FH of his twin brother and paternal uncle with ptosis, diagnosed as MG, who presents with 1 mo of ptosis and diplopia. On exam severe bilateral ptosis and left eye *****, right eye only abduction intact but with nystagmus.     COMPARISON: MR brain 10/16/2019, 09/01/2019    TECHNIQUE: Multiple sequences through the brain were acquired at 3.0 tesla.    MEDICATIONS:  Dotarem - 23.9 mL - Intravenous    FINDINGS:    Prior right frontal craniotomy. Similar extent of right frontal subcortical FLAIR hyperintensity and encephalomalacia and gliosis. ***

In [101]:
clinical_indication_dataset

Unnamed: 0,patientepicid,patientdurablekey,deid_note_key,deid_note_id,deid_note_csn_id,procedureorderfactid,accessionnumber,accessionnumber2,accessionnumber3,encounterfactid,...,prov_specialty,deid_service_date,redact_date,deid_note_key_1,note_text,exam_type,original_indication,radiologist_indication,body_group,imaging_modality
192289,DFC1F318D5C202,DAB2C4F4F8B502,DC01C376258351,DD5E7F03730F9E,D21A6411F90443,,D7A3854B517269,,,,...,,2023-03-21 13:05:00,2024-08-04,DC01C376258351,MR BRAIN WITH AND WITHOUT CONTRAST: 03/18/202...,MR BRAIN WITH AND WITHOUT CONTRAST,"history of craniotomy, has titanium in skull ...","History of 2 craniotomies, has titanium in sku...",Head,MRI
0,DFC1F318D5C202,DAB2C4F4F8B502,D0DD6CE23342F5,DA4FE3E0461CC5,DFA4AA1A0D7023,,,,,DE8E6B862005CC,...,Pediatric Neurology,2023-03-11 00:00:00,2024-07-29,D0DD6CE23342F5,Subjective ***** ***** is a ***** y.o. *...,,,,,
1,DFC1F318D5C202,DAB2C4F4F8B502,DBD4D1EC158E9D,D7DD23E5C6D6BB,D2A607582E48FA,,,,,DE8E6B862005CC,...,,2023-03-11 00:00:00,2024-07-29,DBD4D1EC158E9D,BCH ***** ***** ***** ***** ***** ***** ...,,,,,
2,DFC1F318D5C202,DAB2C4F4F8B502,D6D89DA20440D5,D09224EA347862,D3D16EFD558B25,,,,,DF043595F0DF98,...,,2023-02-25 00:00:00,2024-07-25,D6D89DA20440D5,BCH ***** ***** ***** ***** ***** ***** ...,,,,,
3,DFC1F318D5C202,DAB2C4F4F8B502,DE69EB7BBA1705,,D805CC9A4C4516,,,,,D19C0568557A82,...,Pediatric Neurology,2023-02-11 11:37:00,2024-08-10,DE69EB7BBA1705,"Performed at: 03 - Labcorp ***** *****""\n\n**...",,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,D33976366DCC17,DC7C985786BF03,D14537E2932E56,DEE839BF19A5C4,DE8FA593264131,,,,,D1F63BA504867A,...,,2015-12-31 00:00:00,2024-07-31,D14537E2932E56,12/31/2015 ***** ***** ***** ***** ***** ...,,,,,
89,D33976366DCC17,DC7C985786BF03,D8079347536E6C,DEAB40422A1413,DC1A20B9279C60,,,,,D6CC95F288B853,...,Neurosurgery,2015-09-19 00:00:00,2024-08-02,D8079347536E6C,Thank-you for referring ***** ***** to the Uni...,,,,,
90,D33976366DCC17,DC7C985786BF03,D3B64B8D25932F,DDF85CA0638175,DE6EB01EB963E1,,,,,D6CC95F288B853,...,,2015-09-19 00:00:00,2024-07-30,D3B64B8D25932F,UCSF Medical Center - After Visit Summary ...,,,,,
91,D33976366DCC17,DC7C985786BF03,D8BDF965F4C13C,D47D8C9547DED3,D4BF60272635B2,,,,,D6CC95F288B853,...,,2015-09-19 00:00:00,2024-07-28,D8BDF965F4C13C,09/19/2015 ***** ***** ***** ***** *****...,,,,,


In [100]:
clinical_indication_dataset[
    (~clinical_indication_dataset["body_group"].isna())
]["note_text"].iloc[0]

'MR BRAIN WITH AND WITHOUT CONTRAST:  03/18/2023 10:25 AM    INDICATION (as provided by referring clinician): history of craniotomy, has titanium in skull  Diplopia  Ptosis, bilateral    ADDITIONAL HISTORY: History of 2 craniotomies, has titanium in skull.    Additional clinical history: history of frontal bone osteomyelitis and associated meningoencephalitis, s/p craniotomy, ongoing emotional changes and difficulty with school since then, with a FH of his twin brother and paternal uncle with ptosis, diagnosed as MG, who presents with 1 mo of ptosis and diplopia. On exam severe bilateral ptosis and left eye *****, right eye only abduction intact but with nystagmus.     COMPARISON: MR brain 10/16/2019, 09/01/2019    TECHNIQUE: Multiple sequences through the brain were acquired at 3.0 tesla.    MEDICATIONS:  Dotarem - 23.9 mL - Intravenous    FINDINGS:    Prior right frontal craniotomy. Similar extent of right frontal subcortical FLAIR hyperintensity and encephalomalacia and gliosis. ***

In [79]:
clinical_indication_dataset.loc[clinical_indication_dataset["exam_type"].notna()]

Unnamed: 0,patientepicid,patientdurablekey,deid_note_key,deid_note_id,deid_note_csn_id,procedureorderfactid,accessionnumber,accessionnumber2,accessionnumber3,encounterfactid,...,auth_prov_type,prov_specialty,deid_service_date,redact_date,deid_note_key_1,note_text,exam_type,original_indication,radiologist_indication,body_group
192289,DFC1F318D5C202,DAB2C4F4F8B502,DC01C376258351,DD5E7F03730F9E,D21A6411F90443,,D7A3854B517269,,,,...,,,2023-03-21 13:05:00,2024-08-04,DC01C376258351,MR BRAIN WITH AND WITHOUT CONTRAST: 03/18/202...,MR BRAIN WITH AND WITHOUT CONTRAST,"history of craniotomy, has titanium in skull ...","History of 2 craniotomies, has titanium in sku...",Head
218024,DFC1F318D5C202,DAB2C4F4F8B502,D7A790BBFB05D5,D1ACA9AF9C0C38,DCDDAB24D6D9E8,,D7A3854B517269,,,,...,,,2023-03-18 11:09:00,2024-07-25,D7A790BBFB05D5,MR BRAIN WITH AND WITHOUT CONTRAST: 03/18/202...,MR BRAIN WITH AND WITHOUT CONTRAST,"history of craniotomy, has titanium in skull ...","History of 2 craniotomies, has titanium in sku...",Head
168777,DBF5663B57CAFF,DBCE3F7A3B1347,DF8C4FD2D9B71E,D4292C84EBB0B5,DD19B225593CDC,,D78FEA153219E2,,,,...,,,2023-08-21 12:06:00,2024-07-25,DF8C4FD2D9B71E,"MRI BRAIN, MRA BRAIN, MRA NECK - WITHOUT CONTR...","MRI BRAIN, MRA BRAIN, MRA NECK - WITHOUT CONTR...",r/o stroke,***** on 08/18/23,Head
15321,DBF5663B57CAFF,DBCE3F7A3B1347,D92F83561112A8,DB411297BA01E7,D3C66064EF291F,,D78FEA153219E2,,,,...,,,2023-08-21 11:07:00,2024-08-05,D92F83561112A8,"MRI BRAIN, MRA BRAIN, MRA NECK - WITHOUT CONTR...","MRI BRAIN, MRA BRAIN, MRA NECK - WITHOUT CONTR...",r/o stroke,***** on 08/18/23,Head
64001,DBF5663B57CAFF,DBCE3F7A3B1347,D0F40E73D8616B,D576D98C3FD408,DC5A87FC8362CA,,D78FEA153219E2,,,,...,,,2023-08-21 10:10:00,2024-07-23,D0F40E73D8616B,"MRI BRAIN, MRA BRAIN, MRA NECK - WITHOUT CONTR...","MRI BRAIN, MRA BRAIN, MRA NECK - WITHOUT CONTR...",r/o stroke,***** on 08/18/23,Head
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206543,DD7DC4FAFE58A0,D966487D0EB461,DCB5B30901D9FC,D15FE88CE55F79,D82E262BBAB61D,DC446A4073AB16,DE1451AC2CD52B,,,,...,,,2020-04-05 14:05:00,2024-07-22,DCB5B30901D9FC,MR BRAIN WITH AND WITHOUT CONTRAST : 04/04/202...,MR BRAIN WITH AND WITHOUT CONTRAST 04/04/2020...,6 mo. S/P ICH and resection of cavernoma in th...,,Head
259911,D8378B2FED1830,D71EBF868D31BF,D85A5961F8C6E2,DE673A0BAAB67F,D6CC43905ABAEC,D50354B13B18FA,D3820B223E4C55,,,,...,,,2022-06-08 05:06:00,2024-08-05,D85A5961F8C6E2,MR LUMBAR SPINE WITH AND WITHOUT CONTRAST: 06...,MR LUMBAR SPINE WITH AND WITHOUT CONTRAST,L-spine neoplasm suspected Lumbar radiculopat...,,MSK
171659,DE16A432286C65,DC4B53B3B85BA4,D58C7B33916F37,D52C82F9FF584C,DA04E1D4594E28,D91D875D70EFBC,D2D03D989E4EB4,,,,...,,,2020-04-02 04:05:00,2024-07-28,D58C7B33916F37,US PREGNANCY < 14 WEEKS OB (RADIOLOGY PERFORME...,US PREGNANCY < 14 WEEKS OB (RADIOLOGY PERFORMED),spotting in early pregnancy,Small volume spotting 1 week ago.,
201778,D33976366DCC17,DC7C985786BF03,DC1549DE08530A,DD414998C5DAAB,D0A63EE6E80F1D,DDE6073EFAC92F,DB01813D1C0E18,,,,...,,,2019-09-08 03:05:00,2024-07-21,DC1549DE08530A,MR CERVICAL SPINE WITH AND WITHOUT CONTRAST : ...,MR CERVICAL SPINE WITH AND WITHOUT CONTRAST 0...,Ependymoma No iodinated contrast contraindica...,Treatment History: 1. October 2014: Presented...,MSK


In [75]:
clinical_indication_dataset["exam_type"].value_counts()

exam_type
MR BRAIN WITH AND WITHOUT CONTRAST                                         918
CT BRAIN WITHOUT CONTRAST                                                  789
CT SINUSES WITHOUT CONTRAST                                                245
CT ABDOMEN/PELVIS WITH CONTRAST                                            224
CT NECK WITH CONTRAST                                                      223
                                                                          ... 
CT SINUSES WITHOUT CONTRAST  12/17/2019 403 PM                               1
MRI BRAIN, MRA BRAIN - WITHOUT AND WITH CONTRAST                             1
CT ANGIOGRAM ABDOMEN/PELVIS WITH AND WITHOUT CONTRAST 11/01/2023 800 AM      1
MR THORACIC SPINE WITH AND WITHOUT CONTRAST  03/15/2020 318 PM               1
MR CERVICAL SPINE WITH AND WITHOUT CONTRAST  09/07/2019 714 PM               1
Name: count, Length: 336, dtype: int64

In [62]:
clinical_indication_dataset[clinical_indication_dataset["body_group"] == "None"]["exam_type"].value_counts()

exam_type
1.                                                  195
CT GUIDED EPIDURAL BLOOD PATCH                       40
XR SHUNT EVALUATION                                  15
CT ESOPHAGRAM                                        13
CT GUIDED SYNOVIAL FLUID ASPIRATION OR INJECTION     12
                                                   ... 
XR HUMERUS 2 VIEWS, LEFT, XR ELBOW 2 VIEWS, LEFT      1
MRI Pelvis without and with contrast                  1
MR WRIST WITH AND WITHOUT CONTRAST, RIGHT             1
CT GUIDED FIBRIN GLUE and EPIDURAL BLOOD PATCH        1
MR IAC WITH AND WITHOUT CONTRAST                      1
Name: count, Length: 118, dtype: int64