In [2]:
import pandas as pd
import regex as re
from sklearn.model_selection import train_test_split

In [3]:
UCSF_reports = pd.read_csv('data/raw/secure_UCSF_radreports__CT_MR_MRI_US_PET__01-01-2021__10-22-2022.csv')
UCSF_reports = UCSF_reports[UCSF_reports['Modality'].isin(['CT', 'MRI', 'US'])].reset_index(drop=True)
len(UCSF_reports)

372716

## Exclude Studies 

In [4]:
# Reports with same accessions
UCSF_duplicates = UCSF_reports[UCSF_reports.duplicated(subset='Accession Number', keep=False)]
UCSF_reports = UCSF_reports.drop_duplicates(subset='Accession Number', keep=False).reset_index(drop=True)
print('n =', len(UCSF_duplicates))

n = 6


In [5]:
# Reports that are non-reportable studies 
UCSF_nonreportable_reports = UCSF_reports[UCSF_reports['Report Text'].str.contains('non-reportable study')].reset_index(drop=True)
UCSF_reports = UCSF_reports[~UCSF_reports['Report Text'].str.contains('non-reportable study')].reset_index(drop=True)
print('n =', len(UCSF_nonreportable_reports))

n = 15803


In [6]:
# Reports located in progress notes
OTHER_SOURCES_PATTERNS = [
    'Refer to procedure/progress notes',
    'See clinic note for results'
]

pattern = '|'.join([re.escape(p) for p in OTHER_SOURCES_PATTERNS])

# Filtering based on the pattern
UCSF_relocated_reports = UCSF_reports[UCSF_reports['Report Text'].str.contains(pattern, regex=True)].reset_index(drop=True)
UCSF_reports = UCSF_reports[~UCSF_reports['Report Text'].str.contains(pattern, regex=True)].reset_index(drop=True)
print('n =', len(UCSF_relocated_reports))

n = 715


In [7]:
# Reports with no separate findings and impressions
def extract_impression(report_text):
    pattern = r'SUMMARY(.*)'
    match = re.search(pattern, report_text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip()
    
    pattern = r'SUMMARY:(.*?)END OF IMPRESSION'
    match = re.search(pattern, report_text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip()
    
    pattern = r'Impression:(.*?)END OF IMPRESSION'
    match = re.search(pattern, report_text, re.DOTALL)
    if match:
        return match.group(1).strip()
    
    pattern = r'IMPRESSION(.*?)Report dictated by'
    match = re.search(pattern, report_text, re.DOTALL)
    if match:
        return match.group(1).strip()
    
    sections = report_text.split('\n\n')
    if len(sections) == 2 or len(sections) == 3:
        return sections[1]
    
    sections = report_text.split('\n"\n')
    if len(sections) == 2:
        return sections[1]
    
    sections = report_text.split('\n')
    if len(sections) == 2:
        return sections[1]   
    
    pattern = r'IMPRESSION:(.*?)Report dictated by'
    match = re.search(pattern, report_text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip()
    
    pattern = r'IMPRESSION:(.*?)-+'
    match = re.search(pattern, report_text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(0).strip()
    
    return ''

def clean_text(text):
    if '//ALERT//' in text and len(text.split('\n')) > 1:
        text = text.split('\n')[1]
    text = re.sub(r'Report dictated by:.*$', '', text, flags=re.DOTALL).strip()
    text = re.sub(r'\nThis document is.*$', '', text, flags=re.DOTALL).strip()
    text = re.sub(r'//.*$', '', text, flags=re.DOTALL).strip()
    text = (text.strip()
        .replace('FINDINGS/IMPRESSION', '')
        .replace('Findings/impression', '')
        .replace('IMPRESSION: \n', '')
        .replace(':\n', '')
        .replace(':', '')
        .replace('IMPRESSION:', '')
        .replace('IMPRESSION', '')
        .replace('Impression', ''))
    return text

def extract_source(report_text, impression):
    return report_text.replace(impression, '').strip()

UCSF_reports['Impression'] = UCSF_reports['Report Text'].apply(extract_impression)
UCSF_reports['Processed Impression'] = UCSF_reports['Impression'].apply(clean_text)
UCSF_reports['Processed Source'] = UCSF_reports.apply(
    lambda x: extract_source(x['Report Text'], x['Impression']), axis=1
)

UCSF_no_separate_impression = UCSF_reports[UCSF_reports['Processed Impression'] == '']
UCSF_reports = UCSF_reports[UCSF_reports['Processed Impression'] != ''].reset_index(drop=True)
print('n =', len(UCSF_no_separate_impression))

n = 2912


## Splits

In [8]:
len(UCSF_reports)

353280

In [9]:
patient_mrns = list(set(UCSF_reports['Patient MRN']))
train_mrns, val_test_mrns = train_test_split(patient_mrns, test_size=0.2, random_state=123)
val_mrns, test_mrns = train_test_split(val_test_mrns, test_size=0.5, random_state=123)

UCSF_reports_train = UCSF_reports[UCSF_reports['Patient MRN'].isin(train_mrns)]
UCSF_reports_val = UCSF_reports[UCSF_reports['Patient MRN'].isin(val_mrns)]
UCSF_reports_test = UCSF_reports[UCSF_reports['Patient MRN'].isin(test_mrns)]

UCSF_reports_dict = {
    'train': UCSF_reports_train,
    'val': UCSF_reports_val,
    'test': UCSF_reports_test
}

In [10]:
len(patient_mrns)

127716

In [11]:
for dataset in UCSF_reports_dict:
    print('='*20)
    print(dataset)
    print('-'*20)
    print('n =', len(UCSF_reports_dict[dataset]))
    print(len(set(UCSF_reports_dict[dataset]['Patient MRN'])), 'patients')

train
--------------------
n = 282525
102172 patients
val
--------------------
n = 35631
12772 patients
test
--------------------
n = 35124
12772 patients


In [12]:
print('Age (y)')
for dataset in UCSF_reports_dict:
    print('='*20)
    print(dataset)
    print('-'*20)
    print(UCSF_reports_dict[dataset]['Patient Age'].mean(), '+/-', UCSF_reports_dict[dataset]['Patient Age'].std())

Age (y)
train
--------------------
51.188826581718445 +/- 22.835700204281263
val
--------------------
50.65542168336562 +/- 22.745595264243146
test
--------------------
51.21868266712219 +/- 22.89266057646915


In [13]:
print('Patient Sex')
for dataset in UCSF_reports_dict:
    print('='*20)
    print(dataset)
    print('-'*20)
    UCSF_reports_dict[dataset]['Patient Sex'] = UCSF_reports_dict[dataset]['Patient Sex'].fillna('U')
    print(UCSF_reports_dict[dataset]['Patient Sex'].value_counts(normalize=True)*100)

Patient Sex
train
--------------------
Patient Sex
Female    54.491461
Male      45.388904
U          0.119635
Name: proportion, dtype: float64
val
--------------------
Patient Sex
Female    53.745334
Male      46.150824
U          0.103842
Name: proportion, dtype: float64
test
--------------------
Patient Sex
Female    54.313290
Male      45.578522
U          0.108188
Name: proportion, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  UCSF_reports_dict[dataset]['Patient Sex'] = UCSF_reports_dict[dataset]['Patient Sex'].fillna('U')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  UCSF_reports_dict[dataset]['Patient Sex'] = UCSF_reports_dict[dataset]['Patient Sex'].fillna('U')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  UCSF_repo

In [14]:
print('Modality')
for dataset in UCSF_reports_dict:
    print('='*20)
    print(dataset)
    print('-'*20)
    print(UCSF_reports_dict[dataset]['Modality'].value_counts(normalize=True)*100)

Modality
train
--------------------
Modality
CT     42.332537
MRI    30.064242
US     27.603221
Name: proportion, dtype: float64
val
--------------------
Modality
CT     42.266566
MRI    30.128259
US     27.605175
Name: proportion, dtype: float64
test
--------------------
Modality
CT     42.164901
MRI    30.437877
US     27.397221
Name: proportion, dtype: float64


In [15]:
print('Patient Status')
for dataset in UCSF_reports_dict:
    print('='*20)
    print(dataset)
    print('-'*20)
    UCSF_reports_dict[dataset]['Patient Status'] = UCSF_reports_dict[dataset]['Patient Status'].fillna('Other')
    print(UCSF_reports_dict[dataset]['Patient Status'].value_counts(normalize=True))

Patient Status
train
--------------------
Patient Status
Outpatient    0.647125
Inpatient     0.209288
Emergency     0.120035
Other         0.023552
Name: proportion, dtype: float64
val
--------------------
Patient Status
Outpatient    0.639920
Inpatient     0.213690
Emergency     0.122169
Other         0.024220
Name: proportion, dtype: float64
test
--------------------
Patient Status
Outpatient    0.649328
Inpatient     0.210711
Emergency     0.118267
Other         0.021695
Name: proportion, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  UCSF_reports_dict[dataset]['Patient Status'] = UCSF_reports_dict[dataset]['Patient Status'].fillna('Other')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  UCSF_reports_dict[dataset]['Patient Status'] = UCSF_reports_dict[dataset]['Patient Status'].fillna('Other')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versu

In [16]:
print('Is Stat')
for dataset in UCSF_reports_dict:
    print('='*20)
    print(dataset)
    print('-'*20)
    print(UCSF_reports_dict[dataset]['Is Stat'].value_counts(normalize=True)*100)

Is Stat
train
--------------------
Is Stat
False    99.957172
True      0.042828
Name: proportion, dtype: float64
val
--------------------
Is Stat
False    99.952289
True      0.047711
Name: proportion, dtype: float64
test
--------------------
Is Stat
False    99.945906
True      0.054094
Name: proportion, dtype: float64


In [17]:
def label_body_part(exam_description):
    if 'ABDOMEN' in exam_description or 'PELVIS' in exam_description:
        return 'ABDOMEN/PELVIS'
    elif 'CHEST' in exam_description:
        return 'CHEST'
    elif 'BRAIN' in exam_description:
        return 'BRAIN'
    elif 'NECK' in exam_description:
        return 'NECK'
    elif 'SPINE' in exam_description or 'SPINAL' in exam_description:
        return 'SPINE'
    elif 'EXTREMITY' in exam_description:
        return 'EXTREMITY'
    elif 'KIDNEY' in exam_description or 'RENAL' in exam_description:
        return 'RENAL/KIDNEY'
    elif 'BREAST' in exam_description:
        return 'BREAST'
    elif 'LIVER' in exam_description:
        return 'LIVER'
    elif 'PROSTATE' in exam_description:
        return 'PROSTATE'  
    elif 'KNEE' in exam_description:
        return 'KNEE'  
    elif 'HEAD' in exam_description:
        return 'HEAD'
    elif 'HIP' in exam_description:
        return 'HIP'
    elif 'HEART' in exam_description:
        return 'HEART'
    
    return 'OTHER'
    
UCSF_reports['Body Part Imaged'] = UCSF_reports['Exam Description'].apply(label_body_part)

print('Body Part Imaged')
for dataset in UCSF_reports_dict:
    print('='*20)
    print(dataset)
    print('-'*20)
    UCSF_reports_dict[dataset]['Body Part Imaged'] = UCSF_reports_dict[dataset]['Exam Description'].apply(label_body_part)
    print(UCSF_reports_dict[dataset]['Body Part Imaged'].value_counts(normalize=True)*100)

Body Part Imaged
train
--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  UCSF_reports_dict[dataset]['Body Part Imaged'] = UCSF_reports_dict[dataset]['Exam Description'].apply(label_body_part)


Body Part Imaged
ABDOMEN/PELVIS    25.584992
BRAIN             18.058225
OTHER             15.274754
CHEST             13.626051
SPINE              8.445624
NECK               4.721706
RENAL/KIDNEY       4.224759
EXTREMITY          4.069020
PROSTATE           1.331918
BREAST             1.159897
KNEE               1.153880
LIVER              0.897974
HIP                0.796744
HEART              0.510397
HEAD               0.144058
Name: proportion, dtype: float64
val
--------------------
Body Part Imaged
ABDOMEN/PELVIS    25.660240
BRAIN             18.175185
OTHER             14.961691
CHEST             13.583677
SPINE              8.607673
NECK               4.745867
RENAL/KIDNEY       4.164913
EXTREMITY          4.027392
PROSTATE           1.389240
KNEE               1.097359
BREAST             1.094552
LIVER              0.898094
HIP                0.839157
HEART              0.541663
HEAD               0.213297
Name: proportion, dtype: float64
test
--------------------
Body Part

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  UCSF_reports_dict[dataset]['Body Part Imaged'] = UCSF_reports_dict[dataset]['Exam Description'].apply(label_body_part)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  UCSF_reports_dict[dataset]['Body Part Imaged'] = UCSF_reports_dict[dataset]['Exam Description'].apply(label_body_part)


In [21]:
for dataset in UCSF_reports_dict:
    UCSF_reports_dict[dataset].to_csv(f'data/processed/UCSF_{dataset}.csv', index=False)