In [21]:
import pandas as pd
import regex as re

In [22]:
ZSFG_reports = pd.read_csv('data/raw/secure_ZSFG_radreports__CT_MR_US_NM__01-01-2022__12-29-2022.csv')
ZSFG_reports = ZSFG_reports[ZSFG_reports['Modality'].isin(['CT', 'MR', 'US'])].reset_index(drop=True)
ZSFG_reports['Report Text'] = ZSFG_reports['Report Text'].str.replace('_x000D_', '').str.replace('\t', ' ')
len(ZSFG_reports)

60049

## Exclude Studies

In [23]:
# Reports with same accessions
ZSFG_duplicates = ZSFG_reports[ZSFG_reports.duplicated(subset='Accession Number', keep=False)]
ZSFG_reports = ZSFG_reports.drop_duplicates(subset='Accession Number', keep=False).reset_index(drop=True)
print('n =', len(ZSFG_duplicates))

n = 2


In [24]:
# Reports that are non-reportable studies 
ZSFG_nonreportable_reports = ZSFG_reports[ZSFG_reports['Report Text'].str.contains('non-reportable study')].reset_index(drop=True)
ZSFG_reports = ZSFG_reports[~ZSFG_reports['Report Text'].str.contains('non-reportable study')].reset_index(drop=True)
print('n =', len(ZSFG_nonreportable_reports))

n = 0


In [25]:
# Reports with no separate findings and impressions
def extract_impression(report_text):
    pattern = r'(Impression:|IMPRESSION:|IMPRESSION:\n)(.*?)(The above|Wet read|Findings are|Electronically|REFERENCE:)'
    match = re.search(pattern, report_text, re.DOTALL)
    if match:
        return match.group(2).strip()
    return ''

def extract_source(report_text):
    pattern = r'^(.*?)(Impression:|IMPRESSION:|IMPRESSION:\n)'

    match = re.search(pattern, report_text, re.DOTALL)
    if match:
        return match.group(1).strip()  
    return '' 

ZSFG_reports['Processed Impression'] = ZSFG_reports['Report Text'].apply(extract_impression)
ZSFG_reports['Processed Source'] = ZSFG_reports['Report Text'].apply(extract_source)

ZSFG_no_separate_impression = ZSFG_reports[
    (ZSFG_reports['Processed Impression'] == '') | (ZSFG_reports['Processed Source'] == '')
]
ZSFG_reports = ZSFG_reports[
    (ZSFG_reports['Processed Impression'] != '') & (ZSFG_reports['Processed Source'] != '')
].reset_index(drop=True)
print('n =', len(ZSFG_no_separate_impression))

n = 124


In [26]:
print('n =', len(ZSFG_reports))
print(len(ZSFG_reports['Patient MRN'].unique()), 'patients')

n = 59923
27530 patients


In [27]:
print('Age (y)')
print('='*20)
print(ZSFG_reports['Patient Age'].mean(), '+/-', ZSFG_reports['Patient Age'].std())

Age (y)
52.620025532767045 +/- 19.308107764202852


In [41]:
print('Patient Sex')
print('='*20)
ZSFG_reports['Patient Sex'] = ZSFG_reports['Patient Sex'].fillna('U')
print(ZSFG_reports['Patient Sex'].value_counts(normalize=True))

Patient Sex
Male      0.536305
Female    0.463261
U         0.000434
Name: Patient Sex, dtype: float64


In [49]:
print('Modality')
print('='*20)
print(ZSFG_reports['Modality'].value_counts(normalize=True))

Modality
CT     0.611451
US     0.262086
MRI    0.126462
Name: Modality, dtype: float64


In [45]:
print('Patient Status')
print('='*20)
ZSFG_reports['Patient Status'] = ZSFG_reports['Patient Status'].fillna('Other')
print(ZSFG_reports['Patient Status'].value_counts(normalize=True)*100)

Patient Status
Outpatient    40.313402
Emergency     34.694525
Inpatient     24.818517
Other          0.173556
Name: Patient Status, dtype: float64


In [50]:
print('Is Stat')
print('='*20)
print(ZSFG_reports['Is Stat'].value_counts(normalize=True))

Is Stat
True     0.600237
False    0.399763
Name: Is Stat, dtype: float64


In [53]:
def label_body_part(exam_description):
    if 'ABDOMEN' in exam_description or 'PELVIS' in exam_description:
        return 'ABDOMEN/PELVIS'
    elif 'CHEST' in exam_description:
        return 'CHEST'
    elif 'BRAIN' in exam_description:
        return 'BRAIN'
    elif 'NECK' in exam_description:
        return 'NECK'
    elif 'SPINE' in exam_description or 'SPINAL' in exam_description:
        return 'SPINE'
    elif 'EXTREMITY' in exam_description:
        return 'EXTREMITY'
    elif 'KIDNEY' in exam_description or 'RENAL' in exam_description or 'Kidneys' in exam_description:
        return 'RENAL/KIDNEY'
    elif 'BREAST' in exam_description:
        return 'BREAST'
    elif 'LIVER' in exam_description:
        return 'LIVER'
    elif 'PROSTATE' in exam_description:
        return 'PROSTATE'  
    elif 'KNEE' in exam_description:
        return 'KNEE'  
    elif 'HEAD' in exam_description or 'Head' in exam_description:
        return 'HEAD'
    elif 'HIP' in exam_description:
        return 'HIP'
    elif 'HEART' in exam_description:
        return 'HEART'
    
    return 'OTHER'
    
print('Body Part Imaged')
print('='*20)
ZSFG_reports['Body Part Imaged'] = ZSFG_reports['Exam Description'].apply(label_body_part)
print(ZSFG_reports['Body Part Imaged'].value_counts(normalize=True))

Body Part Imaged
ABDOMEN/PELVIS    0.268511
BRAIN             0.204145
OTHER             0.196352
CHEST             0.153230
NECK              0.052017
SPINE             0.037081
RENAL/KIDNEY      0.022863
EXTREMITY         0.020526
HEAD              0.018324
LIVER             0.017973
KNEE              0.006792
HIP               0.002069
BREAST            0.000117
Name: Body Part Imaged, dtype: float64


In [33]:
ZSFG_reports['Modality'] = ZSFG_reports['Modality'].replace('MR', 'MRI')
ZSFG_reports['Modality'].value_counts()

CT     36640
US     15705
MRI     7578
Name: Modality, dtype: int64

In [35]:
ZSFG_reports.to_csv(f'Flan_T5_Finetune/data/ZSFG_test.csv', index=False)