In [1]:
import pandas as pd
import os
import io
import tqdm
import regex as re
from cryptography.fernet import Fernet
from sklearn.model_selection import train_test_split

In [5]:
raw_data = pd.read_csv('data/raw/secure_UCSF_radreports__CT_MR_MRI_US_PET__01-01-2021__10-22-2022.csv')

GENERAL = raw_data[raw_data['Report Finalized Date'].str.split(' ').str[0] < '2022-04-02'][raw_data['Modality'].isin(['CT', 'PET', 'MRI', 'US'])]

date_condition_1 = raw_data['Report Finalized Date'].str.split(' ').str[0] >= '2022-04-02'
date_condition_2 = raw_data['Report Finalized Date'].str.split(' ').str[0] < '2022-07-02' 
date_condition_3 = raw_data['Report Finalized Date'].str.split(' ').str[0] >= '2022-07-02' 

CT = raw_data[date_condition_1 & date_condition_2][raw_data['Modality'] == 'CT']
CT_CHEST = raw_data[date_condition_3 & raw_data['Exam Description'].str.contains('CT CHEST')]

  GENERAL = raw_data[raw_data['Report Finalized Date'].str.split(' ').str[0] < '2022-04-02'][raw_data['Modality'].isin(['CT', 'PET', 'MRI', 'US'])]
  CT = raw_data[date_condition_1 & date_condition_2][raw_data['Modality'] == 'CT']


In [6]:
general_dataset = GENERAL.drop_duplicates('Accession Number').reset_index(drop=True)
len(general_dataset)

261086

In [7]:
specialized_dataset = CT.drop_duplicates('Accession Number').reset_index(drop=True)
len(specialized_dataset)

21425

In [8]:
finegrained_dataset = CT_CHEST.drop_duplicates('Accession Number').reset_index(drop=True)
len(finegrained_dataset)

7863

In [9]:
general_departmental_counts = general_dataset['Modality'].value_counts()
general_departmental_counts.sort_index()

CT     111440
MRI     70876
PET     10781
US      67989
Name: Modality, dtype: int64

In [10]:
general_sex_counts = general_dataset['Patient Sex'].value_counts()
general_sex_counts

Female    140070
Male      120729
U            236
Name: Patient Sex, dtype: int64

In [11]:
specialized_sex_counts = specialized_dataset['Patient Sex'].value_counts()
specialized_sex_counts

Female    10738
Male      10661
U            16
Name: Patient Sex, dtype: int64

In [12]:
finegrained_sex_counts = finegrained_dataset['Patient Sex'].value_counts()
finegrained_sex_counts

Female    3976
Male      3875
U           10
Name: Patient Sex, dtype: int64

In [13]:
general_dataset_raw = general_dataset.dropna(subset=['Exam Code'])
general_dataset_raw = general_dataset_raw[['Accession Number', 'Report Text', 'Exam Description']]
general_dataset_raw = general_dataset_raw.sample(frac=1).reset_index(drop=True)

specialized_dataset_raw = specialized_dataset.dropna(subset=['Exam Code'])
specialized_dataset_raw = specialized_dataset_raw[['Accession Number', 'Report Text', 'Exam Description']]
specialized_dataset_raw = specialized_dataset_raw.sample(frac=1).reset_index(drop=True)

finegrained_dataset_raw = finegrained_dataset.dropna(subset=['Exam Code'])
finegrained_dataset_raw = finegrained_dataset_raw[['Accession Number', 'Report Text', 'Exam Description']]
finegrained_dataset_raw = finegrained_dataset_raw.sample(frac=1).reset_index(drop=True)

In [14]:
def preprocess_findings(exam, report_text):
    text = report_text.split('\n\n')
    findings = re.search(r"((?<=\nFINDINGS:|FINDINGS:\n|Findings:\n)(.|\n)*)", text[0])
    if findings:
        findings = findings.group(0).split('"')[0]
        findings = findings.strip()
        findings = re.sub(r"(?=(RADIATION DOSE))(.|\n)*", "", findings)
    else:
        raise Exception('Invalid')
    comparison = re.search(r"((?=COMPARISON|Comparison)(.)*)", text[0])
    if comparison:
        comparison = comparison.group(0)
        comparison_text = comparison.split(':')
        comparison = ' '.join(comparison_text[-1].split())
    else:
        comparison = 'None'
    clinical_history = re.search(r"((?=CLINICAL HISTORY|Clinical history)(.)*)", text[0])
    if clinical_history:
        clinical_history = clinical_history.group(0)
        clinical_history_text = clinical_history.split(':')
        clinical_history = ' '.join(clinical_history_text[-1].split())
    else:
        clinical_history = 'None'    
    if not comparison:
        comparison = 'None'
    if not clinical_history:
        clinical_history = 'None'
    source = 'EXAM:\n{}'.format(exam) + '\n' + 'CLINICAL HISTORY:\n{}'.format(clinical_history) + '\n' + 'COMPARISON:\n{}'.format(comparison) + '\n' + 'FINDINGS:\n{}'.format(findings) 
    return exam, clinical_history, comparison, findings, source

In [15]:
def preprocess_impression(report_text):
    text = report_text.split('\n\n')
    if len(text) == 1:
        text = report_text.split('\n"')
        regex = re.compile(r'^(^$|IMPRESSION:|Report|Department|Electronically|\/\/).*')
        text = [ele for ele in text if not regex.match(ele)]
        if len(text) > 2:
            raise Exception('Invalid')

    text[-1] = re.sub(r"(?=(Impression discussed|Further impression|Final impression|Attestation|Radiologist|Electronically|This change was))(.|\n)*", "", text[-1])
    text[-1] = re.sub(r"\.\.", ".", text[-1])
    impression_text = text[-1].split('\n')
    regex = re.compile(r'^(^$|^\s*$|This study|FINDINGS|IMPRESSION:|Report|Department|Electronically|\/\/).*')
    filtered_impression = [ele for ele in impression_text if not regex.match(ele)]
    new_impression = []
    for i, sentence in enumerate(filtered_impression, start=1):
        # Considering cases starting with a letter or 
        # an example like << 3 intrahepatic biliary duct stents in place. >>
        if sentence[0].isalpha() or sentence[0].isdigit() and sentence[1] != '.':
            if 'non-reportable' in sentence:
                raise Exception('Invalid')
            new_impression.append('{}. {}'.format(i, " ".join(sentence.split())))
        # Considering cases such as << [Status post bilateral lung transplantation >>
        elif not sentence[0].isalpha() and not sentence[0].isdigit():
            new_impression.append('{}. {}'.format(i, " ".join(sentence[1:].split())))
        else:
            new_impression.append(" ".join(sentence.split()))
    impression = '\n'.join(new_impression)
    return impression

In [24]:
def preprocess(raw_data):
    processed_data = pd.DataFrame(columns=['Accession Number', 'Exam', 'Clinical History', 'Comparison', 'Findings', 'Source', 'Impression'])
    for i in tqdm.tqdm(range(len(raw_data))):
        row = raw_data.iloc[i]
        try:
            exam, clinical_history, comparison, findings, source = preprocess_findings(row['Exam Description'], row['Report Text'])
            impression = preprocess_impression(row['Report Text'])
            processed_data.loc[len(processed_data)] = {
                'Accession Number': row['Accession Number'],
                'Exam': exam,
                'Clinical History': clinical_history,
                'Comparison': comparison,
                'Findings': findings,
                'Source': source,
                'Impression': impression
            }
        except:
            continue
    return processed_data

In [25]:
general_dataset_csv = preprocess(general_dataset_raw)
general_excluded = len(general_dataset.dropna(subset=['Exam Code'])) - len(general_dataset_csv)
general_excluded

100%|█████████████████████████████████| 261086/261086 [1:18:53<00:00, 55.15it/s]


24775

In [26]:
specialized_dataset_csv = preprocess(specialized_dataset_raw)
specialized_excluded = len(specialized_dataset.dropna(subset=['Exam Code'])) - len(specialized_dataset_csv)
specialized_excluded

100%|████████████████████████████████████| 21425/21425 [01:05<00:00, 325.78it/s]


1927

In [27]:
finegrained_dataset_csv = preprocess(finegrained_dataset_raw)
finegrained_excluded = len(finegrained_dataset.dropna(subset=['Exam Code'])) - len(finegrained_dataset_csv)
finegrained_excluded

100%|██████████████████████████████████████| 7863/7863 [00:17<00:00, 437.65it/s]


85

In [28]:
general_train_dataset_csv, general_test_dataset_csv = train_test_split(general_dataset_csv, test_size=0.1)
specialized_train_dataset_csv, specialized_test_dataset_csv = train_test_split(specialized_dataset_csv, test_size=0.1)
finegrained_train_dataset_csv, finegrained_test_dataset_csv = train_test_split(finegrained_dataset_csv, test_size=0.1)

general_train_dataset_csv.to_csv('data/processed/general_train_dataset.csv', index=False)
general_test_dataset_csv.to_csv('data/processed/general_test_dataset.csv', index=False)

specialized_train_dataset_csv.to_csv('data/processed/specialized_train_dataset.csv', index=False)
specialized_test_dataset_csv.to_csv('data/processed/specialized_test_dataset.csv', index=False)

finegrained_train_dataset_csv.to_csv('data/processed/finegrained_train_dataset.csv', index=False)
finegrained_test_dataset_csv.to_csv('data/processed/finegrained_test_dataset.csv', index=False)

In [33]:
len(general_train_dataset_csv), len(general_test_dataset_csv), len(general_train_dataset_csv) + len(general_test_dataset_csv)

(212679, 23632, 236311)

In [34]:
len(specialized_train_dataset_csv), len(specialized_test_dataset_csv), len(specialized_train_dataset_csv) + len(specialized_test_dataset_csv)

(17548, 1950, 19498)

In [35]:
len(finegrained_train_dataset_csv), len(finegrained_test_dataset_csv), len(finegrained_train_dataset_csv) + len(finegrained_test_dataset_csv)

(7000, 778, 7778)

In [None]:
key = Fernet.generate_key()

with open('filekey.key', 'wb') as filekey:
    filekey.write(key)
    
# opening the key
with open('filekey.key', 'rb') as filekey:
    key = filekey.read()

fernet = Fernet(key)
 
# opening the original file to encrypt
with open('data/raw/secure_UCSF_radreports__CT_MR_MRI_US_PET__01-01-2021__10-22-2022.csv', 'rb') as file:
    original = file.read()
     
# encrypting the file
encrypted = fernet.encrypt(original)
 
# opening the file in write mode and
# writing the encrypted data
with open('data/raw/secure_UCSF_radreports__CT_MR_MRI_US_PET__01-01-2021__10-22-2022.csv', 'wb') as encrypted_file:
    encrypted_file.write(encrypted)

In [2]:
# opening the key
with open('filekey.key', 'rb') as filekey:
    key = filekey.read()
 
# using the generated key
fernet = Fernet(key)

# using the key
fernet = Fernet(key)
 
# opening the encrypted file
with open('data/raw/secure_UCSF_radreports__CT_MR_MRI_US_PET__01-01-2021__10-22-2022.csv', 'rb') as enc_file:
    encrypted = enc_file.read()
 
# decrypting the file
decrypted = fernet.decrypt(encrypted)
 
# opening the file in write mode and
# writing the decrypted data
with open('data/raw/secure_UCSF_radreports__CT_MR_MRI_US_PET__01-01-2021__10-22-2022.csv', 'wb') as dec_file:
    dec_file.write(decrypted)

In [3]:
pd.read_csv('data/raw/secure_UCSF_radreports__CT_MR_MRI_US_PET__01-01-2021__10-22-2022.csv')

Unnamed: 0.1,Unnamed: 0,Organization,Point of Care,Source System,Accession Number,Modality,Exam Code,Exam Description,CPT Code,Report Text,...,MP (RVU),Work (Professional) (RVU),PE (Professional) (RVU),MP (Professional) (RVU),Work (Technical) (RVU),PE (Technical) (RVU),MP (Technical) (RVU),Total (RVU),Total (Professional) (RVU),Total (Technical) (RVU)
0,0,CT4,RAD CT PARN,EPIC,10022479688,CT,CABPU,CT ABDOMEN/PELVIS WITHOUT CONTRAST,74176,CT ABDOMEN/PELVIS WITHOUT CONTRAST 9/30/2022 ...,...,0.10,1.74,2.45,0.09,0.0,3.21,0.01,7.50,4.28,3.22
1,1,CT4,RAD CT PARN,EPIC,10022479687,CT,CCHU,CT CHEST WITHOUT CONTRAST,71250,CT CHEST WITHOUT CONTRAST 9/30/2022 11:40 P...,...,0.06,1.08,1.52,0.05,0.0,2.59,0.01,5.25,2.65,2.60
2,2,MBUS2,RAD ULTRASOUND MB,EPIC,10022479715,US,USCRO,US SCROTUM WITH DOPPLER,"76870, 93976",US SCROTUM WITH DOPPLER 9/30/2022 11:36 PM\...,...,0.11,1.44,2.03,0.08,0.0,5.76,0.03,9.34,3.55,5.79
3,3,CT2,RAD CT PARN,EPIC,10022479670,CT,CABPE,CT ABDOMEN/PELVIS WITH CONTRAST,74177,CT ABDOMEN/PELVIS WITH CONTRAST 9/30/2022 11:...,...,0.11,1.82,2.57,0.09,0.0,7.06,0.02,11.56,4.48,7.08
4,4,CT2,RAD CT PARN,EPIC,10022479671,CT,CCHU,CT CHEST WITHOUT CONTRAST,71250,CT CHEST WITHOUT CONTRAST 9/30/2022 11:08 P...,...,0.06,1.08,1.52,0.05,0.0,2.59,0.01,5.25,2.65,2.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388576,14033,CTED,RAD CT PARN,EPIC,10022479797,CT,CABPU,CT ABDOMEN/PELVIS WITHOUT CONTRAST,74176,CT ABDOMEN/PELVIS WITHOUT CONTRAST 10/1/2022 ...,...,0.10,1.74,2.45,0.09,0.0,3.21,0.01,7.50,4.28,3.22
388577,14034,CTED,RAD CT PARN,EPIC,10022479796,CT,CBRAU,CT BRAIN WITHOUT CONTRAST,70450,CT BRAIN WITHOUT CONTRAST: 10/1/2022 1:12 AM\...,...,0.05,0.85,1.19,0.04,0.0,2.08,0.01,4.17,2.08,2.09
388578,14035,CT2,RAD CT PARN,EPIC,10022479675,CT,CTABUE,CT ANGIOGRAM BRAIN,70496,CT ANGIOGRAM BRAIN: 10/1/2022 12:35 AM\nINDIC...,...,0.13,1.75,2.46,0.09,0.0,6.12,0.04,10.46,4.30,6.16
388579,14036,MRN1,RAD MRI PARN,EPIC,10022479631,MRI,MBRUE,MR BRAIN WITH AND WITHOUT CONTRAST,70553,"MR BRAIN WITH AND WITHOUT CONTRAST, MR VENOGRA...",...,0.15,2.29,3.22,0.11,0.0,6.79,0.04,12.45,5.62,6.83
