In [1]:
import pandas as pd
import regex as re
import tqdm

In [2]:
def preprocess_text(exam, report_text):
    text = report_text.replace('_x000D_', '')
    findings_impression = re.search(r"(FINDINGS|Findings:|FINDINGS:|FINDINGS:\n\n|FINDINGS:\n)((.|\n)*)", text)
    findings_impression = findings_impression.group(0)
    
    impression = re.search(r"(Impression:|IMPRESSION:|IMPRESSION:\n\n)((.|\n)*)", text)
    impression = impression.group(0)
    
    findings = findings_impression.replace(impression, '')
    findings = findings.replace('FINDINGS', '').strip()
    findings = findings.replace('FINDINGS:', '').strip()
    findings = findings.replace('FINDINGS:\n', '').strip()
    findings = findings.replace('Findings:\n', '').strip()
    findings = findings.replace(':\n\n', '').strip()
    findings = findings.replace(': \n\n', '').strip()
    findings = findings.replace(':\n', '').strip()

    impression = impression.replace('IMPRESSION:\n', '').strip()
    impression = impression.replace('Impression:', '').strip()

    impression = re.sub(r"(?=(The above findings|Impression discussed|Further impression|Final impression|Attestation|Radiologist|Electronically|This change was))(.|\n)*", "", impression)
    
    impression_text = impression.split('\n')
    regex = re.compile(r'^(^$|^\s*$|No wet|Wet read|This study|FINDINGS|IMPRESSION:|Report|Department|Electronically|\/\/).*')
    filtered_impression = [ele for ele in impression_text if not regex.match(ele)]
    new_impression = []
    for i, sentence in enumerate(filtered_impression, start=1):
        # Considering cases starting with a letter or 
        # an example like << 3 intrahepatic biliary duct stents in place. >>
        if sentence[0].isalpha() or sentence[0].isdigit() and sentence[1] != '.':
            if 'non-reportable' in sentence:
                raise Exception('Invalid')
            new_impression.append('{}. {}'.format(i, " ".join(sentence.split())))
        # Considering cases such as << [Status post bilateral lung transplantation >>
        elif not sentence[0].isalpha() and not sentence[0].isdigit():
            new_impression.append('{}. {}'.format(i, " ".join(sentence[1:].split())))
        else:
            new_impression.append(" ".join(sentence.split()))
    impression = '\n'.join(new_impression)
    
    comparison = re.search(r"((?=COMPARISON|Comparison)(.)*)", text).group(0)
    comparison = comparison.replace('COMPARISON: ', '').strip()
    
    indication = re.search(r"((?=Indication:|INDICATION FOR STUDY|INDICATION FOR EXAM)(.)*)", text)
    if indication:
        indication = indication.group(0)
    else:
        indication = 'None'
    indication = indication.replace('INDICATION FOR STUDY: ', '').strip()
    indication = indication.replace('INDICATION FOR EXAM: ', '').strip()
    
    source = 'EXAM:\n{}'.format(exam) + '\n' + 'CLINICAL HISTORY:\n{}'.format(indication) + '\n' + 'COMPARISON:\n{}'.format(comparison) + '\n' + 'FINDINGS:\n{}'.format(findings) 
    return exam, indication, comparison, findings, source, impression

def preprocess(raw_data):
    processed_data = pd.DataFrame(columns=['Accession Number', 'Exam', 'Clinical History', 'Comparison', 'Findings', 'Source', 'Impression'])
    for i in tqdm.tqdm(range(len(raw_data))):
        row = raw_data.iloc[i]
        try:
            exam, clinical_history, comparison, findings, source, impression = preprocess_text(row['Exam Description'], row['Report Text'])
            processed_data.loc[len(processed_data)] = {
                'Accession Number': row['Accession Number'],
                'Exam': row['Exam Description'],
                'Clinical History': clinical_history,
                'Comparison': comparison,
                'Findings': findings,
                'Source': source,
                'Impression': impression
            }
        except:
            continue
    return processed_data

In [50]:
zsfg_specialized_file = '/run/user/1000/gvfs/smb-share:server=sohn2022.local,share=sohn2022/secure_metadata/secure_ZSFG_AllCT_3k_20220101_20220201.xlsx'

zsfg_specialized_dataset = pd.read_excel(zsfg_specialized_file)
date_condition_1 = zsfg_specialized_dataset['Report Finalized Date'] >= '2022-01-01'
date_condition_2 = zsfg_specialized_dataset['Report Finalized Date'] < '2022-02-01' 
zsfg_specialized_dataset = zsfg_specialized_dataset[date_condition_1 & date_condition_2]

zsfg_specialized_dataset_csv = preprocess(zsfg_specialized_dataset)
zsfg_specialized_excluded = len(zsfg_specialized_dataset.dropna(subset=['Exam Code'])) - len(zsfg_specialized_dataset_csv)
zsfg_specialized_excluded

100%|██████████████████████████████████████| 2777/2777 [00:06<00:00, 450.11it/s]


28

In [51]:
zsfg_finegrained_file = '/run/user/1000/gvfs/smb-share:server=sohn2022.local,share=sohn2022/secure_metadata/secure_ZSFG_ChestCT_20211101_20221111.xlsx'

zsfg_finegrained_dataset = pd.read_excel(zsfg_finegrained_file)
date_condition_1 = zsfg_finegrained_dataset['Report Finalized Date'] >= '2022-01-01'
date_condition_2 = zsfg_finegrained_dataset['Report Finalized Date'] < '2022-02-01' 
zsfg_finegrained_dataset = zsfg_finegrained_dataset[date_condition_1 & date_condition_2]
zsfg_finegrained_dataset_csv = preprocess(zsfg_finegrained_dataset)
zsfg_finegrained_excluded = len(zsfg_finegrained_dataset.dropna(subset=['Exam Code'])) - len(zsfg_finegrained_dataset_csv)
zsfg_finegrained_excluded

100%|████████████████████████████████████████| 372/372 [00:00<00:00, 476.52it/s]


0

In [52]:
len(zsfg_specialized_dataset_csv)

2749

In [53]:
len(zsfg_finegrained_dataset_csv)

372

In [None]:
zsfg_specialized_dataset_csv.to_csv('data/processed/zsfg_specialized_test_dataset.csv', index=False)
zsfg_finegrained_dataset_csv.to_csv('data/processed/zsfg_finegrained_test_dataset.csv', index=False)

In [45]:
zsfg_specialized_dataset.dropna(subset=['Report Finalized Date']).sort_values(by='Report Finalized Date')['Report Finalized Date'].iloc[0]

Timestamp('2022-01-01 07:40:50')