# This notebook segments (i.e., breaks reports into key statements/words) tables containing report data for MC1-T1

In [None]:
# Generic imports
import pandas as pd
from copy import deepcopy
from pathlib import Path

In [None]:
# Custom imports
from src.segmentation_tools import (
    curate_indicator_word_list,
    remove_easy_sections,
    handle_subsection_titles,
    remove_lines_on_other_organs,
    stem_indicator_words,
    remove_stopwords,
    remove_sections_n_duplicate_lines,
    refine_cleaning,
    remove_dictation,
    extract_surroundings_of_risk_factor_and_process
)

In [None]:
# Custom display of tables for easier inspection
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
print("Hello World!")

This notebook aims to process datasets from MC1-T1 for downstream automated ARDS/control adjudication.  
Specifically, this notebook follows steps highlighted in bold-face:  
- File I/O, which depends on a specs file.
- Standard preprocessing, in which column names are standardized.  
- Hospital-specific processing, which is temporarily custom-made.  
- Anonymization. It follows two substeps:  
    + Anonymizing patient/encounter IDs and datetime columns for all tables.
    + Anonymizing text-based tables (chest X-ray reports, attending notes, ECHO reports). 
- **Segmentation of text-based tables.**

## Read in the tables and converting the text-based ones into list of dictionaries

In [None]:
basedir = Path("..")
anonymized_location = basedir / 'Anonymized_data'
cohort = 'mc1_t1'
path = anonymized_location / cohort

In [None]:
pf = pd.read_csv(path / "pf_ratio.csv")
pf['pf_ratio_timestamp'] = pd.to_timedelta(pf['pf_ratio_timestamp'])
pf['vent_start_timestamp'] = pd.to_timedelta(pf['vent_start_timestamp'])

try:
    peep = pd.read_csv(path / "peep.csv")
    peep['peep_timestamp'] = pd.to_timedelta(peep['peep_timestamp'])
except FileNotFoundError:
    peep = None
    print("This dataset doesn't seem to have peep separately specified.")

cxr = pd.read_csv(path / "cxr.csv")
cxr['cxr_timestamp'] = pd.to_timedelta(cxr['cxr_timestamp'])

notes = pd.read_csv(path / "attending_notes.csv")
notes['notes_timestamp'] = pd.to_timedelta(notes['notes_timestamp'])

notes_annot = pd.read_csv(path / "attending_notes_annotated.csv")
notes_annot['notes_timestamp'] = pd.to_timedelta(notes_annot['notes_timestamp'])

echo = pd.read_csv(path / "echo_reports.csv")
echo['echo_timestamp'] = pd.to_timedelta(echo['echo_timestamp'])

bnp = pd.read_csv(path / "bnp.csv")
bnp['bnp_timestamp'] = pd.to_timedelta(bnp['bnp_timestamp'])

## Defining parameters for segmentation functions

In [None]:
# Does it actually matter listing different sections for the cohorts?
# The pattern of True/False is the same, but with new sections.
# Might as well have just one list of tuples with comprehensive sections
# found across every cohort. If a section is not in the CXRs of one
# particular cohort, it would simply not match, right?
section_order = {}

section_order['mc1_t1'] = [
    ('result:', True),
    ('study:', False),
    ('procedure:', False),
    ('indication:', False),
    ('technique:', False),
    ('history:', False),
    ('exam:', False),
    ('comparison:', False),
    ('finding_conclusion:', True),
    ('finding:', True),
    ('impression:', True),
    ('conclusion:', True)
    ]

section_order['mc1_t2'] = [
    ('procedure:', False),
    ('indication:', False),
    ('technique:', False),
    ('history:', False),
    ('exam:', False),
    ('comparison:', False),
    ('finding_conclusion:', True),
    ('finding:', True),
    ('impression:', True),
    ('conclusion:', True)
    ]

section_order['mc2_t3'] = [
    ('procedure:', False),
    ('indication:', False),
    ('technique:', False),
    ('history:', False),
    ('exam:', False),
    ('comparison:', False),
    ('finding_conclusion:', True),
    ('finding:', True),
    ('impression:', True),
    ('conclusion:', True)
    ]

In [None]:
# Words listed in exclusion_set mark statements in the report that do not address the lungs.
#
# Note: Curt and Cathy recommended removing 'hila', 'hilar' from this list.
#
# They suggested that it may also be appropriate to remove 'venous'
# but it requires further consideration.

exclusion_set = {
    'adenopathy', 'artery', 'aortic', 'atria',
    'biliary', 'bowel', 'bones', 'cabg', 'carina',
    'cardiac', 'cardiomegaly', 'catheter', 'chest',
    'cirrhosis', 'devices', 'drain', 'drains', 'ett',
    'gallbladder', 'heart', 'hearts', 'hydronephrosis',
    'kidney', 'line', 'lines', 'liver', 'lymph',
    'mediastinal', 'mediastinum', 'myeloma', 'picc',
    'pneumomediastinum', 'spine', 'spleen',
    'support_devices', 'tube', 'tubes', 'tubes_devices',
    'vasculature', 'vein', 'vena', 'venous', 'ventric',
    'wire', 'wires'
    }

In [None]:
targeted_stemming = {
    'bilaterally': 'bilateral',
    'infiltrates': 'infiltrate',
    'inhalational': 'inhalation',
    'opacities': 'opacity',
    'angles': 'angle',
    'effusions': 'effusion',
    'patches': 'patch',
    'patchy': 'patch',
    'spaces': 'space',
    'traces': 'trace'
    }

In [None]:
complex_stopwords = [
    'there is ',
    'there are ',
    'there has been ',
    'at this time'
    ]

simple_stopwords = [
    'a', 'an', 'are', 'demonstrate',
    'demonstrated', 'is', 'noted',
    'present', 'shows', 'showed', 'the'
    ]

In [None]:
useless_statements = [
    '', ' ', 'clinic', 'clinical',
    'dx clinical', 'discussed dr',
    'findings discussed dr',
    'first_name last_name', 'intubated',
    'intubation', 'patient rotated',
    'xr chest ap portable',
    'this exam was dictated at', '____'
    ]

In [None]:
dictation = 'this exam was dictated at this_hospital'

In [None]:
filename = 'ards_indicators'
verbose = False
indicator_words = curate_indicator_word_list(
    filename,
    targeted_stemming,
    verbose
    )

## Code cell performing report/text segmentation: CXRs

In [None]:
cxr_list = cxr.to_dict(orient='records')

In [None]:
verbose = False

for record in cxr_list:
    note = remove_easy_sections(
        record['cxr_findings'],
        section_order[cohort.strip('/')]
        )
    if verbose: print(note, '\n')
        
    new_report = note.split('.')
    if verbose: print(f"Note split---- {new_report}---\n")
            
    new_report = handle_subsection_titles(new_report)
    if verbose: print(f"++Split by :---- {new_report}---\n")
            
    new_report = remove_lines_on_other_organs(
        new_report,
        exclusion_set
        )
    if verbose: print(f"++Exclusions---- {new_report}---\n")
            
    new_report = stem_indicator_words(
        new_report,
        targeted_stemming
        )
    if verbose: print(f"++Stem Indic---- {new_report}---\n")
            
    new_report = remove_stopwords(
        new_report,
        complex_stopwords,
        simple_stopwords
        )
    if verbose: print(f"++Rem Stopw---- {new_report}---\n")
            
    new_report = remove_sections_n_duplicate_lines(new_report)
    if verbose: print(f"++Rem Dupli---- {new_report}---\n")
            
    new_report = refine_cleaning(
        new_report,
        useless_statements
        )
    if verbose: print(f"++Clean---- {new_report}---")
            
    new_report = remove_dictation(
        new_report,
        dictation,
        verbose
        )
    if verbose: print(f"++Dictation---- {new_report}---")
            
    record['seg_cxr_text'] = deepcopy(new_report)
    if verbose: print(record['seg_cxr_text'])

## Annotated attending physician notes

#### First, extract surrounding text mentioning risk factors. It currently extracts a 200-character window from the mention of a risk factor (100 before, and 100 after). If note isn't that long, it takes whatever it can take from the note (i.e. from beginning to end).

In [None]:
# Making those list of dict. The whole notes table plus SW notes are also being processed.
notes_list = notes.to_dict(orient='records')
notes_annot_list = notes_annot.to_dict(orient='records')

In [None]:
# Extracting the relevant part of the notes mentioning the risk factors. Doing this now for all notes
extract_surroundings_of_risk_factor_and_process(
    notes_list,
    text_field='notes_text',
    add_column_name='pneumonia'
    )

extract_surroundings_of_risk_factor_and_process(
    notes_list,
    text_field='notes_text',
    add_column_name='chf'
    )

extract_surroundings_of_risk_factor_and_process(
    notes_list,
    text_field='notes_text',
    add_column_name='aspiration'
    )

extract_surroundings_of_risk_factor_and_process(
    notes_list,
    text_field='notes_text',
    add_column_name='sepsis'
    )

extract_surroundings_of_risk_factor_and_process(
    notes_list,
    text_field='notes_text',
    add_column_name='shock'
    )

extract_surroundings_of_risk_factor_and_process(
    notes_list,
    text_field='notes_text',
    add_column_name='cardiac_arrest'
    )

In [None]:
# Extracting the relevant part of the notes mentioning the risk factors. Now doing this for training dataset (SW notes)
extract_surroundings_of_risk_factor_and_process(
    notes_annot_list,
    text_field='notes_text',
    add_column_name='pneumonia'
    )

extract_surroundings_of_risk_factor_and_process(
    notes_annot_list,
    text_field='notes_text',
    add_column_name='chf'
    )

extract_surroundings_of_risk_factor_and_process(
    notes_annot_list,
    text_field='notes_text',
    add_column_name='aspiration'
    )

extract_surroundings_of_risk_factor_and_process(
    notes_annot_list,
    text_field='notes_text',
    add_column_name='sepsis'
    )

extract_surroundings_of_risk_factor_and_process(
    notes_annot_list,
    text_field='notes_text',
    add_column_name='shock'
    )

extract_surroundings_of_risk_factor_and_process(
    notes_annot_list,
    text_field='notes_text',
    add_column_name='cardiac_arrest'
    )

## Saving segmented files

In [None]:
# CSV files
savepath = Path.cwd() / basedir / 'Analysis_Data' / 'mc1_t1'
ml_savepath = Path.cwd() / basedir / 'Analysis_Data' / 'train_ML'

cxr = pd.DataFrame(cxr_list)
notes = pd.DataFrame(notes_list)
notes_annot = pd.DataFrame(notes_annot_list)

Uncomment if intending to overwrite files

In [None]:
pf.to_csv(savepath / "pf_ratio.csv", index=False)
if peep is not None:
    peep.to_csv(savepath / "peep.csv", index=False)
    
cxr.to_csv(savepath / "cxr.csv", index=False)
notes.to_csv(savepath / "attending_notes.csv", index=False)
notes_annot.to_csv(savepath / "attending_notes_annotated.csv", index=False)
notes_annot.to_csv(ml_savepath / "attending_notes_annotated.csv", index=False)
echo.to_csv(savepath / "echo_reports.csv", index=False)
bnp.to_csv(savepath / "bnp.csv", index=False)