In [None]:
# Generic imports
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import timedelta
from sklearn.metrics import (
    confusion_matrix,
    cohen_kappa_score
)

In [None]:
# Custom imports
from src.diagnosis_tools import (
    mark_hypoxemic_episodes,
    mark_abnormal_cxr,
    mark_cxr_within_48h_of_post_vent_hypoxemia,
    mark_note_within_7d,
    mark_notes_with_ml,
    text_match_risk_factors,
    diagnose_or_exclude_encounters,
    flag_echos
)
import src.plots as plots

In [None]:
# set plotting params
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)
plt.style.reload_library()
rcparams = plots.stdrcparams1()
mpl.rcParams.update(rcparams)

In [None]:
# Custom display of tables for easier inspection
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

While criteria to identify ARDS includes more than these, from the perspective of files these will be the important files/criteria:  
-PF ratios (ARDS if PF<=300 mmHg and PEEP>=5 cm H20 anytime during encounter)  
-Chest X-ray reports (ARDS if bilateral pulmonary opacities identified within 48 h window of PF ratio <= 300 AND both timestamps happen after intubation).  
-First and second criteria happening within 7 days of a known ARDS risk factor (-1 to 7 days of latest timestamp of above combo). Search attending physician notes for this.  
-If no risk factor found, seek language ruling out cardiac failure in attending physician notes.  
-If no cardiac failure language found, use objective criteria form echocardiography reports to rule out cardiac failure, otherwise, ARDS is adjudicated (window of entire hospitalization).

## Read in the tables

In [None]:
basedir = Path("..")
analysis_location = basedir / 'Analysis_Data'
training_location = analysis_location / 'train_ML'
cohort = 'hospital_a_2013'
path = analysis_location / cohort

In [None]:
pf = pd.read_csv(path / "pf_ratio.csv")
pf['pf_ratio_timestamp'] = pd.to_timedelta(pf['pf_ratio_timestamp'])
pf['vent_start_timestamp'] = pd.to_timedelta(pf['vent_start_timestamp'])

try:
    peep = pd.read_csv(path / "peep.csv")
    peep['peep_timestamp'] = pd.to_timedelta(peep['peep_timestamp'])
except FileNotFoundError:
    peep = None
    print("This dataset doesn't seem to have peep separately specified.")

cxr = pd.read_csv(path / "cxr.csv")
cxr['cxr_timestamp'] = pd.to_timedelta(cxr['cxr_timestamp'])

notes = pd.read_csv(path / "attending_notes.csv")
notes['notes_timestamp'] = pd.to_timedelta(notes['notes_timestamp'])

echo = pd.read_csv(path / "echo_reports.csv")
echo['echo_timestamp'] = pd.to_timedelta(echo['echo_timestamp'])

bnp = pd.read_csv(path / "bnp.csv")
bnp['bnp_timestamp'] = pd.to_timedelta(bnp['bnp_timestamp'])

## Now, diagnosis.

### PF ratio table: Flagging hypoxemic windows (will check for PEEP >= 5 cm H2O if PEEP available)

In [None]:
pf, hypox_df = mark_hypoxemic_episodes(pf, peep, 'encounter_id')

In [None]:
print(f"Encounters with hypoxemia: {hypox_df['encounter_id'].nunique()}")
print(f"Uniquely-identified hypoxemic entries: {len(hypox_df)}")

### Chest X-ray: Flagging abnormal CXRs and whether they are within 48 h of a hypoxemic record

#### Flagging those CXR that are "abnormal" (bilateral pulmonary opacities consistent with pulmonary edema).

In [None]:
train_data = training_location / 'cxr_whole_training_dataset.csv'

cxr = mark_abnormal_cxr(
    cxr_table=cxr,
    train_data_path=train_data,
    train_col=['segmented_report', 'score'],
    test_label_col='cxr_score',
    thresholding="default"
    )

#### Flagging CXRs that are within 48 h of a hypoxemic entry

In [None]:
cxr, hypox_pred_abn_cxr_48h = mark_cxr_within_48h_of_post_vent_hypoxemia(
    hypox_df,
    cxr,
    'encounter_id',
    'cxr_timestamp'
    )

In [None]:
print(f"Abnormal CXRs - label: {cxr['cxr_score'].sum()}")
print(f"Predicted Abnormal CXRs: {cxr['cxr_score_predicted'].sum()}")

In [None]:
print(f"""Encounters with abnormal CXRs - label: {cxr.loc[
    cxr['cxr_score'].astype(bool),
    'encounter_id'
    ].nunique()}"""
    )

print(f"""Encounters with predicted abnormal CXRs: {cxr.loc[
    cxr['cxr_score_predicted'],
    'encounter_id'
    ].nunique()}"""
    )

In [None]:
print(
    f"""Encounters with hypoxemia and CXRs within 48h: {cxr.loc[
    cxr['within_48h'],
    'encounter_id'
    ].nunique()}"""
    )

print(f"Uniquely-identified entries: {len(cxr.loc[cxr['within_48h']])}")

In [None]:
print(
    f"""Encounters with hypoxemia and predicted abnormal CXRs within 48h: {hypox_pred_abn_cxr_48h[
    'encounter_id'
    ].nunique()}"""
    )

print(f"Uniquely-identified entries: {len(hypox_pred_abn_cxr_48h)}")

### Attending physician notes

#### Flag notes within -1 to 7 days of latest of hypoxemia or abnormal CXR report.

In [None]:
notes = mark_note_within_7d(
    notes,
    hypox_df,
    hypox_pred_abn_cxr_48h,
    'encounter_id',
    'cxr_timestamp'
    )

In [None]:
print(
    f"""Encounters with notes within 7 days of hypox_abn_cxr: {notes.loc[
    notes['within_7d'],
    'encounter_id'
    ].nunique()}"""
    )

print(f"Uniquely-identified entries: {len(notes.loc[notes['within_7d']])}")

#### Flag notes mentioning any risk factor. Separately, flag notes with cardiac failure language.

In [None]:
notes = mark_notes_with_ml(
    attn_notes=notes,
    train_data_path=training_location,
    train_col=['seg_pneumonia', 'pneumonia_sw'],
    thresholding="default"
    )

In [None]:
notes = text_match_risk_factors(notes)

In [None]:
notes, diagnosed, excluded, for_objective_assessment = diagnose_or_exclude_encounters(
    notes,
    hypox_pred_abn_cxr_48h,
    'encounter_id'
    )

In [None]:
print(
    f"{notes.encounter_id.nunique()}, {diagnosed.encounter_id.nunique()}, {excluded.encounter_id.nunique()}, {for_objective_assessment.encounter_id.nunique()}"
    )

### BNP and ECHO reports: Objective assessment of cardiac failure

#### First, let's annotate the ECHO reports with the values/statements of interest:  
- lvef < 40%  
- cardiopulmonary bypass  
- left atrial dimension > 4 cm or volume index > 28 mL/m2  
- left ventricular hypertrophy  
- Grade II or III diastolic dysfunction

In [None]:
# These will be dictionaries whose keys will become the column names for the flags
# and the lists will be the regex patterns to search for

# (?i) is to inactivate case-sensitivity
# (?:) is to indicate that contents inside a parenthesis shouldn't be read as a "capturing group"
# Default behavior of () is to consider it a capturing group
echo_prefix = {'lvef': ['(?i)lv\s+ejection\s+fraction',
                        '(?i)left\s+ventricular\s+ejection\s+fraction',
                        '(?i)lvef',
                        '(?i)left\s+ventricular\s+ef',
                        '(?i)lvef\s+is',
                        '(?i)left\s+ventricle\s+ejection\s+fraction\s+is',
                        '(?i)lv\s+ejection\s+fraction\s+is'],
               
               # Match "cardiopulmonary bypass" ensuring at least one whitespace character between those words
              'cp_bypass': ['(?i)cardiopulmonary\s+bypass'],
              
              'la_dimension': ['(?i)la\s+diameter',
                               '(?i)la\s+dimension'],

              'la_volume_index': ['(?i)la\s+volume',
                                  '(?i)LA\s+Vol\s+BP\s+A/L\s+Index'],
              
              'lv_hypertrophy': ['(?i)(?:left\s+ventricular|lv|lv\s+concentric)\s*hypertrophy',
                                 '(?i)LVH'],
              
              'diastolic_dysfunction': ['(?i)(grade\s*ii)',
                                        '(?i)(grade\s*iii)']}

echo_suffix = {'lvef': '\D{0,20}(\d{1,3}|\d{1,2}\s*-\s*\d{1,3})-{0,1}\s*%', # Sample matches: 45%, 45 %, 45-55%, 45 - 55 %, 45- 100%, 45- %
               'cp_bypass': '(?!\s*N\/A|\s*Patient\s+was\s+not\s+placed\s+on\s+cardiopulmonary\s+bypass|\s*NA)',  # Don't match if N/A or Patient wasn't placed on CPB
               'la_dimension': '\D{0,25}(\d\.\s*\d)\s*(?:cm|centimeter)', # Sample matches: 2.7cm, 2.7 cm, 2.7   centimeter
               
                # Match anything until "ml" appears once or never, then match anything until the number of interest appears
                # followed by either ml/m or ml per square meter
               'la_volume_index': '.*?(?:ml)?.*?(\d+\.\s*\d+)\s+(?:(?=ml\/m)|(?=ml\s+per\s+square\s+meter))',
               'lv_hypertrophy': '',
               # Matches anything, either never or up to 30 characters, then an arbitrary number of white spaces,
               # as long as "diastolic dysfunction" immediately follows.
               'diastolic_dysfunction': '.{0,30}\s*?(?=diastolic\s+dysfunction)'}

In [None]:
echo = flag_echos(echo, echo_prefix, echo_suffix)

In [None]:
# Encounters entering objective assessment
text = "There are {obj_assess_encntrs} unique encounters entering objective assessment".format(
obj_assess_encntrs = for_objective_assessment['encounter_id'].nunique())
print(text)

#### 1. Taking away encounters that have BNP > 100 pg/mL

In [None]:
a = bnp['bnp_value'] > 100
encounters_with_bnp_greater_than_100 = list(bnp.loc[a, 'encounter_id'].unique())

j = for_objective_assessment['encounter_id'].isin(encounters_with_bnp_greater_than_100)
remaining_after_bnp = for_objective_assessment.loc[~j]

In [None]:
# Encounters remaining after bnp exclusion
text1 = "{encntrs_with_bnp_greater_than_100} encounters in BNP table have BNP > 100 pg/mL".format(
    encntrs_with_bnp_greater_than_100 = len(encounters_with_bnp_greater_than_100))

text2 = "\n{encntrs_remaining} encounters remain".format(
    encntrs_remaining = remaining_after_bnp['encounter_id'].nunique())

print(text1+text2)

#### 2.Taking away encounters that have left ventricular ejection fraction < 40%

In [None]:
b = echo['lvef_value'] < 40
encounters_with_lvef_smaller_than_40 = list(echo.loc[b, 'encounter_id'].unique())

j = remaining_after_bnp['encounter_id'].isin(encounters_with_lvef_smaller_than_40)
remaining_after_lvef = remaining_after_bnp.loc[~j]

In [None]:
# Encounters remaining after lvef exclusion
text1 = "{encntrs_with_lvef_smaller_than_40} encounters in ECHO have with LVEF < 40%".format(
    encntrs_with_lvef_smaller_than_40 = len(encounters_with_lvef_smaller_than_40))

text2 = "\n{encntrs_remaining} encounters remain".format(
    encntrs_remaining = remaining_after_lvef['encounter_id'].nunique())

print(text1+text2)

#### 3. Taking away encounters that had cardiopulmonary bypass in the report (as a proxy for having had cardiopulmonary bypass during the ECHO).

In [None]:
cpb = echo['cp_bypass_value'].notnull()
encounters_with_cardiopulmonary_bypass = list(echo.loc[cpb, 'encounter_id'].unique())

j = remaining_after_lvef['encounter_id'].isin(encounters_with_cardiopulmonary_bypass)
remaining_after_cpb = remaining_after_lvef.loc[~j]

In [None]:
# Encounters remaining after cardiopulmonary bypass exclusion
text1 = "{encntrs_with_cb} encounters in ECHO table have cardiopulmonary bypass".format(
    encntrs_with_cb = len(encounters_with_cardiopulmonary_bypass))

text2 = "\n{encntrs_remaining} encounters remain".format(
    encntrs_remaining = remaining_after_cpb['encounter_id'].nunique())

print(text1+text2)

#### 4. Taking away encounters that have two out of three additional criteria:  
- Left atrial enlargement (either left atrial dimension > 4 cm or left atrial volume index > 28 mL/m^2)  
- Left ventricular hypertrophy  
- Grade II or Grade III diastolic dysfunction

In [None]:
# Scoring the presence of criteria as 0 or 1
la_dim = echo['la_dimension_value'] > 4
la_vol_idx = echo['la_volume_index_value'] > 28
echo.loc[:, 'la_enlargement_bool'] = (la_dim | la_vol_idx).astype(int)
echo.loc[:, 'lv_hypertrophy_bool'] = echo['lv_hypertrophy_value'].notnull().astype(int)
echo.loc[:, 'diastolic_dysfunction_bool'] = echo[
    'diastolic_dysfunction_value'
    ].notnull().astype(int)

echo['additional_criteria_count'] = echo['la_enlargement_bool'] + \
                                    echo['lv_hypertrophy_bool'] + \
                                    echo['diastolic_dysfunction_bool']

In [None]:
add_crit = echo['additional_criteria_count'] > 1
encounters_with_additional_criteria = list(echo.loc[add_crit, 'encounter_id'].unique())

j = remaining_after_cpb['encounter_id'].isin(encounters_with_additional_criteria)
remaining_after_additional_criteria = remaining_after_cpb.loc[~j]

In [None]:
# Encounters remaining after additional criteria exclusion
text1 = "{encntrs_with_addtnl_crit} encounters in ECHO table have two out of three additional criteria".format(
    encntrs_with_addtnl_crit = len(encounters_with_additional_criteria))

text2 = "\n{encntrs_remaining} encounters remain".format(
    encntrs_remaining = remaining_after_additional_criteria['encounter_id'].nunique())

print(text1+text2)

In [None]:
encounters_remaining = remaining_after_additional_criteria['encounter_id'].drop_duplicates()

# Diagnosed encounters

In [None]:
excl_encntrs = excluded['encounter_id'].nunique()
diagnosed_encntrs = diagnosed['encounter_id'].nunique()

In [None]:
text1 = "Encounters diagnosed:\n\nPF+CXR+Notes: {diagnosed_encntrs}".format(
    diagnosed_encntrs = diagnosed['encounter_id'].nunique())
text2 = "\nNo risk factor nor objective evidence of cardiac failure: {encntrs_remaining}".format(
    encntrs_remaining = remaining_after_additional_criteria['encounter_id'].nunique())
text3 = "\nTotal encounters diagnosed: {total}".format(
    total = diagnosed_encntrs+remaining_after_additional_criteria['encounter_id'].nunique())

print(text1+text2+text3)

Taking away encounters without ECHOs or BNP values

In [None]:
# File having encounters diagnosed as ARDS in CCM paper
encounters_diagnosed_in_CCM = pd.read_csv(basedir / "Raw_data" / "hospital_a_2013" / 'ards' / 'ARDS_final.csv')
encounters_diagnosed_in_CCM = encounters_diagnosed_in_CCM.rename(
    columns={
        "encounter_ID": "encounter_id"
        }
    )

encounters_diagnosed_by_pipeline = pd.merge(
    diagnosed['encounter_id'].drop_duplicates(),
    remaining_after_additional_criteria['encounter_id'].drop_duplicates(),
    how='outer'
    ).drop_duplicates()

In [None]:
# Anonymizing encounter IDs
with open(basedir / "Anonymization_notebooks" / "keys" / 'map_ids_hospital_a_2013.json', 'r') as key_file:
    keys = json.load(key_file)
    
encounters_diagnosed_in_CCM = encounters_diagnosed_in_CCM.replace(
    to_replace={
        'encounter_id': keys
        },
    method="None"
    )

In [None]:
# remove alphanumeric IDs
encounters_diagnosed_in_CCM_list = encounters_diagnosed_in_CCM.to_dict(orient='records')
cleaned_CCM_list = []

for item in encounters_diagnosed_in_CCM_list:
    if type(item['encounter_id']) is int:
        cleaned_CCM_list.append(item)
        
cleaned_CCM_encounters = pd.DataFrame(cleaned_CCM_list)

In [None]:
# Creating encounters table
encounter_summary = pd.merge(
    hypox_df['encounter_id'].drop_duplicates(),
    encounters_diagnosed_by_pipeline,
    how='outer',
    indicator=True
    )

In [None]:
encounter_summary = encounter_summary.replace(
    to_replace={
        '_merge': {
            "left_only": 'No',
            "both": 'Yes'
            }
        }
    )

encounter_summary = encounter_summary.rename(
    columns={
        '_merge': "pipeline_diagnosed"
        }
    )

In [None]:
# Adding encounters diagnosed in CCM
encounter_summary = pd.merge(
    encounter_summary,
    cleaned_CCM_encounters,
    how='outer',
    indicator=True
    )

In [None]:
encounter_summary = encounter_summary.replace(
    to_replace={
        '_merge': {
            "left_only": "No",
            "both": "Yes"
            }
        }
    )
encounter_summary = encounter_summary.rename(
    columns={
        '_merge': "CCM_diagnosed"
        }
    )

In [None]:
y_true = encounter_summary['CCM_diagnosed']
y_pred = encounter_summary['pipeline_diagnosed']
cf = confusion_matrix(y_true, y_pred)

strings = np.asarray([['True negatives\n', 'False positives\n'],
                      ['False negatives\n', 'True positives\n']])

labels = (np.asarray(["{0} {1:.0f}".format(string, value)
                      for string, value in zip(strings.flatten(),
                                               cf.flatten())])
         ).reshape(2, 2)

fig1, ax1 = plt.subplots(figsize=plots.stdfigsize())
sns.heatmap(cf, fmt='', annot=labels, cmap='Blues', cbar=False, ax=ax1)
ax1.set_ylabel("Physician adjudicated")
ax1.set_xlabel("Pipeline adjudicated")
ax1.tick_params(axis='both', bottom=False, left=False,
                labelbottom=False, labelleft=False)

plt.tight_layout()
plt.show()

In [None]:
cohen_kappa_score(y_true, y_pred)