# This notebook will contain development of code to pull relevant numbers and phrases from Echocardiography reports.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from pathlib import Path
from sklearn.metrics import confusion_matrix
from scipy.spatial.distance import hamming

import sys
sys.path.append("../")
from src.diagnosis_tools import *
import src.plots as plots

In [None]:
# Custom display of tables for easier inspection
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
# set plotting params
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)
plt.style.reload_library()
rcparams = plots.stdrcparams1()
mpl.rcParams.update(rcparams)

In [None]:
# Data locations
basedir = Path("../..")
analysis_location = basedir / 'Analysis_Data'
training_location = analysis_location / 'train_ML'
preprocess_location = basedir / "Preprocessed_Data"
cohort = 'hospital_a_2013'
path = analysis_location / cohort
echo_validation = path / 'ECHO_validation'

# Figures
figure_path = basedir / "Figures"

In [None]:
echo = pd.read_csv(path / "echo_reports.csv")
echo['echo_timestamp'] = pd.to_timedelta(echo['echo_timestamp'])

In [None]:
strings = np.asarray([['True negatives\n', 'False positives\n'],
                      ['False negatives\n', 'True positives\n']])

In [None]:
# These will be dictionaries whose keys will become the column names for the flags
# and the lists will be the regex patterns to search for

# (?i) is to inactivate case-sensitivity
# (?:) is to indicate that contents inside a parenthesis shouldn't be read as a "capturing group"
# Default behavior of () is to consider it a capturing group
echo_prefix = {'lvef': ['(?i)lv\s+ejection\s+fraction',
                        '(?i)left\s+ventricular\s+ejection\s+fraction',
                        '(?i)lvef',
                        '(?i)left\s+ventricular\s+ef',
                        '(?i)lvef\s+is',
                        '(?i)left\s+ventricle\s+ejection\s+fraction\s+is',
                        '(?i)lv\s+ejection\s+fraction\s+is'],
               
               # Match "cardiopulmonary bypass" ensuring at least one whitespace character between those words
              'cp_bypass': ['(?i)cardiopulmonary\s+bypass'],
              
              'la_dimension': ['(?i)la\s+diameter',
                               '(?i)la\s+dimension'],

              'la_volume_index': ['(?i)la\s+volume',
                                  '(?i)LA\s+Vol\s+BP\s+A/L\s+Index'],
              
              'lv_hypertrophy': ['(?i)(?<!borderline )(?:left\s+ventricular|lv|lv\s+concentric)\s*hypertrophy',
                                 '(?i)(?<!borderline )LVH'],
              
              'diastolic_dysfunction': ['(?i)(grade\s*ii)',
                                        '(?i)(grade\s*iii)']}

echo_suffix = {'lvef': '\D{0,20}(\d{1,3}|\d{1,2}\s*-\s*\d{1,3})-{0,1}\s*%', # Sample matches: 45%, 45 %, 45-55%, 45 - 55 %, 45- 100%, 45- %
               'cp_bypass': '(?!\s*N\/A|\s*Patient\s+was\s+not\s+placed\s+on\s+cardiopulmonary\s+bypass|\s*NA)',  # Don't match if N/A or Patient wasn't placed on CPB
               'la_dimension': '\D{0,25}(\d\.\s*\d)\s*(?:cm|centimeter)', # Sample matches: 2.7cm, 2.7 cm, 2.7   centimeter
               
                # Match anything until "ml" appears once or never, then match anything until the number of interest appears
                # followed by either ml/m or ml per square meter
               'la_volume_index': '.*?(?:ml)?.*?(\d+\.\s*\d+)\s+(?:(?=ml\/m)|(?=ml\s+per\s+square\s+meter))',
               'lv_hypertrophy': '',
               # Matches anything, either never or up to 30 characters, then an arbitrary number of white spaces,
               # as long as "diastolic dysfunction" immediately follows.
               'diastolic_dysfunction': '.{0,30}\s*?(?=diastolic\s+dysfunction)'}

In [None]:
echo = flag_echos(echo, echo_prefix, echo_suffix)

### Evaluating performance by metric

#### Left ventricular ejection fraction

In [None]:
evaluate_lvef = echo[['echo_text', 'ejection_fraction', 'lvef_flag', 'lvef_value']]

Evaluating flags by comparing against the flags already there

In [None]:
f = evaluate_lvef['ejection_fraction'].isna()
evaluate_lvef.loc[f, 'ejection_fraction'] = 0

In [None]:
y_true = evaluate_lvef['ejection_fraction']
y_pred = evaluate_lvef['lvef_flag']
cf_lvef = confusion_matrix(y_true, y_pred).transpose()[::-1, ::-1]

fig, ax = plt.subplots(figsize=plots.stdfigsize(0, layout="single"))
sns.heatmap(cf_lvef, fmt='d', annot=True, cmap='Blues', cbar=False, ax=ax)
ax.set_xticklabels(['Yes', 'No'])
ax.set_yticklabels(['Yes', 'No'], rotation=0)
ax.set_ylabel("Regex-matched")
ax.set_xlabel("Text-matched")
ax.set_title("Left ventricular ejection fraction")

plt.tight_layout()
# plt.savefig(figure_path / 'SIfig7_lvef_cf.png')
plt.show()

Now comparing captured values against a labeled subset of ECHOs. Specifically, I'll annotate 10% of ECHOs that had a value captured and a flag present, and 10% of ECHOs that had a flag but no value.  
I'll use Hamming distance (measure better suited for this binary task, because: either regex captures the right value or it doesn't).

In [None]:
# a = evaluate_lvef['ejection_fraction'] == 1
# b = evaluate_lvef['lvef_value'].isna()
# no_value_lvef = evaluate_lvef.loc[a & b].sample(frac=0.1)

In [None]:
# # Writing the ECHO texts to a txt file for ease of annotating
# no_value_lvef = no_value_lvef.reset_index().drop(columns=['index'])

# # Also, keeping the original file to not have to do double-work
# no_value_lvef.to_csv(echo_validation / "annot_echos_lvef_no_value.csv", index=False)

# for i in range(len(no_value_lvef)):
#     with open(echo_validation / f"annot_echos_lvef_no_value{i+1}.txt", "w") as f:
#         f.write(no_value_lvef.loc[i, 'echo_text'])

In [None]:
# a = evaluate_lvef['ejection_fraction'] == 1
# b = evaluate_lvef['lvef_value'].notnull()
# with_value_lvef = evaluate_lvef.loc[a & b].sample(frac=0.1)

In [None]:
# with_value_lvef = with_value_lvef.reset_index().drop(columns=['index'])

# with_value_lvef.to_csv(echo_validation / "annot_echos_lvef_with_value.csv", index=False)

# for i in range(len(with_value_lvef)):
#     with open(echo_validation / f"annot_echos_lvef_with_value{i+1}.txt", "w") as f:
#         f.write(with_value_lvef.loc[i, 'echo_text'])

In [None]:
no_value_lvef = pd.read_csv(echo_validation / "annot_echos_lvef_no_value.csv")
with_value_lvef = pd.read_csv(echo_validation / "annot_echos_lvef_with_value.csv")

In [None]:
dist_no_value_lvef = hamming(no_value_lvef['lvef_value'], no_value_lvef['annot_lvef_value'])
dist_with_value_lvef = hamming(with_value_lvef['lvef_value'], with_value_lvef['annot_lvef_value'])

print(dist_no_value_lvef, dist_with_value_lvef)

#### Cardiopulmonary bypass

For this one, only value performance can be assessed since there's no prior flag indicating cardiopulmonary bypass

In [None]:
evaluate_cpb = echo[['echo_text', 'cp_bypass_flag', 'cp_bypass_value']]

In [None]:
# # Writing the ECHO texts to a txt file for ease of annotating
# evaluate_cpb = evaluate_cpb.reset_index().drop(columns=['index'])

# # Also, keeping the original file to not have to do double-work
# evaluate_cpb.to_csv(echo_validation / "annot_echos_cpb.csv", index=False)

# for i in range(len(evaluate_cpb)):
#     with open(echo_validation / f"annot_echos_cpb{i+1}.txt", "w") as f:
#         f.write(evaluate_cpb.loc[i, 'echo_text'])

In [None]:
evaluate_cpb = pd.read_csv(echo_validation / "annot_echos_cpb.csv")

In [None]:
dist_evaluate_cpb = hamming(evaluate_cpb['cp_bypass_value'], evaluate_cpb['annot_cp_bypass_value'])

print(dist_evaluate_cpb)

#### Left atrial dimension/diameter

Flagging performance

In [None]:
evaluate_lad = echo[['echo_text', 'la_diameter', 'la_dimension_flag', 'la_dimension_value']]
f = evaluate_lad['la_diameter'].isna()
evaluate_lad.loc[f, 'la_diameter'] = 0
evaluate_lad.loc[~f, 'la_diameter'] = 1

In [None]:
y_true = evaluate_lad['la_diameter']
y_pred = evaluate_lad['la_dimension_flag']
cf_lad = confusion_matrix(y_true, y_pred).transpose()[::-1, ::-1]

fig1, ax1 = plt.subplots(figsize=plots.stdfigsize(0, layout="single"))
sns.heatmap(cf_lad, fmt='d', annot=True, cmap='Blues', cbar=False, ax=ax1)
ax1.set_xticklabels(['Yes', 'No'])
ax1.set_yticklabels(['Yes', 'No'], rotation=0)
ax1.set_ylabel("Regex-matched")
ax1.set_xlabel("Text-matched")
ax1.set_title("Left atrial dimension/diameter")

plt.tight_layout()
# plt.savefig(figure_path / 'SIfig7_la_dim_cf.png')
plt.show()

Now, value performance

In [None]:
# f = evaluate_lad['la_dimension_flag'] == 1
# g = evaluate_lad['la_dimension_value'].isna()
# no_value_lad = evaluate_lad.loc[f & g].sample(frac=0.1)

In [None]:
# # Writing the ECHO texts to a txt file for ease of annotating
# no_value_lad = no_value_lad.reset_index().drop(columns=['index'])

# # Also, keeping the original file to not have to do double-work
# no_value_lad.to_csv(echo_validation / "annot_echos_lad_no_value.csv", index=False)

# for i in range(len(no_value_lad)):
#     with open(echo_validation / f"annot_echos_lad_no_value{i+1}.txt", "w") as f:
#         f.write(no_value_lad.loc[i, 'echo_text'])

In [None]:
# a = evaluate_lad['la_dimension_flag'] == 1
# b = evaluate_lad['la_dimension_value'].notnull()
# with_value_lad = evaluate_lad.loc[a & b].sample(frac=0.1)

In [None]:
# with_value_lad = with_value_lad.reset_index().drop(columns=['index'])

# with_value_lad.to_csv(echo_validation / "annot_echos_lad_with_value.csv", index=False)

# for i in range(len(with_value_lad)):
#     with open(echo_validation / f"annot_echos_lad_with_value{i+1}.txt", "w") as f:
#         f.write(with_value_lad.loc[i, 'echo_text'])

In [None]:
no_value_lad = pd.read_csv(echo_validation / "annot_echos_lad_no_value.csv")
with_value_lad = pd.read_csv(echo_validation / "annot_echos_lad_with_value.csv")

In [None]:
dist_no_value_lad = hamming(no_value_lad['la_dimension_value'], no_value_lad['annot_la_dimension_value'])
dist_with_value_lad = hamming(with_value_lad['la_dimension_value'], with_value_lad['annot_la_dimension_value'])

print(dist_no_value_lad, dist_with_value_lad)

#### Left atrial volume index

In [None]:
evaluate_lav = echo[['echo_text', 'la_volume', 'la_volume_index_flag', 'la_volume_index_value']]
f = evaluate_lav['la_volume'].isna()
evaluate_lav.loc[f, 'la_volume'] = 0
evaluate_lav.loc[~f, 'la_volume'] = 1

Flagging performance

In [None]:
y_true = evaluate_lav['la_volume']
y_pred = evaluate_lav['la_volume_index_flag']
cf_lav = confusion_matrix(y_true, y_pred).transpose()[::-1, ::-1]

fig2, ax2 = plt.subplots(figsize=plots.stdfigsize(0, layout="single"))
sns.heatmap(cf_lav, fmt='d', annot=True, cmap='Blues', cbar=False, ax=ax2)
ax2.set_xticklabels(['Yes', 'No'])
ax2.set_yticklabels(['Yes', 'No'], rotation=0)
ax2.set_ylabel("Regex-matched")
ax2.set_xlabel("Text-matched")
ax2.set_title("Left atrial volume index")

plt.tight_layout()
# plt.savefig(figure_path / 'SIfig7_la_vol_cf.png')
plt.show()

Value performance

In [None]:
# f = evaluate_lav['la_volume'] == 1
# g = evaluate_lav['la_volume_index_value'].isna()
# no_value_lav = evaluate_lav.loc[f & g].sample(frac=0.1)

In [None]:
# # Writing the ECHO texts to a txt file for ease of annotating
# no_value_lav = no_value_lav.reset_index().drop(columns=['index'])

# # Also, keeping the original file to not have to do double-work
# no_value_lav.to_csv(echo_validation / "annot_echos_lav_no_value.csv", index=False)

# for i in range(len(no_value_lav)):
#     with open(echo_validation / f"annot_echos_lav_no_value{i+1}.txt", "w") as f:
#         f.write(no_value_lav.loc[i, 'echo_text'])

In [None]:
# a = evaluate_lav['la_volume'] == 1
# b = evaluate_lav['la_volume_index_value'].notnull()
# with_value_lav = evaluate_lav.loc[a & b].sample(frac=0.1)

In [None]:
# with_value_lav = with_value_lav.reset_index().drop(columns=['index'])

# with_value_lav.to_csv(echo_validation / "annot_echos_lav_with_value.csv", index=False)

# for i in range(len(with_value_lav)):
#     with open(echo_validation / f"annot_echos_lav_with_value{i+1}.txt", "w") as f:
#         f.write(with_value_lav.loc[i, 'echo_text'])

In [None]:
no_value_lav = pd.read_csv(echo_validation / "annot_echos_lav_no_value.csv")
with_value_lav = pd.read_csv(echo_validation / "annot_echos_lav_with_value.csv")

In [None]:
dist_no_value_lav = hamming(no_value_lav['la_volume_index_value'], no_value_lav['annot_la_volume_index_value'])
dist_with_value_lav = hamming(with_value_lav['la_volume_index_value'], with_value_lav['annot_la_volume_index_value'])

print(dist_no_value_lav, dist_with_value_lav)

#### Left ventricular hypertrophy

In [None]:
evaluate_lv_hyper = echo[['echo_text', 'lv_hypertrophy', 'lv_hypertrophy_flag', 'lv_hypertrophy_value']]
f = evaluate_lv_hyper['lv_hypertrophy'].isna()
evaluate_lv_hyper.loc[f, 'lv_hypertrophy'] = 0
evaluate_lv_hyper.loc[~f, 'lv_hypertrophy'] = 1

Flagging performance

In [None]:
y_true = evaluate_lv_hyper['lv_hypertrophy']
y_pred = evaluate_lv_hyper['lv_hypertrophy_flag']
cf_lv_hyper = confusion_matrix(y_true, y_pred).transpose()[::-1, ::-1]

labels = (np.asarray(["{0} {1:.0f}".format(string, value)
                      for string, value in zip(strings.flatten(),
                                               cf_lv_hyper.flatten())])
         ).reshape(2, 2)

fig3, ax3 = plt.subplots(figsize=plots.stdfigsize(0, layout="single"))
sns.heatmap(cf_lv_hyper, fmt='d', annot=True, cmap='Blues', cbar=False, ax=ax3)
ax3.set_xticklabels(['Yes', 'No'])
ax3.set_yticklabels(['Yes', 'No'], rotation=0)
ax3.set_ylabel("Regex-matched")
ax3.set_xlabel("Text-matched")
ax3.set_title("Left ventricular hypertrophy")

plt.tight_layout()
# plt.savefig(figure_path / 'SIfig7_lv_hyper_cf.png')
plt.show()

Inspection of the 13 supposedly "false positives" yields that every single ECHO text mentions "mild concentric lv hypertrophy" or "moderate concentric lv hypertrophy". So, no clue as to why those weren't text-matched.

In [None]:
# p = evaluate_lv_hyper['lv_hypertrophy'] == 0
# o = evaluate_lv_hyper['lv_hypertrophy_flag'] == 1

# evaluate_lv_hyper.loc[p&o]

Value performance

In [None]:
# # Since the flag comparison is useless, will use the same approach as if no flag existed.
# evaluate_lv_hyper_sample = evaluate_lv_hyper.sample(frac=0.1)

In [None]:
# # Writing the ECHO texts to a txt file for ease of annotating
# evaluate_lv_hyper_sample = evaluate_lv_hyper_sample.reset_index().drop(columns=['index'])

# # Also, keeping the original file to not have to do double-work
# evaluate_lv_hyper_sample.to_csv(echo_validation / "annot_echos_lv_hyper.csv", index=False)

# for i in range(len(evaluate_lv_hyper_sample)):
#     with open(echo_validation / f"annot_echos_lv_hyper{i+1}.txt", "w") as f:
#         f.write(evaluate_lv_hyper_sample.loc[i, 'echo_text'])

In [None]:
# evaluate_lv_hyper_sample = pd.read_csv(echo_validation / "annot_echos_lv_hyper.csv")

In [None]:
# dist_evaluate_lv_hyper = hamming(evaluate_lv_hyper_sample['lv_hypertrophy_value'],
#                                  evaluate_lv_hyper_sample['annot_lv_hypertrophy_value'])

# print(dist_evaluate_lv_hyper)

#### Grade II or III diastolic dysfunction

While e_e could serve as a flag, issue is that I wrote this as capturing the value and the flag at once. So it can't be compared to e_e, as e_e is a number where the value stratifies the degrees of diastolic dysfunction, but this column is probably just a flag indicating whether the number is in the report or not.

In [None]:
# evaluate_dd = echo[['echo_text','diastolic_dysfunction_value', 'diastolic_dysfunction_flag']].sample(frac=0.1)

In [None]:
# # Writing the ECHO texts to a txt file for ease of annotating
# evaluate_dd = evaluate_dd.reset_index().drop(columns=['index'])

# # Also, keeping the original file to not have to do double-work
# evaluate_dd.to_csv(echo_validation / "annot_echos_dd.csv", index=False)

# for i in range(len(evaluate_dd)):
#     with open(echo_validation / f"annot_echos_dd{i+1}.txt", "w") as f:
#         f.write(evaluate_dd.loc[i, 'echo_text'])

In [None]:
evaluate_dd = pd.read_csv(echo_validation / "annot_echos_dd.csv")

In [None]:
dist_evaluate_dd = 1 - hamming(evaluate_dd['diastolic_dysfunction_flag'], evaluate_dd['annot_diastolic_dysfunction_flag'])

print(dist_evaluate_dd)

### Plots for paper

In [None]:
all_cols = [i for i in list(echo.columns)[3:] if "value" not in i]

In [None]:
# This is to subset label columns into those text-matched by EDW and by me
annotations = []
annotations_regex = []

for i in all_cols:
    if "flag" in i:
        annotations_regex.append(i)
    else:
        annotations.append(i)
        
annotations.remove('la_enlargement')
annotations.remove('lvids')
annotations.remove('lvidd')
annotations.remove('bowing')
annotations.remove('e_e')
annotations.remove('lateral')
annotations.sort()

In [None]:
counts = list(echo[annotations].sum())
counts_regex = list(echo[annotations_regex].sum())

In [None]:
agg = []

annotations_nice = ['LV ejection fraction', 'CP Bypass',
                    'LA diameter', 'LA volume index',
                    'LV hypertrophy', 'Diastolic dysfunction']

annotations_short = ['LV ejection fraction', 'LA diameter',
                     'LA volume index', 'LV hypertrophy']

for i in range(len(annotations)):
    temp = {'factor': annotations_short[i], 'counts': counts[i], 'method': 'Text-match'}
    agg.append(temp)
    
for i in range(len(annotations_regex)):
    temp = {'factor': annotations_nice[i], 'counts': counts_regex[i], 'method': 'Regex-match'}
    agg.append(temp)
    
for_plot = pd.DataFrame(agg).sort_values(by='counts', ascending=False)

In [None]:
fig4, ax4 = plt.subplots(figsize=plots.stdfigsize(0, layout="single"))
sns.barplot(x='counts', y='factor', hue='method',
            data=for_plot, errorbar=None, ax=ax4)

ax4.set_xlabel('Count of matched ECHOs')
ax4.set_ylabel('')
# ax4.set_title(f"Total ECHO reports: {len(echo)}")
ax4.grid(linestyle=':', axis='x')
ax4.legend(loc='lower right', title=None, frameon=False)

plt.tight_layout()
# plt.savefig(figure_path / 'SIfig7_echos.png')
plt.show()

In [None]:
fig5, ax5 = plt.subplots(2, 2, figsize=plots.stdfigsize(49, n_rows=2, n_cols=2, layout="double"))

sns.heatmap(cf_lvef, fmt='d', annot=True, cmap='Blues', cbar=False, ax=ax5[0,0])
ax5[0,0].set_xticklabels([None, None])
ax5[0,0].set_xticks([])
ax5[0,0].set_yticklabels(['Yes', 'No'], rotation=0)
ax5[0,0].set_ylabel("Regex-matched")
ax5[0,0].set_xlabel(None)
ax5[0,0].set_title("LV ejection fraction", fontweight='bold')

sns.heatmap(cf_lad, fmt='d', annot=True, cmap='Blues', cbar=False, ax=ax5[0,1])
ax5[0,1].set_xticklabels([None, None])
ax5[0,1].set_xticks([])
ax5[0,1].set_yticklabels([None, None], rotation=0)
ax5[0,1].set_yticks([])
ax5[0,1].set_ylabel(None)
ax5[0,1].set_xlabel(None)
ax5[0,1].set_title("LA diameter", fontweight='bold')

sns.heatmap(cf_lav, fmt='d', annot=True, cmap='Blues', cbar=False, ax=ax5[1,0])
ax5[1,0].set_xticklabels(['Yes', 'No'])
ax5[1,0].set_yticklabels(['Yes', 'No'], rotation=0)
ax5[1,0].set_ylabel("Regex-matched")
ax5[1,0].set_xlabel("Text-matched")
ax5[1,0].set_title("LA volume index", fontweight='bold')

sns.heatmap(cf_lv_hyper, fmt='d', annot=True, cmap='Blues', cbar=False, ax=ax5[1,1])
ax5[1,1].set_xticklabels(['Yes', 'No'])
ax5[1,1].set_yticklabels([None, None], rotation=0)
ax5[1,1].set_yticks([])
ax5[1,1].set_ylabel(None)
ax5[1,1].set_xlabel("Text-matched")
ax5[1,1].set_title("LV hypertrophy", fontweight='bold')

plt.tight_layout()
# fig5.savefig(figure_path / 'SIfig7.png')
plt.show()