# Review model results - Step 2 - Review one sample

# Setup

<div class="alert alert-block alert-warning">
    This notebook assumes
    <ul>
        <li><b>Terra</b> is running custom Docker image <kbd>gcr.io/uk-biobank-sek-data/ml4cvd_terra:20200601_163801</kbd>.</li>
        <li><b>ml4cvd</b> is running custom Docker image <kbd>gcr.io/broad-ml4cvd/deeplearning:tf2-latest-gpu</kbd>.</li>
    </ul>
</div>

<div class="alert alert-block alert-info">
Tip: <b>ml4cvd users</b> all the functions that work with ECG or MRI data have an optional <kbd>folder</kbd> parameter. You can direct those methods to read contents of any mounted persistent disk.
</div>

In [None]:
import altair as alt  # Interactive data visualization for plots.
import ml4cvd.visualization_tools.annotations as annotations  # Custom methods for editing and submitting data annotations.
import ml4cvd.visualization_tools.dicom_interactive_plots as dicom_interactive_plots  # Custom methods to interactively visualize MRIs.
import ml4cvd.visualization_tools.dicom_plots as dicom_plots  # Custom methods to visualize cardiac MRIs.
import ml4cvd.visualization_tools.raw_ecg as raw_ecg  # Custom methods for reshaping this particular ECG data.
import ml4cvd.visualization_tools.static_plots as ecg_plots  # Custom methods for plotting this particular ECG data.
import numpy as np
import os
import pandas as pd
from plotnine import *  # Static data visualization. Provides a ggplot-like interface to matplotlib.
import re

In [None]:
%matplotlib inline

# Configure for large-data altair plotting.
# https://altair-viz.github.io/user_guide/faq.html#why-does-altair-lead-to-such-extremely-large-notebooks
alt.data_transformers.disable_max_rows()

%load_ext google.cloud.bigquery

if 'GOOGLE_PROJECT' in os.environ:
  # If this is Terra, enable a few things not enabled by default.
  alt.renderers.enable('notebook')

# Obtain sample info and model results

<div class="alert alert-block alert-info">
    If you want to change the SQL below, you can view the available tables:
    <ul>
        <li><a href="https://storage.cloud.google.com/uk-biobank-sek-data-us-east1/ukb21481.html">phenotype descriptions</a>
        <li><a href="https://bigquery.cloud.google.com/table/uk-biobank-sek-data:raw_phenotypes.ukb9222_no_empty_strings_20181128">phenotype values</a>
        <li><a href="https://bigquery.cloud.google.com/dataset/uk-biobank-sek-data:a_ttl_one_week">available ML results</a>
    </ul>      
</div>

In [None]:
%%bigquery sample_info

---[ EDIT THIS QUERY IF YOU LIKE ]---

SELECT
  sample_id,
  CASE u31_0_0
    WHEN 0 THEN 'Female'
    WHEN 1 THEN 'Male'
    ELSE 'Unknown' END AS sex_at_birth,
  u34_0_0 AS year_of_birth,
  u52_0_0 AS month_of_birth,
  u21003_0_0 AS age_at_assessment,
  u21002_0_0 AS weight_kg,
  u50_0_0 AS height_cm,
  u21001_0_0 AS bmi,
  CASE u1249_0_0
    WHEN 1 THEN 'Smoked on most or all days'
    WHEN 2 THEN 'Smoked occasionally'
    WHEN 3 THEN 'Just tried once or twice'
    WHEN 4 THEN 'I have never smoked'
    WHEN -3 THEN 'Prefer not to answer' END AS past_tobacco_smoking,
  ecg.* EXCEPT(sample_id)
FROM
  `uk-biobank-sek-data.raw_phenotypes.ukb9222_no_empty_strings_20181128`
INNER JOIN
  `uk-biobank-sek-data.ml_results.inference_ecg_rest_age_sex_autoencode_lvmass` AS ecg
ON
  eid = sample_id

In [None]:
sample_info.shape

To facilitate demos showing sample-level information, add sample information for a few fake samples.

In [None]:
# Add sample information for the fake samples.
NUM_FAKE_SAMPLES = 4
fake_sample_info = { col: [sample_info[col].median()] * NUM_FAKE_SAMPLES for col in sample_info.select_dtypes(include=np.number).columns }
fake_sample_info['sample_id'] = ['fake_' + str(i) for i in range(1, NUM_FAKE_SAMPLES+1)]
sample_info = sample_info.append(pd.DataFrame(fake_sample_info), sort=False)
sample_info.shape

In [None]:
# Compute the deltas between actual values and predicted value columns.
actual_regexp = re.compile('^(\w+)_actual$')
for actual_col in sample_info.columns:
  if actual_col.endswith('_actual'):
    prediction_col = actual_regexp.sub(r'\1_prediction', actual_col)
    if prediction_col in sample_info.columns:
      delta_col = actual_regexp.sub(r'\1_delta', actual_col)
      print('Adding ' + delta_col)
      sample_info[delta_col] = (sample_info[actual_col].astype('float')
                                - sample_info[prediction_col].astype('float'))
        
sample_info.shape

# Review one sample

In [None]:
#---[ EDIT THIS VARIABLE VALUE IF YOU LIKE ]---
SAMPLE_TO_REVIEW = 1655349

In [None]:
tidy_resting_ecg_signal = raw_ecg.reshape_resting_ecg_to_tidy(sample_id=SAMPLE_TO_REVIEW)

tidy_resting_ecg_signal.shape

In [None]:
(exercise_ecg_trend, exercise_ecg_signal) = raw_ecg.reshape_exercise_ecg_to_tidy(sample_id=SAMPLE_TO_REVIEW)

print(exercise_ecg_trend.shape)
print(exercise_ecg_signal.shape)

In [None]:
# Limit to our sample_id of interest, handling both string-valued and numeric sample ids.
df = sample_info.query('sample_id == @SAMPLE_TO_REVIEW')
if 0 == df.shape[0]: df = sample_info.query('sample_id == ' + SAMPLE_TO_REVIEW)
# Convert the suffixes to prefixes so that pandas.wide_to_long pivots on the correct portion of the column name.
df.columns = df.columns.str.replace(r'(.*)_(actual|delta|prediction)$', r'\2_value_\1')
# Pivot the prediction columns for a more compact display.
df = pd.wide_to_long(df, stubnames=['actual_value', 'prediction_value', 'delta_value'],
                     i=df.columns[~df.columns.str.contains(r'(actual|delta|prediction)')].tolist(),
                     j='value', sep='_', suffix='.*')
df

## Retrieve static visualization of the resting ECG

In [None]:
ecg_plots.display_resting_ecg(sample_id=SAMPLE_TO_REVIEW)

## Render alternate static visualization of the resting ECG

In [None]:
df

In [None]:
p = (ggplot(tidy_resting_ecg_signal.query(
    "(lead in ['strip_I','strip_II', 'strip_III'] & ts_reference < 2.5) " +
    "| (lead in ['strip_aVR','strip_aVL', 'strip_aVF'] & ts_reference >= 2.5 & ts_reference < 5)" +
    "| (lead in ['strip_V1','strip_V2', 'strip_V3'] & ts_reference >= 5 & ts_reference < 7.5)" +
    "| (lead in ['strip_V4','strip_V5', 'strip_V6'] & ts_reference > 7.5)"),
        aes(x = 'ts_reference', y = 'signal_mV', color = 'filtering')) +
     geom_line() +
     facet_wrap('~ lead', ncol = 4, nrow = 3, scales = 'free_x') +
     scale_x_continuous(name = 'Elapsed time', breaks = ecg_plots.major_breaks_x_resting_ecg, minor_breaks = 5) +
     scale_y_continuous(name = 'Signal mV', limits = [-1.5, 1.5], breaks = np.arange(-1.5, 2.0, 0.5), minor_breaks = 5) +
     ggtitle('Resting ECG for {}'.format(SAMPLE_TO_REVIEW)) + 
     theme(axis_text_x = element_text(angle = 50, hjust = 1), legend_position = 'top', figure_size = (18, 12), panel_spacing = .3))

p

## Resting ECG interactive visualization

In [None]:
DATA_FILE = 'tidy_ecg.json'
tidy_resting_ecg_signal.query("filtering in ['raw_mV']").to_json(DATA_FILE, orient='records')

In [None]:
# Define the plot components.
brush = alt.selection(type='interval', encodings=['x'])

lead_dropdown = alt.binding_select(options=list(tidy_resting_ecg_signal.lead.unique()))
lead_select = alt.selection_single(fields=['lead'], bind=lead_dropdown, name='Choose just one to view',
                                   init={'lead': tidy_resting_ecg_signal.lead.unique()[0]})

upper = alt.Chart(DATA_FILE).mark_line().encode(
    alt.X('ts_reference:Q', scale=alt.Scale(domain=brush)),
    y='signal_mV:Q',
    color=alt.Color('lead:N', legend=alt.Legend(orient='top'),
                    title='Lead(s) currently displayed')
).properties(width=900, height=250, title='Resting ECG for {}'.format(SAMPLE_TO_REVIEW)
).add_selection(lead_select
).transform_filter(lead_select)

lower = upper.properties(height=50, title='Brush over this subplot to select a time interval.'
).add_selection(brush
).add_selection(lead_select
).transform_filter(lead_select)

# TODO: Apply a Calculate Transform for alternate signal filtering https://altair-viz.github.io/user_guide/transform.html#calculate-transform

<div class="alert alert-block alert-info">
<b>Tip:</b> Copy, paste, and run the 'display plot components' cell how ever many times you like to view several leads simultaneously.
</div>

In [None]:
df

In [None]:
## Display plot components.
upper & lower

## Exercise ECG interactive visualization

In [None]:
TREND_DATA_FILE = 'tidy_exercise_ecg_trend.json'
SIGNAL_DATA_FILE = 'tidy_exercise_ecg_signal.json'
exercise_ecg_trend.to_json(TREND_DATA_FILE, orient='records')
exercise_ecg_signal.to_json(SIGNAL_DATA_FILE, orient='records')

In [None]:
# Define the plot components.
brush = alt.selection_single(on='mouseover', nearest=True, fields=['time'], init={'time': 200.0})

lead_dropdown = alt.binding_select(options=list(exercise_ecg_signal.lead.unique()))
lead_select = alt.selection_single(fields=['lead'], bind=lead_dropdown, name='Choose just one to view',
                                   init={'lead': exercise_ecg_signal.lead.unique()[0]})

trend = alt.Chart(TREND_DATA_FILE).mark_point(opacity=0.8, filled=True, size=100).encode(
    x='time:Q',
    color=alt.Color('phasename:N', legend=alt.Legend(orient='top'), title='Phase names'),
    tooltip=['artifact:Q', 'grade:Q', 'heartrate:Q', 'load:Q', 'mets:Q', 'pacecount:Q',
             'phasename:N', 'phasetime:Q', 'time:Q', 'vecount:Q']
).properties(width=900, height=200, title='Click on a point to select a 10 second time interval.'
).add_selection(brush)

signal = alt.Chart(SIGNAL_DATA_FILE).mark_line().encode(
    alt.X('time:Q', axis=alt.Axis(labelAngle=15)),
    y='raw_mV:Q',
    color=alt.Color('lead:N', legend=alt.Legend(orient='top'), title='Lead names'),
).properties(width=900, height=200, title='Exercise ECG signal for {}'.format(SAMPLE_TO_REVIEW)
).add_selection(lead_select
).transform_filter(lead_select
).transform_filter(
    # https://github.com/altair-viz/altair/issues/1960
    '((toNumber({}.time) - 5.0) < datum.time) && (datum.time < toNumber({}.time) + 5.0)'.format(brush.name, brush.name)
)

In [None]:
df

In [None]:
## Display plot components.
trend.encode(y='heartrate:Q') & trend.encode(y='load:Q') & signal

## Cardiac MRI visualization

In [None]:
df

In [None]:
dicom_plots.choose_cardiac_mri(sample_id=SAMPLE_TO_REVIEW)

## Brain and other MRIs interactive visualization

<div class="alert alert-block alert-info">
<b>Tip:</b> Click on the image instance slider and then use the left/right arrows to animate the image.
</div>

In [None]:
df

In [None]:
dicom_interactive_plots.choose_mri(sample_id=SAMPLE_TO_REVIEW)

## Annotate the sample

Run the following cell, then select the relevant key on which you wish to comment, and enter your comment in the text box. Click the button to submit your annotation.

In [None]:
annotations.display_annotation_collector(sample_info, SAMPLE_TO_REVIEW)

Let's take a look at the most recently added annotations.

In [None]:
annotations.view_submissions(count=10)

# Provenance

In [None]:
import datetime
print(datetime.datetime.now())

In [None]:
%%bash
pip3 freeze

Questions about these particular notebooks? Reach out to Puneet Batra pbatra@broadinstitute.org, Paolo Di Achille pdiachil@broadinstitute.org, and Nicole Deflaux deflaux@verily.com.