### Select patients for CFR model: Combine file names, cfr measurements and views ###

In [1]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

from bokeh.io import output_notebook, reset_output, show, output_file, save
from bokeh.plotting import figure
from bokeh.layouts import column, row, gridplot
from bokeh.models import ColumnDataSource, HoverTool, Legend

from bokeh.palettes import Category10

In [2]:
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
study_cfr_file = os.path.join(cfr_data_root, 'metadata_200202', 
                                '210_getStressTest_match365_study_BWH_200202.parquet')

study_cfr_df = pd.read_parquet(study_cfr_file)
study_cfr_df.head()

Unnamed: 0,mrn,study,echo_study_date,reportID,cfr_days_after_echo,subjectid,report_number,cfr_study_date,cfr_report_date,cfr
0,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,-64,121,EVS0228610,2010-01-28,2010-01-26,1.133683
1,9241,4b7b463ae45ad0e0_4903a585842f436837ee2e0d9dd5,2010-01-26,100820,2,121,EVS0228610,2010-01-28,2010-01-26,1.133683
2,59691,4b7b463fca25269d_4903a580509f97a83ea0d31ca15c,2015-08-31,141524,17,5313,E1238046,2015-09-17,2015-08-31,2.277429
3,59691,4b7b463fca25269d_4903a5864e60c69dea709396260d,2013-06-03,127002,121,5313,EVS0419689,2013-10-02,2013-06-03,2.012465
5,92742,4b7b4633cc96d8c2_4903a5858427dadde17abcd03be6,2010-09-27,107029,63,1646,EVS0276708,2010-11-29,2010-11-26,1.097118


In [3]:
# Meta data for all echo files
echo_meta_file = os.path.join(cfr_data_root, 'metadata_200202', 'echo_BWH_meta_200202.parquet')
echo_meta_df = pd.read_parquet(echo_meta_file)
echo_meta_df = echo_meta_df.astype({'mrn': 'int64'})
echo_meta_df.head()

Unnamed: 0,filename,dir,study,mrn,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,a2c,a2c_laocc,a2c_lvocc_s,a3c,a3c_laocc,a3c_lvocc_s,a4c,a4c_far,a4c_laocc,a4c_lvocc_s,a4c_rv,a4c_rv_laocc,a5c,apex,other,plax_far,plax_lac,plax_laz,plax_laz_ao,plax_plax,psax_avz,psax_az,psax_mv,psax_pap,rvinf,subcostal,suprasternal
0,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b09010a...,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a,35154368,2017-06-29 13:59:05,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a_...,BWH,iE33,Philips Medical Systems,0.0,59.4,41.0,75.0,0.032639,0.032639,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b09010a...,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a,35154368,2017-06-29 13:59:05,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a_...,BWH,iE33,Philips Medical Systems,0.0,50.767,44.0,73.0,0.039452,0.039452,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b09010a...,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a,35154368,2017-06-29 13:59:05,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a_...,BWH,iE33,Philips Medical Systems,0.0,33.333,75.0,74.0,0.021885,0.021885,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b09010a...,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a,35154368,2017-06-29 13:59:05,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a_...,BWH,iE33,Philips Medical Systems,0.0,53.064,48.0,86.0,0.019907,0.019907,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a_...,/mnt/obi0/phi/echo/npyFiles/BWH/48b0/48b09010a...,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a,35154368,2017-06-29 13:59:05,48b09010a09ba991_4903a582ec77f1640cdeecd8cb4a_...,BWH,iE33,Philips Medical Systems,0.0,33.333,72.0,74.0,0.020193,0.020193,2.521683e-10,0.001678,7.510354e-07,0.000747,8.37475e-08,0.016469,6e-06,5.6e-05,0.000254,0.135937,0.000173,0.001442,0.092005,1e-06,2.7e-05,8.538127e-08,1.3e-05,7.100429e-07,0.023798,9e-06,0.117547,0.609544,3e-06,0.000289,7.983381e-08,1.483031e-07,1.847825e-07


In [4]:
print('Total number of patients in meta data {}'.format(len(echo_meta_df.mrn.unique())))
print('Total number of studies {}'.format(len(echo_meta_df.study.unique())))
print('Total number of files in meta data {}'.format(len(echo_meta_df.filename.unique())))

temp = echo_meta_df.loc[echo_meta_df.a4c.isnull()].reset_index(drop = True)
print()
print('Studies without view classification {}'.format(len(temp.study.unique())))

# Remove rows without view classification
echo_meta_all_views = echo_meta_df.loc[~echo_meta_df.a4c.isnull()].reset_index(drop = True)
print()
print('After removal of rows without view clasification:')
print('Total number of patients in meta data {}'.format(len(echo_meta_all_views.mrn.unique())))
print('Total number of studies {}'.format(len(echo_meta_all_views.study.unique())))
print('Total number of files in meta data {}'.format(len(echo_meta_all_views.filename.unique())))

Total number of patients in meta data 11994
Total number of studies 20515
Total number of files in meta data 1028651

Studies without view classification 54

After removal of rows without view clasification:
Total number of patients in meta data 11994
Total number of studies 20515
Total number of files in meta data 1027045


In [5]:
view_dict = {'view_a2c': ['a2c', 'a2c_laocc', 'a2c_lvocc_s'],
             'view_a3c': ['a3c', 'a3c_laocc', 'a3c_lvocc_s'],
             'view_a4c': ['a4c', 'a4c_far', 'a4c_laocc', 'a4c_lvocc_s', 'a4c_rv', 'a4c_rv_laocc'],
             'view_plax': ['plax_far', 'plax_lac', 'plax_laz', 'plax_laz_ao', 'plax_plax'],
             'view_psax': ['psax_avz', 'psax_az', 'psax_mv', 'psax_pap'],
             'view_other': ['other', 'a5c', 'apex', 'rvinf', 'subcostal', 'suprasternal']}

# Combine some of the view columns
echo_meta_sum_views = echo_meta_all_views.copy()
for view in view_dict.keys():
    cols = view_dict[view]
    echo_meta_sum_views[view] = echo_meta_sum_views[cols].sum(axis =1)
    # Drop the old columns
    echo_meta_sum_views = echo_meta_sum_views.drop(columns = cols)

# Sum up the new view columns as a consistency check
echo_meta_sum_views = echo_meta_sum_views.assign(sum_views = echo_meta_sum_views[list(view_dict.keys())].\
                                                 sum(axis = 1))

# Get the maxiumum view classification score for each row
echo_meta_sum_views = echo_meta_sum_views.assign(max_view = echo_meta_sum_views[list(view_dict.keys())].\
                                                 idxmax(axis = 1))

In [6]:
# Left join in the PET CFR table, thereby filtering those studies that have CFR values
files_cfr = study_cfr_df.merge(right = echo_meta_sum_views, on = ['mrn', 'study'], how = 'left')
files_cfr.head()

Unnamed: 0,mrn,study,echo_study_date,reportID,cfr_days_after_echo,subjectid,report_number,cfr_study_date,cfr_report_date,cfr,filename,dir,datetime,fileid,institution,model,manufacturer,index,frame_time,number_of_frames,heart_rate,deltaX,deltaY,view_a2c,view_a3c,view_a4c,view_plax,view_psax,view_other,sum_views,max_view
0,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,-64,121,EVS0228610,2010-01-28,2010-01-26,1.133683,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7b/4b7b463ae...,2010-04-02 07:53:16,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.089256,56.0,85.0,0.027944,0.027944,0.9762369,0.0002330583,0.003340996,0.0001106907,0.0007710441,0.01930706,1.0,view_a2c
1,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,-64,121,EVS0228610,2010-01-28,2010-01-26,1.133683,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7b/4b7b463ae...,2010-04-02 07:53:16,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.064101,53.0,88.0,0.017671,0.017671,1.080687e-06,2.221534e-07,1.34489e-06,0.9993535,6.972564e-06,0.0006367801,1.0,view_plax
2,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,-64,121,EVS0228610,2010-01-28,2010-01-26,1.133683,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7b/4b7b463ae...,2010-04-02 07:53:16,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,48.333332,46.0,107.0,0.076726,0.076726,2.892548e-11,2.382341e-09,3.241086e-09,1.0,2.436432e-10,4.737633e-10,1.0,view_plax
3,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,-64,121,EVS0228610,2010-01-28,2010-01-26,1.133683,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7b/4b7b463ae...,2010-04-02 07:53:16,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.450451,38.0,84.0,0.043312,0.043312,3.644072e-10,1.350347e-10,1.540048e-07,1.887208e-10,1.837416e-07,0.9999996,1.0,view_other
4,9241,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e,2010-04-02,100820,-64,121,EVS0228610,2010-01-28,2010-01-26,1.133683,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b7b/4b7b463ae...,2010-04-02 07:53:16,4b7b463ae45ad0e0_4903a585842a266f561aee3f563e_...,BWH,Vivid7,GE Vingmed Ultrasound,0.0,40.119048,57.0,85.0,0.020879,0.020879,1.801199e-13,9.366453e-13,6.398218e-12,3.151624e-12,6.481914e-12,1.0,1.0,view_other


In [7]:
# Let's see what we have
df_stat = pd.DataFrame()
for view in view_dict.keys():
    df = files_cfr[files_cfr.max_view == view]
    
    df_dict = {'view': [view],
               'patients': [len(df.mrn.unique())],
               'echo studies': [len(df.study.unique())],
               'unique CFR values': [len(df.cfr.unique())]}    
    df_stat = pd.concat([df_stat, pd.DataFrame(df_dict)], ignore_index = True)

In [8]:
df_stat

Unnamed: 0,view,patients,echo studies,unique CFR values
0,view_a2c,1627,2473,1759
1,view_a3c,1513,2266,1631
2,view_a4c,1724,2696,1860
3,view_plax,1696,2693,1839
4,view_psax,1728,2744,1868
5,view_other,1786,2904,1932


In [9]:
match_view_filename = '210_getStressTest_match365_files_BWH_200131.parquet'
files_cfr.to_parquet(os.path.join(cfr_data_root, 'metadata_200131', match_view_filename))

In [58]:
# For the paper: Make a graph with the days of echo before or after CFR
def style(p):
    # Title 
    p.title.align = 'center'
    p.title.text_font_size = '11pt'
    #p.title.text_font = 'serif'

    # Axis titles
    p.xaxis.axis_label_text_font_size = '11pt'
    p.xaxis.axis_label_text_font_style = 'bold'
    p.yaxis.axis_label_text_font_size = '11pt'
    p.yaxis.axis_label_text_font_style = 'bold'

    # Tick labels
    p.xaxis.major_label_text_font_size = '11pt'
    p.yaxis.major_label_text_font_size = '11pt'
    
    return p

def make_dataset(df = files_cfr, range_start = -200, range_end = 200, bin_width = 1):

    arr_df_list = []
    
    for view_idx, view in enumerate(view_dict.keys()):
        subset = df[df.max_view == view]
        # This should actually be on the study level, not videos
        subset = subset[['study', 'cfr_days_after_echo']].drop_duplicates().reset_index(drop = True)

        range_extent = range_end - range_start

        days_hist, edges = np.histogram(subset.cfr_days_after_echo,
                                        bins = int(range_extent/bin_width),
                                        range = [range_start, range_end])

        arr_df = pd.DataFrame({'studies': days_hist,
                               'freq': days_hist/np.sum(days_hist),
                               'left': edges[:-1], 'right': edges[1:]})

        arr_df['f_freq'] = ['%0.5f' % p for p in arr_df.freq]

        arr_df['view'] = view

        arr_df['color'] = Category10[10][view_idx]

        arr_df_list.append(arr_df)

    hist_df = pd.concat(arr_df_list, ignore_index=True, axis = 0)
    hist_df = hist_df.sort_values(['view', 'left'])

    return hist_df

def make_plot(df):
    p = figure(title = 'CFR - ECHO Time Delay',
               x_axis_label = 'Time delay for cfr measurment after echo acquisition [days]',
               y_axis_label = 'Echo studies')
    
    hist_dict = {}
    for v, view in enumerate(df.view.unique()):
    
        df_view = df[df.view == view]
        datasource = ColumnDataSource(df_view) 
        label = view.split('_')[-1]
        hist_dict[view] = p.quad(source = datasource, bottom = 0, top = 'studies', 
                                 left = 'left', right = 'right',
                                 color = 'color', fill_alpha = 0.7, 
                                 hover_fill_color = 'color', hover_fill_alpha = 1.0,
                                 legend_label = label, line_color = 'black', 
                                 name = view, muted_alpha = 0.2)
        
        hist_dict[view].visible = True if view == 'view_a4c' else False
    
    hover = HoverTool(tooltips = [('view', '@view'),
                                  ('proportion', '@f_freq')],
                      mode = 'vline',
                      names = list(df.view.unique()))
    
    p.add_tools(hover)
    
    p.legend.location = 'top_right'
    p.legend.title = 'View: click to hide'
    p.legend.click_policy = 'hide'
    p = style(p)
    
    return p

In [59]:
hist_df = make_dataset(df = files_cfr, range_start = -30, range_end = 30, bin_width = 1)
hist_plot = make_plot(hist_df)
reset_output()
output_notebook()
show(hist_plot)

In [60]:
hist_df.head()

Unnamed: 0,studies,freq,left,right,f_freq,view,color
0,3,0.002481,-30.0,-29.0,0.00248,view_a2c,#1f77b4
1,5,0.004136,-29.0,-28.0,0.00414,view_a2c,#1f77b4
2,4,0.003309,-28.0,-27.0,0.00331,view_a2c,#1f77b4
3,2,0.001654,-27.0,-26.0,0.00165,view_a2c,#1f77b4
4,1,0.000827,-26.0,-25.0,0.00083,view_a2c,#1f77b4


In [57]:
reset_output()
output_file(os.path.join(cfr_data_root, 'cfr_echo_time.html'), title = 'cfr_echo_time_delay')
save(hist_plot)

'/mnt/obi0/andreas/data/cfr/cfr_echo_time.html'