In [None]:
# IO
import glob
from pathlib import Path
try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle

import re

# Utility Libraries
import math
from datetime import datetime
import re
import csv
import itertools
import inflection

# Data Processing
import pandas as pd
import numpy as np
from bcpn_pipeline import data, features, models, consts
import shap

# Viz
%matplotlib inline
import matplotlib as mpl
from matplotlib.dates import DateFormatter
from matplotlib.cbook import boxplot_stats
import matplotlib.dates as mdates
import matplotlib.transforms as mtrans
import seaborn as sns
sns.set_style("whitegrid")

import matplotlib.pyplot as plt
plt.rcParams.update(
    {'figure.autolayout': True, 
    }
)
# plt.rcParams.update({'figure.facecolor': [1.0, 1.0, 1.0, 1.0]})

# configure autoreloading of modules
%load_ext autoreload
%autoreload 2


# Load Results

In [None]:
pred_res = []
pred_res_final = []
for f in consts.OUTPUT_PATH_PRED.glob('*_pred.csv'):
    df = pd.read_csv(f)

    if 'final' in f.stem:
        pred_res_final.append(df)
    else:
        pred_res.append(df)
    
pred_res = pd.concat(pred_res, axis=0).reset_index(drop=True)
pred_res.drop(columns=['Unnamed: 0'], inplace=True)
display(pred_res.head())

pred_res_final = pd.concat(pred_res_final, axis=0).reset_index(drop=True)
pred_res_final.drop(columns=['Unnamed: 0'], inplace=True)
display(pred_res_final.head())

In [None]:
auc_res = []
for f in consts.OUTPUT_PATH_PRED.glob('*_auc.csv'):
    print(f)
    df = pd.read_csv(f)
    df.insert(4, 'method', re.split('_', f.stem)[2])
    auc_res.append(df)
    
auc_res = pd.concat(auc_res, axis=0).reset_index(drop=True)
auc_res.drop(columns=['Unnamed: 0'], inplace=True)
display(auc_res.head())

In [None]:
roc_res_final = []
for f in consts.OUTPUT_PATH_PRED.glob('*_roc.csv'):
    if 'final' in f.stem:
        df = pd.read_csv(f)
        df.insert(0, 'featureset', '_'.join(re.split('_', f.stem)[:2]))
        df.insert(0, 'method', re.split('_', f.stem)[2])
        roc_res_final.append(df)
    
roc_res_final = pd.concat(roc_res_final, axis=0).reset_index(drop=True)
roc_res_final.drop(columns=['Unnamed: 0'], inplace=True)
roc_res_final

In [None]:
auc_res['type'] = 'test'
roc_res['type'] = 'test'

# Get aggregate results (including mean, std, and variance) across runs, by featureset and method

In [None]:

agg_res = None
pred_res_agg = pd.DataFrame()
groupby_cols = ['featureset', 'method', 'features_selected', 'tuned', 'type']
metrics = ['accuracy', 'precision', 'sensitivity', 'specificity', 'f1_score']

for metric in metrics:
    agg_funcs = ['mean', 'std', 'var']
    if metric != 'f1_score':
        pred_res[metric] = pred_res[metric] * 100 # Scale to be reported as a percentage
    
    df = pred_res.groupby(groupby_cols)[metric].agg(agg_funcs).reset_index()
    df.rename(columns={col: f'{metric}_{col}' for col in agg_funcs}, inplace=True)

    if pred_res_agg.empty:
        pred_res_agg = df
    else:
        pred_res_agg = pred_res_agg.merge(df, on=groupby_cols)

pred_res_agg = pred_res_agg.merge(auc_res[['auc_mean', 'auc_std'] + groupby_cols], on=groupby_cols, how='outer')
pred_res_agg.fillna(-1, inplace=True)
pred_res_agg.to_csv(Path.joinpath(consts.OUTPUT_PATH_PRED, 'pred_agg.csv'))
pred_res_agg

In [None]:
# Format column stats as mean +- std
# Note that train AUC was not obtained, so it will be -1 +- (-1), as expected after filling nans

for metric in metrics + ['auc']:
    pred_res_agg.rename(columns={f'{metric}_mean': metric}, inplace=True)
    pred_res_agg[metric] = pred_res_agg.apply(
        lambda x: '%0.3f $\pm$ %0.3f' % (x[metric], x[metric + '_std']),
        axis=1
    )
pred_res_agg = pred_res_agg[groupby_cols + metrics + ['auc']]
pred_res_agg.to_csv(Path.joinpath(consts.OUTPUT_PATH_PRED, 'pred_agg_condensed.csv'))
pred_res_agg

# Get Results for Final Classifier

In [None]:
# Add AUROC column
# pred_res_final = pred_res_final[pred_res_final['type']=='test']
# pred_res_final = pred_res_final.merge(roc_res_final[['method', 'featureset', 'auc']], on=['method', 'featureset'], how='left').drop_duplicates().reset_index(drop=True)
pred_res_final.to_csv(Path.joinpath(consts.OUTPUT_PATH_PRED, 'pred_res_final.csv'))
pred_res_final

In [None]:
# Display results for final classifiers as well as their params
display(pred_res_final[pred_res_final['type']=='test'].sort_values(by=['featureset', 'specificity'], ascending=[True, False]))
display(pred_res_final[pred_res_final['type']=='test'].sort_values(by=['featureset', 'auc'], ascending=[True, False]))

In [None]:
roc_res_final

In [None]:
# Create legend labels for ROC curve plotting
roc_res_final['legend_label'] = roc_res_final.apply(
    lambda x: '%s (AUC = %0.3f)' % (x['method'], x['auc']),
    axis=1
)
roc_res_final

# ROC Curves

In [None]:
featuresets = list(roc_res_final.featureset.unique())
methods = list(roc_res_final.method.unique())

fs_titles = {fs: None for fs in featuresets}
for fs in fs_titles.keys():
    title = 'Next-' 
    if 'day' in fs:
        title = title + 'Day'
    elif 'week' in fs:
        title = title + 'Week'
    elif 'month' in fs:
        title = title + 'Month'
    
    title = title + ' Prediction w/' + ('Full' if 'all_scores' in fs else 'MEMS-Only') + ' Feature Set'
    fs_titles[fs] = title
fs_titles

In [None]:
featuresets

## Optimized methods

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15,7.5), sharex=True, sharey=True)

for i, fs in enumerate(featuresets):
    print(fs)
    df = roc_res_final[(roc_res_final['featureset'] == fs)]
    g = sns.lineplot(x='fpr', y='tpr', hue='legend_label', data=df, ax=axes[i])
    g.plot([0, 1], [0, 1], linestyle='--', lw=2, color='black',
            label='Chance', alpha=.8)
    axes[i].legend(title='', fontsize=12)
  
axes[0].set(xlabel='(A)', ylabel='')
axes[1].set(xlabel='(B)')

# fig.legend(title='Model', loc='upper center')
fig.supxlabel('False Positive Rate', x = (fig.subplotpars.right + fig.subplotpars.left)/1.9)
fig.supylabel('True Positive Rate', y = (fig.subplotpars.top + fig.subplotpars.bottom)/1.65)

plt.savefig(Path.joinpath(consts.OUTPUT_PATH_PRED, f'roc_curves_optimized.png'))
plt.show()

In [None]:
featuresets

# Feature Importance

In [None]:
plt.rcParams.update({'font.size': 20})

for fs in featuresets:
    for method in methods:
        f = list(consts.OUTPUT_PATH_PRED.glob(f'shap_values_{fs}_{method}*_tuned_final_clf.pkl'))[0]
        shap_values = pickle.load(open(f, 'rb'))
    
        if method == 'RF' or method == 'SVM': # Get results for positive class only
            shap_values = shap_values[:, :, 1]

        f = list(consts.OUTPUT_PATH_PRED.glob(f'feats_{fs}_{method}*_tuned_final_clf.pkl'))[0]
        feature_names = pickle.load(open(f, 'rb'))

        feature_names = [x[0].upper() + x[1:].replace('_', ' ') for x in feature_names]

        
        shap.summary_plot(shap_values, feature_names=feature_names, max_display=20, show=False)
        plt.savefig(f'results/washout/prediction_task/shap_all_{fs}_{method}.png', bbox_inches='tight')
        plt.show()