In [None]:
# IO
from pathlib import Path
try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle

# Utility Libraries
import math
from datetime import datetime
import re
import csv
import itertools

# Data Processing
import pandas as pd
import numpy as np

# Predictive Analytics
import statsmodels.stats.api as sms
from sklearn.feature_selection import VarianceThreshold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneGroupOut
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from bcpn_pipeline import data, features, models, consts
import shap

# Viz
%matplotlib inline
import matplotlib as mpl
from matplotlib.dates import DateFormatter
from matplotlib.cbook import boxplot_stats
import matplotlib.dates as mdates
import matplotlib.transforms as mtrans
import seaborn as sns
sns.set_style("whitegrid")

import matplotlib.pyplot as plt
plt.rcParams.update({'figure.autolayout': True})
# plt.rcParams.update({'figure.facecolor': [1.0, 1.0, 1.0, 1.0]})

# configure autoreloading of modules
%load_ext autoreload
%autoreload 2


In [None]:
# Set up some common vars
merge_cols = ['n_lags', 'featureset','optimized','target','method','run','n_features','n_samples']
metrics = ['accuracy', 'precision', 'recall', 'f1_score']
res_path = 'results/'
feat_imp_path = 'feature_importance/'

# Load Results

In [None]:
import glob

pred_res = pd.read_csv(res_path + 'pred_results.csv')
pred_res =  pred_res[~pred_res.test_accuracy.str.contains('test_accuracy')] # remove headers from CSV body
pred_res.drop(columns=['test_support'], inplace=True)

auc_res = pd.read_csv(res_path + 'auc_results.csv')
auc_res = auc_res[~auc_res.auc_mean.str.contains('auc_mean')] # remove headers from CSV body

roc_res = pd.read_csv(res_path + 'roc_curves.csv')
roc_res = roc_res[~roc_res.tpr_mean.str.contains('tpr_mean')] # remove headers from CSV body

for df in [pred_res, auc_res, roc_res]:
    for col in df:
        try:
            df[col] = pd.to_numeric(df[col])
        except:
            pass

In [None]:
auc_res

In [None]:
pred_res

In [None]:
# Get aggregate results across runs, by featureset and method
agg_res = pd.DataFrame()
groupby_cols = ['featureset', 'method', 'optimized']

for metric in metrics:
    df = pred_res.groupby(groupby_cols)['test_' + metric].agg({
        metric + '_mean': 'mean',
        metric + '_std': 'std',
        metric + '_var': 'var'
    }).reset_index()
    
    if agg_res.empty:
        agg_res = df
    else:
        agg_res = agg_res.merge(df, on=groupby_cols)

agg_res = agg_res.merge(auc_res[['auc_mean', 'auc_std'] + groupby_cols], on=groupby_cols)
agg_res.to_csv(res_path + 'agg_res_full.csv')
agg_res

In [None]:
for metric in metrics + ['auc']:
    agg_res.rename(columns={metric + '_mean': metric}, inplace=True)
    agg_res[metric] = agg_res.apply(
        lambda x: '%0.4f $\pm$ %0.2f' % (x[metric], x[metric + '_std']),
        axis=1
    )
agg_res = agg_res[groupby_cols + metrics + ['auc']]
agg_res.to_csv(res_path + 'agg_res_condensed.csv')
agg_res

In [None]:
roc_res = roc_res.merge(auc_res, on=merge_cols)

roc_res['legend_label'] = roc_res.apply(
    lambda x: '%s (AUC = %0.2f $\pm$ %0.2f)' % (x['method'], x['auc_mean'], x['auc_std']),
    axis=1
)
roc_res

# ROC Curves

In [None]:
featuresets = list(roc_res.featureset.unique())
methods = list(roc_res.method.unique())

fs_titles = {fs: None for fs in featuresets}
for fs in fs_titles.keys():
    title = 'Next-' 
    if 'day' in fs:
        title = title + 'Day'
    elif 'week' in fs:
        title = title + 'Week'
    elif 'month' in fs:
        title = title + 'Month'
    
    title = title + ' Prediction w/' + ('Full' if 'all_scores' in fs else 'MEMS-Only') + ' Feature Set'
    fs_titles[fs] = title
fs_titles

## Optimized methods

In [None]:
for fs in featuresets:
    plt.figure(figsize=(9,5))
    df = roc_res[(roc_res['featureset'] == fs) & (roc_res['optimized'] == 'True')]
    g = sns.lineplot('fpr_mean', 'tpr_mean', hue='legend_label', data=df)
    g.plot([0, 1], [0, 1], linestyle='--', lw=2, color='black',
            label='Chance', alpha=.8)
        
    g.set(title= 'Mean ROC for Optimized methods\n %s' % (fs_titles[fs]),
          xlabel='False Positive Rate (Positive Label: 1)', 
          ylabel='True Positive Rate (Positive Label: 1)')
    g.legend(title='Model',
             bbox_to_anchor=(1.05, 1), ncol=1).texts[0].set_text('')
    plt.savefig(res_path  +'roc_curves_optimized_%s.png' % (fs))
    plt.show()

# Feature Importance

In [None]:
for fs in featuresets:
    print(fs)
    n_lags = pred_res[pred_res['featureset'] == fs]['n_lags'].iloc[0]
    for method in methods:
        suffix = '%s_%s_%d_lags_optimized' % (fs, method, n_lags)
        
        shap_scores = pickle.load(open(feat_imp_path + 'shap_' + suffix + '.ob', 'rb'))

        # Get results only for the positive class, for classifiers with multiple outputs
        if method == 'RF' or method == 'SVM': # classifiers with multiple outputs
            shap_scores = shap_scores[1]
            
        X_test = pd.read_pickle(feat_imp_path + 'X_test_' + suffix + '.ob')
        X_test.columns = [x.replace('_', ' ').capitalize() for x in X_test.columns]
        shap.summary_plot(shap_scores, X_test, show=False, plot_type='bar')
        fig = plt.gcf()
        fig.set_size_inches(12.5, 8.5)
        plt.savefig(feat_imp_path + 'shap_' + suffix + '.png')
        plt.show()

In [None]:
shap_scores = pickle.load(open('results/prediction_task/shap_study_day_LogisticR_2.ob', 'rb'))
shap_feats = pickle.load(open('results/prediction_task/features_study_day_LogisticR_2.ob', 'rb'))

In [None]:
shap_df = pickle.load(open('results/prediction_task/shap_df_study_day_LogisticR_2.ob', 'rb'))

In [None]:
shap_scores = shap_df.to_numpy()
feats = list(shap_df.columns)
shap.summary_plot(shap_scores, feats, show=False, plot_type='bar')