In [None]:
# IO
from pathlib import Path
try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle

# Utility Libraries
import math
from datetime import datetime
import re
import csv
import itertools

# Data Processing
import pandas as pd
import numpy as np

# Predictive Analytics
import statsmodels.stats.api as sms
from sklearn.feature_selection import VarianceThreshold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneGroupOut
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from bcpn_pipeline import data, features, models, consts
import shap

# Viz
%matplotlib inline
import matplotlib as mpl
from matplotlib.dates import DateFormatter
from matplotlib.cbook import boxplot_stats
import matplotlib.dates as mdates
import matplotlib.transforms as mtrans
import seaborn as sns
# sns.set_style("whitegrid")

import matplotlib.pyplot as plt
plt.rcParams.update({'figure.autolayout': True})
# plt.rcParams.update({'figure.facecolor': [1.0, 1.0, 1.0, 1.0]})

# configure autoreloading of modules
%load_ext autoreload
%autoreload 2


In [None]:
# Set up paths
res_path = 'results/prediction_task/'
# feat_imp_path = 'feature_importance/'
feat_imp_path = res_path

# Load Results

In [None]:
import glob

pred_res = pd.read_csv(res_path + 'pred.csv')
pred_res.drop(columns=['support'], inplace=True)
auc_res = pd.read_csv(res_path + 'auc.csv')
roc_res = pd.read_csv(res_path + 'roc.csv')
for df in [pred_res, auc_res, roc_res]:
    for col in df:
        try:
            df[col] = pd.to_numeric(df[col])
        except:
            pass

In [None]:
pred_res

In [None]:
auc_res

In [None]:
roc_res

In [None]:
# auc_res['type'] = 'test'
# roc_res['type'] = 'test'

In [None]:
# Get aggregate results across runs, by featureset and method
agg_res = None
pred_res_agg = pd.DataFrame()
groupby_cols = ['featureset', 'method', 'features_selected', 'tuned', 'type']
metrics = ['accuracy', 'precision', 'sensitivity', 'specificity', 'f1_score']

for metric in metrics:
    df = pred_res.groupby(groupby_cols)[metric].agg({
        metric + '_mean': 'mean',
        metric + '_std': 'std',
        metric + '_var': 'var'
    }).reset_index()
    
    if pred_res_agg.empty:
        pred_res_agg = df
    else:
        pred_res_agg = pred_res_agg.merge(df, on=groupby_cols)

pred_res_agg = pred_res_agg.merge(auc_res[['auc_mean', 'auc_std'] + groupby_cols], on=groupby_cols, how='outer')
pred_res_agg.fillna(-1, inplace=True)
pred_res_agg.to_csv(res_path + 'pred_agg.csv')
pred_res_agg

In [None]:

for metric in metrics + ['auc']:
    pred_res_agg.rename(columns={metric + '_mean': metric}, inplace=True)
    pred_res_agg[metric] = pred_res_agg.apply(
        lambda x: '%0.4f $\pm$ %0.2f' % (x[metric], x[metric + '_std']),
        axis=1
    )
pred_res_agg = pred_res_agg[groupby_cols + metrics + ['auc']]
pred_res_agg.to_csv(res_path + 'pred_agg_condensed.csv')
pred_res_agg

In [None]:
auc_res

In [None]:
merge_cols

In [None]:
merge_cols =  [col for col in roc_res.columns if '_mean' not in col and '_std' not in col] 
roc_res = roc_res.merge(auc_res, on=merge_cols)

roc_res['legend_label'] = roc_res.apply(
    lambda x: '%s (AUC = %0.2f $\pm$ %0.2f)' % (x['method'], x['auc_mean'], x['auc_std']),
    axis=1
)
roc_res

# ROC Curves

In [None]:
featuresets = list(roc_res.featureset.unique())
methods = list(roc_res.method.unique())

fs_titles = {fs: None for fs in featuresets}
for fs in fs_titles.keys():
    title = 'Next-' 
    if 'day' in fs:
        title = title + 'Day'
    elif 'week' in fs:
        title = title + 'Week'
    elif 'month' in fs:
        title = title + 'Month'
    
    title = title + ' Prediction w/' + ('Full' if 'all_scores' in fs else 'MEMS-Only') + ' Feature Set'
    fs_titles[fs] = title
fs_titles

## Optimized methods

In [None]:
for fs in featuresets:
    print(fs)
    plt.figure(figsize=(9,5))
    df = roc_res[(roc_res['featureset'] == fs) & (roc_res['tuned'] ==  True) & (roc_res['features_selected'] == True)]
    g = sns.lineplot('fpr_mean', 'tpr_mean', hue='legend_label', data=df)
    g.plot([0, 1], [0, 1], linestyle='--', lw=2, color='black',
            label='Chance', alpha=.8)
        
    g.set(title= 'Mean ROC for Optimized methods\n %s' % (fs_titles[fs]),
          xlabel='False Positive Rate (Positive Label: 1)', 
          ylabel='True Positive Rate (Positive Label: 1)')
    g.legend(title='Model',
             bbox_to_anchor=(1.05, 1), ncol=1).texts[0].set_text('')
    plt.savefig(res_path  +'roc_curves_optimized_%s.png' % (fs))
    plt.show()

# Feature Importance

In [None]:
# TODO - re-tune LogisticR since ROC curve sucks
# ensure shap dfs get renamed to "optimized" if selected feats and tuned are true
for fs in featuresets:
    print(fs)
    n_lags = pred_res[pred_res['featureset'] == fs]['n_lags'].iloc[0]
    for method in methods:
        filename = 'shap_df_%s_%s_%d_lags_tuned' % (fs, method, n_lags)       
        shap_df = pickle.load(open(feat_imp_path + filename + '.ob', 'rb'))
        shap_scores = shap_df.to_numpy()
        feature_names = list(shap_df.columns)
        feature_names = [x.replace('_', ' ').replace('(', ' (').capitalize() for x in feature_names]
        shap.summary_plot(shap_scores, feature_names, show=False, plot_type='dot')
#         fig = plt.gcf()
#         fig.set_size_inches(12.5, 8.5)
        plt.savefig(feat_imp_path + filename + '.png')
        plt.show()

In [None]:
shap_scores = shap_df.to_numpy()
feats = list(shap_df.columns)
shap.summary_plot(shap_scores, feats, show=False, plot_type='bar')

In [None]:
filename = f'{feat_imp_path}shap_explainer_study_day_LogisticR_2_lags_tuned_run_0_fold_0.pkl'
explainer = pickle.load(open(filename, 'rb'))
explainer

In [None]:
filename = f'{feat_imp_path}shap_values_study_day_LogisticR_2_lags_tuned_run_0_fold_0.pkl'
shap_values = pickle.load(open(filename, 'rb'))
shap_values

In [None]:
shap.plots.bar(shap_values[0])