In [None]:
# IO
from pathlib import Path
try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle

# Utility Libraries
import math
from datetime import datetime
import re
import csv
import itertools

# Data Processing
import pandas as pd
import numpy as np

# Predictive Analytics
import statsmodels.stats.api as sms
from sklearn.feature_selection import VarianceThreshold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneGroupOut
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from bcpn_pipeline import data, features, models, consts
import shap

# Viz
%matplotlib inline
import matplotlib as mpl
from matplotlib.dates import DateFormatter
from matplotlib.cbook import boxplot_stats
import matplotlib.dates as mdates
import matplotlib.transforms as mtrans
import seaborn as sns
sns.set_style("whitegrid")

import matplotlib.pyplot as plt
plt.rcParams.update({'figure.autolayout': True})
# plt.rcParams.update({'figure.facecolor': [1.0, 1.0, 1.0, 1.0]})

# configure autoreloading of modules
%load_ext autoreload
%autoreload 2


# Load Results

In [None]:
import glob

pred_res = pd.read_csv('results/pred_results.csv')
pred_res =  pred_res[~pred_res.test_accuracy.str.contains('test_accuracy')] # remove headers from CSV body

auc_res = pd.read_csv('results/auc_results.csv')
auc_res =  auc_res[~auc_res.test_tpr.str.contains('test_tpr')] # remove headers from CSV body
auc_res['target'] = 'adherent' # Add missing target col

for df in [pred_res, auc_res]:
    for col in df:
        try:
            df[col] = pd.to_numeric(df[col])
        except:
            pass

In [None]:
pred_res.head()

In [None]:
df = pred_res.sort_values(by=['test_accuracy', 'featureset'], ascending=False)
df[['test_accuracy', 'train_accuracy', 'method', 'featureset']].head()

In [None]:
auc_res.head()

In [None]:
auc_res = auc_res.merge(pred_res, on=['method', 'n_lags', 'featureset', 'target', 'optimized'])
auc_res['legend_label'] = auc_res.apply(
    lambda x: '%s (AUC = %0.2f $\pm$ %0.2f)' % (x['method'], x['test_mean_auc'], x['test_std_auc']),
    axis=1
)
auc_res

In [None]:
auc_res.iloc[[0, 5, 2, 5]]

In [None]:
auc_res.n_lags.unique()

# ROC Curves

In [None]:
featuresets = list(auc_res.featureset.unique())
models = list(auc_res.method.unique())
fs_titles = {fs: None for fs in featuresets}
for fs in fs_titles.keys():
    title = 'Next-' 
    if 'day' in fs:
        title = title + 'Day'
    elif 'week' in fs:
        title = title + 'Week'
    
    title = title + ' Prediction w/' + ('Full' if 'all_scores' in fs else 'MEMS-Only') + ' Feature Set'
    fs_titles[fs] = title
fs_titles

## Optimized Models

In [None]:
for fs in featuresets:
    plt.figure(figsize=(9,5))
    df = auc_res[(auc_res['featureset'] == fs) & (auc_res['optimized'] == 'True')]
    print(df)
    g = sns.lineplot('test_fpr', 'test_tpr', hue='legend_label', data=df)
    g.plot([0, 1], [0, 1], linestyle='--', lw=2, color='black',
            label='Chance', alpha=.8)
        
    g.set(title= 'Mean ROC for Optimized Models\n %s' % (fs_titles[fs]),
          xlabel='False Positive Rate (Positive Label: 1)', 
          ylabel='True Positive Rate (Positive Label: 1)')
    g.legend(title='Model',
             bbox_to_anchor=(1.05, 1), ncol=1).texts[0].set_text('')
#     plt.savefig('results/roc_curves_optimized_%s.png' % (fs))
    plt.show()

# Feature Importance

In [None]:
for fs in featuresets:
    print(fs)
    n_lags = pred_res[pred_res['featureset'] == fs]['n_lags'].iloc[0]
    for model in models:
        shap_scores = pickle.load(open('feature_importance/shap_%s_%s_%d_lags_optimized.ob' % (fs, model, n_lags), 'rb'))

        # Handle special case of RF - get results only for the positive class
        if model == 'RF': 
            shap_scores = shap_scores[1]
        X_test = pd.read_pickle('feature_importance/X_test_%s_%s_%d_lags_optimized.ob' % (fs, model, n_lags))
        X_test.columns = [x.replace('_', ' ').capitalize() for x in X_test.columns]
        shap.summary_plot(shap_scores, X_test, show=False, plot_type='bar')
        fig = plt.gcf()
        fig.set_size_inches(12.5, 8.5)
        plt.savefig('feature_importance/shap_%s_%s_%d_lags_optimized.png' % (fs, model, n_lags))
        plt.show()

In [None]:
shap_scores.shape[1]

In [None]:
X_test