In [None]:
# IO
from pathlib import Path
try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle

# Utility Libraries
import math
from datetime import datetime
import re
import csv
import itertools

# Data Processing
import pandas as pd
import numpy as np

# Predictive Analytics
import statsmodels.stats.api as sms
from sklearn.feature_selection import VarianceThreshold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneGroupOut
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from bcpn_pipeline import data, features, models, consts
import shap

# Viz
%matplotlib inline
import matplotlib as mpl
from matplotlib.dates import DateFormatter
from matplotlib.cbook import boxplot_stats
import matplotlib.dates as mdates
import matplotlib.transforms as mtrans
import seaborn as sns
sns.set_style("whitegrid")

import matplotlib.pyplot as plt
plt.rcParams.update({'figure.autolayout': True})
# plt.rcParams.update({'figure.facecolor': [1.0, 1.0, 1.0, 1.0]})

# configure autoreloading of modules
%load_ext autoreload
%autoreload 2


# Load Results

In [None]:
import glob
all_files = glob.glob("results/*.csv")
all_files

pred = []
auc = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    if 'final' in filename:
        if 'auc' in filename:
            auc.append(df)
        else:
            pred.append(df)

pred_res = pd.concat(pred, axis=0, ignore_index=True)
auc_res = pd.concat(auc, axis=0, ignore_index=True)

In [None]:
pred_res

In [None]:
df = pred_res.sort_values(by=['test_accuracy', 'featureset'], ascending=False)
df[['test_accuracy', 'train_accuracy', 'method', 'featureset']]

In [None]:
auc_res

In [None]:
auc_res = auc_res.merge(pred_res, on=['method', 'n_lags', 'featureset', 'target', 'optimized'])
auc_res['legend_label'] = auc_res.apply(
    lambda x: '%s (AUC = %0.2f $\pm$ %0.2f)' % (x['method'], x['test_mean_auc'], x['test_std_auc']),
    axis=1
)
auc_res

# ROC Curves

## Optimized Models

In [None]:
plt.figure(figsize=(9,5))
df = auc_res[(auc_res['featureset'] == 'study_day') & (auc_res['optimized'] == True)]
g = sns.lineplot('test_mean_fpr', 'test_mean_tpr', hue='legend_label', data=df)
g.plot([0, 1], [0, 1], linestyle='--', lw=2, color='black',
        label='Chance', alpha=.8)
g.set(title= 'Mean ROC for Optimized Models',
      xlabel='False Positive Rate (Positive Label: 1)', 
      ylabel='True Positive Rate (Positive Label: 1)')
g.legend(title='Model',
         bbox_to_anchor=(1.05, 1), ncol=1).texts[0].set_text('')
plt.savefig('results/roc_curves_optimized.png')
plt.show()

# Feature Importance

In [None]:
# XGB x FS4, in paper
shap_scores = pickle.load(open('feature_importance/shap_study_day_XGB_6_lags_optimized.ob', 'rb'))
# shap_scores = shap_scores
X_test = pd.read_pickle('feature_importance/X_test_study_day_XGB_6_lags_optimized.ob')
X_test.columns = [x.replace('_', ' ').capitalize() for x in X_test.columns]
shap.summary_plot(shap_scores, X_test, show=False)
fig = plt.gcf()
fig.set_size_inches(12.5, 8.5)
# txt="I need the caption to be present a little below X-axis"
# plt.figtext(0.53, -0.01, txt, wrap=True, horizontalalignment='center', fontsize=14)
# plt.savefig('results/pred/feature_importance/anx.png')
plt.show()

In [None]:
shap_scores.shape[1]

In [None]:
X_test