# Model intepretation

In [None]:
#uncomment this below code to install imblearn package
# !pip install imbalanced-learn

In [None]:
# !pip install scikit-optimize
# !pip install lime

In [1]:
import pandas as pd
import numpy as np
import sklearn
import shap
import lime

#statistics
from scipy.stats import chi2_contingency, ttest_ind

# import cudf #gpu-powered DataFrame (Pandas alternative)

#imbalance handling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, RepeatedEditedNearestNeighbours
from imblearn.pipeline import Pipeline

#preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split

#hyperparameter search
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

#internal validation
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, PredefinedSplit, RandomizedSearchCV


#performance metrices
from sklearn.metrics import precision_recall_curve, make_scorer, confusion_matrix, classification_report, f1_score, balanced_accuracy_score, r2_score, auc, average_precision_score, roc_auc_score, recall_score, roc_curve, accuracy_score

#Models selection
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
# from cuml.svm import SVC #gpu-powered SVM


#save and load trained model
import pickle

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

import os

random_state = 42

In [20]:
#Load data
gridSearchData, crossValData, internalEvaluationData, externalEvaluationData = pickle.load(open('PATH-TO-FILE', 'rb'))


In [None]:
#Candidate features: 'sex', 'rhinitis', 'cardiovascular', 'heartfailure', 
# 'psoriasis', 'anaphylaxis', 'diabetes', 'ihd', 'anxiety', 'eczema', 'nasalpolyps', 
# 'ethnic_group_Asian', 'ethnic_group_Black', 'ethnic_group_Mixed', 'ethnic_group_Other',
# 'ethnic_group_White', 'ethnic_group_not recorded', 'smokingStatus_current', 
# 'smokingStatus_former', 'smokingStatus_never', 'DeviceType_BAI', 'DeviceType_DPI', 
# 'DeviceType_NEB', 'DeviceType_not recorded', 'DeviceType_pMDI', 'PriorEducation_No',
# 'PriorEducation_Yes', 'age', 'average_daily_dose_ICS', 'prescribed_daily_dose_ICS',
# 'ICS_medication_possesion_ratio', 'numPCS', 'numPCSAsthma', 'numAntibioticsEvents', 
# 'numAntibioticswithLRTI', 'numOCSEvents', 'numOCSwithLRTI', 'numAsthmaAttacks', 
# 'numAcuteRespEvents', 'numHospEvents', 'numAsthmaManagement', 'numAsthmaReview', 
# 'imd_decile', 'CharlsonScore', 'BTS_step', 'BMI_cat', 'PEFStatus', 'EosinophilLevel'

features_columns = gridSearchData.columns.to_list()
exclude_columns = ['patid', 'practice_id', 'set', #identifier
                   'BMI', #use the categorical instead
                   'ethnicity', #use ethnic_group instead
                   'Spacer',  #all zero
                   'asthmaPlan', #use the continuous one
                   
                   'outcome_3months', 'outcome_6months', 'outcome_9months', 'outcome_12months', 'outcome_15months', 'outcome_18months', 
                   'outcome_21months', 'outcome_24months', 'outcome_combined_6months', 'outcome_combined_9months', 'outcome_combined_12months', 
                   'outcome_combined_15months', 'outcome_combined_18months', 'outcome_combined_24months', '3months', '6months', '12months', '24months', #outcomes variable
                   
                   'postcode_district', 'County', 'LocalAuthority', 'OutputAreaClassification', #location related variables, use IMD decile only
                   
                   'age_cat', 'ICS_medication_possesion_ratio_cat', 'numOCS_cat', 'numOCSEvents_cat', 'numOCSwithLRTI_cat', 'numAcuteRespEvents_cat', 
                   'numAntibioticsEvents_cat', 'numAntibioticswithLRTI_cat', 'numAsthmaAttacks_cat', 'numHospEvents_cat', 'numPCS_cat', 'numPCSAsthma_cat', 
                   'numAsthmaManagement_cat', 'numAsthmaReview_cat', 'numAsthmaMedReview_cat', 'numAsthmaReviewRCP_cat', 'average_daily_dose_ICS_cat', 
                   'prescribed_daily_dose_ICS_cat', #use continous vars instead
                   
                   'count_rhinitis', 'count_cardiovascular', 'count_heartfailure',
                   'count_psoriasis', 'count_anaphylaxis', 'count_diabetes', 'count_ihd',
                   'count_anxiety', 'count_eczema', 'count_nasalpolyps',
                   'count_paracetamol', 'count_nsaids', 'count_betablocker', #use binary ones
                   
                   'paracetamol', 'nsaids', 'betablocker', #no data in evaluation
                   'numAsthmaMedReview', 'numAsthmaReviewRCP',
                                      
                  ]
# exclude_columns = exclude_columns + [x for x in features_columns if '_count' in x] #filter out commorbid count variables
features_columns = [x for x in features_columns if x not in exclude_columns]
print('Features size: ', len(features_columns))
print(features_columns)

In [None]:
X_cross = crossValData[features_columns]
y_cross = externalEvaluationData[['outcome_12months']]
X_grid = gridSearchData[features_columns]
X_internalVal = internalEvaluationData[features_columns]
X_externalVal = externalEvaluationData[features_columns]
y_externalVal = externalEvaluationData[['outcome_12months']]

print(X_cross.shape)
print(X_grid.shape)
print(X_internalVal.shape)
print(X_externalVal.shape)


target_outcomes = [
    'outcome_3months', 'outcome_6months', 'outcome_9months', 
    'outcome_12months',
] 
# target_outcomes = ['12months'] 
model_names = [
    'LR',
    # 'DT', 'RF', 
    'XGB'
              ]

In [34]:
#Helper for generate the output
def summariseResult (testX, testY, model):
    preds = model.predict_proba(testX)
    preds = [x[1] for x in preds]
    fpr, tpr, thresholds = roc_curve(testY, preds, pos_label=1)
    aucscore = roc_auc_score(testY, preds)
    auprc = average_precision_score(testY, preds)
    return np.round(aucscore,4), np.round(auprc,4)


#Helper for processing the hyperparameters
def process_params(param_items, best_param):
            a = eval(param_items)
            b = eval(best_param)
            c = {}
            for key, value in zip(a,b):
                c[key] = value
            return c

# Train parsimonious model

In [None]:
results = []
for outcome in target_outcomes:
    for model in model_names:
        best_model = pickle.load(open('ORIGINAL_MODEL', 'rb'))
        n_features = [10,15,20] #number of top important features will be incorporated into the model
        AUC, AUPRC = summariseResult(X_externalVal, y_externalVal, best_model)
        results.append([outcome, model, 'original_model', AUC, AUPRC])
        #params
        params_dict = pd.read_csv('../../MODELS/BS_result_new.csv')
        params_dict['params'] = params_dict.apply(lambda x: dict(eval(x.best_param[11:])), axis=1)
        for n in n_features:
#             if model == 'XGB':
#                 #extract n-important features 10,15,20
#                 sorted_idx = best_model.feature_importances_.argsort()
#                 # plt.figure(figsize=(7,10))
#                 # plt.barh(X_cross.columns[sorted_idx][-n:], best_model.feature_importances_[sorted_idx][-n:])
#                 # plt.xlabel
#                 # plt.show()
#                 topnfeatures = X_cross.columns[sorted_idx][-n:]

#                 #retrain model using subset of n-features
#                 params = params_dict[(params_dict['outcome']==outcome)&(params_dict['model']==model)]['params'].tolist()[0]
#                 scale_pos_ratio = y_cross.value_counts()[0]/y_cross.value_counts()[1]
#                 trained_model = xgb.XGBClassifier(objective ='binary:logistic', tree_method = "hist", 
#                                               n_estimators=params['n_estimators'],
#                                               max_depth=params['max_depth'],
#                                               learning_rate=params['learning_rate'],
#                                               reg_alpha=params['reg_alpha'],
#                                               reg_lambda=params['reg_lambda'],
#                                               # subsample=params['subsample'],
#                                               # colsample_bytree=params['colsample_bytree'],
#                                               # scale_pos_weight=params['scale_pos_weight'],
#                                               scale_pos_weight=scale_pos_ratio,
#                                               device = "cuda", 
#                                               verbosity = 3,
#                                               # importance_type = 'gain', 
#                                               random_state=random_state)
#             elif model=='LR':
#                 sorted_idx = best_model.coef_[0].argsort()
#                 topnfeatures = X_cross.columns[sorted_idx][-n:]
#                 #retrain model
#                 params = params_dict[(params_dict['outcome']==outcome)&(params_dict['model']==model)]['params'].tolist()[0]
#                 trained_model = LogisticRegression(class_weight='balanced', C = params['C'], max_iter=params['max_iter'], solver=params['solver'], random_state=random_state)
        
            
            trained_model.fit(crossValData[topnfeatures], crossValData[[outcome]])
            AUC, AUPRC = summariseResult(externalEvaluationData[topnfeatures], externalEvaluationData[[outcome]], trained_model)
            results.append([outcome, model, n, AUC, AUPRC])
            pickle.dump(trained_model, open('SAVED-MODEL-nFEATURES', 'wb'))    


In [37]:
results = pd.DataFrame(results, columns=['outcome', 'model', 'n_features', 'AUC', 'AUPRC'])


In [None]:
results[results.outcome=='outcome_12months']