# ANALYSIS

In [None]:
#uncomment this below code to install imblearn package
# !pip install imbalanced-learn

In [2]:
# !pip install scikit-ssoptimize
# !pip install shap

In [1]:
import pandas as pd
import numpy as np
import sklearn

#statistics
from scipy.stats import chi2_contingency, ttest_ind

# import cudf #gpu-powered DataFrame (Pandas alternative)

#imbalance handling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, RepeatedEditedNearestNeighbours
from imblearn.pipeline import Pipeline

#preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler

#hyperparameter search
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

#internal validation
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, PredefinedSplit, RandomizedSearchCV


#performance metrices
from sklearn.metrics import make_scorer, confusion_matrix, classification_report, f1_score, balanced_accuracy_score, r2_score, auc, average_precision_score, roc_auc_score, recall_score, roc_curve, accuracy_score

#Models selection
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
# from cuml.svm import SVC #gpu-powered SVM

import cupy as cp

#save and load trained model
import pickle

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

import os

random_state = 42

In [None]:
# Data loader
# features = pd.read_csv("../FinalData/cleaned_features_11072023.csv")
gridSearchData, crossValData, internalEvaluationData, externalEvaluationData = pickle.load(open('../Clean_data/dataset_scaled_2vs1_25102024.sav', 'rb'))
outcomes = pd.read_csv("../Clean_data/cleaned_outcomes_24102024.csv")
# features = features[features.columns[1:]]
# outcomes = outcomes[outcomes.columns[1:]]

In [None]:
#check vars
gridSearchData.iloc[:,60:]

In [None]:
print(gridSearchData.shape)
print(outcomes.shape)

In [None]:
pd.options.display.max_rows = 287
gridSearchData.isna().sum()

In [None]:
#Define feature candidates

features_columns = gridSearchData.columns.to_list()
exclude_columns = ['patid', 'practice_id', 'set', #identifier
                   'BMI', #use the categorical instead
                   'ethnicity', #use ethnic_group instead
                   'Spacer',  #all zero
                   
                   'outcome_3months', 'outcome_6months', 'outcome_9months', 'outcome_12months', 'outcome_15months', 'outcome_18months', 
                   'outcome_21months', 'outcome_24months', 'outcome_combined_6months', 'outcome_combined_9months', 'outcome_combined_12months', 
                   'outcome_combined_15months', 'outcome_combined_18months', 'outcome_combined_24months', '3months', '6months', '12months', '24months', #outcomes variable
                   
                   'postcode_district', 'County', 'LocalAuthority', 'OutputAreaClassification', #location related variables, use IMD decile only
                   
                   'age_cat', 'ICS_medication_possesion_ratio_cat', 'numOCS_cat', 'numOCSEvents_cat', 'numOCSwithLRTI_cat', 'numAcuteRespEvents_cat', 
                   'numAntibioticsEvents_cat', 'numAntibioticswithLRTI_cat', 'numAsthmaAttacks_cat', 'numHospEvents_cat', 'numPCS_cat', 'numPCSAsthma_cat', 
                   'numAsthmaManagement_cat', 'numAsthmaReview_cat', 'numAsthmaMedReview_cat', 'numAsthmaReviewRCP_cat', 'average_daily_dose_ICS_cat', 
                   'prescribed_daily_dose_ICS_cat', #use continous vars instead
                   
                   'count_rhinitis', 'count_cardiovascular', 'count_heartfailure',
                   'count_psoriasis', 'count_anaphylaxis', 'count_diabetes', 'count_ihd',
                   'count_anxiety', 'count_eczema', 'count_nasalpolyps',
                   'count_paracetamol', 'count_nsaids', 'count_betablocker', #use binary ones
                   
                   'paracetamol', 'nsaids', 'betablocker', #no data in evaluation
                                      
                  ]
# exclude_columns = exclude_columns + [x for x in features_columns if '_count' in x] #filter out commorbid count variables
features_columns = [x for x in features_columns if x not in exclude_columns]
print('Features size: ', len(features_columns))
print(features_columns)

# GRID SEARCH

In [None]:
def print_dataframe(filtered_cv_results):
    """Pretty print for filtered dataframe"""
    for mean_sensitivity, std_sensitivity, mean_specificity, std_specificity, mean_auc, std_auc, params in zip(
        filtered_cv_results["mean_test_sensitivity"],
        filtered_cv_results["std_test_sensitivity"],
        filtered_cv_results["mean_test_specificity"],
        filtered_cv_results["std_test_specificity"],
        filtered_cv_results["mean_test_auc"],
        filtered_cv_results["std_test_auc"],
        filtered_cv_results["params"],
    ):
        print(
            f"sensitivity: {mean_sensitivity:0.4f} (±{std_sensitivity:0.03f}),"
            f" specificity: {mean_specificity:0.4f} (±{std_specificity:0.03f}),"
            f"auc: {mean_auc:0.4f} (±{std_auc:0.03f}),"
            f" for {params}"
        )
    print()


def refit_strategy(cv_results):
    """Define the strategy to select the best estimator.

    The strategy defined here is to filter-out all results below a precision threshold
    of 0.98, rank the remaining by recall and keep all models with one standard
    deviation of the best by recall. Once these models are selected, we can select the
    fastest model to predict.

    Parameters
    ----------
    cv_results : dict of numpy (masked) ndarrays
        CV results as returned by the `GridSearchCV`.

    Returns
    -------
    best_index : int
        The index of the best estimator as it appears in `cv_results`.
    """
    # print the info about the grid-search for the different scores
    sensitivity_threshold = 0.5

    cv_results_ = pd.DataFrame(cv_results)
    print("All grid-search results:")
    print_dataframe(cv_results_)

    # Filter-out all results below the threshold
    high_sensitivity_cv_results = cv_results_[
        cv_results_["mean_test_sensitivity"] > sensitivity_threshold
    ]

    print(f"Models with a sensitivity higher than {sensitivity_threshold}:")
    print_dataframe(high_sensitivity_cv_results)

    high_sensitivity_cv_results = high_sensitivity_cv_results[
        [
            "mean_score_time",
            "mean_test_sensitivity",
            "std_test_sensitivity",
            "mean_test_specificity",
            "std_test_specificity",
            "mean_test_auc",
            "std_test_auc",
            "rank_test_sensitivity",
            "rank_test_specificity",
            "rank_test_auc",
            "params",
        ]
    ]

    # Select the most performant models in terms of sesntivity
    # (within 1 sigma from the best)
    best_auc_std = high_sensitivity_cv_results["mean_test_auc"].std()
    best_auc = high_sensitivity_cv_results["mean_test_auc"].max()
    best_auc_threshold = best_auc - best_auc_std

    high_auc_cv_results = high_sensitivity_cv_results[
        high_sensitivity_cv_results["mean_test_auc"] > best_auc_threshold
    ]
    if high_auc_cv_results.shape[0] > 1:
        print(
            "Out of the previously selected high sensitivity models, we keep all the\n"
            "the models within one standard deviation of the highest auc model:"
        )

        print(best_auc_threshold)
        print_dataframe(high_auc_cv_results)

        # From the best candidates, select the fastest model to predict
        fastest_top_auc_high_sensitivity_index = high_auc_cv_results[
            "mean_score_time"
        ].idxmin()

        print(
            "\nThe selected final model is the fastest to predict out of the previously\n"
            "selected subset of best models based on precision and recall.\n"
            "Its scoring time is:\n\n"
            f"{high_auc_cv_results.loc[fastest_top_auc_high_sensitivity_index]}"
        )

        return fastest_top_auc_high_sensitivity_index
    elif high_auc_cv_results.shape[0] == 1:
        print('no parameter achieve the threshold, so return the default best score')
        return cv_results_["mean_test_auc"].idxmax()
    else:
        print('no parameter achieve the threshold, so return the default best score')
        return cv_results_["mean_test_auc"].idxmax()

In [None]:
# LR = ['solver', 'C', 'max_iter']
# Lasso = ['solver', 'C', 'max_iter']
# Elastic= ['l1_ratio', 'max_iter']
# GNB = ['var_smoothing']
# SVM = ['C', 'gamma']
# DT = ['criterion', 'splitter', 'max_depth']
# RF = ['criterion', 'n_estimators', 'max_depth']
# XGB = ['n_estimators', 'learning_rate', 'reg_alpha', 'reg_lambda']


In [None]:
%%time

#Bayesiain continuous search

X = trainingData[features_columns]
outcomes = [
            # 'outcome_3months', 
            'outcome_combined_6months', 
            # 'outcome_combined_12months', 
            # 'outcome_combined_24months',
           ] 
cv = StratifiedKFold(n_splits=3)
n_calls = 10
n_jobs = 2

output = []

for target_outcome in outcomes:
    print('######################################################################################################')
    print(target_outcome)
    y = trainingData[target_outcome]
    scale_pos_ratio = y.value_counts()[0]/y.value_counts()[1]
    
#     if target_outcome == 'outcome_combined_24months':   
#     ##############################################################################
#         print('#LR')
#         lr_model = LogisticRegression(class_weight='balanced', random_state=1234)
#         lr_params = [Categorical(['liblinear', 'newton-cholesky'], name = 'solver'),
#                      Real(0.1, 10, 'log-uniform', name='C'), 
#                      Integer(50, 200, 'uniform', name='max_iter')]

#         @use_named_args(lr_params)
#         def lr_objective(**params):
#             lr_model.set_params(**params)

#             return 1-np.mean(cross_val_score(lr_model, X, y, cv=cv,
#                                             scoring=make_scorer(roc_auc_score)))

#         res_gp_lr = gp_minimize(lr_objective, lr_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
#         output.append([target_outcome, 'lr', 1-res_gp_lr.fun, res_gp_lr.x])

#     ########################################################################################

#         print('#Lasso')
#         lasso_model = LogisticRegression(class_weight='balanced', penalty='l1', random_state=1234) #only the LIBLINEAR and SAGA (added in v0.19) solvers handle the L1 penalty
#         lasso_params = [Categorical(['saga', 'liblinear'], name = 'solver'),
#                           Real(0.1, 10, 'log-uniform', name='C'),
#                           Integer(50, 200, 'uniform', name='max_iter')]

#         @use_named_args(lasso_params)
#         def lasso_objective(**params):
#             lasso_model.set_params(**params)

#             return 1-np.mean(cross_val_score(lasso_model, X, y, cv=cv,
#                                             scoring=make_scorer(roc_auc_score)))

#         res_gp_lasso = gp_minimize(lasso_objective, lasso_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
#         output.append([target_outcome, 'lasso', 1-res_gp_lasso.fun, res_gp_lasso.x])

#     ########################################################################################

#         print('#Elastic')
#         elastic_model = LogisticRegression(class_weight='balanced', penalty = 'elasticnet', solver = 'saga', random_state=1234)
#         elastic_params = [Real(0.1, 1, 'log-uniform', name='l1_ratio'),
#                           Integer(300, 800, 'uniform', name='max_iter')]

#         @use_named_args(elastic_params)
#         def elastic_objective(**params):
#             elastic_model.set_params(**params)

#             return 1-np.mean(cross_val_score(elastic_model, X, y, cv=cv,
#                                             scoring=make_scorer(roc_auc_score)))

#         res_gp_elastic = gp_minimize(elastic_objective, elastic_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
#         output.append([target_outcome, 'elastic', 1-res_gp_elastic.fun, res_gp_elastic.x])

#     ########################################################################################

#         print('#NB')
#         gnb_model = GaussianNB()
#         gnb_params = [Real(1e-9, 1e-5, 'log-uniform', name='var_smoothing')]

#         @use_named_args(gnb_params)
#         def gnb_objective(**params):
#             gnb_model.set_params(**params)

#             return 1-np.mean(cross_val_score(gnb_model, X, y, cv=cv,
#                                             scoring = make_scorer(roc_auc_score)))

#         res_gp_gnb = gp_minimize(gnb_objective, gnb_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
#         output.append([target_outcome, 'gnb', 1-res_gp_gnb.fun, res_gp_gnb.x])

    ########################################################################################

    print('#SVM')
    svc_model = SVC(class_weight='balanced', kernel='rbf', cache_size=1000, random_state=1234)
    svm_params = [Real(0.1, 100, "log-uniform", name='C'),
                     Real(0.1, 100, "log-uniform", name='gamma')]

    @use_named_args(svm_params)
    def svm_objective(**params):
        svc_model.set_params(**params)

        return 1-np.mean(cross_val_score(svc_model, X, y, cv=cv,
                                        scoring=make_scorer(roc_auc_score), verbose=3))

    res_gp_svm = gp_minimize(svm_objective, svm_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
    output.append([target_outcome, 'svm', 1-res_gp_svm.fun, res_gp_svm.x])

    ########################################################################################
#         print('#DT')
#         dt_model = DecisionTreeClassifier(class_weight='balanced', random_state=1234)
#         dt_params = [Categorical(["gini", "entropy", "log_loss"],name='criterion'),
#                      Categorical(['best', 'random'],name='splitter'),
#                      Integer(3, 10, "uniform", name='max_depth'),]

#         @use_named_args(dt_params)
#         def dt_objective(**params):
#             scoring = {
#                 'auc': make_scorer(roc_auc_score)
#                 }
#             dt_model.set_params(**params)

#             return 1 - np.mean(cross_val_score(dt_model, X, y, cv=cv,
#                                             scoring=make_scorer(roc_auc_score)))
#         res_gp_dt = gp_minimize(dt_objective, dt_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
#         output.append([target_outcome, 'dt', 1-res_gp_dt.fun, res_gp_dt.x])
    
# ##########################################################################################

#     print('#RF')
#     rf_model = RandomForestClassifier(class_weight='balanced', random_state=1234)
#     rf_params = [Categorical(["gini", "entropy", "log_loss"],name='criterion'),
#                  Integer(100, 500, "uniform", name='n_estimators'),
#                  Integer(3, 10, "uniform", name='max_depth'),]

#     @use_named_args(rf_params)
#     def rf_objective(**params):
#         scoring = {
#             'auc': make_scorer(roc_auc_score)
#             }
#         rf_model.set_params(**params)

#         return 1 - np.mean(cross_val_score(rf_model, X, y, cv=cv,
#                                         scoring=make_scorer(roc_auc_score)))
#     res_gp_rf = gp_minimize(rf_objective, rf_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
#     output.append([target_outcome, 'rf', 1-res_gp_rf.fun, res_gp_rf.x])
    
# ##########################################################################################

#     print('#XGB')
#     xgb_model = xgb.XGBClassifier(objective ='binary:logistic', tree_method='gpu_hist', gpu_id=0,  verbosity = 0,
#                                          importance_type = 'gain', scale_pos_weight = scale_pos_ratio, random_state=1234)
#     xgb_params = [Integer(100,500,"uniform", name='n_estimators'),
#                     Integer(3, 10, "uniform", name='max_depth'),
#                      Real(1e-5, 1e-1, 'log-uniform', name='learning_rate'),
#                      Real(1e-5, 1e-1, 'log-uniform', name='reg_alpha'),
#                      Real(1e-5, 1e-1, 'log-uniform', name='reg_lambda'),]
#     @use_named_args(xgb_params)
#     def xgb_objective(**params):
#         xgb_model.set_params(**params)

#         return 1 - np.mean(cross_val_score(xgb_model, X, y, cv=cv,
#                                         scoring=make_scorer(roc_auc_score)))
#     res_gp_xgb = gp_minimize(xgb_objective, xgb_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
#     output.append([target_outcome, 'xgb', 1-res_gp_xgb.fun, res_gp_xgb.x])

########################################################################################

In [None]:
pd.DataFrame(output, columns=['outcome', 'model', 'best_score', 'best_param'])
# pd.DataFrame(output, columns=['outcome', 'model', 'best_score', 'best_param']).to_csv('../Models/BS_result_svm6.csv', index = False, index_label=False)

# GRIDSEARCH HERE

In [None]:
%%time

#GRID SEARCH
X = gridSearchData[features_columns]
# X = cudf.DataFrame(X)
outcomes = ['outcome_12months',
            # 'outcome_3months', 
            # 'outcome_6months', 
            # 'outcome_9months', 
           ] 
model_names = ['LR', 'Lasso', 'ElasticNet',  'DT', 'RF', 'XGB']


output = []
for outcome in outcomes:
    print(outcome)
    y = gridSearchData[outcome]
    scale_pos_ratio = y.value_counts()[0]/y.value_counts()[1]
    
    #MODELS
    lr_model = LogisticRegression(class_weight='balanced', random_state=random_state)
    lasso_model = LogisticRegression(class_weight='balanced', penalty='l1', random_state=random_state) #only the LIBLINEAR and SAGA (added in v0.19) solvers handle the L1 penalty
    elastic_model = LogisticRegression(solver='saga', class_weight='balanced', penalty = 'elasticnet', random_state=random_state)
    dt_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state)
    rf_model = RandomForestClassifier(class_weight='balanced', random_state=random_state)
    xgb_model = xgb.XGBClassifier(objective ='binary:logistic', tree_method = "hist", device = "cuda", verbosity = 3,
                                     importance_type = 'gain', random_state=random_state)

    #PARAMS
    lr_params = {'solver': ['liblinear', 'newton-cholesky'],
                 'C': [0.1, 1.0, 10.0],
                 'max_iter': [80, 100, 120]}
    
    lasso_params = {'solver': ['saga', 'liblinear'],
                    'C': [0.1, 1, 10],
                    'max_iter': [80, 100, 120]}
    
    elastic_params = {'l1_ratio': Real(0.1, 1, 'uniform'),
                      'max_iter': [80, 100, 120]}
    
    dt_params = {'criterion':["gini", "entropy"],
                 'splitter': ['best', 'random'],
                'max_depth': Integer(2,100,"uniform")}
    
    rf_params = {'criterion':["gini", "entropy"],
                 'n_estimators': Integer(100,1000,"uniform"),
                'max_depth': Integer(2,100,"uniform"),
                'min_samples_split': Integer(2,10,"uniform"),
                'min_samples_leaf': Integer(2,100,"uniform"),
                'max_features': ['sqrt', 'log2'],
                'bootstrap': [True, False]}
    
    xgb_params = {'n_estimators': Integer(100,1000,"uniform"),
                'max_depth': Integer(2,100,"uniform"),
                 'learning_rate': Real(1e-3, 3e-1, 'log-uniform'),
                 'reg_alpha': Real(0.1, 10, 'log-uniform'),
                 'reg_lambda': Real(0.1, 10, 'log-uniform'),
                 'subsample': Real(0.5, 1, 'uniform'),
                 'colsample_bytree': Real(0.5, 1, 'uniform'),
                 'scale_pos_weight': Real(scale_pos_ratio-2, scale_pos_ratio+2, 'uniform')}

    #Models and params in DICT
    models_to_be_trained = [
        # {'model_name': 'LR', 'model': lr_model, 'params': lr_params},
        # {'model_name': 'Lasso', 'model': lasso_model, 'params': lasso_params},
        # {'model_name': 'ElasticNet', 'model': elastic_model, 'params': elastic_params},
        # {'model_name': 'DT', 'model': dt_model, 'params': dt_params},
        # {'model_name': 'RF', 'model': rf_model, 'params': rf_params},
        {'model_name': 'XGB', 'model': xgb_model, 'params': xgb_params}
    ]
    
    #scoring
    # scoring = {
    #     'accuracy': make_scorer(balanced_accuracy_score),
    #     'sensitivity': make_scorer(recall_score),
    #     'specificity': make_scorer(recall_score,pos_label=0),
    #     'auc': make_scorer(roc_auc_score)
    #     }
    scoring = {
        'auc': make_scorer(roc_auc_score)
        }
    
    for item in models_to_be_trained:
        print(item['model_name'])
        gs = BayesSearchCV(item['model'],
                          search_spaces=item['params'],
                          scoring='roc_auc',
                           n_iter = 50,
                          cv=2,
                          verbose=3, 
                           n_jobs=4,
                           n_points=50,
                            random_state = random_state)
        # if item['model_name']=='XGB':
        #     X_xgb = cp.array(X)
        #     y_xgb = cp.array(y)
        #     gs.fit(X_xgb.get(), y_xgb.get())
        gs.fit(X, y)
        output.append([outcome, item['model_name'], gs.best_score_, gs.best_params_])
        pickle.dump(gs.cv_results_, open('../MODELS/BS/' + outcome.split('_')[-1] + '_' + item['model_name'] + '.sav', 'wb'))


In [None]:
pd.DataFrame(output, columns=['outcome', 'model', 'best_score', 'best_param'])
# pd.DataFrame(output, columns=['outcome', 'model', 'best_score', 'best_param']).to_csv('../Models/BS_result_svm6.csv', index = False, index_label=False)

# END

In [None]:
%%time

X = trainingData[features_columns]
y = trainingData['outcome_combined_12months']
X = cudf.DataFrame(X)
# y = cudf.DataFrame(y)





In [None]:
%%time

# svc_model = SVC(class_weight='balanced', kernel='poly', random_state=1234)
# svm_params={'C': [0.1, 1, 10], 'gamma': [1,10]}
# gs = GridSearchCV(svc_model,
#                   param_grid=svm_params,
#                   scoring=['average_precision', 'balanced_accuracy', 'roc_auc'],
#                   refit='roc_auc',
#                   cv=3,
#                   verbose=3,)
# gs.fit(X, y)

In [None]:
%%time

svc_model = SVC(class_weight='balanced', C = 10, gamma= 10, kernel='rbf', cache_size= 2000, random_state=1234, verbose=3)
svc_model.fit(X, y)

In [None]:
preds = svc_model.predict(X)

In [None]:
preds

In [None]:
classification_report(y, preds)

In [None]:
output[5][1].keys()

In [None]:
gs.best_score_

In [None]:
output[5][1]['params'][output[5][1]['rank_test_balanced_accuracy'][0]]

In [None]:
output[5][1]['params'][output[5][1]['rank_test_average_precision'][0]]

In [None]:
pd.DataFrame(output, columns=['model', 'GS_result']).to_csv('../Models/GS_result.csv', index_label=False, index=False)

In [None]:
item

In [None]:
#Define number of split in k-fold

n_splits = 10

In [None]:
#Create X set for model development

target_outcome = 'outcome_3months'
X = trainingData[features_columns]
y = trainingData[[target_outcome]]
print('X shape: ', X.shape)
print('y shape: ', y.shape)

#model parameters
params = {'xgb_lr': 0.6,
         'xgb_maxdepth': 7}

In [None]:
%%time

#EXECUTE model training

kf = StratifiedKFold(n_splits=n_splits, random_state=1234, shuffle=True)
kf.get_n_splits(X)
models1 = pd.DataFrame(columns=['modelname', 'class_ratio'])
summary_result1 = []
cols = ['model_name', 'class_ratio', 'acc','spec','sens','auc', 'auprc', 'balance_accuracy', 'f1_score', 'ppv', 'npv']
split_counter = 0

#train model
for train_index, test_index in kf.split(X, y):
    #split data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #Build models -> it can be commented if the models have been trained
    models_temp = pd.DataFrame(build_models(X_train, y_train[target_outcome], params, split_counter), columns=['modelname', 'class_ratio'])
    models1 = pd.concat([models1,models_temp]).reset_index(drop=True)
    split_counter+=1
        
#evaluate model
for modelname, classratio in models1.values:
    # print('======================================================================')
    print(modelname)
    model = pickle.load(open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'rb'))
    summary_result1.append((str(model), classratio, ) + summariseResult (X_test, y_test[target_outcome], model) )       


summary_result1 = pd.DataFrame(summary_result1, columns=cols)
summary_result1['model_num'] = summary_result1.index



In [None]:
print(target_outcome)
summary_result1['model_name'] = summary_result1.apply(lambda x: modelNameFixer(x.model_name), axis=1)
summary_result1.groupby('model_name').mean().sort_values(['auc'], ascending=False)

In [None]:
summary_result1.to_csv("summaryResult_outcome1.csv")
summary_result1 = pd.read_csv("summaryResult_outcome1.csv")

bar = sns.catplot(x = "model_name",       # x variable name
            y = "auc",       # y variable name            
            data = summary_result1,     # dataframe to plot
            kind = "bar",
            height=5,
            aspect=5/2.5,
            ci = None)
ax = bar.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x() + 0.01, 
            p.get_height() * 1.01, 
            '{0:.4f}'.format(p.get_height()), 
            color='black', rotation='horizontal', fontsize=11)
    
# listOf_Yticks = np.arange(0.5, 0.7, 0.05)
ax.set_ylim(0.4, 1)
ax.set_ylabel('AUC Score', fontsize=11)
ax.set_xlabel('Method', fontsize=11)

In [None]:
# kf = StratifiedKFold(n_splits=2, random_state=1234, shuffle=True)
# kf.get_n_splits(X)
# for train_index, test_index in kf.split(X, y):
#     #split data
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]
#     trymodel = SVC(class_weight='balanced', C = 0.7, degree=2, kernel='poly', random_state=1234, cache_size=2048)
#     trymodel.fit(X_train,y_train)
#     print(summariseResult(X_test, y_test, trymodel))


In [None]:
best_model1 = pickle.load(open('./models/outcome_3months/0DTModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model1.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model1.feature_importances_[sorted_idx][-10:])
plt.xlabel("Decision Tree Feature Importance")
plt.show()

In [None]:
best_model1 = pickle.load(open('./models/outcome_3months/0RFModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model1.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model1.feature_importances_[sorted_idx][-10:])
plt.xlabel("Random Forest Feature Importance")
plt.show()

In [None]:
best_model1 = pickle.load(open('./models/outcome_3months/0XGBoostModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model1.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model1.feature_importances_[sorted_idx][-10:])
plt.xlabel("XGBoost Feature Importance")
plt.show()

# 6months

In [None]:
target_outcome = 'outcome_combined_6months'
y = trainingData[[target_outcome]]

#model parameters
params = {'xgb_lr': 0.6,
         'xgb_maxdepth': 7}

In [None]:
%%time

#EXECUTE model training

kf = StratifiedKFold(n_splits=n_splits, random_state=1234, shuffle=True)
kf.get_n_splits(X)
models2 = pd.DataFrame(columns=['modelname', 'class_ratio'])
summary_result2 = []
cols = ['model_name', 'class_ratio', 'acc','spec','sens','auc', 'auprc', 'balance_accuracy', 'f1_score', 'ppv', 'npv']
split_counter = 0

#train model
for train_index, test_index in kf.split(X, y):
    #split data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #Build models -> it can be commented if the models have been trained
    models_temp = pd.DataFrame(build_models(X_train, y_train[target_outcome], params, split_counter), columns=['modelname', 'class_ratio'])
    models2 = pd.concat([models2,models_temp]).reset_index(drop=True)
    split_counter+=1
        
#evaluate model
for modelname, classratio in models2.values:
    # print('======================================================================')
    print(modelname)
    model = pickle.load(open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'rb'))
    summary_result2.append((str(model), classratio, ) + summariseResult (X_test, y_test[target_outcome], model) )       


summary_result2 = pd.DataFrame(summary_result2, columns=cols)
summary_result2['model_num'] = summary_result2.index
# summary_result1['method_name'] = summary_result1.apply(lambda x: 'LR' if x.model_num%2 == 0 else 'XGBoost', axis=1)


In [None]:
print(target_outcome)
summary_result2['model_name'] = summary_result2.apply(lambda x: modelNameFixer(x.model_name), axis=1)
summary_result2.groupby('model_name').mean().sort_values(['auc'], ascending=False)

In [None]:
summary_result2.to_csv("summaryResult_outcome2.csv")
summary_result2 = pd.read_csv("summaryResult_outcome2.csv")

bar = sns.catplot(x = "model_name",       # x variable name
            y = "auc",       # y variable name            
            data = summary_result2,     # dataframe to plot
            kind = "bar",
            height=5,
            aspect=5/2.5,
            ci = None)
ax = bar.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x() + 0.01, 
            p.get_height() * 1.01, 
            '{0:.4f}'.format(p.get_height()), 
            color='black', rotation='horizontal', fontsize=11)
    
# listOf_Yticks = np.arange(0.5, 0.7, 0.05)
ax.set_ylim(0.4, 1)
ax.set_ylabel('AUC Score', fontsize=11)
ax.set_xlabel('Method', fontsize=11)

In [None]:
best_model2 = pickle.load(open('./models/outcome_combined_6months/0DTModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model2.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model2.feature_importances_[sorted_idx][-10:])
plt.xlabel("Decision Tree Feature Importance")
plt.show()

In [None]:
best_model2 = pickle.load(open('./models/outcome_combined_6months/0RFModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model2.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model2.feature_importances_[sorted_idx][-10:])
plt.xlabel("Random Forest Feature Importance")
plt.show()

In [None]:
best_model2 = pickle.load(open('./models/outcome_combined_6months/0XGBoostModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model2.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model2.feature_importances_[sorted_idx][-10:])
plt.xlabel("XGBoost Feature Importance")
plt.show()

# 12 months

In [None]:
target_outcome = 'outcome_combined_12months'
y = trainingData[[target_outcome]]

#model parameters
params = {'xgb_lr': 0.6,
         'xgb_maxdepth': 10}

In [None]:
%%time

#EXECUTE model training

kf = StratifiedKFold(n_splits=n_splits, random_state=1234, shuffle=True)
kf.get_n_splits(X)
models3 = pd.DataFrame(columns=['modelname', 'class_ratio'])
summary_result3 = []
cols = ['model_name', 'class_ratio', 'acc','spec','sens','auc', 'auprc', 'balance_accuracy', 'f1_score', 'ppv', 'npv']
split_counter = 0

#train model
for train_index, test_index in kf.split(X, y):
    #split data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #Build models -> it can be commented if the models have been trained
    models_temp = pd.DataFrame(build_models(X_train, y_train[target_outcome], params, split_counter), columns=['modelname', 'class_ratio'])
    models3 = pd.concat([models3,models_temp]).reset_index(drop=True)
    split_counter+=1
        
#evaluate model
for modelname, classratio in models3.values:
    # print('======================================================================')
    print(modelname)
    model = pickle.load(open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'rb'))
    summary_result3.append((str(model), classratio, ) + summariseResult (X_test, y_test[target_outcome], model) )       


summary_result3 = pd.DataFrame(summary_result3, columns=cols)
summary_result3['model_num'] = summary_result3.index
# summary_result1['method_name'] = summary_result1.apply(lambda x: 'LR' if x.model_num%2 == 0 else 'XGBoost', axis=1)


In [None]:
print(target_outcome)
summary_result3['model_name'] = summary_result3.apply(lambda x: modelNameFixer(x.model_name), axis=1)
summary_result3.groupby('model_name').mean().sort_values(['auc'], ascending=False)

In [None]:
summary_result3.to_csv("summaryResult_outcome3.csv")
summary_result3 = pd.read_csv("summaryResult_outcome3.csv")

bar = sns.catplot(x = "model_name",       # x variable name
            y = "auc",       # y variable name            
            data = summary_result3,     # dataframe to plot
            kind = "bar",
            height=5,
            aspect=5/2.5,
            ci = None)
ax = bar.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x() + 0.01, 
            p.get_height() * 1.01, 
            '{0:.4f}'.format(p.get_height()), 
            color='black', rotation='horizontal', fontsize=11)
    
# listOf_Yticks = np.arange(0.5, 0.7, 0.05)
ax.set_ylim(0.4, 1)
ax.set_ylabel('AUC Score', fontsize=11)
ax.set_xlabel('Method', fontsize=11)

In [None]:
best_model3 = pickle.load(open('./models/outcome_combined_12months/0DTModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model3.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model3.feature_importances_[sorted_idx][-10:])
plt.xlabel("Decision Tree Feature Importance")
plt.show()

In [None]:
best_model3 = pickle.load(open('./models/outcome_combined_12months/0RFModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model3.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model3.feature_importances_[sorted_idx][-10:])
plt.xlabel("Random Forest Feature Importance")
plt.show()

In [None]:
best_model3 = pickle.load(open('./models/outcome_combined_12months/0XGBoostModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model3.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model3.feature_importances_[sorted_idx][-10:])
plt.xlabel("XGBoost Feature Importance")
plt.show()