# ANALYSIS

In [None]:
#uncomment this below code to install imblearn package
# !pip install imbalanced-learn

In [None]:
# !pip install scikit-optimize

In [1]:
import pandas as pd
import numpy as np
import sklearn

#statistics
from scipy.stats import chi2_contingency, ttest_ind

import cudf #gpu-powered DataFrame (Pandas alternative)

#imbalance handling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, RepeatedEditedNearestNeighbours
from imblearn.pipeline import Pipeline

#preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

#hyperparameter search
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

#internal validation
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, PredefinedSplit, RandomizedSearchCV


#performance metrices
from sklearn.metrics import make_scorer, confusion_matrix, classification_report, f1_score, balanced_accuracy_score, r2_score, auc, average_precision_score, roc_auc_score, recall_score, roc_curve, accuracy_score

#Models selection
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from cuml.svm import SVC #gpu-powered SVM


#save and load trained model
import pickle

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

import os

In [2]:
# Data loader
# features = pd.read_csv("../FinalData/cleaned_features_11072023.csv")
trainingData, validationData, internalEvaluationData, evaluationData, evaluationDataWales, evaluationDataScotland = pickle.load(open('../FinalData/dataset_scaled_01122023.sav', 'rb'))
outcomes = pd.read_csv("../FinalData/cleaned_outcomes_01122023.csv")
# features = features[features.columns[1:]]
# outcomes = outcomes[outcomes.columns[1:]]

In [3]:
print(trainingData.shape)
print(outcomes.shape)

(429177, 133)
(696659, 15)


In [4]:
trainingData.head()

Unnamed: 0,patid,practice_id,sex,age,BMI,ethnicity,CharlsonScore,count_rhinitis,count_cardiovascular,count_heartfailure,...,PEFStatus_60-80,PEFStatus_less than 60,PEFStatus_more than 80,PEFStatus_not_recorded,EosinophilLevel_high,EosinophilLevel_normal,EosinophilLevel_unknown,system_EMIS,system_SystemOne,system_Vision
0,48960901,617,0.0,51,21.612812,not_recorded,1.0,0.0,0.0,0.0,...,0,0,0,1,0,0,1,0,1,0
1,47093498,659,1.0,46,24.220227,not_recorded,0.0,0.0,3.0,0.0,...,0,0,0,1,1,0,0,1,0,0
2,62650247,1100,1.0,66,30.865052,White British,0.0,0.0,7.0,0.0,...,0,0,0,1,0,1,0,1,0,0
3,60253483,957,1.0,10,22.956841,not_recorded,1.0,1.0,0.0,0.0,...,0,0,0,1,0,1,0,1,0,0
4,46565933,114,1.0,70,25.20611,not_recorded,4.0,1.0,0.0,0.0,...,0,0,0,1,0,1,0,0,0,1


In [5]:
outcomes.head()

Unnamed: 0,patid,outcome_3months,outcome_6months,outcome_9months,outcome_12months,outcome_15months,outcome_18months,outcome_21months,outcome_24months,outcome_combined_6months,outcome_combined_9months,outcome_combined_12months,outcome_combined_15months,outcome_combined_18months,outcome_combined_24months
0,43231452,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,43206365,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,43203606,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,43117348,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,43105858,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
# masterData = trainingData.merge(outcomes, how = 'left', left_on='patid', right_on='patid') #join table
# masterData = masterData.dropna() #NAs from Country
# masterData = masterData.reset_index(drop=True)
exclude_columns = ['weight', 'height', 'id', 'event_date', 'recorded_date', 'visit_id', 'code_id', 'snomed_id',
       'numeric_1', 'numeric_2', 'created_datetime', 'updated_datetime',]
trainingData = trainingData.loc[:,~trainingData.columns.isin(exclude_columns)]
print('original data shape: ', trainingData.shape)

original data shape:  (429177, 133)


In [7]:
trainingData = trainingData[(trainingData.age >=8) & (trainingData.age <=80)]
print('filtered data shape: ', trainingData.shape)

filtered data shape:  (388879, 133)


In [8]:
pd.options.display.max_rows = 287
trainingData.isna().sum()

patid                                         0
practice_id                                   0
sex                                           0
age                                           0
BMI                                           0
ethnicity                                     0
CharlsonScore                                 0
count_rhinitis                            11969
count_cardiovascular                      11969
count_heartfailure                        11969
count_psoriasis                           11969
count_anaphylaxis                         11969
count_diabetes                            11969
count_ihd                                 11969
count_anxiety                             11969
count_eczema                              11969
count_nasalpolyps                         11969
count_paracetamol                         11969
count_nsaids                              11969
count_betablocker                         11969
BTS_step                                

In [9]:
# #Positive vs negative class ratio

# print('3 months -> 1 : ', round(masterData.outcome_3months.value_counts()[0]/masterData.outcome_3months.value_counts()[1],2))
# print('6 months -> 1 : ', round(masterData.outcome_combined_6months.value_counts()[0]/masterData.outcome_combined_6months.value_counts()[1],2))
# print('9 months -> 1 : ', round(masterData.outcome_combined_9months.value_counts()[0]/masterData.outcome_combined_9months.value_counts()[1],2))
# print('12 months -> 1 : ', round(masterData.outcome_combined_12months.value_counts()[0]/masterData.outcome_combined_12months.value_counts()[1],2))
# print('15 months -> 1 : ', round(masterData.outcome_combined_15months.value_counts()[0]/masterData.outcome_combined_15months.value_counts()[1],2))
# print('18 months -> 1 : ', round(masterData.outcome_combined_18months.value_counts()[0]/masterData.outcome_combined_18months.value_counts()[1],2))
# print('24 months -> 1 : ', round(masterData.outcome_combined_24months.value_counts()[0]/masterData.outcome_combined_24months.value_counts()[1],2))


In [10]:
# #Proportion of asthma attack in each outcome

# print('3 months -> ', round(masterData.outcome_3months.value_counts()[1]/len(masterData)*100,2), '%')
# print('6 months -> ', round(masterData.outcome_combined_6months.value_counts()[1]/len(masterData)*100,2), '%')
# print('9 months -> ', round(masterData.outcome_combined_9months.value_counts()[1]/len(masterData)*100,2), '%')
# print('12 months -> ', round(masterData.outcome_combined_12months.value_counts()[1]/len(masterData)*100,2), '%')
# print('15 months -> ', round(masterData.outcome_combined_15months.value_counts()[1]/len(masterData)*100,2), '%')
# print('18 months -> ', round(masterData.outcome_combined_18months.value_counts()[1]/len(masterData)*100,2), '%')
# print('24 months -> ', round(masterData.outcome_combined_24months.value_counts()[1]/len(masterData)*100,2), '%')

In [11]:
# #Data scenario
# # 1: all data without ethnicity variable
# # 2: all data with ethnicity variable (include all missing values in ethnicity as separate group)
# # 3: filter data based on ethnicity (exclude missing values)

# scenario = 1 #change it based on the scenario

# if scenario == 1:
#     #include all data
#     allData = masterData
    
# elif scenario == 2:
#     #Exclude ethnic column
#     allData = masterData.drop('ethnic_group', axis=1)
    
# elif scenario == 3:
#     #exclude missing values for ethnic variable
#     allData = masterData[masterData.ethnic_group!='not_recorded']
    
# allData = allData.reset_index(drop=True)
# print('Data shape for scenario', str(scenario), allData.shape)



In [12]:
# #Split data into training and evaluation set based on the country. Include only 18+ patients.

# trainingData = allData[(allData.Country == 'England') & (allData.age>=18)]
# evaluationData = allData[((allData.Country == 'Scotland') | (allData.Country == 'Wales')) & (allData.age>=18)] #used for internal validation

# #remove country variable
# trainingData = trainingData.drop('Country', axis=1)
# evaluationData = evaluationData.drop('Country', axis=1)

# trainingData = trainingData.reset_index(drop=True)
# evaluationData = evaluationData.reset_index(drop=True)

# print('Training data shape:', trainingData.shape)
# print('Evaluation data shape: ', evaluationData.shape)

In [13]:
# #encode categorical data

# cat_vars = ['PEFStatus','EosinophilLevel']
# onehot_vars = ['ethnic_group','smokingStatus', 'DeviceType', 'cat_BMI', 'imd_decile']
# data_categorical = trainingData[cat_vars]
# data_onehot = trainingData[onehot_vars]

# #ordinal encoder
# encoder = OrdinalEncoder(categories=[['not_recorded','less than 60', '60-80', 'more than 80'], ['unknown', 'normal', 'high']]).set_output(transform="pandas")
# data_encoded = encoder.fit_transform(data_categorical)
# pickle.dump(encoder, open('../Models/cat_encoder.pkl', 'wb'))
    
# #one hot encoder
# onehot_encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
# onehot_encoded = onehot_encoder.fit_transform(data_onehot)
# pickle.dump(onehot_encoder, open('../Models/onehot_encoder.pkl', 'wb'))

# trainingData = pd.concat([trainingData.drop(cat_vars, axis=1), data_encoded], axis=1)
# trainingData = pd.concat([trainingData.drop(onehot_vars, axis=1), onehot_encoded], axis=1)

# print('Data shape after encoding: ', trainingData.shape)

In [14]:
# #Data normalisation for continous variable into 0-1 range


# continuous_vars = ['age', 'CharlsonScore', 'average_daily_dose_ICS', 'prescribed_daily_dose_ICS', 'ICS_medication_possesion_ratio', 
#                    'numOCS', 'numPCS', 'numPCSAsthma', 'numAntibioticsEvents', 'numAntibioticswithLRTI', 'numOCSEvents', 'numOCSwithLRTI', 
#                    'numAsthmaAttacks', 'numAcuteRespEvents', 'numHospEvents']

# # define scaler
# scaler = StandardScaler()
# data_scaled = scaler.fit_transform(trainingData[continuous_vars])
# pickle.dump(scaler, open('../Models/cont_scaler.pkl', 'wb'))


# data_scaled = pd.DataFrame(data_scaled, columns=scaler.get_feature_names_out())
# trainingData = pd.concat([trainingData.drop(continuous_vars, axis=1), data_scaled], axis=1)

# print('Data shape after scaling: ', trainingData.shape)

In [15]:
#Define feature candidates

features_columns = trainingData.columns.to_list()
exclude_columns = ['patid', 'practice_id', #identifier
                   'BMI', #use the categorical instead
                   'ethnicity', #use ethnic_group instead
                   'Spacer',  #all zero
                   
                   'outcome_3months', 'outcome_6months', 'outcome_9months', 'outcome_12months', 'outcome_15months', 'outcome_18months', 
                   'outcome_21months', 'outcome_24months', 'outcome_combined_6months', 'outcome_combined_9months', 'outcome_combined_12months', 
                   'outcome_combined_15months', 'outcome_combined_18months', 'outcome_combined_24months', #outcomes variable
                   
                   'postcode_district', 'County', 'LocalAuthority', 'OutputAreaClassification', #location related variables, use IMD decile only
                   
                   'cat_age', 'cat_average_daily_dose_ICS', 'cat_prescribed_daily_dose_ICS', 'cat_ICS_medication_possesion_ratio', 'cat_numOCS', 'cat_numOCSEvents', 
                   'cat_numOCSwithLRTI', 'cat_numAcuteRespEvents', 'cat_numAntibioticsEvents', 'cat_numAntibioticswithLRTI', 'cat_numAsthmaAttacks', 'cat_numHospEvents', 
                   'cat_numPCS', 'cat_numPCSAsthma', #use continous vars instead
                   
                   'system_EMIS', 'system_SystemOne', 'system_Vision', #practice system
                   
                   'count_rhinitis', 'count_cardiovascular', 'count_heartfailure',
                   'count_psoriasis', 'count_anaphylaxis', 'count_diabetes', 'count_ihd',
                   'count_anxiety', 'count_eczema', 'count_nasalpolyps',
                   'count_paracetamol', 'count_nsaids', 'count_betablocker', #use binary ones
                   
                   'numOCSEvents', #duplicate with numOCS
                  ]
exclude_columns = exclude_columns + [x for x in features_columns if '_count' in x] #filter out commorbid count variables
features_columns = [x for x in features_columns if x not in exclude_columns]
print('Features size: ', len(features_columns))
print(features_columns)

Features size:  79
['sex', 'age', 'CharlsonScore', 'BTS_step', 'average_daily_dose_ICS', 'prescribed_daily_dose_ICS', 'ICS_medication_possesion_ratio', 'numOCS', 'PriorEducation', 'numPCS', 'numPCSAsthma', 'numAntibioticsEvents', 'numAntibioticswithLRTI', 'numOCSwithLRTI', 'numAsthmaAttacks', 'numAcuteRespEvents', 'numHospEvents', 'month_12', 'month_4', 'month_5', 'month_10', 'month_1', 'month_6', 'month_3', 'month_11', 'month_8', 'month_9', 'month_7', 'month_2', 'rhinitis', 'cardiovascular', 'heartfailure', 'psoriasis', 'anaphylaxis', 'diabetes', 'ihd', 'anxiety', 'eczema', 'nasalpolyps', 'paracetamol', 'nsaids', 'betablocker', 'ethnic_group_Asian - ethnic group', 'ethnic_group_Black - ethnic group', 'ethnic_group_Mixed ethnic census group', 'ethnic_group_Other ethnic group', 'ethnic_group_White - ethnic group', 'ethnic_group_not_recorded', 'smokingStatus_Active Smoker', 'smokingStatus_Former Smoker', 'smokingStatus_Non Smoker', 'DeviceType_BAI', 'DeviceType_DPI', 'DeviceType_NEB', 'D

In [16]:
#ONE HOT encoding for categorical data

# categoricalNonnumericVars = pd.Series(list(set(categoricalNonnumericVars).intersection(set(features_columns)))).tolist() #select only variables within the feature candidate list

# # define one hot encoder
# categoricalEncoder = OneHotEncoder(sparse=False)

# # transform data
# result = categoricalEncoder.fit_transform(trainingData[categoricalNonnumericVars]) 
# result = pd.DataFrame(result, columns=categoricalEncoder.get_feature_names_out())

# #save encoder
# pickle.dump(categoricalEncoder, open('./models/categoricalEncoder.pkl', 'wb'))

# # replace categorical variables in the original data with the one hot version
# trainingData = pd.concat([trainingData.loc[:, ~trainingData.columns.isin(categoricalNonnumericVars)],result], axis=1)
# print('Data shape after one-hot encoding: ', trainingData.shape)

In [17]:
# #ONE HOT encoding for evaluation dataset

# # transform data
# result = categoricalEncoder.transform(evaluationData[categoricalNonnumericVars]) 
# result = pd.DataFrame(result, columns=categoricalEncoder.get_feature_names_out())

# # replace categorical variables in the original data with the one hot version
# evaluationData = pd.concat([evaluationData.loc[:, ~evaluationData.columns.isin(categoricalNonnumericVars)],result], axis=1)
# print('Data shape after one-hot encoding: ', evaluationData.shape)

In [18]:
# #Scaling continous variable into 0-1 range for evaluation dataset


# # transform data
# result = scaler.transform(evaluationData[continuous_vars])
# result = pd.DataFrame(result, columns=scaler.get_feature_names_out())

# evaluationData = pd.concat([evaluationData.loc[:,~evaluationData.columns.isin(continuous_vars)],result], axis=1)

# print('Data shape after scaling: ', evaluationData.shape)

In [19]:
# #Model evaluation function

# def summariseResult (testX, testY, model):
#     preds = model.predict(testX)
#     tn, fp, fn, tp = confusion_matrix(testY, preds).ravel()
#     specificity = tn / (tn+fp)
#     sensitivity = tp / (tp+fn)
#     ppv = 100*tp/(tp+fp)
#     npv = 100*tn/(fn+tn)
#     acc = accuracy_score(testY, preds)
#     f1score = f1_score(testY, preds, average = 'binary')
#     balanceacc = balanced_accuracy_score(testY, preds)
#     fpr, tpr, thresholds = roc_curve(testY, preds, pos_label=1)
#     aucscore = auc(fpr, tpr)
#     # auc = roc_auc_score(testY, preds)
#     auprc = average_precision_score(testY, preds)
#     # plot_confusion_matrix(model, testX, testY, cmap='viridis')  
#     return np.round(acc,4), np.round(specificity,4), np.round(sensitivity,4), np.round(aucscore,4), np.round(auprc,4), np.round(balanceacc,4), np.round(f1score,4), np.round(ppv,4), np.round(npv,4)

In [20]:
# #Fix model name for visualisation

# def modelNameFixer(x):
#     if 'liblinear' in x:
#         return 'Lasso'
#     elif 'GaussianNB' in x:
#         return 'GNB'
#     elif 'SVC' in x:
#         return 'SVC'
#     elif 'RandomForest' in x:
#         return 'RF'
#     elif 'XGB' in x:
#         return 'XGBoost'
#     elif 'DecisionTree' in x:
#         return 'DT'
#     else:
#         return 'LR'

In [21]:
# # instantiate the model (using the default parameters)
# def build_models (X_train, y_train, params, split_counter):
#     models = [] #list to store all the models
#     model_counter = 0
#     print("Building models . . . .")

#     #LR
#     lr_model = LogisticRegression(class_weight='balanced', penalty='l2', random_state=1234)
#     lr_model.fit(X_train,y_train)
#     modelname =str(split_counter) + 'LRModel' 
#     models.append([modelname, y_train.value_counts()[1]/y_train.value_counts()[0]])
#     model_counter+=1
#     pickle.dump(lr_model, open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'wb')) 
#     print("LR done")

#     #Lasso
#     lasso_model = LogisticRegression(class_weight='balanced', penalty='l1', solver='liblinear', random_state=1234) #only the LIBLINEAR and SAGA (added in v0.19) solvers handle the L1 penalty
#     lasso_model.fit(X_train, y_train)
#     modelname =str(split_counter) + 'LassoModel' 
#     models.append([modelname, y_train.value_counts()[1]/y_train.value_counts()[0]])
#     model_counter+=1
#     pickle.dump(lasso_model, open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'wb'))
#     print("LR done")

#     #GNB
#     gnb_model = GaussianNB()
#     gnb_model.fit(X_train, y_train)
#     modelname =str(split_counter) + 'GNBModel' 
#     models.append([modelname, y_train.value_counts()[1]/y_train.value_counts()[0]])
#     model_counter+=1
#     pickle.dump(gnb_model, open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'wb'))     
#     print("GNB done")

#     #SVM
#     svc_model = SVC(class_weight='balanced', C = 0.7, degree=2, kernel='poly', random_state=1234, cache_size=2048)
#     svc_model.fit(X_train,y_train)
#     modelname =str(split_counter) + 'SVCModel' 
#     models.append([modelname, y_train.value_counts()[1]/y_train.value_counts()[0]])
#     model_counter+=1
#     pickle.dump(svc_model, open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'wb'))     
#     print("SVM done")

#     #DT
#     dt_model = DecisionTreeClassifier(class_weight='balanced', random_state=1234)
#     dt_model.fit(X_train, y_train)
#     modelname =str(split_counter) + 'DTModel' 
#     models.append([modelname, y_train.value_counts()[1]/y_train.value_counts()[0]])
#     model_counter+=1
#     pickle.dump(dt_model, open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'wb'))     
#     print("DT done")

#     #RF
#     rf_model = RandomForestClassifier(class_weight='balanced', n_estimators=500, random_state=1234)
#     rf_model.fit(X_train, y_train)
#     modelname =str(split_counter) + 'RFModel' 
#     models.append([modelname, y_train.value_counts()[1]/y_train.value_counts()[0]])
#     model_counter+=1
#     pickle.dump(rf_model, open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'wb'))     
#     print("RF done")



#     #XGB
#     scale_pos_ratio = y_train.value_counts()[0]/y_train.value_counts()[1]
#     xgb_model = xgb.XGBClassifier(objective ='binary:logistic', max_depth = params['xgb_maxdepth'], n_estimators = 2000,  tree_method='gpu_hist', gpu_id=0,  verbosity = 0, random_state = 1234,
#                                  importance_type = 'gain', scale_pos_weight = scale_pos_ratio, use_label_encoder=False, learning_rate=params['xgb_lr'])
#     # xgb_model = xgb.XGBClassifier(objective ='binary:logistic', learning_rate = 0.001, tree_method='gpu_hist', gpu_id=0,  verbosity = 0, random_state = 1234)
#     xgb_model.fit(X_train,y_train)
#     #save model
#     modelname = str(split_counter) + 'XGBoostModel'
#     models.append([modelname,  y_train.value_counts()[1]/y_train.value_counts()[0]])
#     pickle.dump(xgb_model, open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'wb')) 
#     model_counter+=1
#     print("XGB done")
    
#     return models
#     # return [xgb_model]

In [16]:
outcomes.columns

Index(['patid', 'outcome_3months', 'outcome_6months', 'outcome_9months',
       'outcome_12months', 'outcome_15months', 'outcome_18months',
       'outcome_21months', 'outcome_24months', 'outcome_combined_6months',
       'outcome_combined_9months', 'outcome_combined_12months',
       'outcome_combined_15months', 'outcome_combined_18months',
       'outcome_combined_24months'],
      dtype='object')

# GRID SEARCH

In [17]:
def print_dataframe(filtered_cv_results):
    """Pretty print for filtered dataframe"""
    for mean_sensitivity, std_sensitivity, mean_specificity, std_specificity, mean_auc, std_auc, params in zip(
        filtered_cv_results["mean_test_sensitivity"],
        filtered_cv_results["std_test_sensitivity"],
        filtered_cv_results["mean_test_specificity"],
        filtered_cv_results["std_test_specificity"],
        filtered_cv_results["mean_test_auc"],
        filtered_cv_results["std_test_auc"],
        filtered_cv_results["params"],
    ):
        print(
            f"sensitivity: {mean_sensitivity:0.4f} (±{std_sensitivity:0.03f}),"
            f" specificity: {mean_specificity:0.4f} (±{std_specificity:0.03f}),"
            f"auc: {mean_auc:0.4f} (±{std_auc:0.03f}),"
            f" for {params}"
        )
    print()


def refit_strategy(cv_results):
    """Define the strategy to select the best estimator.

    The strategy defined here is to filter-out all results below a precision threshold
    of 0.98, rank the remaining by recall and keep all models with one standard
    deviation of the best by recall. Once these models are selected, we can select the
    fastest model to predict.

    Parameters
    ----------
    cv_results : dict of numpy (masked) ndarrays
        CV results as returned by the `GridSearchCV`.

    Returns
    -------
    best_index : int
        The index of the best estimator as it appears in `cv_results`.
    """
    # print the info about the grid-search for the different scores
    sensitivity_threshold = 0.5

    cv_results_ = pd.DataFrame(cv_results)
    print("All grid-search results:")
    print_dataframe(cv_results_)

    # Filter-out all results below the threshold
    high_sensitivity_cv_results = cv_results_[
        cv_results_["mean_test_sensitivity"] > sensitivity_threshold
    ]

    print(f"Models with a sensitivity higher than {sensitivity_threshold}:")
    print_dataframe(high_sensitivity_cv_results)

    high_sensitivity_cv_results = high_sensitivity_cv_results[
        [
            "mean_score_time",
            "mean_test_sensitivity",
            "std_test_sensitivity",
            "mean_test_specificity",
            "std_test_specificity",
            "mean_test_auc",
            "std_test_auc",
            "rank_test_sensitivity",
            "rank_test_specificity",
            "rank_test_auc",
            "params",
        ]
    ]

    # Select the most performant models in terms of sesntivity
    # (within 1 sigma from the best)
    best_auc_std = high_sensitivity_cv_results["mean_test_auc"].std()
    best_auc = high_sensitivity_cv_results["mean_test_auc"].max()
    best_auc_threshold = best_auc - best_auc_std

    high_auc_cv_results = high_sensitivity_cv_results[
        high_sensitivity_cv_results["mean_test_auc"] > best_auc_threshold
    ]
    if high_auc_cv_results.shape[0] > 1:
        print(
            "Out of the previously selected high sensitivity models, we keep all the\n"
            "the models within one standard deviation of the highest auc model:"
        )

        print(best_auc_threshold)
        print_dataframe(high_auc_cv_results)

        # From the best candidates, select the fastest model to predict
        fastest_top_auc_high_sensitivity_index = high_auc_cv_results[
            "mean_score_time"
        ].idxmin()

        print(
            "\nThe selected final model is the fastest to predict out of the previously\n"
            "selected subset of best models based on precision and recall.\n"
            "Its scoring time is:\n\n"
            f"{high_auc_cv_results.loc[fastest_top_auc_high_sensitivity_index]}"
        )

        return fastest_top_auc_high_sensitivity_index
    elif high_auc_cv_results.shape[0] == 1:
        print('no parameter achieve the threshold, so return the default best score')
        return cv_results_["mean_test_auc"].idxmax()
    else:
        print('no parameter achieve the threshold, so return the default best score')
        return cv_results_["mean_test_auc"].idxmax()

In [24]:
# LR = ['solver', 'C', 'max_iter']
# Lasso = ['solver', 'C', 'max_iter']
# Elastic= ['l1_ratio', 'max_iter']
# GNB = ['var_smoothing']
# SVM = ['C', 'gamma']
# DT = ['criterion', 'splitter', 'max_depth']
# RF = ['criterion', 'n_estimators', 'max_depth']
# XGB = ['n_estimators', 'learning_rate', 'reg_alpha', 'reg_lambda']


In [21]:
ignore, use = train_test_split(trainingData, stratify=trainingData['outcome_combined_12months'], test_size=.1)

In [22]:
ignore.outcome_combined_12months.value_counts(normalize=False)

0    307502
1     42489
Name: outcome_combined_12months, dtype: int64

In [23]:
use.outcome_combined_12months.value_counts(normalize=False)

0    34167
1     4721
Name: outcome_combined_12months, dtype: int64

In [24]:
trainingData.outcome_combined_12months.value_counts(normalize=False)

0    341669
1     47210
Name: outcome_combined_12months, dtype: int64

In [28]:
%%time

#Bayesiain continuous search


X = use[features_columns]
outcomes = [
            'outcome_3months', 
            'outcome_combined_6months', 
            'outcome_combined_12months', 
            'outcome_combined_24months',
           ] 
cv = StratifiedKFold(n_splits=3)
n_calls = 10
n_jobs = 10

output = []

for target_outcome in outcomes:
    print('######################################################################################################')
    print(target_outcome)
    y = use[target_outcome]
    scale_pos_ratio = y.value_counts()[0]/y.value_counts()[1]
    
#     if target_outcome == 'outcome_combined_24months':   
#     ##############################################################################
    print('#LR')
    lr_model = LogisticRegression(class_weight='balanced', random_state=1234)
    lr_params = [Categorical(['liblinear', 'newton-cholesky'], name = 'solver'),
                 Real(0.1, 10, 'log-uniform', name='C'), 
                 Integer(50, 200, 'uniform', name='max_iter')]

    @use_named_args(lr_params)
    def lr_objective(**params):
        lr_model.set_params(**params)

        return 1-np.mean(cross_val_score(lr_model, X, y, cv=cv,
                                        scoring=make_scorer(roc_auc_score)))

    res_gp_lr = gp_minimize(lr_objective, lr_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
    output.append([target_outcome, 'lr', 1-res_gp_lr.fun, res_gp_lr.x])

    ########################################################################################

    print('#Lasso')
    lasso_model = LogisticRegression(class_weight='balanced', penalty='l1', random_state=1234) #only the LIBLINEAR and SAGA (added in v0.19) solvers handle the L1 penalty
    lasso_params = [Categorical(['saga', 'liblinear'], name = 'solver'),
                      Real(0.1, 10, 'log-uniform', name='C'),
                      Integer(50, 200, 'uniform', name='max_iter')]

    @use_named_args(lasso_params)
    def lasso_objective(**params):
        lasso_model.set_params(**params)

        return 1-np.mean(cross_val_score(lasso_model, X, y, cv=cv,
                                        scoring=make_scorer(roc_auc_score)))

    res_gp_lasso = gp_minimize(lasso_objective, lasso_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
    output.append([target_outcome, 'lasso', 1-res_gp_lasso.fun, res_gp_lasso.x])

    ########################################################################################

    print('#Elastic')
    elastic_model = LogisticRegression(class_weight='balanced', penalty = 'elasticnet', solver = 'saga', random_state=1234)
    elastic_params = [Real(0.1, 1, 'log-uniform', name='l1_ratio'),
                      Integer(300, 800, 'uniform', name='max_iter')]

    @use_named_args(elastic_params)
    def elastic_objective(**params):
        elastic_model.set_params(**params)

        return 1-np.mean(cross_val_score(elastic_model, X, y, cv=cv,
                                        scoring=make_scorer(roc_auc_score)))

    res_gp_elastic = gp_minimize(elastic_objective, elastic_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
    output.append([target_outcome, 'elastic', 1-res_gp_elastic.fun, res_gp_elastic.x])

    ########################################################################################

    print('#NB')
    gnb_model = GaussianNB()
    gnb_params = [Real(1e-9, 1e-5, 'log-uniform', name='var_smoothing')]

    @use_named_args(gnb_params)
    def gnb_objective(**params):
        gnb_model.set_params(**params)

        return 1-np.mean(cross_val_score(gnb_model, X, y, cv=cv,
                                        scoring = make_scorer(roc_auc_score)))

    res_gp_gnb = gp_minimize(gnb_objective, gnb_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
    output.append([target_outcome, 'gnb', 1-res_gp_gnb.fun, res_gp_gnb.x])

#     ########################################################################################

#     print('#SVM')
#     svc_model = SVC(class_weight='balanced', kernel='rbf', cache_size=1000, random_state=1234)
#     svm_params = [Real(0.1, 100, "log-uniform", name='C'),
#                      Real(0.1, 100, "log-uniform", name='gamma')]

#     @use_named_args(svm_params)
#     def svm_objective(**params):
#         svc_model.set_params(**params)

#         return 1-np.mean(cross_val_score(svc_model, X, y, cv=cv,
#                                         scoring=make_scorer(roc_auc_score), verbose=3))

#     res_gp_svm = gp_minimize(svm_objective, svm_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
#     output.append([target_outcome, 'svm', 1-res_gp_svm.fun, res_gp_svm.x])

    ########################################################################################
    print('#DT')
    dt_model = DecisionTreeClassifier(class_weight='balanced', random_state=1234)
    dt_params = [Categorical(["gini", "entropy", "log_loss"],name='criterion'),
                 Categorical(['best', 'random'],name='splitter'),
                 Integer(3, 10, "uniform", name='max_depth'),]

    @use_named_args(dt_params)
    def dt_objective(**params):
        scoring = {
            'auc': make_scorer(roc_auc_score)
            }
        dt_model.set_params(**params)

        return 1 - np.mean(cross_val_score(dt_model, X, y, cv=cv,
                                        scoring=make_scorer(roc_auc_score)))
    res_gp_dt = gp_minimize(dt_objective, dt_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
    output.append([target_outcome, 'dt', 1-res_gp_dt.fun, res_gp_dt.x])
    
##########################################################################################

    print('#RF')
    rf_model = RandomForestClassifier(class_weight='balanced', random_state=1234)
    rf_params = [Categorical(["gini", "entropy", "log_loss"],name='criterion'),
                 Integer(100, 500, "uniform", name='n_estimators'),
                 Integer(3, 10, "uniform", name='max_depth'),]

    @use_named_args(rf_params)
    def rf_objective(**params):
        scoring = {
            'auc': make_scorer(roc_auc_score)
            }
        rf_model.set_params(**params)

        return 1 - np.mean(cross_val_score(rf_model, X, y, cv=cv,
                                        scoring=make_scorer(roc_auc_score)))
    res_gp_rf = gp_minimize(rf_objective, rf_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
    output.append([target_outcome, 'rf', 1-res_gp_rf.fun, res_gp_rf.x])
    
##########################################################################################

    print('#XGB')
    xgb_model = xgb.XGBClassifier(objective ='binary:logistic', tree_method='gpu_hist', gpu_id=0,  verbosity = 0,
                                         importance_type = 'gain', scale_pos_weight = scale_pos_ratio, random_state=1234)
    xgb_params = [Integer(100,500,"uniform", name='n_estimators'),
                    Integer(3, 10, "uniform", name='max_depth'),
                     Real(1e-5, 1e-1, 'log-uniform', name='learning_rate'),
                     Real(1e-5, 1e-1, 'log-uniform', name='reg_alpha'),
                     Real(1e-5, 1e-1, 'log-uniform', name='reg_lambda'),]
    @use_named_args(xgb_params)
    def xgb_objective(**params):
        xgb_model.set_params(**params)

        return 1 - np.mean(cross_val_score(xgb_model, X, y, cv=cv,
                                        scoring=make_scorer(roc_auc_score)))
    res_gp_xgb = gp_minimize(xgb_objective, xgb_params, n_calls=n_calls, random_state=1234, verbose=3, n_jobs=n_jobs)
    output.append([target_outcome, 'xgb', 1-res_gp_xgb.fun, res_gp_xgb.x])

########################################################################################

######################################################################################################
outcome_3months
#LR
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 2.8702
Function value obtained: 0.2918
Current minimum: 0.2918
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 2.7655
Function value obtained: 0.2923
Current minimum: 0.2918
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 2.7759
Function value obtained: 0.2924
Current minimum: 0.2918
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 1.8627
Function value obtained: 0.2911
Current minimum: 0.2911
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at r



Iteration No: 2 ended. Evaluation done at random point.
Time taken: 8.2120
Function value obtained: 0.3699
Current minimum: 0.2920
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 5.1811
Function value obtained: 0.2920
Current minimum: 0.2920
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 3.5081
Function value obtained: 0.2920
Current minimum: 0.2920
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 9.4767
Function value obtained: 0.2920
Current minimum: 0.2920
Iteration No: 6 started. Evaluating function at random point.
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 5.8170
Function value obtained: 0.2910
Current minimum: 0.2910
Iteration No: 7 started. Evaluating function at random point.
Iteration No: 7 ended. Evaluation d



Iteration No: 8 ended. Evaluation done at random point.
Time taken: 7.0297
Function value obtained: 0.3702
Current minimum: 0.2910
Iteration No: 9 started. Evaluating function at random point.
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 6.9378
Function value obtained: 0.2913
Current minimum: 0.2910
Iteration No: 10 started. Evaluating function at random point.




Iteration No: 10 ended. Evaluation done at random point.
Time taken: 11.6964
Function value obtained: 0.3604
Current minimum: 0.2910
#Elastic
Iteration No: 1 started. Evaluating function at random point.




Iteration No: 1 ended. Evaluation done at random point.
Time taken: 68.2414
Function value obtained: 0.3103
Current minimum: 0.3103
Iteration No: 2 started. Evaluating function at random point.




Iteration No: 2 ended. Evaluation done at random point.
Time taken: 68.3524
Function value obtained: 0.3108
Current minimum: 0.3103
Iteration No: 3 started. Evaluating function at random point.




Iteration No: 3 ended. Evaluation done at random point.
Time taken: 37.0197
Function value obtained: 0.3311
Current minimum: 0.3103
Iteration No: 4 started. Evaluating function at random point.




Iteration No: 4 ended. Evaluation done at random point.
Time taken: 70.7337
Function value obtained: 0.3104
Current minimum: 0.3103
Iteration No: 5 started. Evaluating function at random point.




Iteration No: 5 ended. Evaluation done at random point.
Time taken: 35.1446
Function value obtained: 0.3315
Current minimum: 0.3103
Iteration No: 6 started. Evaluating function at random point.




Iteration No: 6 ended. Evaluation done at random point.
Time taken: 53.7210
Function value obtained: 0.3167
Current minimum: 0.3103
Iteration No: 7 started. Evaluating function at random point.




Iteration No: 7 ended. Evaluation done at random point.
Time taken: 69.7962
Function value obtained: 0.3108
Current minimum: 0.3103
Iteration No: 8 started. Evaluating function at random point.




Iteration No: 8 ended. Evaluation done at random point.
Time taken: 32.4775
Function value obtained: 0.3343
Current minimum: 0.3103
Iteration No: 9 started. Evaluating function at random point.




Iteration No: 9 ended. Evaluation done at random point.
Time taken: 31.7902
Function value obtained: 0.3360
Current minimum: 0.3103
Iteration No: 10 started. Evaluating function at random point.




Iteration No: 10 ended. Evaluation done at random point.
Time taken: 45.4730
Function value obtained: 0.3280
Current minimum: 0.3103
#NB
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.1446
Function value obtained: 0.3260
Current minimum: 0.3260
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.1043
Function value obtained: 0.3661
Current minimum: 0.3260
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.1061
Function value obtained: 0.3407
Current minimum: 0.3260
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.1049
Function value obtained: 0.3604
Current minimum: 0.3260
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evalua



Iteration No: 2 ended. Evaluation done at random point.
Time taken: 8.5569
Function value obtained: 0.3730
Current minimum: 0.2963
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 3.2437
Function value obtained: 0.2964
Current minimum: 0.2963
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 2.4303
Function value obtained: 0.2980
Current minimum: 0.2963
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 4.3395
Function value obtained: 0.2963
Current minimum: 0.2963
Iteration No: 6 started. Evaluating function at random point.
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 4.6805
Function value obtained: 0.2966
Current minimum: 0.2963
Iteration No: 7 started. Evaluating function at random point.
Iteration No: 7 ended. Evaluation d



Iteration No: 8 ended. Evaluation done at random point.
Time taken: 7.0051
Function value obtained: 0.3746
Current minimum: 0.2963
Iteration No: 9 started. Evaluating function at random point.
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 3.2897
Function value obtained: 0.2967
Current minimum: 0.2963
Iteration No: 10 started. Evaluating function at random point.




Iteration No: 10 ended. Evaluation done at random point.
Time taken: 12.0190
Function value obtained: 0.3651
Current minimum: 0.2963
#Elastic
Iteration No: 1 started. Evaluating function at random point.




Iteration No: 1 ended. Evaluation done at random point.
Time taken: 61.9374
Function value obtained: 0.3150
Current minimum: 0.3150
Iteration No: 2 started. Evaluating function at random point.




Iteration No: 2 ended. Evaluation done at random point.
Time taken: 59.1895
Function value obtained: 0.3154
Current minimum: 0.3150
Iteration No: 3 started. Evaluating function at random point.




Iteration No: 3 ended. Evaluation done at random point.
Time taken: 30.2886
Function value obtained: 0.3367
Current minimum: 0.3150
Iteration No: 4 started. Evaluating function at random point.




Iteration No: 4 ended. Evaluation done at random point.
Time taken: 59.7607
Function value obtained: 0.3149
Current minimum: 0.3149
Iteration No: 5 started. Evaluating function at random point.




Iteration No: 5 ended. Evaluation done at random point.
Time taken: 31.4865
Function value obtained: 0.3384
Current minimum: 0.3149
Iteration No: 6 started. Evaluating function at random point.




Iteration No: 6 ended. Evaluation done at random point.
Time taken: 49.5240
Function value obtained: 0.3238
Current minimum: 0.3149
Iteration No: 7 started. Evaluating function at random point.




Iteration No: 7 ended. Evaluation done at random point.
Time taken: 63.4773
Function value obtained: 0.3153
Current minimum: 0.3149
Iteration No: 8 started. Evaluating function at random point.




Iteration No: 8 ended. Evaluation done at random point.
Time taken: 28.2485
Function value obtained: 0.3392
Current minimum: 0.3149
Iteration No: 9 started. Evaluating function at random point.




Iteration No: 9 ended. Evaluation done at random point.
Time taken: 27.7247
Function value obtained: 0.3398
Current minimum: 0.3149
Iteration No: 10 started. Evaluating function at random point.




Iteration No: 10 ended. Evaluation done at random point.
Time taken: 38.1249
Function value obtained: 0.3339
Current minimum: 0.3149
#NB
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.1396
Function value obtained: 0.3306
Current minimum: 0.3306
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.1147
Function value obtained: 0.3661
Current minimum: 0.3306
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.1148
Function value obtained: 0.3391
Current minimum: 0.3306
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.1016
Function value obtained: 0.3613
Current minimum: 0.3306
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evalua



Iteration No: 2 ended. Evaluation done at random point.
Time taken: 6.9587
Function value obtained: 0.3815
Current minimum: 0.2990
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 4.6378
Function value obtained: 0.2990
Current minimum: 0.2990
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 1.4413
Function value obtained: 0.3006
Current minimum: 0.2990
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 2.8740
Function value obtained: 0.2990
Current minimum: 0.2990
Iteration No: 6 started. Evaluating function at random point.
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 2.1987
Function value obtained: 0.2992
Current minimum: 0.2990
Iteration No: 7 started. Evaluating function at random point.
Iteration No: 7 ended. Evaluation d



Iteration No: 8 ended. Evaluation done at random point.
Time taken: 6.0032
Function value obtained: 0.3821
Current minimum: 0.2990
Iteration No: 9 started. Evaluating function at random point.
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 2.4939
Function value obtained: 0.2991
Current minimum: 0.2990
Iteration No: 10 started. Evaluating function at random point.




Iteration No: 10 ended. Evaluation done at random point.
Time taken: 10.2266
Function value obtained: 0.3709
Current minimum: 0.2990
#Elastic
Iteration No: 1 started. Evaluating function at random point.




Iteration No: 1 ended. Evaluation done at random point.
Time taken: 61.1939
Function value obtained: 0.3227
Current minimum: 0.3227
Iteration No: 2 started. Evaluating function at random point.




Iteration No: 2 ended. Evaluation done at random point.
Time taken: 56.5241
Function value obtained: 0.3235
Current minimum: 0.3227
Iteration No: 3 started. Evaluating function at random point.




Iteration No: 3 ended. Evaluation done at random point.
Time taken: 30.4653
Function value obtained: 0.3419
Current minimum: 0.3227
Iteration No: 4 started. Evaluating function at random point.




Iteration No: 4 ended. Evaluation done at random point.
Time taken: 58.2305
Function value obtained: 0.3228
Current minimum: 0.3227
Iteration No: 5 started. Evaluating function at random point.




Iteration No: 5 ended. Evaluation done at random point.
Time taken: 28.8462
Function value obtained: 0.3427
Current minimum: 0.3227
Iteration No: 6 started. Evaluating function at random point.




Iteration No: 6 ended. Evaluation done at random point.
Time taken: 44.3609
Function value obtained: 0.3288
Current minimum: 0.3227
Iteration No: 7 started. Evaluating function at random point.




Iteration No: 7 ended. Evaluation done at random point.
Time taken: 60.0118
Function value obtained: 0.3232
Current minimum: 0.3227
Iteration No: 8 started. Evaluating function at random point.




Iteration No: 8 ended. Evaluation done at random point.
Time taken: 27.1134
Function value obtained: 0.3455
Current minimum: 0.3227
Iteration No: 9 started. Evaluating function at random point.




Iteration No: 9 ended. Evaluation done at random point.
Time taken: 25.4188
Function value obtained: 0.3474
Current minimum: 0.3227
Iteration No: 10 started. Evaluating function at random point.




Iteration No: 10 ended. Evaluation done at random point.
Time taken: 37.0483
Function value obtained: 0.3373
Current minimum: 0.3227
#NB
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.1478
Function value obtained: 0.3449
Current minimum: 0.3449
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.1051
Function value obtained: 0.3729
Current minimum: 0.3449
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.1159
Function value obtained: 0.3511
Current minimum: 0.3449
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.1109
Function value obtained: 0.3700
Current minimum: 0.3449
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evalua



Iteration No: 2 ended. Evaluation done at random point.
Time taken: 6.0250
Function value obtained: 0.3859
Current minimum: 0.2904
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 1.8212
Function value obtained: 0.2904
Current minimum: 0.2904
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 1.2401
Function value obtained: 0.2927
Current minimum: 0.2904
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 1.7761
Function value obtained: 0.2904
Current minimum: 0.2904
Iteration No: 6 started. Evaluating function at random point.
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 2.3432
Function value obtained: 0.2905
Current minimum: 0.2904
Iteration No: 7 started. Evaluating function at random point.
Iteration No: 7 ended. Evaluation d



Iteration No: 8 ended. Evaluation done at random point.
Time taken: 6.2101
Function value obtained: 0.3876
Current minimum: 0.2904
Iteration No: 9 started. Evaluating function at random point.
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 2.5549
Function value obtained: 0.2904
Current minimum: 0.2904
Iteration No: 10 started. Evaluating function at random point.




Iteration No: 10 ended. Evaluation done at random point.
Time taken: 9.7146
Function value obtained: 0.3761
Current minimum: 0.2904
#Elastic
Iteration No: 1 started. Evaluating function at random point.




Iteration No: 1 ended. Evaluation done at random point.
Time taken: 56.3433
Function value obtained: 0.3177
Current minimum: 0.3177
Iteration No: 2 started. Evaluating function at random point.




Iteration No: 2 ended. Evaluation done at random point.
Time taken: 54.7570
Function value obtained: 0.3190
Current minimum: 0.3177
Iteration No: 3 started. Evaluating function at random point.




Iteration No: 3 ended. Evaluation done at random point.
Time taken: 29.7895
Function value obtained: 0.3369
Current minimum: 0.3177
Iteration No: 4 started. Evaluating function at random point.




Iteration No: 4 ended. Evaluation done at random point.
Time taken: 55.5853
Function value obtained: 0.3177
Current minimum: 0.3177
Iteration No: 5 started. Evaluating function at random point.




Iteration No: 5 ended. Evaluation done at random point.
Time taken: 28.4894
Function value obtained: 0.3383
Current minimum: 0.3177
Iteration No: 6 started. Evaluating function at random point.




Iteration No: 6 ended. Evaluation done at random point.
Time taken: 42.9358
Function value obtained: 0.3254
Current minimum: 0.3177
Iteration No: 7 started. Evaluating function at random point.




Iteration No: 7 ended. Evaluation done at random point.
Time taken: 56.6044
Function value obtained: 0.3180
Current minimum: 0.3177
Iteration No: 8 started. Evaluating function at random point.




Iteration No: 8 ended. Evaluation done at random point.
Time taken: 27.1935
Function value obtained: 0.3406
Current minimum: 0.3177
Iteration No: 9 started. Evaluating function at random point.




Iteration No: 9 ended. Evaluation done at random point.
Time taken: 26.9195
Function value obtained: 0.3430
Current minimum: 0.3177
Iteration No: 10 started. Evaluating function at random point.




Iteration No: 10 ended. Evaluation done at random point.
Time taken: 38.8262
Function value obtained: 0.3323
Current minimum: 0.3177
#NB
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.1642
Function value obtained: 0.3544
Current minimum: 0.3544
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.1144
Function value obtained: 0.3711
Current minimum: 0.3544
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.1061
Function value obtained: 0.3558
Current minimum: 0.3544
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.1100
Function value obtained: 0.3672
Current minimum: 0.3544
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evalua

In [26]:
pd.DataFrame(output, columns=['outcome', 'model', 'best_score', 'best_param']).to_csv('../Models/BS_new.csv', index = False, index_label=False)

In [92]:
pd.DataFrame(output, columns=['outcome', 'model', 'best_score', 'best_param'])

Unnamed: 0,outcome,model,best_score,best_param
0,outcome_combined_12months,lr,0.666627,"[liblinear, 0.10612435463932542, 123]"
1,outcome_combined_12months,lasso,0.67766,"[liblinear, 0.10612435463932542, 123]"
2,outcome_combined_12months,elastic,0.517827,"[0.3145311526395745, 709]"
3,outcome_combined_12months,gnb,0.648173,[4.004495983697586e-09]
4,outcome_combined_12months,dt,0.647501,"[log_loss, random, 4]"
5,outcome_combined_12months,rf,0.655746,"[log_loss, 286, 6]"
6,outcome_combined_12months,xgb,0.674931,"[160, 4, 0.018224336938187292, 4.3177889716795..."


In [None]:
%%time

#GRID SEARCH
X = trainingData[features_columns]
# X = cudf.DataFrame(X)
outcomes = [
            'outcome_3months', 
            'outcome_combined_6months', 
            'outcome_combined_12months', 
            'outcome_combined_24months',
           ] 
model_names = ['LR', 'Lasso', 'ElasticNet', 'NB', 'SVM', 'DT', 'RF', 'XGB']


output = []
for outcome in outcomes:
    print(outcome)
    y = trainingData[outcome]
    scale_pos_ratio = y.value_counts()[0]/y.value_counts()[1]
    
    #MODELS
    lr_model = LogisticRegression(class_weight='balanced', random_state=1234)
    lasso_model = LogisticRegression(class_weight='balanced', penalty='l1', random_state=1234) #only the LIBLINEAR and SAGA (added in v0.19) solvers handle the L1 penalty
    elastic_model = LogisticRegression(class_weight='balanced', penalty = 'elasticnet', random_state=1234)
    gnb_model = GaussianNB()
    svc_model = SVC(class_weight='balanced', gamma = 10, kernel='rbf', cache_size=2000, random_state=1234)
    dt_model = DecisionTreeClassifier(class_weight='balanced', random_state=1234)
    rf_model = RandomForestClassifier(class_weight='balanced', random_state=1234)
    xgb_model = xgb.XGBClassifier(objective ='binary:logistic', tree_method='gpu_hist', gpu_id=0,  verbosity = 0,
                                     importance_type = 'gain', scale_pos_weight = scale_pos_ratio, random_state=1234)

    #PARAMS
    lr_params = {'solver': ['liblinear', 'newton-cholesky'],
                 'C': [0.1, 1.0, 10.0],
                 'max_iter': [80.0, 100.0, 120.0]}
    lasso_params = {'solver': ['saga', 'liblinear'],
                    'C': [0.1, 1, 10],
                    'max_iter': [80, 100, 120]},
    elastic_params = {'solver': ['saga', 'liblinear'],
                      'l1_ratio': [0.3, 0.5, 0.7],
                      'max_iter': [80, 100, 120]},
    gnb_params = {'var_smoothing': [1e-8, 5e-8, 1e-9, 5e-9]}
    svm_params={'C': [1.0, 10.0]}
    dt_params = {'criterion':["gini", "entropy", "log_loss"],
                 'splitter': ['best', 'random'],
                'max_depth': [3,5,7,9]}
    rf_params = {'criterion':["gini", "entropy", "log_loss"],
                 'n_estimators': [100, 200, 300],
                'max_depth': [3.0,5.0,7.0,9.0]}
    xgb_params = {'n_estimators': [100, 200, 300],
                'max_depth': [3.0,5.0,7.0,9.0],
                 'learning_rate': [3e-1, 3e-2, 3e-3],
                 'reg_alpha': [0.3, 0.5, 0.7],
                 'reg_lambda': [0.3, 0.5, 0.7],}

    #Models and params in DICT
    models_to_be_trained = [
        {'model_name': 'LR', 'model': lr_model, 'params': lr_params},
        {'model_name': 'Lasso', 'model': lasso_model, 'params': lasso_params},
        {'model_name': 'ElasticNet', 'model': elastic_model, 'params': elastic_params},
        {'model_name': 'NB', 'model': gnb_model, 'params': gnb_params},
        # {'model_name': 'SVM', 'model': svc_model, 'params': svm_params},
        {'model_name': 'DT', 'model': dt_model, 'params': dt_params},
        {'model_name': 'RF', 'model': rf_model, 'params': rf_params},
        {'model_name': 'XGB', 'model': xgb_model, 'params': xgb_params}
    ]
    
    #scoring
    # scoring = {
    #     'accuracy': make_scorer(balanced_accuracy_score),
    #     'sensitivity': make_scorer(recall_score),
    #     'specificity': make_scorer(recall_score,pos_label=0),
    #     'auc': make_scorer(roc_auc_score)
    #     }
    scoring = {
        'auc': make_scorer(roc_auc_score)
        }
    
    for item in models_to_be_trained:
        print(item['model_name'])
        gs = BayesSearchCV(item['model'],
                          search_spaces=item['params'],
                          scoring=make_scorer(roc_auc_score),
                           n_iter = 20,
                          cv=2,
                          verbose=3, 
                           n_jobs=5,
                           n_points=10,
                            random_state = 1234)
        gs.fit(X, y)
        output.append([outcome, item['model_name'], gs.best_params_])
        pickle.dump(gs.cv_results_, open('../Models/gs/' + outcome.split('_')[-1] + '_' + item['model_name'] + '.sav', 'wb'))


In [None]:
%%time

X = trainingData[features_columns]
y = trainingData['outcome_combined_12months']
X = cudf.DataFrame(X)
# y = cudf.DataFrame(y)





In [None]:
%%time

# svc_model = SVC(class_weight='balanced', kernel='poly', random_state=1234)
# svm_params={'C': [0.1, 1, 10], 'gamma': [1,10]}
# gs = GridSearchCV(svc_model,
#                   param_grid=svm_params,
#                   scoring=['average_precision', 'balanced_accuracy', 'roc_auc'],
#                   refit='roc_auc',
#                   cv=3,
#                   verbose=3,)
# gs.fit(X, y)

In [None]:
%%time

svc_model = SVC(class_weight='balanced', C = 10, gamma= 10, kernel='rbf', cache_size= 2000, random_state=1234, verbose=3)
svc_model.fit(X, y)

In [None]:
preds = svc_model.predict(X)

In [None]:
preds

In [None]:
classification_report(y, preds)

In [None]:
output[5][1].keys()

In [None]:
gs.best_score_

In [None]:
output[5][1]['params'][output[5][1]['rank_test_balanced_accuracy'][0]]

In [None]:
output[5][1]['params'][output[5][1]['rank_test_average_precision'][0]]

In [None]:
pd.DataFrame(output, columns=['model', 'GS_result']).to_csv('../Models/GS_result.csv', index_label=False, index=False)

In [None]:
item

In [None]:
#Define number of split in k-fold

n_splits = 10

In [None]:
#Create X set for model development

target_outcome = 'outcome_3months'
X = trainingData[features_columns]
y = trainingData[[target_outcome]]
print('X shape: ', X.shape)
print('y shape: ', y.shape)

#model parameters
params = {'xgb_lr': 0.6,
         'xgb_maxdepth': 7}

In [None]:
%%time

#EXECUTE model training

kf = StratifiedKFold(n_splits=n_splits, random_state=1234, shuffle=True)
kf.get_n_splits(X)
models1 = pd.DataFrame(columns=['modelname', 'class_ratio'])
summary_result1 = []
cols = ['model_name', 'class_ratio', 'acc','spec','sens','auc', 'auprc', 'balance_accuracy', 'f1_score', 'ppv', 'npv']
split_counter = 0

#train model
for train_index, test_index in kf.split(X, y):
    #split data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #Build models -> it can be commented if the models have been trained
    models_temp = pd.DataFrame(build_models(X_train, y_train[target_outcome], params, split_counter), columns=['modelname', 'class_ratio'])
    models1 = pd.concat([models1,models_temp]).reset_index(drop=True)
    split_counter+=1
        
#evaluate model
for modelname, classratio in models1.values:
    # print('======================================================================')
    print(modelname)
    model = pickle.load(open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'rb'))
    summary_result1.append((str(model), classratio, ) + summariseResult (X_test, y_test[target_outcome], model) )       


summary_result1 = pd.DataFrame(summary_result1, columns=cols)
summary_result1['model_num'] = summary_result1.index



In [None]:
print(target_outcome)
summary_result1['model_name'] = summary_result1.apply(lambda x: modelNameFixer(x.model_name), axis=1)
summary_result1.groupby('model_name').mean().sort_values(['auc'], ascending=False)

In [None]:
summary_result1.to_csv("summaryResult_outcome1.csv")
summary_result1 = pd.read_csv("summaryResult_outcome1.csv")

bar = sns.catplot(x = "model_name",       # x variable name
            y = "auc",       # y variable name            
            data = summary_result1,     # dataframe to plot
            kind = "bar",
            height=5,
            aspect=5/2.5,
            ci = None)
ax = bar.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x() + 0.01, 
            p.get_height() * 1.01, 
            '{0:.4f}'.format(p.get_height()), 
            color='black', rotation='horizontal', fontsize=11)
    
# listOf_Yticks = np.arange(0.5, 0.7, 0.05)
ax.set_ylim(0.4, 1)
ax.set_ylabel('AUC Score', fontsize=11)
ax.set_xlabel('Method', fontsize=11)

In [None]:
# kf = StratifiedKFold(n_splits=2, random_state=1234, shuffle=True)
# kf.get_n_splits(X)
# for train_index, test_index in kf.split(X, y):
#     #split data
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]
#     trymodel = SVC(class_weight='balanced', C = 0.7, degree=2, kernel='poly', random_state=1234, cache_size=2048)
#     trymodel.fit(X_train,y_train)
#     print(summariseResult(X_test, y_test, trymodel))


In [None]:
best_model1 = pickle.load(open('./models/outcome_3months/0DTModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model1.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model1.feature_importances_[sorted_idx][-10:])
plt.xlabel("Decision Tree Feature Importance")
plt.show()

In [None]:
best_model1 = pickle.load(open('./models/outcome_3months/0RFModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model1.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model1.feature_importances_[sorted_idx][-10:])
plt.xlabel("Random Forest Feature Importance")
plt.show()

In [None]:
best_model1 = pickle.load(open('./models/outcome_3months/0XGBoostModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model1.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model1.feature_importances_[sorted_idx][-10:])
plt.xlabel("XGBoost Feature Importance")
plt.show()

# 6months

In [None]:
target_outcome = 'outcome_combined_6months'
y = trainingData[[target_outcome]]

#model parameters
params = {'xgb_lr': 0.6,
         'xgb_maxdepth': 7}

In [None]:
%%time

#EXECUTE model training

kf = StratifiedKFold(n_splits=n_splits, random_state=1234, shuffle=True)
kf.get_n_splits(X)
models2 = pd.DataFrame(columns=['modelname', 'class_ratio'])
summary_result2 = []
cols = ['model_name', 'class_ratio', 'acc','spec','sens','auc', 'auprc', 'balance_accuracy', 'f1_score', 'ppv', 'npv']
split_counter = 0

#train model
for train_index, test_index in kf.split(X, y):
    #split data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #Build models -> it can be commented if the models have been trained
    models_temp = pd.DataFrame(build_models(X_train, y_train[target_outcome], params, split_counter), columns=['modelname', 'class_ratio'])
    models2 = pd.concat([models2,models_temp]).reset_index(drop=True)
    split_counter+=1
        
#evaluate model
for modelname, classratio in models2.values:
    # print('======================================================================')
    print(modelname)
    model = pickle.load(open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'rb'))
    summary_result2.append((str(model), classratio, ) + summariseResult (X_test, y_test[target_outcome], model) )       


summary_result2 = pd.DataFrame(summary_result2, columns=cols)
summary_result2['model_num'] = summary_result2.index
# summary_result1['method_name'] = summary_result1.apply(lambda x: 'LR' if x.model_num%2 == 0 else 'XGBoost', axis=1)


In [None]:
print(target_outcome)
summary_result2['model_name'] = summary_result2.apply(lambda x: modelNameFixer(x.model_name), axis=1)
summary_result2.groupby('model_name').mean().sort_values(['auc'], ascending=False)

In [None]:
summary_result2.to_csv("summaryResult_outcome2.csv")
summary_result2 = pd.read_csv("summaryResult_outcome2.csv")

bar = sns.catplot(x = "model_name",       # x variable name
            y = "auc",       # y variable name            
            data = summary_result2,     # dataframe to plot
            kind = "bar",
            height=5,
            aspect=5/2.5,
            ci = None)
ax = bar.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x() + 0.01, 
            p.get_height() * 1.01, 
            '{0:.4f}'.format(p.get_height()), 
            color='black', rotation='horizontal', fontsize=11)
    
# listOf_Yticks = np.arange(0.5, 0.7, 0.05)
ax.set_ylim(0.4, 1)
ax.set_ylabel('AUC Score', fontsize=11)
ax.set_xlabel('Method', fontsize=11)

In [None]:
best_model2 = pickle.load(open('./models/outcome_combined_6months/0DTModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model2.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model2.feature_importances_[sorted_idx][-10:])
plt.xlabel("Decision Tree Feature Importance")
plt.show()

In [None]:
best_model2 = pickle.load(open('./models/outcome_combined_6months/0RFModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model2.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model2.feature_importances_[sorted_idx][-10:])
plt.xlabel("Random Forest Feature Importance")
plt.show()

In [None]:
best_model2 = pickle.load(open('./models/outcome_combined_6months/0XGBoostModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model2.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model2.feature_importances_[sorted_idx][-10:])
plt.xlabel("XGBoost Feature Importance")
plt.show()

# 12 months

In [None]:
target_outcome = 'outcome_combined_12months'
y = trainingData[[target_outcome]]

#model parameters
params = {'xgb_lr': 0.6,
         'xgb_maxdepth': 10}

In [None]:
%%time

#EXECUTE model training

kf = StratifiedKFold(n_splits=n_splits, random_state=1234, shuffle=True)
kf.get_n_splits(X)
models3 = pd.DataFrame(columns=['modelname', 'class_ratio'])
summary_result3 = []
cols = ['model_name', 'class_ratio', 'acc','spec','sens','auc', 'auprc', 'balance_accuracy', 'f1_score', 'ppv', 'npv']
split_counter = 0

#train model
for train_index, test_index in kf.split(X, y):
    #split data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #Build models -> it can be commented if the models have been trained
    models_temp = pd.DataFrame(build_models(X_train, y_train[target_outcome], params, split_counter), columns=['modelname', 'class_ratio'])
    models3 = pd.concat([models3,models_temp]).reset_index(drop=True)
    split_counter+=1
        
#evaluate model
for modelname, classratio in models3.values:
    # print('======================================================================')
    print(modelname)
    model = pickle.load(open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'rb'))
    summary_result3.append((str(model), classratio, ) + summariseResult (X_test, y_test[target_outcome], model) )       


summary_result3 = pd.DataFrame(summary_result3, columns=cols)
summary_result3['model_num'] = summary_result3.index
# summary_result1['method_name'] = summary_result1.apply(lambda x: 'LR' if x.model_num%2 == 0 else 'XGBoost', axis=1)


In [None]:
print(target_outcome)
summary_result3['model_name'] = summary_result3.apply(lambda x: modelNameFixer(x.model_name), axis=1)
summary_result3.groupby('model_name').mean().sort_values(['auc'], ascending=False)

In [None]:
summary_result3.to_csv("summaryResult_outcome3.csv")
summary_result3 = pd.read_csv("summaryResult_outcome3.csv")

bar = sns.catplot(x = "model_name",       # x variable name
            y = "auc",       # y variable name            
            data = summary_result3,     # dataframe to plot
            kind = "bar",
            height=5,
            aspect=5/2.5,
            ci = None)
ax = bar.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x() + 0.01, 
            p.get_height() * 1.01, 
            '{0:.4f}'.format(p.get_height()), 
            color='black', rotation='horizontal', fontsize=11)
    
# listOf_Yticks = np.arange(0.5, 0.7, 0.05)
ax.set_ylim(0.4, 1)
ax.set_ylabel('AUC Score', fontsize=11)
ax.set_xlabel('Method', fontsize=11)

In [None]:
best_model3 = pickle.load(open('./models/outcome_combined_12months/0DTModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model3.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model3.feature_importances_[sorted_idx][-10:])
plt.xlabel("Decision Tree Feature Importance")
plt.show()

In [None]:
best_model3 = pickle.load(open('./models/outcome_combined_12months/0RFModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model3.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model3.feature_importances_[sorted_idx][-10:])
plt.xlabel("Random Forest Feature Importance")
plt.show()

In [None]:
best_model3 = pickle.load(open('./models/outcome_combined_12months/0XGBoostModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model3.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model3.feature_importances_[sorted_idx][-10:])
plt.xlabel("XGBoost Feature Importance")
plt.show()