# ANALYSIS

In [None]:
#uncomment this below code to install imblearn package
# !pip install imbalanced-learn

In [1]:
import pandas as pd
import numpy as np
import sklearn

#statistics
from scipy.stats import chi2_contingency, ttest_ind

import cudf #gpu-powered DataFrame (Pandas alternative)

#imbalance handling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, RepeatedEditedNearestNeighbours
from imblearn.pipeline import Pipeline

#preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, MinMaxScaler

#internal validation
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, PredefinedSplit

#performance metrices
from sklearn.metrics import confusion_matrix, classification_report, f1_score, balanced_accuracy_score, matthews_corrcoef, auc, average_precision_score, roc_auc_score, balanced_accuracy_score, roc_curve, accuracy_score

#Models selection
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from cuml.svm import SVC #gpu-powered SVM



#save and load trained model
import pickle

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter


In [2]:
# Data loader
features = pd.read_csv("../FinalData/cleaned_features_11072023.csv")
outcomes = pd.read_csv("../FinalData/cleaned_outcomes_11072023.csv")
features = features[features.columns[1:]]
# outcomes = outcomes[outcomes.columns[1:]]

  features = pd.read_csv("../FinalData/cleaned_features_11072023.csv")


In [3]:
print(features.shape)
print(outcomes.shape)

(577962, 74)
(577962, 15)


In [4]:
features.columns

Index(['patid', 'practice_id', 'sex', 'age', 'BMI', 'weight', 'height',
       'ethnicity', 'ethnic_group', 'smokingStatus', 'CharlsonScore',
       'count_rhinitis', 'count_cardiovascular', 'count_heartfailure',
       'count_psoriasis', 'count_anaphylaxis', 'count_diabetes', 'count_ihd',
       'count_anxiety', 'count_eczema', 'count_nasalpolyps',
       'count_paracetamol', 'count_nsaids', 'count_betablocker', 'id',
       'event_date', 'recorded_date', 'visit_id', 'code_id', 'snomed_id',
       'numeric_1', 'numeric_2', 'created_datetime', 'updated_datetime',
       'PEFStatus', 'EosinophilLevel', 'BTS_step', 'average_daily_dose_ICS',
       'prescribed_daily_dose_ICS', 'ICS_medication_possesion_ratio',
       'DeviceType', 'Spacer', 'numOCS', 'PriorEducation', 'numPCS',
       'numPCSAsthma', 'numAntibioticsEvents', 'numAntibioticswithLRTI',
       'numOCSEvents', 'numOCSwithLRTI', 'numAsthmaAttacks',
       'numAcuteRespEvents', 'numHospEvents', 'postcode_district',
       'imd_d

In [5]:
features.head()

Unnamed: 0,patid,practice_id,sex,age,BMI,weight,height,ethnicity,ethnic_group,smokingStatus,...,cat_numOCS,cat_numOCSEvents,cat_numOCSwithLRTI,cat_numAcuteRespEvents,cat_numAntibioticsEvents,cat_numAntibioticswithLRTI,cat_numAsthmaAttacks,cat_numHospEvents,cat_numPCS,cat_numPCSAsthma
0,43231452,39,0.0,48,26.609713,76.0,1.69,not_recorded,not_recorded,Active Smoker,...,0,0,0,0,0,0,0,0,>=3,1
1,43206365,39,1.0,58,23.94636,72.5,1.74,not_recorded,not_recorded,Former Smoker,...,0,0,0,0,0,0,0,0,1,1
2,43203606,559,0.0,51,17.104513,39.0,1.51,not_recorded,not_recorded,Active Smoker,...,1,1,1,1,1,1,1,0,1,1
3,43117348,502,0.0,69,35.303241,74.0,1.4478,not_recorded,not_recorded,Former Smoker,...,0,0,0,0,1,0,0,0,>=3,1
4,43289035,593,0.0,49,31.217482,75.0,1.55,White British,White - ethnic group,Active Smoker,...,0,0,0,0,0,0,0,0,>=3,1


In [6]:
outcomes.head()

Unnamed: 0,patid,outcome_3months,outcome_6months,outcome_9months,outcome_12months,outcome_15months,outcome_18months,outcome_21months,outcome_24months,outcome_combined_6months,outcome_combined_9months,outcome_combined_12months,outcome_combined_15months,outcome_combined_18months,outcome_combined_24months
0,43231452,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,43206365,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,43203606,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,43117348,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,43289035,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
masterData = features.merge(outcomes, how = 'left', left_on='patid', right_on='patid') #join table
# masterData = masterData.dropna() #NAs from Country
masterData = masterData.reset_index(drop=True)
exclude_columns = ['weight', 'height', 'id', 'event_date', 'recorded_date', 'visit_id', 'code_id', 'snomed_id',
       'numeric_1', 'numeric_2', 'created_datetime', 'updated_datetime',]
masterData = masterData.loc[:,~masterData.columns.isin(exclude_columns)]
print('original data shape: ', masterData.shape)

original data shape:  (577962, 76)


In [8]:
pd.options.display.max_rows = 287
masterData.isna().sum()

patid                                 0
practice_id                           0
sex                                   0
age                                   0
BMI                                   0
ethnicity                             0
ethnic_group                          0
smokingStatus                         0
CharlsonScore                         0
count_rhinitis                        0
count_cardiovascular                  0
count_heartfailure                    0
count_psoriasis                       0
count_anaphylaxis                     0
count_diabetes                        0
count_ihd                             0
count_anxiety                         0
count_eczema                          0
count_nasalpolyps                     0
count_paracetamol                     0
count_nsaids                          0
count_betablocker                     0
PEFStatus                             0
EosinophilLevel                       0
BTS_step                              0


In [9]:
#Positive vs negative class ratio

print('3 months -> 1 : ', round(masterData.outcome_3months.value_counts()[0]/masterData.outcome_3months.value_counts()[1],2))
print('6 months -> 1 : ', round(masterData.outcome_combined_6months.value_counts()[0]/masterData.outcome_combined_6months.value_counts()[1],2))
print('9 months -> 1 : ', round(masterData.outcome_combined_9months.value_counts()[0]/masterData.outcome_combined_9months.value_counts()[1],2))
print('12 months -> 1 : ', round(masterData.outcome_combined_12months.value_counts()[0]/masterData.outcome_combined_12months.value_counts()[1],2))
print('15 months -> 1 : ', round(masterData.outcome_combined_15months.value_counts()[0]/masterData.outcome_combined_15months.value_counts()[1],2))
print('18 months -> 1 : ', round(masterData.outcome_combined_18months.value_counts()[0]/masterData.outcome_combined_18months.value_counts()[1],2))
print('24 months -> 1 : ', round(masterData.outcome_combined_24months.value_counts()[0]/masterData.outcome_combined_24months.value_counts()[1],2))


3 months -> 1 :  35.63
6 months -> 1 :  23.46
9 months -> 1 :  18.06
12 months -> 1 :  13.49
15 months -> 1 :  10.92
18 months -> 1 :  9.91
24 months -> 1 :  8.85


In [10]:
#Proportion of asthma attack in each outcome

print('3 months -> ', round(masterData.outcome_3months.value_counts()[1]/len(masterData)*100,2), '%')
print('6 months -> ', round(masterData.outcome_combined_6months.value_counts()[1]/len(masterData)*100,2), '%')
print('9 months -> ', round(masterData.outcome_combined_9months.value_counts()[1]/len(masterData)*100,2), '%')
print('12 months -> ', round(masterData.outcome_combined_12months.value_counts()[1]/len(masterData)*100,2), '%')
print('15 months -> ', round(masterData.outcome_combined_15months.value_counts()[1]/len(masterData)*100,2), '%')
print('18 months -> ', round(masterData.outcome_combined_18months.value_counts()[1]/len(masterData)*100,2), '%')
print('24 months -> ', round(masterData.outcome_combined_24months.value_counts()[1]/len(masterData)*100,2), '%')

3 months ->  2.73 %
6 months ->  4.09 %
9 months ->  5.25 %
12 months ->  6.9 %
15 months ->  8.39 %
18 months ->  9.16 %
24 months ->  10.15 %


In [11]:
#Data scenario
# 1: all data without ethnicity variable
# 2: all data with ethnicity variable (include all missing values in ethnicity as separate group)
# 3: filter data based on ethnicity (exclude missing values)

scenario = 1 #change it based on the scenario

if scenario == 1:
    #include all data
    allData = masterData
    
elif scenario == 2:
    #Exclude ethnic column
    allData = masterData.drop('ethnic_group', axis=1)
    
elif scenario == 3:
    #exclude missing values for ethnic variable
    allData = masterData[masterData.ethnic_group!='not_recorded']
    
allData = allData.reset_index(drop=True)
print('Data shape for scenario', str(scenario), allData.shape)



Data shape for scenario 1 (577962, 76)


In [12]:
#change sex column to binary numeric, flag intersex as NAs

def sexConverter (x):
    if x == 'Female':
        return 0
    elif x == 'Male':
        return 1
    elif x == 'Intersex':
        return None
    else:
        return x

allData['sex'] = allData.apply(lambda x: sexConverter(x.sex), axis=1)
print('Intersex proportion: ', sum(allData['sex'].isnull())/allData.shape[0]*100, '%')
allData = allData.dropna(subset=['sex']) #exclude missing values (intersex)
allData = allData.reset_index(drop=True)
print('Data shape after excluding missing values in sex variable: ', allData.shape)


Intersex proportion:  0.0 %
Data shape after excluding missing values in sex variable:  (577962, 76)


In [13]:
#Split data into training and evaluation set based on the country. Include only 18+ patients.

trainingData = allData[(allData.Country == 'England') & (allData.age>18)]
evaluationData = allData[((allData.Country == 'Scotland') | (allData.Country == 'Wales')) & (allData.age>18)] #used for internal validation

#remove country variable
trainingData = trainingData.drop('Country', axis=1)
evaluationData = evaluationData.drop('Country', axis=1)

trainingData = trainingData.reset_index(drop=True)
evaluationData = evaluationData.reset_index(drop=True)

print('Training data shape:', trainingData.shape)
print('Evaluation data shape: ', evaluationData.shape)

Training data shape: (546363, 75)
Evaluation data shape:  (21813, 75)


In [14]:
#Identify categorical and continuous variables from the dataset for preprocessing purpose

summaryData = trainingData.describe().T
excludeVars = summaryData[summaryData['max'] == 0].index.to_list() #exclude variable with all zero values
binaryVars = summaryData[summaryData['max'] == 1].index.to_list()
categoricalNonnumericVars = trainingData.select_dtypes(['object']).columns.to_list()
categoricalNonnumericVars = categoricalNonnumericVars + ['BTS_step'] #BTS step is numerical categorical

In [26]:
#Define feature candidates

features_columns = trainingData.columns.to_list()
exclude_columns = ['patid', 'practice_id', #identifier
                   'BMI', #use the categorical instead
                   'ethnicity', #use ethnic_group instead
                   'Spacer',  #all zero
                   
                   'outcome_3months', 'outcome_6months', 'outcome_9months', 'outcome_12months', 'outcome_15months', 'outcome_18months', 
                   'outcome_21months', 'outcome_24months', 'outcome_combined_6months', 'outcome_combined_9months', 'outcome_combined_12months', 
                   'outcome_combined_15months', 'outcome_combined_18months', 'outcome_combined_24months', #outcomes variable
                   
                   'postcode_district', 'County', 'LocalAuthority', 'OutputAreaClassification', #location related variables, use IMD decile only
                   
                   'cat_age', 'cat_average_daily_dose_ICS', 'cat_prescribed_daily_dose_ICS', 'cat_ICS_medication_possesion_ratio', 'cat_numOCS', 'cat_numOCSEvents', 
                   'cat_numOCSwithLRTI', 'cat_numAcuteRespEvents', 'cat_numAntibioticsEvents', 'cat_numAntibioticswithLRTI', 'cat_numAsthmaAttacks', 'cat_numHospEvents', 
                   'cat_numPCS', 'cat_numPCSAsthma' #use continous vars instead
                  ]
exclude_columns = exclude_columns + [x for x in features_columns if '_count' in x] #filter out commorbid count variables
features_columns = [x for x in features_columns if x not in exclude_columns]
print('Features size: ', len(features_columns))
print(features_columns)

Features size:  38
['sex', 'age', 'ethnic_group', 'smokingStatus', 'CharlsonScore', 'count_rhinitis', 'count_cardiovascular', 'count_heartfailure', 'count_psoriasis', 'count_anaphylaxis', 'count_diabetes', 'count_ihd', 'count_anxiety', 'count_eczema', 'count_nasalpolyps', 'count_paracetamol', 'count_nsaids', 'count_betablocker', 'PEFStatus', 'EosinophilLevel', 'BTS_step', 'average_daily_dose_ICS', 'prescribed_daily_dose_ICS', 'ICS_medication_possesion_ratio', 'DeviceType', 'numOCS', 'PriorEducation', 'numPCS', 'numPCSAsthma', 'numAntibioticsEvents', 'numAntibioticswithLRTI', 'numOCSEvents', 'numOCSwithLRTI', 'numAsthmaAttacks', 'numAcuteRespEvents', 'numHospEvents', 'imd_decile', 'cat_BMI']


In [None]:
#ONE HOT encoding for categorical data

# categoricalNonnumericVars = pd.Series(list(set(categoricalNonnumericVars).intersection(set(features_columns)))).tolist() #select only variables within the feature candidate list

# # define one hot encoder
# categoricalEncoder = OneHotEncoder(sparse=False)

# # transform data
# result = categoricalEncoder.fit_transform(trainingData[categoricalNonnumericVars]) 
# result = pd.DataFrame(result, columns=categoricalEncoder.get_feature_names_out())

# #save encoder
# pickle.dump(categoricalEncoder, open('./models/categoricalEncoder.pkl', 'wb'))

# # replace categorical variables in the original data with the one hot version
# trainingData = pd.concat([trainingData.loc[:, ~trainingData.columns.isin(categoricalNonnumericVars)],result], axis=1)
# print('Data shape after one-hot encoding: ', trainingData.shape)

In [None]:
# #Scaling continous variable into 0-1 range

# # summaryData = allData.describe().T
# continuous_vars = summaryData[summaryData['max'] >5].index.to_list() + ['numHospEvents'] #Num hospital events is continuous with 
# continuous_vars = pd.Series(list(set(continuous_vars).intersection(set(features_columns)))).tolist() #select only variables within the feature candidate list

# # define scaler
# scaler = MinMaxScaler()

# #save scaler
# pickle.dump(scaler, open('./models/scaler.pkl', 'wb'))


# # transform data
# result = scaler.fit_transform(trainingData[continuous_vars])
# result = pd.DataFrame(result, columns=scaler.get_feature_names_out())

# allData = pd.concat([trainingData.loc[:,~trainingData.columns.isin(continuous_vars)],result], axis=1)

# print('Data shape after scaling: ', trainingData.shape)

In [None]:
#ONE HOT encoding for evaluation dataset

# transform data
result = categoricalEncoder.transform(evaluationData[categoricalNonnumericVars]) 
result = pd.DataFrame(result, columns=categoricalEncoder.get_feature_names_out())

# replace categorical variables in the original data with the one hot version
evaluationData = pd.concat([evaluationData.loc[:, ~evaluationData.columns.isin(categoricalNonnumericVars)],result], axis=1)
print('Data shape after one-hot encoding: ', evaluationData.shape)

In [None]:
# #Scaling continous variable into 0-1 range for evaluation dataset


# # transform data
# result = scaler.transform(evaluationData[continuous_vars])
# result = pd.DataFrame(result, columns=scaler.get_feature_names_out())

# evaluationData = pd.concat([evaluationData.loc[:,~evaluationData.columns.isin(continuous_vars)],result], axis=1)

# print('Data shape after scaling: ', evaluationData.shape)

# UTILS

In [None]:
#Model evaluation function

def summariseResult (testX, testY, model):
    preds = model.predict(testX)
    tn, fp, fn, tp = confusion_matrix(testY, preds).ravel()
    specificity = tn / (tn+fp)
    sensitivity = tp / (tp+fn)
    ppv = 100*tp/(tp+fp)
    npv = 100*tn/(fn+tn)
    acc = accuracy_score(testY, preds)
    f1score = f1_score(testY, preds, average = 'binary')
    balanceacc = balanced_accuracy_score(testY, preds)
    fpr, tpr, thresholds = roc_curve(testY, preds, pos_label=1)
    aucscore = auc(fpr, tpr)
    # auc = roc_auc_score(testY, preds)
    auprc = average_precision_score(testY, preds)
    # plot_confusion_matrix(model, testX, testY, cmap='viridis')  
    return np.round(acc,4), np.round(specificity,4), np.round(sensitivity,4), np.round(aucscore,4), np.round(auprc,4), np.round(balanceacc,4), np.round(f1score,4), np.round(ppv,4), np.round(npv,4)

In [None]:
#Fix model name for visualisation

def modelNameFixer(x):
    if 'liblinear' in x:
        return 'Lasso'
    elif 'GaussianNB' in x:
        return 'GNB'
    elif 'SVC' in x:
        return 'SVC'
    elif 'RandomForest' in x:
        return 'RF'
    elif 'XGB' in x:
        return 'XGBoost'
    elif 'DecisionTree' in x:
        return 'DT'
    else:
        return 'LR'

In [None]:
#Define number of split in k-fold

n_splits = 10

In [None]:
# instantiate the model (using the default parameters)
def build_models (X_train, y_train, params, split_counter):
    models = [] #list to store all the models
    model_counter = 0
    print("Building models . . . .")

    #LR
    lr_model = LogisticRegression(class_weight='balanced', penalty='l2', random_state=1234)
    lr_model.fit(X_train,y_train)
    modelname =str(split_counter) + 'LRModel' 
    models.append([modelname, y_train.value_counts()[1]/y_train.value_counts()[0]])
    model_counter+=1
    pickle.dump(lr_model, open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'wb')) 
    print("LR done")

    #Lasso
    lasso_model = LogisticRegression(class_weight='balanced', penalty='l1', solver='liblinear', random_state=1234) #only the LIBLINEAR and SAGA (added in v0.19) solvers handle the L1 penalty
    lasso_model.fit(X_train, y_train)
    modelname =str(split_counter) + 'LassoModel' 
    models.append([modelname, y_train.value_counts()[1]/y_train.value_counts()[0]])
    model_counter+=1
    pickle.dump(lasso_model, open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'wb'))
    print("LR done")

    #GNB
    gnb_model = GaussianNB()
    gnb_model.fit(X_train, y_train)
    modelname =str(split_counter) + 'GNBModel' 
    models.append([modelname, y_train.value_counts()[1]/y_train.value_counts()[0]])
    model_counter+=1
    pickle.dump(gnb_model, open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'wb'))     
    print("GNB done")

    #SVM
    svc_model = SVC(class_weight='balanced', C = 0.7, degree=2, kernel='poly', random_state=1234, cache_size=2048)
    svc_model.fit(X_train,y_train)
    modelname =str(split_counter) + 'SVCModel' 
    models.append([modelname, y_train.value_counts()[1]/y_train.value_counts()[0]])
    model_counter+=1
    pickle.dump(svc_model, open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'wb'))     
    print("SVM done")

    #DT
    dt_model = DecisionTreeClassifier(class_weight='balanced', random_state=1234)
    dt_model.fit(X_train, y_train)
    modelname =str(split_counter) + 'DTModel' 
    models.append([modelname, y_train.value_counts()[1]/y_train.value_counts()[0]])
    model_counter+=1
    pickle.dump(dt_model, open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'wb'))     
    print("DT done")

    #RF
    rf_model = RandomForestClassifier(class_weight='balanced', n_estimators=500, random_state=1234)
    rf_model.fit(X_train, y_train)
    modelname =str(split_counter) + 'RFModel' 
    models.append([modelname, y_train.value_counts()[1]/y_train.value_counts()[0]])
    model_counter+=1
    pickle.dump(rf_model, open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'wb'))     
    print("RF done")



    #XGB
    scale_pos_ratio = y_train.value_counts()[0]/y_train.value_counts()[1]
    xgb_model = xgb.XGBClassifier(objective ='binary:logistic', max_depth = params['xgb_maxdepth'], n_estimators = 2000,  tree_method='gpu_hist', gpu_id=0,  verbosity = 0, random_state = 1234,
                                 importance_type = 'gain', scale_pos_weight = scale_pos_ratio, use_label_encoder=False, learning_rate=params['xgb_lr'])
    # xgb_model = xgb.XGBClassifier(objective ='binary:logistic', learning_rate = 0.001, tree_method='gpu_hist', gpu_id=0,  verbosity = 0, random_state = 1234)
    xgb_model.fit(X_train,y_train)
    #save model
    modelname = str(split_counter) + 'XGBoostModel'
    models.append([modelname,  y_train.value_counts()[1]/y_train.value_counts()[0]])
    pickle.dump(xgb_model, open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'wb')) 
    model_counter+=1
    print("XGB done")
    
    return models
    # return [xgb_model]

# 3months

In [None]:
#Create X set for model development

target_outcome = 'outcome_3months'
X = trainingData[features_columns]
y = trainingData[[target_outcome]]
print('X shape: ', X.shape)
print('y shape: ', y.shape)

#model parameters
params = {'xgb_lr': 0.6,
         'xgb_maxdepth': 7}

In [None]:
%%time

#EXECUTE model training

kf = StratifiedKFold(n_splits=n_splits, random_state=1234, shuffle=True)
kf.get_n_splits(X)
models1 = pd.DataFrame(columns=['modelname', 'class_ratio'])
summary_result1 = []
cols = ['model_name', 'class_ratio', 'acc','spec','sens','auc', 'auprc', 'balance_accuracy', 'f1_score', 'ppv', 'npv']
split_counter = 0

#train model
for train_index, test_index in kf.split(X, y):
    #split data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #Build models -> it can be commented if the models have been trained
    models_temp = pd.DataFrame(build_models(X_train, y_train[target_outcome], params, split_counter), columns=['modelname', 'class_ratio'])
    models1 = pd.concat([models1,models_temp]).reset_index(drop=True)
    split_counter+=1
        
#evaluate model
for modelname, classratio in models1.values:
    # print('======================================================================')
    print(modelname)
    model = pickle.load(open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'rb'))
    summary_result1.append((str(model), classratio, ) + summariseResult (X_test, y_test[target_outcome], model) )       


summary_result1 = pd.DataFrame(summary_result1, columns=cols)
summary_result1['model_num'] = summary_result1.index



In [None]:
print(target_outcome)
summary_result1['model_name'] = summary_result1.apply(lambda x: modelNameFixer(x.model_name), axis=1)
summary_result1.groupby('model_name').mean().sort_values(['auc'], ascending=False)

In [None]:
summary_result1.to_csv("summaryResult_outcome1.csv")
summary_result1 = pd.read_csv("summaryResult_outcome1.csv")

bar = sns.catplot(x = "model_name",       # x variable name
            y = "auc",       # y variable name            
            data = summary_result1,     # dataframe to plot
            kind = "bar",
            height=5,
            aspect=5/2.5,
            ci = None)
ax = bar.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x() + 0.01, 
            p.get_height() * 1.01, 
            '{0:.4f}'.format(p.get_height()), 
            color='black', rotation='horizontal', fontsize=11)
    
# listOf_Yticks = np.arange(0.5, 0.7, 0.05)
ax.set_ylim(0.4, 1)
ax.set_ylabel('AUC Score', fontsize=11)
ax.set_xlabel('Method', fontsize=11)

In [None]:
# kf = StratifiedKFold(n_splits=2, random_state=1234, shuffle=True)
# kf.get_n_splits(X)
# for train_index, test_index in kf.split(X, y):
#     #split data
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]
#     trymodel = SVC(class_weight='balanced', C = 0.7, degree=2, kernel='poly', random_state=1234, cache_size=2048)
#     trymodel.fit(X_train,y_train)
#     print(summariseResult(X_test, y_test, trymodel))


In [None]:
best_model1 = pickle.load(open('./models/outcome_3months/0DTModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model1.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model1.feature_importances_[sorted_idx][-10:])
plt.xlabel("Decision Tree Feature Importance")
plt.show()

In [None]:
best_model1 = pickle.load(open('./models/outcome_3months/0RFModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model1.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model1.feature_importances_[sorted_idx][-10:])
plt.xlabel("Random Forest Feature Importance")
plt.show()

In [None]:
best_model1 = pickle.load(open('./models/outcome_3months/0XGBoostModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model1.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model1.feature_importances_[sorted_idx][-10:])
plt.xlabel("XGBoost Feature Importance")
plt.show()

# 6months

In [None]:
target_outcome = 'outcome_combined_6months'
y = trainingData[[target_outcome]]

#model parameters
params = {'xgb_lr': 0.6,
         'xgb_maxdepth': 7}

In [None]:
%%time

#EXECUTE model training

kf = StratifiedKFold(n_splits=n_splits, random_state=1234, shuffle=True)
kf.get_n_splits(X)
models2 = pd.DataFrame(columns=['modelname', 'class_ratio'])
summary_result2 = []
cols = ['model_name', 'class_ratio', 'acc','spec','sens','auc', 'auprc', 'balance_accuracy', 'f1_score', 'ppv', 'npv']
split_counter = 0

#train model
for train_index, test_index in kf.split(X, y):
    #split data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #Build models -> it can be commented if the models have been trained
    models_temp = pd.DataFrame(build_models(X_train, y_train[target_outcome], params, split_counter), columns=['modelname', 'class_ratio'])
    models2 = pd.concat([models2,models_temp]).reset_index(drop=True)
    split_counter+=1
        
#evaluate model
for modelname, classratio in models2.values:
    # print('======================================================================')
    print(modelname)
    model = pickle.load(open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'rb'))
    summary_result2.append((str(model), classratio, ) + summariseResult (X_test, y_test[target_outcome], model) )       


summary_result2 = pd.DataFrame(summary_result2, columns=cols)
summary_result2['model_num'] = summary_result2.index
# summary_result1['method_name'] = summary_result1.apply(lambda x: 'LR' if x.model_num%2 == 0 else 'XGBoost', axis=1)


In [None]:
print(target_outcome)
summary_result2['model_name'] = summary_result2.apply(lambda x: modelNameFixer(x.model_name), axis=1)
summary_result2.groupby('model_name').mean().sort_values(['auc'], ascending=False)

In [None]:
summary_result2.to_csv("summaryResult_outcome2.csv")
summary_result2 = pd.read_csv("summaryResult_outcome2.csv")

bar = sns.catplot(x = "model_name",       # x variable name
            y = "auc",       # y variable name            
            data = summary_result2,     # dataframe to plot
            kind = "bar",
            height=5,
            aspect=5/2.5,
            ci = None)
ax = bar.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x() + 0.01, 
            p.get_height() * 1.01, 
            '{0:.4f}'.format(p.get_height()), 
            color='black', rotation='horizontal', fontsize=11)
    
# listOf_Yticks = np.arange(0.5, 0.7, 0.05)
ax.set_ylim(0.4, 1)
ax.set_ylabel('AUC Score', fontsize=11)
ax.set_xlabel('Method', fontsize=11)

In [None]:
best_model2 = pickle.load(open('./models/outcome_combined_6months/0DTModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model2.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model2.feature_importances_[sorted_idx][-10:])
plt.xlabel("Decision Tree Feature Importance")
plt.show()

In [None]:
best_model2 = pickle.load(open('./models/outcome_combined_6months/0RFModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model2.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model2.feature_importances_[sorted_idx][-10:])
plt.xlabel("Random Forest Feature Importance")
plt.show()

In [None]:
best_model2 = pickle.load(open('./models/outcome_combined_6months/0XGBoostModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model2.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model2.feature_importances_[sorted_idx][-10:])
plt.xlabel("XGBoost Feature Importance")
plt.show()

# 12 months

In [None]:
target_outcome = 'outcome_combined_12months'
y = trainingData[[target_outcome]]

#model parameters
params = {'xgb_lr': 0.6,
         'xgb_maxdepth': 10}

In [None]:
%%time

#EXECUTE model training

kf = StratifiedKFold(n_splits=n_splits, random_state=1234, shuffle=True)
kf.get_n_splits(X)
models3 = pd.DataFrame(columns=['modelname', 'class_ratio'])
summary_result3 = []
cols = ['model_name', 'class_ratio', 'acc','spec','sens','auc', 'auprc', 'balance_accuracy', 'f1_score', 'ppv', 'npv']
split_counter = 0

#train model
for train_index, test_index in kf.split(X, y):
    #split data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #Build models -> it can be commented if the models have been trained
    models_temp = pd.DataFrame(build_models(X_train, y_train[target_outcome], params, split_counter), columns=['modelname', 'class_ratio'])
    models3 = pd.concat([models3,models_temp]).reset_index(drop=True)
    split_counter+=1
        
#evaluate model
for modelname, classratio in models3.values:
    # print('======================================================================')
    print(modelname)
    model = pickle.load(open('./models/'+ target_outcome + '/'+ modelname + '.sav', 'rb'))
    summary_result3.append((str(model), classratio, ) + summariseResult (X_test, y_test[target_outcome], model) )       


summary_result3 = pd.DataFrame(summary_result3, columns=cols)
summary_result3['model_num'] = summary_result3.index
# summary_result1['method_name'] = summary_result1.apply(lambda x: 'LR' if x.model_num%2 == 0 else 'XGBoost', axis=1)


In [None]:
print(target_outcome)
summary_result3['model_name'] = summary_result3.apply(lambda x: modelNameFixer(x.model_name), axis=1)
summary_result3.groupby('model_name').mean().sort_values(['auc'], ascending=False)

In [None]:
summary_result3.to_csv("summaryResult_outcome3.csv")
summary_result3 = pd.read_csv("summaryResult_outcome3.csv")

bar = sns.catplot(x = "model_name",       # x variable name
            y = "auc",       # y variable name            
            data = summary_result3,     # dataframe to plot
            kind = "bar",
            height=5,
            aspect=5/2.5,
            ci = None)
ax = bar.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x() + 0.01, 
            p.get_height() * 1.01, 
            '{0:.4f}'.format(p.get_height()), 
            color='black', rotation='horizontal', fontsize=11)
    
# listOf_Yticks = np.arange(0.5, 0.7, 0.05)
ax.set_ylim(0.4, 1)
ax.set_ylabel('AUC Score', fontsize=11)
ax.set_xlabel('Method', fontsize=11)

In [None]:
best_model3 = pickle.load(open('./models/outcome_combined_12months/0DTModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model3.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model3.feature_importances_[sorted_idx][-10:])
plt.xlabel("Decision Tree Feature Importance")
plt.show()

In [None]:
best_model3 = pickle.load(open('./models/outcome_combined_12months/0RFModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model3.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model3.feature_importances_[sorted_idx][-10:])
plt.xlabel("Random Forest Feature Importance")
plt.show()

In [None]:
best_model3 = pickle.load(open('./models/outcome_combined_12months/0XGBoostModel.sav', 'rb'))

# pd.DataFrame([best_model3.feature_importances_], columns=X.columns).T.sort_values(0, ascending=False)
sorted_idx = best_model3.feature_importances_.argsort()
plt.figure(figsize=(5,7))
plt.barh(X.columns[sorted_idx][-10:], best_model3.feature_importances_[sorted_idx][-10:])
plt.xlabel("XGBoost Feature Importance")
plt.show()