## Load libraries

In [1]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import joblib
import os
import shutil

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import StratifiedKFold

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model

from xgboost import XGBClassifier

from keras.layers import Dropout
from scikeras.wrappers import KerasClassifier




In [2]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [3]:
import sys
import os

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

In [16]:
from src import utils
from src.config import CONFIG

In [5]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# Generate data 

In [21]:
# Check/Create path
for target in  CONFIG.outcomes:
    if target != "G_HADSscore":
        for n_features in CONFIG.n_features:
            path_cv_data = CONFIG.path_results+"/cv_data/"+target+"/" +  str(n_features) + "/"
            if os.path.exists(path_cv_data):
                shutil.rmtree(path_cv_data)
                os.makedirs(path_cv_data)
            else:
                os.makedirs(path_cv_data)

In [22]:
# ########### Load data ############
print('Loadind data ...')
df_data, df_codebook  = utils.read_data(CONFIG.path_dataset, CONFIG.path_codebook)
df_data = utils.data_preprocessing(df_data,df_codebook,CONFIG.non_relevant_vars, CONFIG.outcomes)

############ Set target ############
print('Target discretization ...')
X,y_4 = utils.set_target(df_data,CONFIG.outcomes,CONFIG.thr_disc_1,CONFIG.thr_disc_2)
y_3 = y_4.drop('G_HADSscore',axis=1)

print('Number of features =',len(X.columns))
print('--------------------------------------------------------')

print('Number of samples =',X.shape[0])
print('--------------------------------------------------------')

############ Data encoding ############
X, num_vars, categ_vars = utils.data_encoding(X,CONFIG.additional_categ_var)

print('Number of categorical features =',len(categ_vars))
print('--------------------------------------------------------')
print('Number of numerical features =',len(num_vars))
print('--------------------------------------------------------')

missing_data=X.isna().sum()
missing_data = missing_data[missing_data != 0]

print('Number of features with missing values =', len(missing_data))
print('Percentage of missing data = ', round(100*missing_data.sum()/(X.shape[0]*X.shape[1]),2))
print('--------------------------------------------------------')

cv_collection = []
index_scores = []

# Set outer cross-validation
for i in range(CONFIG.outer_cv):
    cv_t = StratifiedKFold(n_splits=CONFIG.inner_cv, shuffle=True)
    cv_collection.append(cv_t)

for count in range(len(cv_collection)):
    for cv_i, (train_index, test_index) in enumerate(cv_collection[count].split(X, y_3[y_3.columns[0]])):

        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train_3, y_test_3 = y_3.iloc[train_index,:], y_3.iloc[test_index,:]

        index_scores.append('BATCH: {} | CV: {} '.format(count + 1,cv_i+1))

        print('BATCH: {} | CV: {}'.format(count + 1,cv_i+1))
        print('--------------------------------------------------------')

        ############ Imputation of missing values ############
        print('Imputing missing values ...')
        X_train, X_test = utils.data_imputation(X_train, X_test,categ_vars)

        ############ One-hot-encoding ############
        print('One-hot-encoding ...')
        X_train, X_test = utils.data_one_hot_encoding(X_train, X_test, CONFIG.categ_nominal_var)

        ############ Scale data ############
        print('Scaling data ...')
        X_train_scaled, X_test_scaled = utils.data_scale(X_train, X_test)

        for target in y_3.columns:
            
            print('--------------------------------------------------------')
            print('target =', target)
            print('--------------------------------------------------------')

            y_train = pd.DataFrame(y_train_3[target])
            y_test = pd.DataFrame(y_test_3[target])

            # Save full data
            data_train_full = pd.concat([y_train,X_train], axis = 1)
            data_test_full = pd.concat([y_test,X_test], axis = 1)

            data_train_full_scaled = pd.concat([y_train,X_train_scaled], axis = 1)
            data_test_full_scaled = pd.concat([y_test,X_test_scaled], axis = 1)

            data = pd.concat([data_train_full,data_test_full],keys=['train','test'])
            data.to_csv(CONFIG.path_results+"/cv_data/"+target+ "/data_batch_"+str(count+1)+"_cv_"+str(cv_i+1)+".csv")

            data_scaled = pd.concat([data_train_full_scaled,data_test_full_scaled],keys=['train','test'])
            data_scaled.to_csv(CONFIG.path_results+"/cv_data/"+target + "/data_scaled_batch_"+str(count+1)+"_cv_"+str(cv_i+1)+".csv")

Loadind data ...
Target discretization ...
Number of features = 161
--------------------------------------------------------
Number of samples = 9291
--------------------------------------------------------
Number of categorical features = 114
--------------------------------------------------------
Number of numerical features = 47
--------------------------------------------------------
Number of features with missing values = 31
Percentage of missing data =  0.34
--------------------------------------------------------
BATCH: 1 | CV: 1
--------------------------------------------------------
Imputing missing values ...
One-hot-encoding ...
Scaling data ...
--------------------------------------------------------
target = G_depressionscore
--------------------------------------------------------
--------------------------------------------------------
target = G_anxietyscore
--------------------------------------------------------
-----------------------------------------------------

# Model training

In [18]:
# Check/Create path
for target in  CONFIG.outcomes:
    if target != "G_HADSscore":
        for n_features in CONFIG.n_features:
            path_cv_data = CONFIG.path_results+"/cv_data/"+target+"/" +  str(n_features) + "/"
            if os.path.exists(path_cv_data):
                pass
            else:
                os.makedirs(path_cv_data)

In [None]:
# Check/Create path
for out in  CONFIG.outcomes:
    if out != 'G_HADSscore':
        for n_features in  CONFIG.n_features:
            if os.path.exists(CONFIG.path_results+"models/"+out+"/"+model_name+"/"+str(n_features)+"/"):
                pass
            else:
                os.makedirs(CONFIG.path_results+"models/"+out+"/"+model_name+"/"+str(n_features)+"/")

In [19]:
# ML model
model_name = CONFIG.clf_name 
print("Model: " + model_name)

Model: XGBoost


In [13]:
'''
# Check/Create path
for out in  CONFIG.outcomes:
    if out != 'G_HADSscore':
        
        if os.path.exists(CONFIG.path_results+"results/"+out+"/"+model_name+"/"):
            shutil.rmtree(CONFIG.path_results+"results/"+out+"/"+model_name+"/")
            os.makedirs(CONFIG.path_results+"results/"+out+"/"+model_name+"/")
        else:
            os.makedirs(CONFIG.path_results+"results/"+out+"/"+model_name+"/")

        for n_features in  CONFIG.n_features:
            if os.path.exists(CONFIG.path_results+"models/"+out+"/"+model_name+"/"+str(n_features)+"/"):
                shutil.rmtree(CONFIG.path_results+"models/"+out+"/"+model_name+"/"+str(n_features)+"/")
                os.makedirs(CONFIG.path_results+"models/"+out+"/"+model_name+"/"+str(n_features)+"/")
            else:
                os.makedirs(CONFIG.path_results+"models/"+out+"/"+model_name+"/"+str(n_features)+"/")
'''

In [20]:
# MLP model keras implementation
def create_model(n_layers, learning_rate, l1, l2, act, dropout, n_features):                      
    '''This is a model generating function so that we can search over neural net 
    parameters and architecture'''
    
    opt = keras.optimizers.Adam(learning_rate=learning_rate)
    reg = keras.regularizers.l1_l2(l1=l1, l2=l2)
                                                    
    model = Sequential()
    
    # for the firt layer we need to specify the input dimensions
    first=True

    n_neurons = np.random.choice([25,50,100], size=n_layers)

    for i in range(n_layers):
        if first:
            model.add(Dense(n_neurons[i], input_dim=n_features, activation=act, kernel_regularizer=reg))
            first=False
        else: 
            model.add(Dense(n_neurons[i], activation=act, kernel_regularizer=reg))
        if dropout!=0:
            model.add(Dropout(dropout))     

    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=opt)
    
    return model

In [21]:
for batch in range(CONFIG.outer_cv):
    for cv in range(CONFIG.inner_cv):
        
        for target in CONFIG.outcomes:
            
            if target != 'G_HADSscore':
                
                print('********************************************************')
                print('********************************************************')
                print('                  target =', target)
                print('********************************************************')
                print('********************************************************')

                path_data_scaled = CONFIG.path_results+"/cv_data/"+target + "/data_scaled_batch_"+str(batch+1)+"_cv_"+str(cv+1)+".csv"
                data_scaled = pd.read_csv(path_data_scaled, index_col=[0,1])

                data_train_scaled = data_scaled.loc['train']
                y_train = data_train_scaled[target]
                X_train_scaled = data_train_scaled.drop([target],axis=1)

                data_test_scaled = data_scaled.loc['test']
                y_test = data_test_scaled[target]
                X_test_scaled = data_test_scaled.drop([target],axis=1)

                path_data = CONFIG.path_results+"/cv_data/"+target + "/data_batch_"+str(batch+1)+"_cv_"+str(cv+1)+".csv"
                data = pd.read_csv(path_data, index_col=[0,1])

                data_train = data.loc['train']
                y_train = data_train[target]
                X_train = data_train.drop([target],axis=1)

                data_test = data.loc['test']
                y_test = data_test[target]
                X_test = data_test.drop([target],axis=1)

                # Set inner CV
                cv_collection2 = []
                cv_t2 = StratifiedKFold(n_splits=5, shuffle=True)
                cv_collection2.append(cv_t2)
                cv_iter = []

                for count2 in range(len(cv_collection2)):
                    for cv_i2, (train_index, test_index) in enumerate(cv_collection2[count2].split(X_train_scaled, y_train)):
                        cv_iter.append((train_index, test_index))
                
                test_scores = []
                    
                for n_features in CONFIG.n_features:
                    print('[batch, cv, n_features] = [' + str(batch+1) + ", " + str(cv+1) + ", " + str(n_features) + "]")
                    print('Selecting features...')
                    X_train_red, X_train_scaled_red, X_test_red, X_test_scaled_red = utils.data_feature_selection(X_train, X_train_scaled, X_test, X_test_scaled, y_train, y_test, CONFIG.algorithm, n_features)

                    # Save full data
                    data_train_full = pd.concat([y_train,X_train_red], axis = 1)
                    data_test_full = pd.concat([y_test,X_test_red], axis = 1)

                    data_train_full_scaled = pd.concat([y_train,X_train_scaled_red], axis = 1)
                    data_test_full_scaled = pd.concat([y_test,X_test_scaled_red], axis = 1)

                    data = pd.concat([data_train_full,data_test_full],keys=['train','test'])
                    data.to_csv(CONFIG.path_results+"/cv_data/"+target+"/" + str(n_features) + "/data_batch_"+str(batch+1)+"_cv_"+str(cv+1)+".csv")

                    data_scaled = pd.concat([data_train_full_scaled,data_test_full_scaled],keys=['train','test'])
                    data_scaled.to_csv(CONFIG.path_results+"/cv_data/"+target+"/" + str(n_features) + "/data_scaled_batch_"+str(batch+1)+"_cv_"+str(cv+1)+".csv")

                    ############ Optimize model ############
                    print('Optimizing model... ')

                    # Choose model
                    if CONFIG.clf_name == "XGBoost":
                        clf = XGBClassifier(objective='multi:softprob',num_class=3,n_estimators = 50, learning_rate = 0.25, booster = 'gbtree', max_depth = 3)
                    elif CONFIG.clf_name == "Multi-Layer Perceptron":        
                        clf  = KerasClassifier(build_fn=create_model, n_features = 25, epochs=25, batch_size=20, verbose=0, n_layers=1 , learning_rate=0.01, l1=0.01, l2=0.01, act = 'relu', dropout=0)
                    elif CONFIG.clf_name == "Random Forest":
                        clf = RandomForestClassifier(n_estimators = 100)
                    elif CONFIG.clf_name == "Support Vector Machines":
                        clf = SVC(kernel="rbf", probability=True, max_iter = 5000)
                    elif CONFIG.clf_name == "Naive Bayes":
                        clf = GaussianNB()
                    elif CONFIG.clf_name == "Logistic Regression":
                        clf = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear')
                    
                    best_model, best_params, best_score = utils.grid_search(X_train_red, X_train_scaled_red, y_train, clf, CONFIG.clf_name, target,CONFIG.n_iter,cv_iter,CONFIG.n_features)

                    # Save model
                    if CONFIG.clf_name == "Multi-Layer Perceptron":
                        best_model.model_.save(CONFIG.path_results+"models/"+target+"/"+model_name+"/"+str(n_features)+ "/model_batch_"+str(batch+1)+"_cv_"+str(cv+1)+".keras")
                    else:
                        joblib.dump(best_model, CONFIG.path_results+"models/"+target+"/"+model_name+"/"+str(n_features)+ "/model_batch_"+str(batch+1)+"_cv_"+str(cv+1)+".sav")
      
                    scores = []
                    au = np.where(best_score['rank_test_score']==1)[0]
                    for i in range(CONFIG.inner_cv):
                        split_name = 'split'+ str(i) + '_test_score'
                        au2 = best_score[split_name]
                        scores.append(np.abs(au2[au])[0])

                    test_scores.append(scores)
                    cv_se = np.std(scores)/np.sqrt(5) 

                    print("mean score train: ", np.mean(scores)," --- one-standard-error: ",cv_se)

                    if CONFIG.clf_name == "Multi-Layer Perceptron":
                        y_prob = best_model.predict(X_test_scaled_red)
                        y_pred = np.argmax(y_prob,axis=1)
                    else:
                        y_pred = best_model.predict(X_test_scaled_red)
                        y_prob = best_model.predict_proba(X_test_scaled_red)

                    # Print metrics
                    value2 = roc_auc_score(y_test, y_prob,multi_class="ovo",average = "macro")
                    #value2 = balanced_accuracy_score(y_test, y_pred)

                    
                    print("Score test: ", value2)
                    print('--------------------------------------------------------')
            
                # save inner-cv scores
                column_names = ['1','2','3','4','5']
                performance_report = pd.DataFrame(test_scores, columns= column_names, index=CONFIG.n_features)
                performance_report.to_csv(CONFIG.path_results +"results/"+target + "/" + model_name + "/scores"+str(batch+1)+"_cv_"+str(cv+1)+".csv")

********************************************************
********************************************************
                  target = G_depressionscore
********************************************************
********************************************************
[batch, cv, n_features] = [1, 1, 100]
Selecting features...
Optimizing model... 


FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/gvillanueva/Desktop/Technician_2022_2023/Projects/Mental_health_COVID19/models/G_depressionscore/XGBoost/100/model_batch_1_cv_1.sav'