<a href="https://colab.research.google.com/github/brunofmf/Datasets4SocialGood/blob/master/Personality_Processes_GradientBoostedTrees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 18 16:21:35 2020

@author: brunofmf
"""

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, precision_score, recall_score
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
from xgboost import XGBRegressor, XGBClassifier
import joblib
import time
import json

In [0]:
'''
#####################################################
#### Random seeds definition ########################
#####################################################
'''

np.random.seed(91190530)
#np.random.seed(95191227)

In [0]:
'''
#####################################################
#### File System Interaction ########################
#####################################################
'''

def enable_save_to_drive():
    from google.colab import drive
    drive.mount('/content/gdrive')
        
'''
Read datasets from file system or from google drive
Return with data augmentation if with_da is true
'''
def read_dataset(with_da, colab):
    if colab:
        #give permission to save to drive
        enable_save_to_drive()
        #load dataset
        from google.colab import files
        import io
        uploaded = files.upload()
        for fn in uploaded.keys():
            print('Uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))
        return pd.read_csv(io.BytesIO(uploaded[fn]))
    else:
        file_name = ('prepared_datasets/personality_final_WithDa_20200321.csv' if with_da else 'prepared_datasets/personality_final_NoDa_20200321.csv')
        return pd.read_csv(file_name)

'''
Save file to file system or to google drive
'''
def save_file(name, results, trial, iteration, architecture, with_da, colab, testing=False):
    if testing:
        filename = name + '_Architecture' + str(architecture) + '_' + with_da + '_' + time.strftime("%Y%m%d%H%M") + ".txt"
    else:
        filename = name + '_Architecture' + str(architecture) + '_' + with_da + '_Trial' + str(trial) + '_Iteration' + str(iteration) + '_' + time.strftime("%Y%m%d%H%M") + ".txt"
    if colab:
        filepath = F'/content/gdrive/My Drive/Experiments/' + filename
    else:
        filepath = F'Experiments/' + filename
    with open(filepath, 'w') as f:
        f.write(json.dumps(results))
        
'''
Save the best estimator found using nested-CV
'''
def save_best_estimator(best_estimator, architecture, with_da, colab):
    filename = 'BestModel_Architecture' + str(architecture) + '_' + with_da + '_' + time.strftime("%Y%m%d%H%M") + ".pkl"
    if colab:
        filepath = F'/content/gdrive/My Drive/Experiments/' + filename
    else:
        filepath = F'Experiments/' + filename
    joblib.dump(best_estimator, filepath)
      
'''
Used to save CSVs (in particular, to save features' importance)
'''
def save_csv(df, architecture, with_da, colab, save_index=False):
    filename = 'FeaturesImportance_Architecture' + str(architecture) + '_' + with_da + '_' + time.strftime("%Y%m%d%H%M") + ".csv"
    if colab:
        filepath = F'/content/gdrive/My Drive/Experiments/' + filename
    else:
        filepath = F'Experiments/' + filename
    df.to_csv(filepath, index=save_index)

In [0]:
'''
#####################################################
#### Data Definition ################################
#####################################################
'''

'''
Returns a tuning dictionary (hyperparameter search space)
'''
def tuning_dictionary():
    return {
        'estimator__n_estimators': [400, 500, 600],
        'estimator__max_depth': [4, 8, 12],
        'estimator__eta': [0.02, 0.05, 0.1], #aka learning rate
        'estimator__gamma': [0.02, 0.04, 0.08],
        'estimator__min_child_weight': [4, 6, 8],
        'estimator__colsample_bytree': [0.3, 0.4]
    }

'''
Split dataframe into X and y, depending on the architecture
Returns (X, y)
'''
def split_x_y(df, architecture, testing=False):
    df_aux = df.copy()
    #if arch is 1 (regression) we want trait values and we drop binned traits. If arch is 2 (classifiers), it is the opposite
    if architecture == 1:
        df_aux.drop(df_aux.loc[:, 'extraversion_binned':'openess_binned'].columns, axis=1, inplace=True)
    else:
        df_aux.drop(df_aux.loc[:, 'extraversion_recalculated':'openess_recalculated'].columns, axis=1, inplace=True)
    #which columns make our X (the one-hot encoded adjectives)
    cols_X = pd.Series([('recalculated' not in col and 'binned' not in col) for col in df_aux.columns])
    #split into X and y
    #if testing, we will test the best estimator on the last 50 adjectives (not used to evaluate the model)
    if testing:
        X = df_aux.iloc[-50:, cols_X.values]
        y = df_aux.iloc[-50:, ~cols_X.values]
    else:
        X = df_aux.loc[:, cols_X.values]
        y = df_aux.loc[:, ~cols_X.values]
    return X, y

In [0]:
'''
#####################################################
#### Modelling and Fit ##############################
#####################################################
'''

'''
Dynamically build the gradient boosted estimator. If arch is 1 then XGBRegressor else XGBClassifier
Returns a multi-output regressor/classifier which fits one regressor/classifier per label (we have 5 labels - the five traits)
'''
def build_model(architecture):
    if architecture == 1:
        estimator = XGBRegressor(
            booster = 'gbtree', 
            objective = 'reg:squarederror',
            eval_metric = 'rmse',
            #tree_method='gpu_hist',
            verbose=1)
        multi_estimator = MultiOutputRegressor(estimator)
    else:
        estimator = XGBClassifier(
            booster = 'gbtree',
            objective = 'multi:softmax',
            num_class = 3,
            eval_metric = 'auc', #auc not gpu supported (https://xgboost.readthedocs.io/en/latest/gpu/index.html)
            #tree_method='gpu_hist',
            verbose=1)
        multi_estimator = MultiOutputClassifier(estimator)
    return multi_estimator

'''
Fitting the multi-output regressor/classifier
Performs num_trials trials using nested-cross validation with outer k as outer_k_folds and inner k as inner_k_folds
Returns the best_metric, best_estimator
'''
def find_best_model(model, X, y, param_grid, num_trials=2, outer_k_folds=2, inner_k_folds=3, num_iter=200, scoring='neg_root_mean_squared_error', architecture=1, with_da='no_da'):
    #find the best model of all
    best_metric = -100
    #cv folds
    outer_cv = KFold(n_splits=outer_k_folds, shuffle=True)
    inner_cv = KFold(n_splits=inner_k_folds, shuffle=True)
    #loop for each trial
    for trial in range(1, num_trials+1):
        #strore results per trial
        results_dict = dict()
        cv_results_dict = dict()
        i = 1
        for train, test in outer_cv.split(X):
            #array to store scores per cv split 
            nested_scores_train = dict()
            #to count time it took
            start = time.time()
            #performing inner cross validation here when looking for the best parameters
            random_search = RandomizedSearchCV(estimator=model,
                                param_distributions=param_grid, verbose=1, scoring=scoring,
                                n_iter=num_iter, cv=inner_cv, n_jobs=-1)
            #fitting training data
            random_search.fit(X.loc[train,:], y.loc[train,:])
            #saving results
            run_time = time.time()-start
            nested_scores_train['Best_Score'] = (-random_search.best_score_ if architecture == 1 else random_search.best_score_)
            nested_scores_train['Evaluation_Score'] = (-random_search.score(X.loc[test,:], y.loc[test,:]) if architecture == 1 else random_search.score(X.loc[test,:], y.loc[test,:])) #evaluating on test data
            nested_scores_train['Scorer'] = (scoring if architecture == 1 else str(random_search.scorer_))
            nested_scores_train['Best_Params'] = random_search.best_params_
            nested_scores_train['Run_Time'] = run_time
            results_dict['Experiment_'+str(i)] = nested_scores_train
            #store full CV results in case it is needed
            cv_results_dict['Experiment_'+str(i)] = str(random_search.cv_results_)
            #storing the best estimator
            if(random_search.best_score_ > best_metric):
                best_metric = random_search.best_score_
                best_estimator = random_search.best_estimator_
            #finishing fold
            print('Outer iteration %d took %.3f s' %(i, run_time))
            i += 1
        #save the best scores after completing one trial
        save_file('Personality', results_dict, trial, outer_k_folds, architecture, with_da, COLAB)
        save_file('CV', cv_results_dict, trial, outer_k_folds, architecture, with_da, COLAB)
    #store features importance for the best estimator (https://scikit-learn.org/stable/modules/ensemble.html)
    #and https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html#sphx-glr-auto-examples-ensemble-plot-forest-importances-py
    importances_arr = []
    for clf in best_estimator.estimators_:
        feature_importances = clf.feature_importances_
        importances_arr.append(feature_importances)
    df_features_importances = pd.DataFrame(data=importances_arr, columns=X.columns.values, index=y.columns.values).transpose()
    df_features_importances.index.name = 'adjective'
    df_features_importances = df_features_importances.reset_index()
    #store to csv
    save_csv(df_features_importances, architecture, with_da, COLAB)
    #return data
    return best_metric, best_estimator

In [0]:
'''
#####################################################
#### Model Testing (not used to evaluate the model) #
#####################################################
'''

'''
Predict for some input values
Saves a file containing some metrics regarding the predictions
Regression Metrics: MAE, MSE and RMSE global and for each trait
Classification Metrics: Micro 
'''
def model_testing(estimator, X, y, architecture=1, with_da='no_da', colab=False, store_file=True):
    #make predictions
    predictions = estimator.predict(X)
    #depending on the architecture, compute metrics
    if architecture == 1:
        #global metrics (for all traits together)
        mae_global = mean_absolute_error(y, predictions, multioutput='uniform_average')
        mse_global = mean_squared_error(y, predictions, multioutput='uniform_average')
        rmse_global = np.sqrt(mse_global)
        #individual metrics (for all traits together)
        mae_list = mean_absolute_error(y, predictions, multioutput='raw_values').tolist()
        mse_list = mean_squared_error(y, predictions, multioutput='raw_values').tolist()
        rmse_list = list(map(np.sqrt, mse_list))
        #results dictionary
        testing_results = {
            'MAE': mae_global,
            'MSE': mse_global,
            'RMSE': rmse_global,
            'MAE_LIST': mae_list,
            'MSE_LIST': mse_list,
            'RMSE_LIST': rmse_list,
            'predictions': predictions.tolist()
        }
    else:
        #how many classes are wrong
        mean_error = (sum( [sum(predictions[i] != y.values[i]) for i in np.arange(0, len(predictions))] )
                      / (predictions.shape[0] * predictions.shape[1]))
        #get transpose for the metrics
        predictions_transpose = [*zip(*predictions)]
        y_transpose = [*zip(*y.values)]
        it_range = range(0, len(y_transpose))
        #for micro-averaging in a multiclass setting with all labels included 
        #it will produce equal precision, recall and F
        #https://simonhessner.de/why-are-precision-recall-and-f1-score-equal-when-using-micro-averaging-in-a-multi-class-problem/
        f1_micro = [f1_score(y_transpose[i], predictions_transpose[i], average='micro') for i in it_range]
        precision_micro = [precision_score(y_transpose[i], predictions_transpose[i], average='micro') for i in it_range]
        recall_micro = [recall_score(y_transpose[i], predictions_transpose[i], average='micro') for i in it_range]
        f1_macro = [f1_score(y_transpose[i], predictions_transpose[i], average='macro') for i in it_range]
        precision_macro = [precision_score(y_transpose[i], predictions_transpose[i], average='macro') for i in it_range]
        recall_macro = [recall_score(y_transpose[i], predictions_transpose[i], average='macro') for i in it_range]
        #results dictionary
        testing_results = {
            'MEAN_ERROR': mean_error,
            'F1': np.mean(f1_micro), 
            'PRECISION': np.mean(precision_micro),
            'RECALL': np.mean(recall_micro), 
            'F1_MICRO': f1_micro,
            'PRECISION_MICRO': precision_micro,
            'RECALL_MICRO': recall_micro,
            'F1_MACRO': f1_macro,
            'PRECISION_MACRO': precision_macro,
            'RECALL_MACRO': recall_macro,
            'predictions': predictions.tolist()
        }
    #is to save a results file
    if store_file:
        save_file('Testing', testing_results, 0, 0, architecture, with_da, colab, testing=True)

In [0]:
'''
#####################################################
#### Architecture Composing #########################
#####################################################
'''

def run_architecture(df, architecture=1):
    #split into X and y
    X, y = split_x_y(df, architecture, testing=False)
    #the multi-output model
    param_grid = tuning_dictionary()
    multi_estimators = build_model(architecture)
    #train and save best model
    scoring = ('neg_root_mean_squared_error' if architecture == 1 else None)
    best_metric, best_estimator = find_best_model(multi_estimators, X, y, param_grid, 
                                                num_trials=NUM_TRIALS, outer_k_folds=OUTER_K_FOLDS, 
                                                inner_k_folds=INNER_K_FOLDS, num_iter=RANDOM_ITERATIONS,
                                                scoring=scoring,
                                                architecture=architecture, with_da=DA_DESC)
    print('Best overall %s for architecture %d and %s is %.4f' %(('RMSE' if architecture == 1 else 'AUC'), architecture, DA_DESC, best_metric))
    #testing the best model
    X, y = split_x_y(df, architecture, testing=True)
    model_testing(best_estimator, X, y, architecture=architecture, with_da=DA_DESC, colab=COLAB, store_file=True)
    #store best model
    save_best_estimator(best_estimator, architecture, DA_DESC, COLAB)

In [0]:
'''
#####################################################
#### Main Execution #################################
#####################################################
'''

#Global Vars
COLAB = True
ARCHITECTURE = 2
WITH_DA = True
DA_DESC = ('WithDa' if WITH_DA else 'NoDa')
NUM_TRIALS = 2
OUTER_K_FOLDS = 3
INNER_K_FOLDS = 4
RANDOM_ITERATIONS = 200

#Read dataset
df = read_dataset(WITH_DA, COLAB)
run_architecture(df, architecture=ARCHITECTURE)