# Ensemble learning
Notebook to explore improvement in performance of building custom-made ensemble learners. The workflow is the following:
1. Build-up of the individual learners and performance evaluation
- LighGBM
- XGBoost
- Random forest
- (Maybe) lasso regression
In this part, we will also include the resampling of data performing upsampling + downsampling

2. Study on how to ensemble them together for performance optimization
- Hard voting (including predictive threshold performance optimization for all of them)
- Soft voting (with a posterior predictive threshold optimization)
- Stacking



In [3]:
import os
import importlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBClassifier, plot_importance
from tqdm import tqdm
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

from auxFuns.EDA import *
from auxFuns.modelling import *


In [6]:
raw_datasets_path = os.getcwd() + '/datasets/raw'
processed_datasets_path = os.getcwd() + '/datasets/processed'

rsv_predictors_df_v2 = pd.read_csv(processed_datasets_path + '/rsv_predictors_phase1_daysDedup_seasons_prevTest.csv',low_memory=False)
rsv_predictors_df_v2 = make_it_categorical_v2(rsv_predictors_df_v2)

rsv_predictors_df_v2.shape

# summary_function_rsv(rsv_predictors_df_v2)

# Extract a reduced sample of the data for modelling
sample_size = 80000
sample_v2_df = rsv_predictors_df_v2.sample(n = sample_size, random_state=42)

In [5]:
selected_features = ['sex', 'marital_status', 'race','patient_regional_location', 'age_group',
                     'Acute_upper_respiratory_infection','Influenza','Pneumonia','Bronchitis','Symptoms_and_signs__digestive_system_and_abdomen','General_symptoms_and_signs','any_symptom',
                     'COPD','AIDS','Asthma_chronic','CCI',
                     'sine','cosine','calendar_year', 
                     'healthcare_seeking', 'influenza_vaccine',
                     'n_symptoms','prev_positive_rsv','previous_test_daydiff','n_immunodeficiencies', 
                     'tumor_indicator','tumor_last_year',
                     'season']
# selected_features = ['sex', 'marital_status', 'race', 'patient_regional_location', 'age_group',
#                      'Acute_upper_respiratory_infection','Influenza','Pneumonia','Bronchitis','Symptoms_and_signs__digestive_system_and_abdomen','General_symptoms_and_signs','any_symptom',
#                      'COPD','AIDS','Asthma_chronic','CCI',
#                      'sine','cosine','calendar_year', 
#                      'healthcare_seeking', 'influenza_vaccine',
#                      'n_symptoms','prev_positive_rsv','previous_test_daydiff','n_immunodeficiencies', 
#                      'tumor_indicator','tumor_last_year']
selected_features.append('RSV_test_result')

In [7]:
df1 = sample_v2_df[selected_features]

input_test_size = 0.2
random_seed = 42

X_train, y_train, X_test, y_test, preprocessor_rsv = preprocess_and_resample_rsv(
    df1, input_test_size = input_test_size, random_seed = random_seed, resampling_technique = 'under', ratio_maj_min = 0.8)

Resampling method chosen:

Undersampling


# 0. Study resampling techniques (WIP)

# 1. Build-up of the models

### 1.1. XGboost
- Train it using the previous approach (GridSearchCV)
- Train it using Bayesian parameter optimization

##### Approach 1: GridSearch CV

In [None]:
# Approach 1: GridSearch CV
random_seed = 42
cost_sensitive = True

if cost_sensitive:
    weight_dict = {"Negative": 1,
                   "Positive": 50}
    scale_pos_weight = weight_dict["Positive"]/weight_dict["Negative"]  # Use scale_pos_weight parameter
    model_class = XGBClassifier(scale_pos_weight=scale_pos_weight,
                                random_state=random_seed)
else:
    model_class = XGBClassifier(random_state=random_seed)

param_grid = {
    'n_estimators': range(20,205,15),
    'max_depth': range(5,30,1),
    'learning_rate': np.arange(0.01, 0.51, 0.05),
    'min_child_weight': np.arange(1, 11, 1), 
    'gamma': np.arange(0.1, 0.5, 0.1) 
}

target_scorer = make_scorer(f1_score, average='macro')
n_cv_folds = 5

# XGBoost needs labels in numeric format
y_train_numeric = [1 if label == "Positive" else 0 for label in y_train]

model1_xgb = train_model_rsv(model = model_class, param_grid = param_grid, target_scorer = target_scorer, n_cv_folds = n_cv_folds,
                    X_train = X_train, y_train = y_train_numeric)

optimal_threshold = find_optimal_moving_threshold(model = model1_xgb, X_test = X_test, y_test = y_test)
__,__,__,__,__,__,f1 = calculate_performance_metrics_rsv(trained_model = model1_xgb, X_test = X_test, y_test = y_test,
                                                         threshold = optimal_threshold, 
                                                         print_roc = False)

##### Approach 2: Bayesian hyperparameter optimization

In [15]:
# Approach 2: train the model using bayesian hyperparameter optimization

# Scorings = ['accuracy', 'balanced_accuracy', 'f1', 'f1_micro', 'f1_macro', 'f1_weighted', 'precision', 'recall', 'roc_auc']
Scorings = ['f1', 'f1_micro', 'f1_macro', 'f1_weighted', 'recall', 'roc_auc']
y_train_numeric = [1 if label == 'Positive' else 0 for label in y_train]

best_models = {}

for score in tqdm(Scorings):
    classifier = XGBoostClassifier_custom(scoring=score, max_evals=12)
    classifier.train(X_train, y_train_numeric)
    classifier.predict(X_test, y_test)
    best_models[score] = {'model': classifier.model, 'score_f1': classifier.score_f1, 'score_auc': classifier.score_auc}
    

  0%|          | 0/6 [00:00<?, ?it/s]

--------------------------------------------------------------------
Training XGBoost classifier with objective metric: f1
Tuning Hyperparameters ...


In [14]:
class XGBoostClassifier_custom:

    def __init__(self, scoring, max_evals):
        self.scoring = scoring
        self.max_evals = max_evals
        self.best = None
        self.model = None
        self.score_f1 = None
        self.score_auc = None

    def objective(self, space):
        classifier = XGBClassifier(n_estimators = space['n_estimators'],
                                    max_depth = int(space['max_depth']),
                                    learning_rate = space['learning_rate'],
                                    gamma = space['gamma'],
                                    min_child_weight = space['min_child_weight'],
                                    subsample = space['subsample'],
                                    colsample_bytree = space['colsample_bytree'],
                                    )
        classifier.fit(self.X_train, self.y_train)
        
        Scores = cross_val_score(estimator = classifier, X = self.X_train, y = self.y_train, cv = 10, scoring=self.scoring)
        score = Scores.mean()
        loss = 1-score
        return {'loss': loss, 'status': STATUS_OK}

    def train(self, X_train, y_train):
        print('--------------------------------------------------------------------')
        print(f'Training XGBoost classifier with objective metric: {self.scoring}')
        self.X_train = X_train
        self.y_train = y_train

        space = {
        'max_depth' : hp.choice('max_depth', [6,8,12]),
        'learning_rate' : hp.choice('learning_rate', [0.001, 0.01]),
        'n_estimators' : hp.choice('n_estimators', [1000, 5000]),
        'gamma' : hp.quniform('gamma', 0, 0.50, 0.1),
        'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 2),
        'subsample' : hp.quniform('subsample', 0.6, 1, 0.1),
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.6, 1.1, 0.1),
        'early_stopping_rounds': hp.choice('early_stopping_rounds', [50, 100])
        }

        trials = Trials()
        print("Tuning Hyperparameters ...")
        self.best = fmin(fn=self.objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=self.max_evals,
                    trials=trials)
        print("Best Hyperparameters: ", self.best)
        self.fit_model()

    def fit_model(self):
        self.model = XGBClassifier(n_estimators = self.best['n_estimators'],
                                max_depth = self.best['max_depth'],
                                learning_rate = self.best['learning_rate'],
                                gamma = self.best['gamma'],
                                min_child_weight = self.best['min_child_weight'],
                                subsample = self.best['subsample'],
                                colsample_bytree = self.best['colsample_bytree'], 
                                early_stopping_rounds = self.best['early_stopping_rounds'],
                                verbose = True
                                )
        self.model.fit(self.X_train, self.y_train)

        print('XGBoostClassifier Performance:')

        # Scores_f1 = cross_val_score(estimator = self.model, X = self.X_train, y = self.y_train, cv = 10, scoring='f1')
        # self.score_f1 = Scores_f1.mean()
        # print("Train Set 10-Fold F1-Score: ", self.score)

        # Scores_auc = cross_val_score(estimator = self.model, X = self.X_train, y = self.y_train, cv = 10, scoring='roc_auc')
        # self.score_auc = Scores_auc.mean()
        # print("Train Set 10-Fold F1-Score: ", self.score)

    def predict(self, X_test, y_test):

        optimal_threshold = find_optimal_moving_threshold(model = self.model, X_test = X_test, y_test = y_test)
        __,__,__,__,__,__,__ = calculate_performance_metrics_rsv(trained_model = self.model, X_test = X_test, y_test = y_test,
                                                         threshold = optimal_threshold, 
                                                         print_roc = False)
        # # F1 score - Test set
        # self.y_pred = self.model.predict(X_test)
        # score_test = f1_score(y_test, y_pred)
        # print("Test Set F1-Score: ", score_test)