In [25]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

In [26]:
# This code performs a hill climbing algorithm to create an ensemble of base models by optimizing the ensemble's 
# accuracy score. Here's a step-by-step explanation of the code:

### 1. Initialization and Data Preparation
# - `PATH`: Directory containing the out-of-fold (OOF) predictions of the models.
PATH = './output/'
# - `FILES`: List of all files in the directory.
FILES = os.listdir(PATH)
# - `OOF`: Sorted list of OOF prediction files.
OOF = np.sort([f for f in FILES if 'val_preds' in f])
print("OOF files: ", OOF)
# - `OOF_CSV`: List of DataFrames, each containing OOF predictions.
OOF_CSV = [pd.read_csv(PATH + k) for k in OOF]
# - `OOF_PREDS_COLS`: List of column names in the OOF predictions.
OOF_PREDS_COLS = ['kfold', 'Target', 'oof_preds', 'oof_preds_proba_0', 'oof_preds_proba_1', 'oof_preds_proba_2']

OOF files:  ['df_val_preds_CatBoost1.csv' 'df_val_preds_CatBoost2.csv'
 'df_val_preds_LightGBM1.csv' 'df_val_preds_LightGBM2.csv'
 'df_val_preds_LightGBM3.csv' 'df_val_preds_LightGBM_fs.csv'
 'df_val_preds_LogisticRegression1.csv' 'df_val_preds_RandomForest1.csv'
 'df_val_preds_XGBoost1.csv' 'df_val_preds_XGBoost2.csv']


In [27]:
### 2. Extract Predictions and True Labels

# - `oof_preds_all`: A matrix where each column represents the predictions of a model.
oof_preds_all = np.zeros((len(OOF_CSV[0]), len(OOF)))
# - `oof_preds_proba_all`: A matrix where each column represents the probability predictions of a model.
oof_preds_proba_all = np.zeros((len(OOF_CSV[0]), len(OOF), 3))
for k in range(len(OOF)):
    oof_preds_all[:, k] = OOF_CSV[k].oof_preds.values
    oof_preds_proba_all[:, k, :] = OOF_CSV[k][['oof_preds_proba_0', 'oof_preds_proba_1', 'oof_preds_proba_2']].values

# - `target`: True target values (assumed to be the same for all OOF files).
target = OOF_CSV[0].Target.values

### 3. Evaluate Initial accuracy for Each Model
# - `all_model_metrics`: List to store accuracy scores for each model.
all_model_metrics = []
# - Loop through each model's predictions to compute and print the accuracy score.
for model_index in range(oof_preds_all.shape[1]):
    accuracy = accuracy_score(target, oof_preds_all[:, model_index])
    all_model_metrics.append(accuracy)
    print(f'Model {OOF[model_index]} has OOF accuracy = {accuracy:.4f}')

Model df_val_preds_CatBoost1.csv has OOF accuracy = 0.8300
Model df_val_preds_CatBoost2.csv has OOF accuracy = 0.8313
Model df_val_preds_LightGBM1.csv has OOF accuracy = 0.8336
Model df_val_preds_LightGBM2.csv has OOF accuracy = 0.8332
Model df_val_preds_LightGBM3.csv has OOF accuracy = 0.8336
Model df_val_preds_LightGBM_fs.csv has OOF accuracy = 0.8262
Model df_val_preds_LogisticRegression1.csv has OOF accuracy = 0.8273
Model df_val_preds_RandomForest1.csv has OOF accuracy = 0.8261
Model df_val_preds_XGBoost1.csv has OOF accuracy = 0.8308
Model df_val_preds_XGBoost2.csv has OOF accuracy = 0.8318


In [33]:
### 4. Initialize the Ensemble with the Best Single Model

# - `included_model_index`: List to store indices of models included in the ensemble (starting with the best single model).
included_model_index = [np.argmax(all_model_metrics)]
# - `model_weights`: List to store weights for the models in the ensemble.
model_weights = []
# - `best_score`: Best accuracy score so far.
best_score = np.max(all_model_metrics)

### 5. Define Search Parameters
# - `RES`: Resolution for weight search.
RES = 200
# - `PATIENCE`: Patience for early stopping in weight search.
PATIENCE = 10
# - `TOL`: Minimum improvement required to continue adding models.
TOL = 0.000003
# - `DUPLICATES`: Whether to allow duplicate models in the ensemble.
DUPLICATES = False

### 6. Hill Climbing to Optimize Ensemble

print(f'Ensemble accuracy = {best_score:.4f} by beginning with model {OOF[included_model_index[0]]}')

Ensemble accuracy = 0.8336 by beginning with model df_val_preds_LightGBM1.csv


In [None]:
# - For each iteration (up to the number of models):
#     - **Build Current Ensemble**: Start with the best initial model and iteratively combine it with other models in `m` 
#       using their weights `w`.
#     - **Search for Best Model to Add**: For each model not already in the ensemble, evaluate it with different weights 
#       to find the best one to add.
#     - **Evaluate Each Weight**: For each candidate model, iterate over possible weights to find the weight that gives the highest AUC. 
#       Use early stopping (`PATIENCE`) to terminate weight search early if no improvement is seen.
#     - **Check for Improvement**: If the best AUC improvement (`inc`) is less than `TOL`, stop the process.
#     - **Update Ensemble**: If improvement is sufficient, update the ensemble with the new model and weight, then print the 
#       updated AUC and improvement.

### Summary

# This code performs a hill climbing optimization to build an ensemble of models, starting with the best single model 
# and iteratively adding models that improve the ensemble's AUC score. The process continues until no significant improvement 
# is found. This method helps to create a more robust predictive model by combining the strengths of multiple models.
for model_index in range(len(OOF)):
    # Build current ensemble prediction
    # included_model_index[0] is the index of the initial best model.
    # oof_preds_all[:, included_model_index[0]] extracts the predictions of this model.
    # - `current_ensemble_preds`: Current ensemble predictions.
    current_ensemble_preds_proba = oof_preds_proba_all[:, included_model_index[0], :]
    for i, k in enumerate(included_model_index[1:]):
        # Update the ensemble predictions by combining the current ensemble md with the predictions 
        # of the model x[:, k] using weight model_weights[i].
        current_ensemble_preds_proba = model_weights[i] * oof_preds_proba_all[:, k, :] + (1 - model_weights[i]) * current_ensemble_preds_proba

    # Search for the best model to add    
    # max_score: Stores the maximum accuracy score found in this iteration.
    max_score = 0 
    # best_model_index: Stores the index of the model that gives the best accuracy score when added.
    best_model_index = 0
    # best_model_weight: Stores the weight of the best model when added to the ensemble.
    best_model_weight = 0
    print('Searching for best model to add... ')

    # The for loop iterates over all models (oof_preds_all.shape[1] is the number of models).
    for model_index in range(oof_preds_all.shape[1]):
        # Print the current model index k.
        print(OOF[model_index], ', ', end='')
        # Skip models that are already in the ensemble 
        if not DUPLICATES and (model_index in included_model_index):
            continue

        # Evaluate adding model at position = model_index with different weights
        # Initialize variables for tracking the best weight for the current model at position = model_index
        # Best weight found for model OOF[model_index].
        best_weight_j = 0
        # Best accuracy score achieved with model OOF[model_index].
        best_score_j = 0
        # Counter to track the number of non-improving iterations for early stopping.
        ct = 0
        # Inner for loop iterates over possible weights (RES determines the resolution of weights from 0 to 1):
        # RES = 200, so we iterate over 0, 0.005, 0.01, ..., 0.995, 1
        for j in range(RES):
            tmp = j / RES * oof_preds_proba_all[:, model_index, :] + (1 - j / RES) * current_ensemble_preds_proba
            tmp_preds = np.argmax(tmp, axis=1)
            #  Calculate the AUC score of the ensemble with the current weight.
            accuracy = accuracy_score(target, tmp_preds)
            print(f"accuracy = {accuracy}")
            # If the accuracy score is better than the best score best_score_j found so far:
            if accuracy > best_score_j:
                # Update best_score_j with the new best score auc.
                best_score_j = accuracy
                # Update best_weight_j with the current weight j / RES.
                best_weight_j = j / RES
            else:
                # If no improvement is found, increment the counter ct.
                ct += 1
            # If the counter ct is greater than PATIENCE, break out of the inner loop.                
            if ct > PATIENCE:
                print(f"Early stopping at j = {j}")
                break
        # If the best score best_score_j for the current model OOF[model_index] is better than the maximum score max_score found so far:
        # Update max_score to best_score_j.
        # Update best_model_index to model_index.
        # Update best_model_weight to best_weight_j.                
        if best_score_j > max_score:
            max_score = best_score_j
            best_model_index = model_index
            best_model_weight = best_weight_j

    # Check if improvement is significant
    # Calculate the improvement inc as the difference between the new best score max_score and the previous best score best_score.
    inc = max_score - best_score
    print(f"inc = {inc}")
    # If the improvement inc is less than or equal to the tolerance TOL:
    # Print a message indicating no significant increase.
    # Break the loop, stopping the search for additional models.
    if inc <= TOL:
        print()
        print('No increase in score. Stopping.')
        break

    # Update ensemble with the new model and weight
    print()
    print(f'Ensemble accuracy = {max_score:.4f} after adding model {OOF[best_model_index]} with weight {best_model_weight:.3f}. Increase of {inc:.4f}')
    print()

    best_score = max_score
    included_model_index.append(best_model_index)
    model_weights.append(best_model_weight)