In [149]:
import time

import numpy as np 

import pandas as pd
from tqdm.notebook import tqdm # progress bar

# from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier


In [150]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 20)
pd.set_option("display.max_colwidth", None)

In [151]:
# Load data
input_df = pd.read_csv("input_df.csv", index_col=0)
input_df_pca = pd.read_csv("input_df_pca.csv", index_col=0)
output_df_class = pd.read_csv("output_df_class.csv", index_col=0)
output_df_score = pd.read_csv("output_df_score.csv", index_col=0)

In [152]:
# Drop rows with missing values in the output for supervised learning

supervised_learning_mask = output_df_score.notna().all(axis=1)

input_df = input_df[supervised_learning_mask]
input_df_pca = input_df_pca[supervised_learning_mask]
output_df_class = output_df_class[supervised_learning_mask]
output_df_score = output_df_score[supervised_learning_mask]

In [153]:
# Prepare input data
X_base = input_df.values
X_pca = input_df_pca.values
y = output_df_class.values

# Set a random seed for reproducibility
random_state = 123

# Define cross-validation strategy
kf = StratifiedKFold(n_splits=10, random_state=random_state, shuffle=True)

# Define a scoring metric
scoring = ["accuracy", "f1_micro", "f1_macro", "precision_macro", "recall_macro"]


from sklearn.metrics import precision_score, make_scorer

# Custom precision scorer to handle cases where a label has never been seen py the predictor (division by tp + fp = 0)
precision_macro_scorer = make_scorer(precision_score, zero_division=0, average="macro") 
custom_scoring = {"accuracy": "accuracy", 
                  "f1_micro": "f1_micro",
                  "f1_macro": "f1_macro",
                  "precision_macro": precision_macro_scorer,
                  "recall_macro": "recall_macro"}

In [154]:
# Define models and hyperparameters for tuning
models = {
    "K-Nearest Neighbors Classifier (uniform)": {
        "model": KNeighborsClassifier(),
        "params": {"k-nearestneighborsclassifier(uniform)__n_neighbors": [2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 35, 40, 50],
                   "k-nearestneighborsclassifier(uniform)__weights": ["uniform"]},
    },
    "K-Nearest Neighbors Classifier (distance)": {
        "model": KNeighborsClassifier(),
        "params": {"k-nearestneighborsclassifier(distance)__n_neighbors": [2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 35, 40, 50],
                   "k-nearestneighborsclassifier(distance)__weights": ["distance"],
                   "k-nearestneighborsclassifier(distance)__p": [1, 2, 3]},
    },
    "Support Vector Classifier (linear)": {
        "model": SVC(),
        "params": {
            "supportvectorclassifier(linear)__C": [0.1, 1.0, 10.0, 100.0],
            "supportvectorclassifier(linear)__kernel": ["linear"]
        },
    },
    "Support Vector Classifier (poly)": {
        "model": SVC(),
        "params": {
            "supportvectorclassifier(poly)__C": [0.1, 1.0, 10.0, 100.0],
            "supportvectorclassifier(poly)__kernel": ["poly"],
            "supportvectorclassifier(poly)__gamma": [0.01, 0.1, 0.5],
            "supportvectorclassifier(poly)__degree": [2, 3, 4, 5],
        },
    },
    "Support Vector Classifier (rbf)": {
        "model": SVC(),
        "params": {
            "supportvectorclassifier(rbf)__C": [0.1, 1.0, 10.0, 100.0],
            "supportvectorclassifier(rbf)__kernel": ["rbf"],
            "supportvectorclassifier(rbf)__gamma": [0.01, 0.1, 0.5],
        },
    },
    "Decision Tree Classifier": {
        "model": DecisionTreeClassifier(),
        "params": {
            "decisiontreeclassifier__max_depth": [5, 10, 15],
            "decisiontreeclassifier__min_samples_split": [2, 5, 10]
        },
    },
    "Bagging Classifier": {
        "model": BaggingClassifier(random_state=42),
        "params": {
            "baggingclassifier__n_estimators": [10, 50, 100, 150],
            "baggingclassifier__max_samples": [0.5, 0.8, 1.0],
            "baggingclassifier__max_features": [0.5, 0.8, 1.0]
        },
    },
    "Random Forest Classifier": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "randomforestclassifier__n_estimators": [10, 50, 100],
            "randomforestclassifier__max_depth": [5, 10],
            "randomforestclassifier__min_samples_split": [2, 5],
        },
    }
}

In [155]:
# Function to clean the best params dictionary
def clean_params(params):
    return ", ".join(f"{key.split('__')[1]}: {value}" for key, value in params.items())

# Function to convert the one hot encoding back to multiclass ecoding
def one_hot_encoding_to_multiclass(y):
    return np.einsum('ij,j->i',y,np.arange(1,y.shape[1]+1))

In [156]:
# 1 dimension multiclass label
y_formated = one_hot_encoding_to_multiclass(y)

# Create an empty list to store results
results_list_base_input, model_list_base_input = [], []
results_list_pca_input, model_list_pca_input = [], []

input_setups = {"Base input": (X_base, results_list_base_input, model_list_base_input),
                "PCA input": (X_pca, results_list_pca_input, model_list_pca_input)}

# Test models on base input and pca-preprocessed input
for input_name, input_config in input_setups.items():
    
    X, results_list, model_list = input_config
    print(f"Starting GridSearch with [{input_name}] ...")
    
    # Loop over each model
    for model_name, config in tqdm(models.items()):
        print(f"Training and tuning {model_name}...")

        # Track start time
        start_time = time.time()

        # Create a pipeline with StandardScaler (necessary for some models)    
        pipeline = Pipeline([
                (
                    "scaler",
                    StandardScaler(),
                ),
                (model_name.lower().replace(" ", ""), config["model"]),
            ])
        
        # Set up GridSearchCV
        grid_search = GridSearchCV(
            estimator=pipeline,
            param_grid=config["params"],
            cv=kf,
            scoring=custom_scoring,
            refit="f1_macro",
            return_train_score=False,
            verbose=1,
        )

        # Fit the grid search
        grid_search.fit(X, y_formated)

        # Make predictions and calculate metrics
        
        # y_pred = grid_search.predict(X)
        # acc = accuracy_score(y_formated, y_pred)
        # f1_score_micro = f1_score(y_formated, y_pred, average="micro")
        # f1_score_macro = f1_score(y_formated, y_pred, average="macro")
        # precision_macro = precision_score(y_formated, y_pred, average="macro")
        # recal_macro = recall_score(y_formated, y_pred, average="macro")
        
        # Index of best estimator (on average score metric over splits)
        best_predictor_index = grid_search.best_index_
        
        # print(grid_search.cv_results_) # DEBUG
        
        # Append results to list
        results_list.append(
            {
                "Model Name": model_name,
                "Best Params": clean_params(grid_search.best_params_),
                "Accuracy": grid_search.cv_results_["mean_test_accuracy"][best_predictor_index],
                "F1-score (micro)": grid_search.cv_results_["mean_test_f1_micro"][best_predictor_index],
                "F1-score (macro)": grid_search.cv_results_["mean_test_f1_macro"][best_predictor_index],
                "Precision (macro)": grid_search.cv_results_["mean_test_precision_macro"][best_predictor_index],
                "Recal (macro)": grid_search.cv_results_["mean_test_recall_macro"][best_predictor_index]
            }
        )
        
        # Append results to list
        model_list.append(
            {
                "Model Name": model_name,
                "gc": grid_search
            }
        )

    # Print duration
    print(f"Done in {time.time() - start_time:.2f} seconds\n")


Starting GridSearch with [Base input] ...


  0%|          | 0/8 [00:00<?, ?it/s]

Training and tuning K-Nearest Neighbors Classifier (uniform)...
Fitting 10 folds for each of 13 candidates, totalling 130 fits
Training and tuning K-Nearest Neighbors Classifier (distance)...
Fitting 10 folds for each of 39 candidates, totalling 390 fits
Training and tuning Support Vector Classifier (linear)...
Fitting 10 folds for each of 4 candidates, totalling 40 fits
Training and tuning Support Vector Classifier (poly)...
Fitting 10 folds for each of 48 candidates, totalling 480 fits
Training and tuning Support Vector Classifier (rbf)...
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Training and tuning Decision Tree Classifier...
Fitting 10 folds for each of 9 candidates, totalling 90 fits
Training and tuning Bagging Classifier...
Fitting 10 folds for each of 36 candidates, totalling 360 fits
Training and tuning Random Forest Classifier...
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Done in 12.08 seconds

Starting GridSearch with [PCA input] ...


  0%|          | 0/8 [00:00<?, ?it/s]

Training and tuning K-Nearest Neighbors Classifier (uniform)...
Fitting 10 folds for each of 13 candidates, totalling 130 fits
Training and tuning K-Nearest Neighbors Classifier (distance)...
Fitting 10 folds for each of 39 candidates, totalling 390 fits
Training and tuning Support Vector Classifier (linear)...
Fitting 10 folds for each of 4 candidates, totalling 40 fits
Training and tuning Support Vector Classifier (poly)...
Fitting 10 folds for each of 48 candidates, totalling 480 fits
Training and tuning Support Vector Classifier (rbf)...
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Training and tuning Decision Tree Classifier...
Fitting 10 folds for each of 9 candidates, totalling 90 fits
Training and tuning Bagging Classifier...
Fitting 10 folds for each of 36 candidates, totalling 360 fits
Training and tuning Random Forest Classifier...
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Done in 11.85 seconds



In [157]:
pd.DataFrame(results_list_base_input).sort_values("F1-score (macro)", ascending=False, ignore_index=True)

Unnamed: 0,Model Name,Best Params,Accuracy,F1-score (micro),F1-score (macro),Precision (macro),Recal (macro)
0,Bagging Classifier,"max_features: 0.8, max_samples: 0.8, n_estimators: 100",0.769519,0.769519,0.757198,0.799585,0.738401
1,K-Nearest Neighbors Classifier (distance),"n_neighbors: 3, p: 1, weights: distance",0.743341,0.743341,0.732731,0.749915,0.723028
2,Random Forest Classifier,"max_depth: 10, min_samples_split: 5, n_estimators: 50",0.746418,0.746418,0.724125,0.791495,0.698752
3,Support Vector Classifier (rbf),"C: 10.0, gamma: 0.1, kernel: rbf",0.721755,0.721755,0.708987,0.726472,0.701521
4,Support Vector Classifier (poly),"C: 10.0, degree: 5, gamma: 0.1, kernel: poly",0.709567,0.709567,0.694654,0.706422,0.691594
5,Decision Tree Classifier,"max_depth: 10, min_samples_split: 5",0.690817,0.690817,0.674802,0.689288,0.669569
6,K-Nearest Neighbors Classifier (uniform),"n_neighbors: 2, weights: uniform",0.666106,0.666106,0.662631,0.707222,0.662714
7,Support Vector Classifier (linear),"C: 1.0, kernel: linear",0.649062,0.649062,0.630458,0.651749,0.621869


In [158]:
pd.DataFrame(results_list_pca_input).sort_values("F1-score (macro)", ascending=False, ignore_index=True)

Unnamed: 0,Model Name,Best Params,Accuracy,F1-score (micro),F1-score (macro),Precision (macro),Recal (macro)
0,Bagging Classifier,"max_features: 1.0, max_samples: 0.5, n_estimators: 150",0.618101,0.618101,0.580064,0.614673,0.571041
1,Random Forest Classifier,"max_depth: 10, min_samples_split: 2, n_estimators: 50",0.619663,0.619663,0.576365,0.629515,0.563987
2,K-Nearest Neighbors Classifier (distance),"n_neighbors: 10, p: 2, weights: distance",0.618149,0.618149,0.57157,0.605507,0.56255
3,Support Vector Classifier (rbf),"C: 10.0, gamma: 0.5, kernel: rbf",0.601298,0.601298,0.567054,0.590912,0.559189
4,K-Nearest Neighbors Classifier (uniform),"n_neighbors: 2, weights: uniform",0.570361,0.570361,0.553353,0.595856,0.562873
5,Support Vector Classifier (poly),"C: 10.0, degree: 4, gamma: 0.5, kernel: poly",0.560889,0.560889,0.538358,0.549119,0.537444
6,Decision Tree Classifier,"max_depth: 5, min_samples_split: 10",0.56113,0.56113,0.507374,0.545996,0.503766
7,Support Vector Classifier (linear),"C: 100.0, kernel: linear",0.558005,0.558005,0.445707,0.520561,0.465217
