# Import Data

In [None]:
import pickle as pkl

with open('X_train_feat', 'rb') as f:
    X_train_feat = pkl.load(f)
with open('X_test_feat', 'rb') as f:
    X_test_feat = pkl.load(f)
with open('y_train_encoded', 'rb') as f:
    y_train_encoded = pkl.load(f)
with open('y_test_encoded', 'rb') as f:
    y_test_encoded = pkl.load(f)


# Modeling

## Utility Functions

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def plot_confusion_matrix(model, X, y, normalize=True):
    # Predict the values from the validation dataset
    y_pred = model.predict(X)
    print(f"Number of predictions: {y_pred.shape}")
    # Convert predictions classes to one hot vectors
    y_pred_classes = np.argmax(y_pred, axis = 1)
    # Convert validation observations to one hot vectors
    y_true = np.argmax(y, axis = 1)
    # compute the confusion matrix
    confusion_mtx = confusion_matrix(y_true, y_pred_classes, normalize='true' if normalize else None)

    # plot the confusion matrix
    plt.figure(figsize=(10,8))
    sns.heatmap(confusion_mtx, annot=True, fmt='.2f' if normalize else 'd', cmap='Blues')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.title('Confusion Matrix')
    plt.show()

from sklearn.metrics import classification_report

def compute_metrics(model, X, y):
    # Predict the values from the validation dataset
    y_pred = model.predict(X)
    print(f"Number of predictions: {y_pred.shape}")
    # Convert predictions classes to one hot vectors
    y_pred_classes = np.argmax(y_pred, axis = 1)
    # Convert validation observations to one hot vectors
    y_true = np.argmax(y, axis = 1)

    # Compute and print the classification report
    print(classification_report(y_true, y_pred_classes))

from sklearn.metrics import ConfusionMatrixDisplay

def plot_confusionMatrix(test_labels, test_predicted, clf):
  cm = confusion_matrix(test_labels, test_predicted, normalize='true', labels=clf.classes_)
  disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                display_labels=['N', 'S','V'])
  disp.plot()
  plt.show()
  return

## Custom Metric

Our metric is designed for model selection. The idea is to select the model that is the most capable of identifying the S and V beats in order for it to be used as a diagnostic tool. 

In [None]:
from sklearn.metrics import recall_score

def penalized_sensitivity_threeclass_metric(y_true, y_pred, threshold=0.9, penalty=None):
    # Calculate recall for each class
    recall = recall_score(y_true, y_pred, average=None)

    # Compute the average of the 'S' and 'V' recalls
    avg_recall_sv = round((recall[1] + recall[2]) / 2,2)

    # If the recall for 'N' beats is below the threshold, return penalized normalized recall for 'S' and 'V'
    if recall[0] < threshold:
        if penalty is None:
            penalty = recall[0]
        return round(avg_recall_sv*penalty,2)

    # Otherwise, return the normalized sum of recall for 'S' and 'V' beats
    return avg_recall_sv

In [None]:
from sklearn.metrics import make_scorer

# Create a custom scorer
custom_scorer = make_scorer(penalized_sensitivity_threeclass_metric, greater_is_better=True)

# Bayesian Search on XgBoost

In [None]:
import xgboost as xgb
from skopt.space import Real, Integer
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold

# Define the parameter space
param_space = {
    "gamma": Real(0, 0.5),
    "learning_rate": Real(0.01, 0.3), # default 0.1 
    "max_depth": Integer(2, 6), # default 3
    "n_estimators": Integer(100, 500), # default 100
    "subsample": Real(0.5, 1.0),
    "lambda": Real(0.5, 1.5)
}

# Initialize the XGBClassifier
clf_xgb = xgb.XGBClassifier()

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5)

# Initialize the BayesSearchCV
opt_xgb = BayesSearchCV(
    clf_xgb,
    param_space,
    cv=cv,
    n_jobs=-1,
    scoring=custom_scorer,
    return_train_score=True,
    refit=True
)

# Run Bayesian optimization over the train data
opt_xgb.fit(X_train_feat, y_train_encoded)

# Get the best parameters
best_params_xgb_custom = opt_xgb.best_params_
print(best_params_xgb_custom)

In [None]:
# As refit=True, the BayesSearch API retrains the model on the whole dataset with the set of best found parameters
model_xgb_custom = opt_xgb.best_estimator_

In [None]:
# Perform Prediction
y_pred_feat_train_xgb_custom = model_xgb_custom.predict(X_train_feat)

In [None]:
# Assess Model performance
plot_confusionMatrix(y_train_encoded, y_pred_feat_train_xgb_custom, model_xgb_custom)
print(classification_report(y_train_encoded, y_pred_feat_train_xgb_custom))

In [None]:
# Perform Prediction
y_pred_feat_xgb_custom = model_xgb_custom.predict(X_test_feat)

In [None]:
# Assess Model performance
plot_confusionMatrix(y_test_encoded, y_pred_feat_xgb_custom, model_xgb_custom)
print(classification_report(y_test_encoded, y_pred_feat_xgb_custom))

# Bayesian Search on Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Define the parameter search space
param_space = {
    'n_estimators': (100, 500),
    'max_depth': (3,8),
    'min_samples_split': (4, 10),
    'min_samples_leaf': (4, 8),
    'max_features': (4,10),
    'bootstrap': [True],
    'class_weight':['balanced']
}

# Create a StratifiedKFold object
cv = StratifiedKFold(n_splits=5)

# Create the BayesSearchCV object
opt_rf = BayesSearchCV(
    RandomForestClassifier(),
    param_space,
    cv=cv,
    n_iter=50,
    n_jobs=-1,
    return_train_score=True
)
np.int = int
# Perform the search
opt_rf.fit(X_train_feat, y_train_encoded)

# Print the best parameters and score
print("Best parameters found: ", opt_rf.best_params_)
print("Best score found: ", opt_rf.best_score_)

In [None]:
# As refit=True, the BayesSearch API retrains the model on the whole dataset with the set of best found parameters
model_rf_custom = opt_rf.best_estimator_

In [None]:
# Perform Prediction
y_pred_feat_train_rf_custom = model_rf_custom.predict(X_train_feat)

In [None]:
# Assess Model performance
plot_confusionMatrix(y_train_encoded, y_pred_feat_train_rf_custom, model_rf_custom)
print(classification_report(y_train_encoded, y_pred_feat_train_rf_custom))

In [None]:
# Perform Prediction
y_pred_feat_rf_custom = model_rf_custom.predict(X_test_feat)

In [None]:
# Assess Model performance
plot_confusionMatrix(y_test_encoded, y_pred_feat_rf_custom, model_rf_custom)
print(classification_report(y_test_encoded, y_pred_feat_rf_custom))

# Bayesian Search on Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Define the parameter space
param_space = {
    "base_estimator__max_depth": Integer(1, 3),
    "n_estimators": Integer(100, 500),
    "learning_rate": Real(0.01, 1.0)
}

# Initialize the weak learner
weak_learner = DecisionTreeClassifier(class_weight='balanced')

# Initialize the AdaBoostClassifier
ada = AdaBoostClassifier(base_estimator=weak_learner)

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5)

# Initialize the BayesSearchCV
opt_ada = BayesSearchCV(
    ada,
    param_space,
    cv=cv,
    n_jobs=-1,
    scoring=custom_scorer,
    refit=True
)

# Run Bayesian optimization over the train data
opt_ada.fit(X_train_feat, y_train_encoded)

# Get the best parameters
best_params_ada_custom = opt_ada.best_params_
print(best_params_ada_custom)

In [None]:
# Save the best model
model_bayes_ada = opt_ada.best_estimator_

In [None]:
# Perform Prediction
y_pred_bayes_train_ada = model_bayes_ada.predict(X_train_feat)
# Assess Model performance
plot_confusionMatrix(y_train_encoded, y_pred_bayes_train_ada, model_bayes_ada)
print(classification_report(y_train_encoded, y_pred_bayes_train_ada))

In [None]:
# Perform Prediction
y_pred_bayes_ada = model_bayes_ada.predict(X_test_feat)
# Assess Model performance
plot_confusionMatrix(y_test_encoded, y_pred_bayes_ada, model_bayes_ada)
print(classification_report(y_test_encoded, y_pred_bayes_ada))

# Bayesian Search on LightGBM

In [None]:
from lightgbm import LGBMClassifier
from skopt.space import Categorical

# Define the parameter space
param_space = {
    "learning_rate": Real(0.01, 1.0),
    "n_estimators": Integer(50, 300),
    "max_depth": Integer(1, 10),
    "num_leaves": Integer(2, 50),
    "min_child_samples": Integer(20, 500),
    "subsample": Real(0.5, 1.0),
    "colsample_bytree": Real(0.5, 1.0),
    "class_weight": Categorical(['balanced'])
}

# Initialize the LGBMClassifier
lgbm = LGBMClassifier()

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5)

# Initialize the BayesSearchCV
opt_lgbm = BayesSearchCV(
    lgbm,
    param_space,
    cv=cv,
    n_jobs=-1,
    scoring=custom_scorer,
    refit=True
)

# Run Bayesian optimization over the train data
opt_lgbm.fit(X_train_feat, y_train_encoded)

# Get the best parameters
best_params_lgbm = opt_lgbm.best_params_
print(best_params_lgbm)

In [None]:
# Save the best model
model_bayes_lgbm = opt_lgbm.best_estimator_

In [None]:
# Perform Prediction
y_pred_bayes_train_lgbm = model_bayes_lgbm.predict(X_train_feat)
# Assess Model performance
plot_confusionMatrix(y_train_encoded, y_pred_bayes_train_lgbm, model_bayes_lgbm)
print(classification_report(y_train_encoded, y_pred_bayes_train_lgbm))

In [None]:
# Perform Prediction
y_pred_bayes_lgbm = model_bayes_lgbm.predict(X_test_feat)
# Assess Model performance
plot_confusionMatrix(y_test_encoded, y_pred_bayes_lgbm, model_bayes_lgbm)
print(classification_report(y_test_encoded, y_pred_bayes_lgbm))

# Bayesian Search on SVM

In [None]:
from sklearn.svm import SVC

# Define the parameter space
param_space = {
    "C": Real(1e-2, 1e+2),
    "gamma": Categorical(['scale', 'auto']),
    "degree": Integer(2,4),
    "kernel": Categorical(['linear', 'poly', 'rbf', 'sigmoid']),
    "class_weight": Categorical(['balanced'])
}

# Initialize the SVC
svc = SVC()

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5)

# Initialize the BayesSearchCV
opt_svc = BayesSearchCV(
    svc,
    param_space,
    cv=cv,
    n_jobs=-1,
    scoring=custom_scorer,
    refit=True
)

# Run Bayesian optimization over the train data
opt_svc.fit(X_train_feat, y_train_encoded)

# Get the best parameters
best_params_svc = opt_svc.best_params_
print(best_params_svc)

In [None]:
# Save the best model
model_bayes_svc = opt_svc.best_estimator_

In [None]:
# Perform Prediction
y_pred_bayes_train_svc = model_bayes_svc.predict(X_train_feat)
# Assess Model performance
plot_confusionMatrix(y_train_encoded, y_pred_bayes_train_svc, model_bayes_svc)
print(classification_report(y_train_encoded, y_pred_bayes_train_svc))

In [None]:
# Perform Prediction
y_pred_bayes_svc = model_bayes_svc.predict(X_test_feat)
# Assess Model performance
plot_confusionMatrix(y_test_encoded, y_pred_bayes_svc, model_bayes_svc)
print(classification_report(y_test_encoded, y_pred_bayes_svc))

# Comparison custom metric vs. macro sensitivity

This is performed on the best model.

In [None]:
from sklearn.metrics import make_scorer, recall_score

# Define the custom scorer
macro_sensitivity = make_scorer(recall_score, average='macro')


# Define the parameter space
param_space = {
    "learning_rate": Real(0.01, 1.0),
    "n_estimators": Integer(50, 300),
    "max_depth": Integer(1, 10),
    "num_leaves": Integer(2, 50),
    "min_child_samples": Integer(20, 500),
    "subsample": Real(0.5, 1.0),
    "colsample_bytree": Real(0.5, 1.0),
    "class_weight": Categorical(['balanced'])
}

# Initialize the BayesSearchCV with the custom scorer
opt_lgbm_comparison = BayesSearchCV(
    LGBMClassifier(),
    param_space,
    cv=cv,
    n_jobs=-1,
    scoring=macro_sensitivity,
    refit=True
)

# Run Bayesian optimization over the train data
opt_lgbm_comparison.fit(X_train_feat, y_train_encoded)

# Get the best parameters
best_params_lgbm_comparison = opt_lgbm_comparison.best_params_
print(best_params_lgbm_comparison)

In [None]:
# Save the best model
model_bayes_lgbm_comparison = opt_lgbm_comparison.best_estimator_

In [None]:
# Perform Prediction
y_pred_bayes_train_lgbm_comparison = model_bayes_lgbm_comparison.predict(X_train_feat)
# Assess Model performance
plot_confusionMatrix(y_train_encoded, y_pred_bayes_train_lgbm_comparison, model_bayes_lgbm_comparison)
print(classification_report(y_train_encoded, y_pred_bayes_train_lgbm_comparison))

In [None]:
# Perform Prediction
y_pred_bayes_lgbm_comparison = model_bayes_lgbm_comparison.predict(X_test_feat)
# Assess Model performance
plot_confusionMatrix(y_test_encoded, y_pred_bayes_lgbm_comparison, model_bayes_lgbm_comparison)
print(classification_report(y_test_encoded, y_pred_bayes_lgbm_comparison))