In [None]:
!pip install lightgbm



In [None]:
!pip install -U scikit-optimize



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Split into train and test sets

X_train = pd.read_csv('../data/train.csv')
X_test = pd.read_csv('../data/test.csv')
y_train = pd.read_csv('../data/train_labels.csv')['EPSBeats']
y_test = pd.read_csv('../data/test_labels.csv')['EPSBeats']

In [None]:
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}')

X_train: (783, 30), X_test: (196, 30), y_train: (783,), y_test: (196,)


In [None]:
from skopt.space import Real, Integer

param_test0 = {
    # 'num_leaves': Integer(6, 50),
    # 'max_depth': Integer(3, 20),
    # 'learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'n_estimators': Integer(100, 500)
    # 'scale_pos_weight': Integer(1, 2),
    # 'min_split_gain': Real(0.001, 0.1),
    # 'min_child_weight': Real(1e-5, 1, prior='log-uniform'),
    # 'min_child_samples': Integer(20, 100),
    # 'subsample': Real(0.01, 1.0),
    # 'colsample_bytree': Real(0.4, 1.0),
    # 'reg_alpha': Real(0, 1.0),
    # 'reg_lambda': Real(0, 100)
}

In [None]:
# Iteration 0: Optimize n estimators for learning rate=0.1

from skopt import BayesSearchCV
from lightgbm import LGBMClassifier

lgb_clf0 = LGBMClassifier(
    boosting_type='gbdt', objective='binary', random_state=42,
    learning_rate=0.1, num_leaves=32, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1
)

lgb_bsearch0 = BayesSearchCV(
    estimator=lgb_clf0, 
    search_spaces=param_test0, 
    n_iter=100, 
    scoring='accuracy', 
    cv=10, 
    refit=True, 
    verbose=0, 
    random_state=42, 
    return_train_score=True
)

lgb_bsearch0.fit(X_train, y_train, eval_set=(X_test, y_test), eval_metric='auc', early_stopping_rounds=50, verbose=0)
lgb_bsearch0

In [None]:
lgb_bsearch0.best_params_

OrderedDict([('n_estimators', 264)])

In [None]:
# Iteration 1: Optimize num leaves, max depth & min child weight for best n estimators

param_test1 = {
    'num_leaves': Integer(6, 50),
    'max_depth': Integer(3, 20),
    # 'learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    # 'n_estimators': Integer(100, 500),
    # 'scale_pos_weight': Integer(1, 2),
    # 'min_split_gain': Real(0.001, 0.1),
    'min_child_weight': Real(1e-5, 10, prior='log-uniform'),
    # 'min_child_samples': Integer(20, 100),
    # 'subsample': Real(0.01, 1.0),
    # 'colsample_bytree': Real(0.4, 1.0),
    # 'reg_alpha': Real(0, 1.0),
    # 'reg_lambda': Real(0, 100)
}

lgb_clf1 = LGBMClassifier(
    boosting_type='gbdt', objective='binary', random_state=42,
    learning_rate=0.1, n_estimators=264, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1
)

lgb_bsearch1 = BayesSearchCV(
    estimator=lgb_clf1, 
    search_spaces=param_test1, 
    n_iter=100, 
    scoring='accuracy', 
    cv=10, 
    refit=True, 
    verbose=0, 
    random_state=42, 
    return_train_score=True
)

lgb_bsearch1.fit(X_train, y_train, eval_set=(X_test, y_test), eval_metric='auc', early_stopping_rounds=50, verbose=0)
lgb_bsearch1

In [None]:
lgb_bsearch1.best_params_

OrderedDict([('max_depth', 19),
             ('min_child_weight', 0.003697984310772264),
             ('num_leaves', 19)])

In [None]:
# Iteration 2: Optimize subsample, colsample by tree for best params in bsearch1

param_test2 = {
    # 'num_leaves': Integer(6, 50),
    # 'max_depth': Integer(3, 20),
    # 'learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    # 'n_estimators': Integer(100, 500),
    # 'scale_pos_weight': Integer(1, 2),
    # 'min_split_gain': Real(0.001, 0.1),
    # 'min_child_weight': Real(1e-5, 10, prior='log-uniform'),
    # 'min_child_samples': Integer(20, 100),
    'subsample': Real(0.4, 1.0),
    'colsample_bytree': Real(0.4, 1.0),
    # 'reg_alpha': Real(0, 1.0),
    # 'reg_lambda': Real(0, 100)
}

lgb_clf2 = LGBMClassifier(
    boosting_type='gbdt', objective='binary', random_state=42,
    learning_rate=0.1, n_estimators=264, num_leaves=19, max_depth=19, min_child_weight=0.0036979843107713866, scale_pos_weight=1
)

lgb_bsearch2 = BayesSearchCV(
    estimator=lgb_clf2, 
    search_spaces=param_test2, 
    n_iter=100, 
    scoring='accuracy', 
    cv=10, 
    refit=True, 
    verbose=0, 
    random_state=42, 
    return_train_score=True
)

lgb_bsearch2.fit(X_train, y_train, eval_set=(X_test, y_test), eval_metric='auc', early_stopping_rounds=50, verbose=0)
lgb_bsearch2

In [None]:
lgb_bsearch2.best_params_

OrderedDict([('colsample_bytree', 0.6171315437034796),
             ('subsample', 0.8545648106110963)])

In [None]:
# Iteration 3: Try scale pos weight for class imbalance for best params in bsearch2

param_test3 = {
    # 'num_leaves': Integer(6, 50),
    # 'max_depth': Integer(3, 20),
    # 'learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    # 'n_estimators': Integer(100, 500),
    'scale_pos_weight': Integer(1, 2)
    # 'min_split_gain': Real(0.001, 0.1),
    # 'min_child_weight': Real(1e-5, 10, prior='log-uniform'),
    # 'min_child_samples': Integer(20, 100),
    # 'subsample': Real(0.01, 1.0),
    # 'colsample_bytree': Real(0.4, 1.0),
    # 'reg_alpha': Real(0, 1.0),
    # 'reg_lambda': Real(0, 100)
}

lgb_clf3 = LGBMClassifier(
    boosting_type='gbdt', objective='binary', random_state=42, learning_rate=0.1, 
    n_estimators=264, num_leaves=19, max_depth=19, min_child_weight=0.0036979843107713866, 
    subsample=0.8545648106125745, colsample_bytree=0.6171315438559221
)

lgb_bsearch3 = BayesSearchCV(
    estimator=lgb_clf3, 
    search_spaces=param_test3, 
    n_iter=100, 
    scoring='accuracy', 
    cv=10, 
    refit=True, 
    verbose=0, 
    random_state=42, 
    return_train_score=True
)

lgb_bsearch3.fit(X_train, y_train, eval_set=(X_test, y_test), eval_metric='auc', early_stopping_rounds=50, verbose=0)
lgb_bsearch3

In [None]:
lgb_bsearch3.best_params_

OrderedDict([('scale_pos_weight', 1)])

In [None]:
# Iteration 4: Optimize L1 & L2 regularization terms for best params in bsearch3

param_test4 = {
    # 'num_leaves': Integer(6, 50),
    # 'max_depth': Integer(3, 20),
    # 'learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    # 'n_estimators': Integer(100, 500),
    # 'scale_pos_weight': Integer(1, 2)
    # 'min_split_gain': Real(0.001, 0.1),
    # 'min_child_weight': Real(1e-5, 10, prior='log-uniform'),
    # 'min_child_samples': Integer(20, 100),
    # 'subsample': Real(0.01, 1.0),
    # 'colsample_bytree': Real(0.4, 1.0),
    'reg_alpha': Real(0, 100),
    'reg_lambda': Real(0, 100)
}

lgb_clf4 = LGBMClassifier(
    boosting_type='gbdt', objective='binary', random_state=42, learning_rate=0.1, 
    n_estimators=264, num_leaves=19, max_depth=19, min_child_weight=0.0036979843107713866, 
    subsample=0.8545648106125745, colsample_bytree=0.6171315438559221, scale_pos_weight=1
)

lgb_bsearch4 = BayesSearchCV(
    estimator=lgb_clf4, 
    search_spaces=param_test4, 
    n_iter=100, 
    scoring='accuracy', 
    cv=10, 
    refit=True, 
    verbose=0, 
    random_state=42, 
    return_train_score=True
)

lgb_bsearch4.fit(X_train, y_train, eval_set=(X_test, y_test), eval_metric='auc', early_stopping_rounds=50, verbose=0)
lgb_bsearch4

In [None]:
lgb_bsearch4.best_params_

OrderedDict([('reg_alpha', 13.123186146742686), ('reg_lambda', 0.0)])

In [None]:
# Iteration 5: Optimize n estimators for learning rate=0.01 and best params in bsearch4

param_test5 = {
    # 'num_leaves': Integer(6, 50),
    # 'max_depth': Integer(3, 20),
    # 'learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'n_estimators': Integer(100, 500)
    # 'scale_pos_weight': Integer(1, 2)
    # 'min_split_gain': Real(0.001, 0.1),
    # 'min_child_weight': Real(1e-5, 10, prior='log-uniform'),
    # 'min_child_samples': Integer(20, 100),
    # 'subsample': Real(0.01, 1.0),
    # 'colsample_bytree': Real(0.4, 1.0),
    # 'reg_alpha': Real(0, 1.0),
    # 'reg_lambda': Real(0, 100)
}

lgb_clf5 = LGBMClassifier(
    boosting_type='gbdt', objective='binary', random_state=42, learning_rate=0.01, 
    num_leaves=19, max_depth=19, min_child_weight=0.0036979843107713866, subsample=0.8545648106125745, colsample_bytree=0.6171315438559221, 
    scale_pos_weight=1, reg_alpha=13.25665534075919, reg_lambda=0
)

lgb_bsearch5 = BayesSearchCV(
    estimator=lgb_clf5, 
    search_spaces=param_test5, 
    n_iter=100, 
    scoring='accuracy', 
    cv=10, 
    refit=True, 
    verbose=0, 
    random_state=42, 
    return_train_score=True
)

lgb_bsearch5.fit(X_train, y_train, eval_set=(X_test, y_test), eval_metric='auc', early_stopping_rounds=50, verbose=0)
lgb_bsearch5

In [None]:
lgb_bsearch5.best_score_

0.6910094125283999

In [None]:
lgb_bsearch5.best_params_

OrderedDict([('n_estimators', 347)])

In [None]:
# Train final LightGBM classifier using best params from bsearch5

lgb_params = lgb_bsearch5.best_estimator_.get_params()

lgb_final = LGBMClassifier(**lgb_params)
lgb_final.fit(X_train, y_train, eval_set=(X_test, y_test), eval_metric=['logloss', 'auc'])
lgb_final

In [None]:
# Compute performance metrics for training set

from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, roc_auc_score

y_pred_train = lgb_final.predict(X_train)
y_prob_train = lgb_final.predict_proba(X_train)[:, 1]

print(f'Accuracy Score: {accuracy_score(y_train, y_pred_train)}')
print(f'F1 Score: {f1_score(y_train, y_pred_train)}')
print(f'PR-AUC Score: {average_precision_score(y_train, y_prob_train)}')
print(f'Precision Score: {precision_score(y_train, y_pred_train)}')
print(f'Recall Score: {recall_score(y_train, y_pred_train)}')
print(f'ROC-AUC Score: {roc_auc_score(y_train, y_prob_train)}')

Accuracy Score: 0.7547892720306514
F1 Score: 0.8358974358974358
PR-AUC Score: 0.9054344583540064
Precision Score: 0.7616822429906542
Recall Score: 0.9261363636363636
ROC-AUC Score: 0.8230392156862745


In [None]:
# Compute performance metrics for test set

from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, roc_auc_score

y_pred_test = lgb_final.predict(X_test)
y_prob_test = lgb_final.predict_proba(X_test)[:, 1]

print(f'Accuracy Score: {accuracy_score(y_test, y_pred_test)}')
print(f'F1 Score: {f1_score(y_test, y_pred_test)}')
print(f'PR-AUC Score: {average_precision_score(y_test, y_prob_test)}')
print(f'Precision Score: {precision_score(y_test, y_pred_test)}')
print(f'Recall Score: {recall_score(y_test, y_pred_test)}')
print(f'ROC-AUC Score: {roc_auc_score(y_test, y_prob_test)}')

Accuracy Score: 0.7346938775510204
F1 Score: 0.8266666666666667
PR-AUC Score: 0.8596380967845247
Precision Score: 0.7380952380952381
Recall Score: 0.9393939393939394
ROC-AUC Score: 0.7675189393939394
