In [1]:
# try usual models and select the best one
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score

from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier,
    VotingClassifier,
    BaggingClassifier,
    StackingClassifier,
    HistGradientBoostingClassifier,
)
from sklearn.linear_model import (
    LogisticRegression,
    RidgeClassifier,
    SGDClassifier,
    PassiveAggressiveClassifier,
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
from typing import List, Tuple, Dict, Any
import os
import sys
import time
import warnings

warnings.filterwarnings("ignore")

current_dir = os.getcwd()
print(current_dir)
sys.path.append(current_dir)
sys.path.append(current_dir + "/../")
sys.path.append(current_dir + "/../FeaturePreprocessing/")

from utils import *
from FeaturePreprocessing.process import *


current_dir = os.getcwd()
print(current_dir)

/Users/zhaoyiming/Desktop/ETHZurich/1. Y1S1/2. AML/Project/task-2-ECG-Signals-Classification/Models
/Users/zhaoyiming/Desktop/ETHZurich/1. Y1S1/2. AML/Project/task-2-ECG-Signals-Classification/FeaturePreprocessing
/Users/zhaoyiming/Desktop/ETHZurich/1. Y1S1/2. AML/Project/task-2-ECG-Signals-Classification/Models


In [2]:
models = {
    "GaussianProcess": GaussianProcessClassifier(kernel=RBF(), n_jobs=-1),
    "LGBM": LGBMClassifier(n_jobs=-1, verbose=-1),
    "XGB": XGBClassifier(n_jobs=-1),
    "CatBoost": CatBoostClassifier(verbose=0),
    "GradientBoosting": GradientBoostingClassifier(verbose=0),  # too slow
    "HistGradientBoosting": HistGradientBoostingClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "ExtraTrees": ExtraTreesClassifier(),
    "Bagging": BaggingClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "Ridge": RidgeClassifier(),
    "SGD": SGDClassifier(),
    "PassiveAggressive": PassiveAggressiveClassifier(),
    "SVC": SVC(),
    "DecisionTree": DecisionTreeClassifier(),
    "KNeighbors": KNeighborsClassifier(),
    "GaussianNB": GaussianNB(),
    "MLP": MLPClassifier(),
}

X_train, X_test, y_train = load_final_data()
print(X_train.shape, X_test.shape, y_train.shape)
y_train = y_train.values.ravel()

(5117, 308) (3411, 308) (5117, 1)


In [3]:
with open(os.path.join(current_dir, "selected_models.txt"), "r") as f:
    selected_models = f.readlines()

print(selected_models)
selected_models = [model.split("|")[0] for model in selected_models]
print(selected_models)
selected_models.append("HistGradientBoosting")
print(selected_models)


['LGBM|0.8711\n', 'XGB|0.873\n', 'CatBoost|0.876\n', 'GradientBoosting|0.8701\n', 'ExtraTrees|0.8799\n', 'Bagging|0.8633\n', 'RandomForest|0.873\n']
['LGBM', 'XGB', 'CatBoost', 'GradientBoosting', 'ExtraTrees', 'Bagging', 'RandomForest']
['LGBM', 'XGB', 'CatBoost', 'GradientBoosting', 'ExtraTrees', 'Bagging', 'RandomForest', 'HistGradientBoosting']


In [13]:
# ['LGBM', 'XGB', 'CatBoost', 'GradientBoosting', 'ExtraTrees', 'Bagging', 'RandomForest', 'HistGradientBoosting']
# fine tune each model
LGBM_params = {
    # "num_leaves": [10, 20, 30, 40, 50],
    "max_depth": [5],
    "learning_rate": [0.07],
    "n_estimators": [1000],
    # "min_child_samples": [10, 20, 30, 40, 50],
    "subsample": [0.9],
    # "colsample_bytree": [0.1, 0.3, 0.5, 0.7, 0.9],
}

lgbm_best = get_best_parameters(
    X_train=X_train,
    y_train=y_train,
    estimator=models["LGBM"],
    parameters=LGBM_params,
    verbose=2,
)

Searching the best parameters:   0%|          | 0/1 [00:00<?, ?it/s]

Searching the best parameters: 100%|██████████| 1/1 [00:52<00:00, 52.88s/it]

No: 0 params: (5, 0.07, 1000, 0.9) score: 0.8761 best: 0
Best params: {'max_depth': 5, 'learning_rate': 0.07, 'n_estimators': 1000, 'subsample': 0.9}
score: 0.8760979884530793
best: LGBMClassifier(learning_rate=0.07, max_depth=5, n_estimators=1000, n_jobs=-1,
               subsample=0.9, verbose=-1)





In [None]:
# fine tune XGB
XGB_params = {
    # "max_depth": [5],
    "learning_rate": np.linspace(0.01, 0.1, 10),
    "n_estimators": range(100, 1000, 100),
    # "subsample": [0.9],
    # "colsample_bytree": [0.9],
}

xgb_best = get_best_parameters(
    X_train=X_train,
    y_train=y_train,
    estimator=models["XGB"],
    parameters=XGB_params,
    verbose=2,
)


In [None]:
# fine tune CatBoost

CatBoost_params = {
    "learning_rate": np.linspace(0.01, 0.1, 10),
    "n_estimators": range(100, 1000, 100),
    "verbose": [0],
}

catboost_best = get_best_parameters(
    X_train=X_train,
    y_train=y_train,
    estimator=models["CatBoost"],
    parameters=CatBoost_params,
    verbose=2,
)


In [None]:
lgbm = LGBMClassifier(
    n_estimators=2000,
    learning_rate=0.11,
    num_leaves=16,
    random_state=0,
    num_threads=128,
)
xgboost = XGBClassifier(
    n_estimators=2000, random_state=0, learning_rate=0.11, max_depth=16, alpha=0.2
)
gradient = HistGradientBoostingClassifier(
    random_state=0, learning_rate=0.15, max_iter=400, max_leaf_nodes=31
)
forest = RandomForestClassifier(n_estimators=2000, random_state=0, n_jobs=-1)
cat_class = CatBoostClassifier()
gradient_class = GradientBoostingClassifier()

models = {
    "LGBM": lgbm,
    "XGB": xgboost,
    "GradientBoosting": gradient,
    "RandomForest": forest,
    "CatBoost": cat_class,
    "GradientBoosting": gradient_class,
}

def evaluate(model):
    cv_score = cross_val_score(
        model, X_train, y_train, cv=5, scoring=make_scorer(f1_score, average="micro")
    )
    return cv_score.mean()

evaluate(cat_class)

Learning rate set to 0.085023
0:	learn: 1.2283687	total: 90.3ms	remaining: 1m 30s
1:	learn: 1.1121770	total: 143ms	remaining: 1m 11s
2:	learn: 1.0167791	total: 205ms	remaining: 1m 7s
3:	learn: 0.9393208	total: 280ms	remaining: 1m 9s
4:	learn: 0.8744305	total: 335ms	remaining: 1m 6s
5:	learn: 0.8210161	total: 388ms	remaining: 1m 4s
6:	learn: 0.7732232	total: 448ms	remaining: 1m 3s
7:	learn: 0.7295465	total: 504ms	remaining: 1m 2s
8:	learn: 0.6929412	total: 611ms	remaining: 1m 7s
9:	learn: 0.6601604	total: 721ms	remaining: 1m 11s
10:	learn: 0.6315727	total: 791ms	remaining: 1m 11s
11:	learn: 0.6064801	total: 851ms	remaining: 1m 10s
12:	learn: 0.5834665	total: 913ms	remaining: 1m 9s
13:	learn: 0.5618460	total: 981ms	remaining: 1m 9s
14:	learn: 0.5449601	total: 1.03s	remaining: 1m 7s
15:	learn: 0.5268599	total: 1.11s	remaining: 1m 8s
16:	learn: 0.5096917	total: 1.2s	remaining: 1m 9s
17:	learn: 0.4945148	total: 1.28s	remaining: 1m 10s
18:	learn: 0.4829485	total: 1.34s	remaining: 1m 8s
19:	l

nan