In [1]:

# try usual models and select the best one
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score

from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier,
    VotingClassifier,
    BaggingClassifier,
    StackingClassifier,
    HistGradientBoostingClassifier,
)
from sklearn.linear_model import (
    LogisticRegression,
    RidgeClassifier,
    SGDClassifier,
    PassiveAggressiveClassifier,
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
from typing import List, Tuple, Dict, Any
import os
import sys
import time
import warnings

warnings.filterwarnings("ignore")

current_dir = os.getcwd()
print(current_dir)
sys.path.append(current_dir)
from MODELS import finetuned_models as models
from MODELS import vanilla_models as vanilla_models
sys.path.append(current_dir + "/../")
sys.path.append(current_dir + "/../FeaturePreprocessing/")

from utils import *
from FeaturePreprocessing.process import *


current_dir = os.getcwd()
print(current_dir)

/Users/zhaoyiming/Desktop/ETHZurich/1. Y1S1/2. AML/Project/task-2-ECG-Signals-Classification/Models
/Users/zhaoyiming/Desktop/ETHZurich/1. Y1S1/2. AML/Project/task-2-ECG-Signals-Classification/FeaturePreprocessing
/Users/zhaoyiming/Desktop/ETHZurich/1. Y1S1/2. AML/Project/task-2-ECG-Signals-Classification/Models


In [2]:
X_train, X_test, y_train = load_final_data()
print(X_train.shape, X_test.shape, y_train.shape)

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%Time taken by load_final_data: 0.43 seconds%%%%%%%%%%%%%%%%%%%%%%%%%%%%
(5117, 308) (3411, 308) (5117,)


In [3]:
with open(os.path.join(current_dir, "selected_models.txt"), "r") as f:
    selected_models = f.readlines()

print(selected_models)
selected_models = [model.split("|")[0] for model in selected_models]
print(selected_models)


['LGBM|0.8760979884530793\n', 'XGB|0.8741431451612904\n', 'CatBoost|0.875\n', 'GradientBoosting|0.8682\n', 'HistGradientBoosting|0.8721\n', 'ExtraTrees|0.8672\n', 'RandomForest|0.8662\n']
['LGBM', 'XGB', 'CatBoost', 'GradientBoosting', 'HistGradientBoosting', 'ExtraTrees', 'RandomForest']


In [37]:
pd.DataFrame(y_train).value_counts()

0
0    3030
2    1474
1     443
3     170
Name: count, dtype: int64

In [None]:
# ['LGBM', 'XGB', 'CatBoost', 'GradientBoosting', 'ExtraTrees', 'Bagging', 'RandomForest', 'HistGradientBoosting']
# fine tune each model
weight = {
    0: 2,
    1: 1,
    2: 1.5,
    3: 1,
}
LGBM_params = {
    # "num_leaves": [10, 20, 30, 40, 50],
    "max_depth": [5],
    "learning_rate": [0.07],
    "n_estimators": [1000],
    "subsample": [0.9],
    "class_weight": ["balanced", None, weight],
}

lgbm_best = get_best_parameters(
    X_train=X_train,
    y_train=y_train,
    estimator=models["LGBM"],
    parameters=LGBM_params,
    verbose=2,
)
# evaluate_model(X_train, y_train, models["LGBM"], cv=True)

In [None]:
# fine tune XGB
# XGB_params = {
#     # "max_depth": [5],
#     "learning_rate": np.linspace(0.01, 0.1, 10),
#     "n_estimators": range(100, 1000, 100),
#     # "subsample": [0.9],
#     # "colsample_bytree": [0.9],
# }

# xgb_best = get_best_parameters(
#     X_train=X_train,
#     y_train=y_train,
#     estimator=models["XGB"],
#     parameters=XGB_params,
#     verbose=2,
# )

evaluate_model(X_train, y_train, models["XGB"], cv=True)

0.8741431451612904

In [None]:
CatBoostClassifier(auto_class_weights="Balanced")

<catboost.core.CatBoostClassifier at 0x1313b2510>

In [None]:
# fine tune CatBoost
import catboost
clf = catboost.CatBoostClassifier(auto_class_weights="Balanced")

CatBoost_params = {
    # "learning_rate": np.linspace(0.01, 0.1, 10),
    # "n_estimators": range(100, 1000, 100),
    "verbose": [0],
    "auto_class_weights": ["SqrtBalanced"],
}

catboost_best = get_best_parameters(
    X_train=X_train,
    y_train=y_train,
    estimator=clf,
    parameters=CatBoost_params,
    verbose=2,
)


In [None]:
# fine tune HistGradientBoosting
HistGDBT_params = {
    # "learning_rate": np.linspace(0.01, 0.1, 10),
    # "max_iter": range(100, 1000, 100),
    "verbose": [0],
    "class_weight": ["balanced"],
}

histgdbt_best = get_best_parameters(
    X_train=X_train,
    y_train=y_train,
    estimator=models["HistGradientBoosting"],
    parameters=HistGDBT_params,
    verbose=2,
    cv=True,
    
)

Searching the best parameters:  50%|█████     | 1/2 [01:59<01:59, 119.98s/it]

No: 0 params: (0, 'balanced') score: 0.8749 best: 0


Searching the best parameters: 100%|██████████| 2/2 [04:15<00:00, 127.58s/it]

No: 1 params: (0, None) score: 0.8734 best: 0.8749
Best params: {'verbose': 0, 'class_weight': 'balanced'}
score: 0.8749257316104595
best: HistGradientBoostingClassifier(learning_rate=0.01, max_iter=500)





In [None]:
# fine tune ExtraTrees
ExtraTrees_params = {
    "n_estimators": [500],
    "max_depth": [15],
    "criterion" : ["log_loss"],
    "class_weight": ["balanced_subsample"],
}

extratrees_best = get_best_parameters(
    X_train=X_train,
    y_train=y_train,
    estimator=models["ExtraTrees"],
    parameters=ExtraTrees_params,
    verbose=2,
)

Searching the best parameters:  33%|███▎      | 1/3 [00:20<00:41, 20.76s/it]

No: 0 params: (500, 15, 'log_loss', 'balanced') score: 0.8724 best: 0


Searching the best parameters:  67%|██████▋   | 2/3 [00:41<00:20, 20.74s/it]

No: 1 params: (500, 15, 'log_loss', 'balanced_subsample') score: 0.8734 best: 0.8724


Searching the best parameters: 100%|██████████| 3/3 [01:01<00:00, 20.44s/it]

No: 2 params: (500, 15, 'log_loss', None) score: 0.8722 best: 0.8734
Best params: {'n_estimators': 500, 'max_depth': 15, 'criterion': 'log_loss', 'class_weight': 'balanced_subsample'}
score: 0.8733622770039101
best: ExtraTreesClassifier(criterion='log_loss', max_depth=15, n_estimators=500)





In [None]:
# fine tune Bagging
Bagging_params = {
    "n_estimators": [10, 100, 200, 500],
    # "max_samples": [0.9, 1.0],
    # "max_features": [0.9, 1.0],
    # "bootstrap": [True, False],
    
}

bagging_best = get_best_parameters(
    X_train=X_train,
    y_train=y_train,
    estimator=models["Bagging"],
    parameters=Bagging_params,
    verbose=2,
)

In [5]:
# fine tune RandomForest

RandomForest_params = {
    "n_estimators": [700],
    # "max_depth": [5],
    # "min_samples_split": [2],
    # "min_samples_leaf": [1],
    "max_features": ["sqrt", "log2"],
    # "class_weight": ["balanced", "balanced_subsample"],
}

randomforest_best = get_best_parameters(
    X_train=X_train,
    y_train=y_train,
    estimator=models["RandomForest"],
    parameters=RandomForest_params,
    verbose=2,
)

Searching the best parameters:  50%|█████     | 1/2 [00:57<00:57, 57.71s/it]

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%Time taken by evaluate_model: 57.71 seconds%%%%%%%%%%%%%%%%%%%%%%%%%%%%
No: 0 params: (700, 'sqrt') score: 0.8708 best: 0


Searching the best parameters: 100%|██████████| 2/2 [01:30<00:00, 45.09s/it]

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%Time taken by evaluate_model: 32.48 seconds%%%%%%%%%%%%%%%%%%%%%%%%%%%%
No: 1 params: (700, 'log2') score: 0.8681 best: 0.8708
Best params: {'n_estimators': 700, 'max_features': 'sqrt'}
score: 0.8708228326612903
best: RandomForestClassifier(class_weight='balanced_subsample', max_features='log2',
                       n_estimators=700, n_jobs=-1, random_state=42)
%%%%%%%%%%%%%%%%%%%%%%%%%%Time taken by get_best_parameters: 90.19 seconds%%%%%%%%%%%%%%%%%%%%%%%%%%



