In [1]:
import pandas as pd
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

In [2]:
import time

import numpy as np
import pandas as pd

from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold
from src.utils import OffsetScaler, get_fps_offset
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import seaborn as sns


mae = 'neg_mean_absolute_error'
mse = 'neg_mean_squared_error'
rmse = 'neg_root_mean_squared_error'
roc_auc = 'neg_roc_auc_score'
N_JOBS = 12
RANDOM_SEED = 42

In [20]:
def get_objectives(X, y):
    objectives = {}

    # Logistic Regression
    def logistic_regression_objective(trial):
        params = {
            'C': trial.suggest_loguniform('C', 1e-4, 1e2),
            'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
            'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
            'max_iter': trial.suggest_int('max_iter', 100, 500)
        }
        clf = LogisticRegression(**params)
        kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
        score = cross_val_score(clf, X, y, n_jobs=-1, cv=kfold, scoring='roc_auc')
        return score.mean() - score.std()


    # KNN
    def knn_objective(trial):
        params = {
            'n_neighbors': trial.suggest_int('n_neighbors', 1, 50),
            'leaf_size': trial.suggest_int('leaf_size', 10, 50),
            'p': trial.suggest_categorical('p', [1, 2]),
            'algorithm': trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
        }
        clf = KNeighborsClassifier(n_jobs=N_JOBS, **params)
        kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
        score = cross_val_score(clf, X, y, n_jobs=-1, cv=kfold, scoring='roc_auc')
        return score.mean() - score.std()

    # SVC
    def svc_objective(trial):
        params = {
            'C': trial.suggest_loguniform('C', 1e-2, 1e2),
            'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
            'gamma': trial.suggest_categorical('gamma', ['scale', 'auto'])
        }
        clf = SVC(**params)
        kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
        score = cross_val_score(clf, X, y, n_jobs=-1, cv=kfold, scoring='roc_auc')
        return score.mean() - score.std()

    # Random Forest
    def random_forest_objective(trial):
        params = {
            'n_estimators': trial.suggest_categorical('n_estimators', [100, 200, 300, 400, 500, 800, 1400, 2000]),
            'max_depth': trial.suggest_int('max_depth', 2, 40),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10)
        }
        clf = RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, **params)
        kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
        score = cross_val_score(clf, X, y, n_jobs=1, cv=kfold, scoring='roc_auc')
        return score.mean() - score.std()

    # XGBoost
    def xgb_objective(trial):
        params = {
            'n_estimators': trial.suggest_categorical('n_estimators', [50, 100, 200, 400, 800, 1000, 2000, 5000]),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1.0),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0)
        }
        clf = xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0, **params)
        kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
        score = cross_val_score(clf, X, y, n_jobs=1, cv=kfold, scoring='roc_auc')
        return score.mean() - score.std()

    # CatBoost
    def catboost_objective(trial):
        params = {
            'iterations': trial.suggest_categorical('n_estimators', [50, 100, 200, 400, 800, 1000, 2000, 5000]),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1.0),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 10.0, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),            
            'depth': trial.suggest_int('depth', 3, 10)
        }
        clf = cb.CatBoostClassifier(random_seed=RANDOM_SEED, thread_count=N_JOBS, verbose=False, **params)
        kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
        score = cross_val_score(clf, X, y, n_jobs=1, cv=kfold, scoring='roc_auc')
        return score.mean() - score.std()

    # LightGBM
    def lgbm_objective(trial):
        params = {
            'n_estimators': trial.suggest_categorical('n_estimators', [50, 100, 200, 400, 800, 1000, 2000, 5000]),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1.0),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'num_leaves': trial.suggest_int('num_leaves', 20, 150)
        }
        clf = lgb.LGBMClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbose=0, **params)
        kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
        score = cross_val_score(clf, X, y, n_jobs=1, cv=kfold, scoring='roc_auc')
        return score.mean() - score.std()

    # Adding each objective to the dict
    objectives['LR'] = logistic_regression_objective
    objectives['KNN'] = knn_objective
    objectives['SVC'] = svc_objective
    objectives['RF'] = random_forest_objective
    objectives['XGB'] = xgb_objective
    objectives['CB'] = catboost_objective
    objectives['LGB'] = lgbm_objective

    return objectives


In [21]:
ds = [
    {
        'X': pd.read_pickle('../data/processed/X_train_1.pkl'),
        'y': pd.read_pickle('../data/processed/y_train_1.pkl'),        
    },
    {
        'X': pd.read_pickle('../data/processed/X_train_2.pkl'),
        'y': pd.read_pickle('../data/processed/y_train_2.pkl'),         
    },
    {
        'X': pd.read_pickle('../data/processed/X_train_3.pkl'),
        'y': pd.read_pickle('../data/processed/y_train_3.pkl'), 
    }
]

In [15]:
pd.read_pickle('../data/processed/X_train_2.pkl').shape

(1424, 3059)

In [24]:
import optuna

for dataset in ds[1:2]:            
    X = dataset['X']
    y = dataset['y']
    
    FPS_OFFSET = get_fps_offset(X.columns)
    scaler = OffsetScaler(FPS_OFFSET)
    X_scaled = scaler.fit_transform(X.values)
    
    
    objectives = get_objectives(X_scaled, y)
    # for name in ['SVC', 'RF', 'XGB', 'CB','LGB']:
    for name in ['RF']:
        obj = objectives[name]        
        study = optuna.create_study(direction='maximize')
        study.optimize(obj, n_trials=200)
        break
    break

[I 2024-09-05 21:01:19,574] A new study created in memory with name: no-name-70c6bba0-3df9-4dca-9df0-95d780b381e4
[I 2024-09-05 21:01:26,404] Trial 0 finished with value: 0.7557082081952248 and parameters: {'n_estimators': 400, 'max_depth': 26, 'min_samples_split': 16, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.7557082081952248.
[I 2024-09-05 21:01:31,459] Trial 1 finished with value: 0.7494737364983413 and parameters: {'n_estimators': 300, 'max_depth': 24, 'min_samples_split': 15, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.7557082081952248.
[I 2024-09-05 21:01:38,055] Trial 2 finished with value: 0.7507431200828442 and parameters: {'n_estimators': 400, 'max_depth': 21, 'min_samples_split': 6, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.7557082081952248.
[I 2024-09-05 21:01:51,245] Trial 3 finished with value: 0.7606189023035441 and parameters: {'n_estimators': 800, 'max_depth': 28, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 3 with val

KeyboardInterrupt: 

In [2]:
import pickle

with open('../data/tuning/RF.pkl', 'rb') as f:
    study = pickle.load(f)
    


{'n_estimators': 1400,
 'max_depth': 25,
 'min_samples_split': 14,
 'min_samples_leaf': 10}