In [10]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [12]:
import time

import numpy as np
import pandas as pd

from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from src.utils import eval_model
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import seaborn as sns


mae = 'neg_mean_absolute_error'
mse = 'neg_mean_squared_error'
rmse = 'neg_root_mean_squared_error'
roc_auc = 'neg_roc_auc_score'
N_JOBS = 24
RANDOM_SEED = 42

# prepare models
models = {}

models['LR'] = LogisticRegression()
models['Ridge'] = RidgeClassifier()
models['DT'] = DecisionTreeClassifier(random_state=RANDOM_SEED)
# models['Lasso'] = Lass()
models['KNN'] = KNeighborsClassifier(n_jobs=N_JOBS)
models['SVC'] = SVC()
models['RF'] = RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS)
models['HistGB'] = HistGradientBoostingClassifier(random_state=RANDOM_SEED)
models['XGB'] = xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0,)
models['XGB_GPU'] = xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0, 
                                  tree_method='gpu_hist', predictor='gpu_predictor', gpu_id=1)
models['CB'] = cb.CatBoostClassifier(iterations=100, random_seed=RANDOM_SEED, thread_count=N_JOBS, verbose=False)
models['CB_GPU'] = cb.CatBoostClassifier(iterations=100, random_seed=RANDOM_SEED, thread_count=N_JOBS, verbose=False, task_type="GPU")
models['LGB'] = lgb.LGBMClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbose=-1)

In [None]:
X = pd.read_pickle('../data/processed/X_train_1.pkl.zip')
y = pd.read_pickle('../data/processed/y_train_1.pkl')

results = {}

for name, model in models.items():
    results[name] = eval_model(name, model, X, y)
    
results = pd.DataFrame(results)
sns.boxplot(results);

     LR: 0.844    (0.852 ± 0.008)    14.9s
  Ridge: 0.777    (0.787 ± 0.009)    6.8s
     DT: 0.726    (0.730 ± 0.004)    26.3s
    KNN: 0.845    (0.856 ± 0.011)    1.6s


In [None]:
X = pd.read_pickle('../data/processed/X_train_2.pkl.zip')
y = pd.read_pickle('../data/processed/y_train_2.pkl')

results = {}

for name, model in models.items():
    results[name] = eval_model(name, model, X, y)
    
results = pd.DataFrame(results)
sns.boxplot(results);

In [None]:
X = pd.read_pickle('../data/processed/X_train_3.pkl.zip')
y = pd.read_pickle('../data/processed/y_train_3.pkl')

results = {}

for name, model in models.items():
    results[name] = eval_model(name, model, X, y)    
    
results = pd.DataFrame(results)
sns.boxplot(results);