In [31]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import pandas as pd

ds = {
    'dataset_1':    {
        'X_train': pd.read_pickle('../data/processed/X_train_1.pkl.zip'),        
        'y_train': pd.read_pickle('../data/processed/y_train_1.pkl'),
        'X_test': pd.read_pickle('../data/processed/X_test_1.pkl.zip'),
        
    },

    'dataset_2': {
        'X_train': pd.read_pickle('../data/processed/X_train_2.pkl.zip'),
        'y_train': pd.read_pickle('../data/processed/y_train_2.pkl'),
        'X_test': pd.read_pickle('../data/processed/X_test_2.pkl.zip'),
    },
    'dataset_3': {
        'X_train': pd.read_pickle('../data/processed/X_train_3.pkl.zip'),
        'y_train': pd.read_pickle('../data/processed/y_train_3.pkl'),
        'X_test': pd.read_pickle('../data/processed/X_test_3.pkl.zip'),
    }
}

In [33]:
import time
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import  KFold
from src.utils import scale, eval_model
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import seaborn as sns


mae = 'neg_mean_absolute_error'
mse = 'neg_mean_squared_error'
rmse = 'neg_root_mean_squared_error'
roc_auc = 'neg_roc_auc_score'
N_JOBS = 24
RANDOM_SEED = 42

# prepare models
models = {}

# models['LR'] = LogisticRegression()
# models['Ridge'] = RidgeClassifier()
# models['DT'] = DecisionTreeClassifier(random_state=RANDOM_SEED)
# models['Lasso'] = Lass()
# models['KNN'] = KNeighborsClassifier(n_jobs=N_JOBS)
models['SVC'] = SVC()
models['RF'] = RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS)
models['XGB'] = xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0)
models['XGB GPU'] = xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0,
                                  tree_method='gpu_hist', predictor='gpu_predictor', gpu_id=1)
models['CB'] = cb.CatBoostClassifier(iterations=400, random_seed=RANDOM_SEED, thread_count=N_JOBS, verbose=False)
models['CB GPU'] = cb.CatBoostClassifier(iterations=400, random_seed=RANDOM_SEED, thread_count=N_JOBS, verbose=False, task_type="GPU")
models['LGB'] = lgb.LGBMClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbose=-1)

In [34]:
from sklearn.model_selection import cross_validate

def get_fitted_models(model, X, y):        
    kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    cv_res = cross_validate(model, scale(X), y, cv=kfold, scoring='roc_auc', return_estimator=True)
    # score = cv_res['test_score'].mean() - cv_res['test_score'].std()    
    return cv_res['estimator']
    

In [35]:
# fitted_models = {}

In [36]:
# X = ds['dataset_2']['X_train']
# y = ds['dataset_2']['y_train']
# 
# for name, model in tqdm(models.items()):
#     fitted_models[name] = get_fitted_models(model, X, y)

In [37]:
def get_predictions(clf_list, X_test):
    preds = []
    for clf in clf_list:
        target_one_probs = clf.predict_proba(X_test)[:, 1]
        preds.append(target_one_probs)
        
    return np.stack(preds).mean(axis=0)
    
# get_predictions(fitted_models['XGB GPU'], ds['dataset_2']['X_test'])

In [38]:
# fitted_models['XGB GPU'][4].predict_proba(ds['dataset_2']['X_test'])[:, 1]

In [39]:
def get_avg_predictions(fitted_models_dict, dataset, weights: dict = None):
    X_test = dataset['X_test']
    preds = np.zeros(X_test.shape[0])
    
    if weights is None:
        weights = {k: 1.0 for k in fitted_models_dict.keys()}

    for name, models in tqdm(fitted_models_dict.items(), desc='Predicting     '):        
        preds += weights[name] * get_predictions(models, X_test)
                    
    preds /= sum(weights.values())
    return preds

# get_avg_predictions(fitted_models, ds['dataset_2'])

In [40]:
# get_avg_predictions(fitted_models, ds['dataset_2'], weights={'XGB GPU': 0.1, 'RF': 0.5})

In [41]:
def fit_models(models, dataset):
    fitted_models = {}
    pb = tqdm(total=len(models))
    for name, model in models.items():
        pb.set_description("Fitting %7s" % name)
        fitted_models[name] = get_fitted_models(model, dataset['X_train'], dataset['y_train'])
        pb.update() 
    return fitted_models


# res = fit_models(models, ds['dataset_2'])

In [42]:
# res

In [43]:
# y_pred_2 = get_avg_predictions(res, ds['dataset_2'])

In [44]:
def fit_predict(models: dict, dataset: dict):
    res = fit_models(models, dataset)
    return get_avg_predictions(res, dataset)

In [45]:
y_pred = []
for name, dataset in ds.items():
    print('dataset:', name)
    y_pred.append(fit_predict(models, dataset))
    
np.concatenate(y_pred)

dataset: dataset_1



  0%|          | 0/2 [00:00<?, ?it/s][A
Fitting      RF:   0%|          | 0/2 [00:00<?, ?it/s][A
Fitting      RF:  50%|█████     | 1/2 [00:02<00:02,  2.73s/it][A
Fitting XGB GPU:   0%|          | 0/1 [00:23<?, ?it/s].73s/it][A

Fitting XGB GPU: 100%|██████████| 2/2 [00:17<00:00,  8.81s/it][A
Predicting     : 100%|██████████| 2/2 [00:01<00:00,  1.03it/s]


dataset: dataset_2


Fitting XGB GPU: 100%|██████████| 2/2 [00:14<00:00,  7.22s/it]
Predicting     : 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]


dataset: dataset_3


Fitting XGB GPU: 100%|██████████| 2/2 [00:13<00:00,  6.86s/it]
Predicting     : 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]


array([0.27566983, 0.54945222, 0.4675001 , ..., 0.67151947, 0.57171144,
       0.69367435])

In [None]:
np.concatenate(y_pred).shape

In [None]:
# test = pd.read_csv('../data/processed/test.csv', index_col=0)
# pd.Series(np.concatenate(y_pred), index=test.index, name='Y').to_csv('../data/preds/060924_avg_rf_3_boosting_gpu.csv')

# Some parameters explorations

In [None]:
X = ds['dataset_2']['X_train']
y = ds['dataset_2']['y_train']

for name, model in models.items():
    eval_model(name, model, X, y)

In [None]:
params = {
    'random_seed': RANDOM_SEED,
    'thread_count': N_JOBS,
    'verbose': False,
}

### Baseline 0.758 - 0.754

In [None]:
models['CB']     = cb.CatBoostClassifier(**params)
models['CB GPU'] = cb.CatBoostClassifier(task_type='GPU', **params)

for name, model in models.items():
    eval_model(name, model, X, y)

### Num iters 100, 200, 400

In [None]:
models['CB']     = cb.CatBoostClassifier(iterations=800, **params)
models['CB GPU'] = cb.CatBoostClassifier(iterations=800, task_type='GPU', **params)

for name, model in models.items():
    eval_model(name, model, X, y)

In [None]:
models['CB']     = cb.CatBoostClassifier(iterations=400, **params)
models['CB GPU'] = cb.CatBoostClassifier(iterations=400, task_type='GPU', **params)

for name, model in models.items():
    eval_model(name, model, X, y)

In [None]:
models['CB']     = cb.CatBoostClassifier(iterations=200, **params)
models['CB GPU'] = cb.CatBoostClassifier(iterations=200, task_type='GPU', **params)

for name, model in models.items():
    eval_model(name, model, X, y)

In [None]:
models['CB']     = cb.CatBoostClassifier(iterations=100,  **params)
models['CB GPU'] = cb.CatBoostClassifier(iterations=100, task_type='GPU', **params)

for name, model in models.items():
    eval_model(name, model, X, y)

### stability 100iter over random seeds
```random_seed=42 - outlier```

In [None]:
models['CB']     = cb.CatBoostClassifier(iterations=100, random_seed=0, verbose=False)
models['CB GPU'] = cb.CatBoostClassifier(iterations=100, task_type='GPU', random_seed=0, verbose=False)

for name, model in models.items():
    eval_model(name, model, X, y)

In [None]:
models['CB']     = cb.CatBoostClassifier(iterations=100, random_seed=1, verbose=False)
models['CB GPU'] = cb.CatBoostClassifier(iterations=100, task_type='GPU', random_seed=1, verbose=False)

for name, model in models.items():
    eval_model(name, model, X, y)

In [None]:
models['CB']     = cb.CatBoostClassifier(iterations=100, random_seed=2, verbose=False)
models['CB GPU'] = cb.CatBoostClassifier(iterations=100, task_type='GPU', random_seed=2, verbose=False)

for name, model in models.items():
    eval_model(name, model, X, y)

In [None]:
models['CB']     = cb.CatBoostClassifier(iterations=100, random_seed=42, verbose=False)
models['CB GPU'] = cb.CatBoostClassifier(iterations=100, task_type='GPU', random_seed=42, verbose=False)

for name, model in models.items():
    eval_model(name, model, X, y)

### other speedup options

In [None]:
models['CB']     = cb.CatBoostClassifier(iterations=100, bootstrap_type='Bernoulli', **params)
models['CB GPU'] = cb.CatBoostClassifier(iterations=100, bootstrap_type='Bernoulli', task_type='GPU', **params)

for name, model in models.items():
    eval_model(name, model, X, y)

In [None]:
models['CB']     = cb.CatBoostClassifier(iterations=100, sampling_frequency='PerTree', **params)
models['CB GPU'] = cb.CatBoostClassifier(iterations=100, sampling_frequency='PerTree', task_type='GPU', **params)

for name, model in models.items():
    eval_model(name, model, X, y)

In [None]:
models['CB']     = cb.CatBoostClassifier(iterations=100, grow_policy='SymmetricTree', **params)
models['CB GPU'] = cb.CatBoostClassifier(iterations=100, grow_policy='SymmetricTree', task_type='GPU', **params)

for name, model in models.items():
    eval_model(name, model, X, y)