In [3]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [58]:
import time
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import  KFold
from src.utils import scale, eval_model
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import seaborn as sns
from src.corr import non_corr_features


mae = 'neg_mean_absolute_error'
mse = 'neg_mean_squared_error'
rmse = 'neg_root_mean_squared_error'
roc_auc = 'neg_roc_auc_score'
N_JOBS = 24
RANDOM_SEED = 42

# prepare models
models = {}

models['RF'] = RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS)

models['HistGB'] = HistGradientBoostingClassifier(random_state=RANDOM_SEED)

models['XGB'] = xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0,)

models['XGB_GPU'] = xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0, 
                                  tree_method='gpu_hist', predictor='gpu_predictor', gpu_id=1)

models['CB'] = cb.CatBoostClassifier(iterations=400, random_seed=RANDOM_SEED, thread_count=N_JOBS, verbose=False)

models['CB_GPU'] = cb.CatBoostClassifier(iterations=400, random_seed=RANDOM_SEED, thread_count=N_JOBS, verbose=False, task_type="GPU")

models['LGB'] = lgb.LGBMClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbose=-1)

In [51]:
X_train = pd.read_pickle('../data/processed/X_train.pkl.zip')
X_test = pd.read_pickle('../data/processed/X_test.pkl.zip')

y_train = pd.read_pickle('../data/processed/y_train.pkl')
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7939 entries, 0 to 7938
Columns: 3682 entries, 0 to prop_3
dtypes: float64(3682)
memory usage: 223.1 MB


In [52]:
from src.utils import get_fps_cols, scale, OffsetScaler

fps_cols = get_fps_cols(X_train.columns)
feats = [c for c in X_train.columns if c not in fps_cols]
rd_cols = [c for c in feats if 'rd_' in c]
md_cols = [c for c in feats if 'md_' in c]
gin_cols = [c for c in feats if 'gin_' in c]

In [53]:
scaler = OffsetScaler(len(fps_cols))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [54]:
for name, model in models.items():
    eval_model(name, model, X_train, y_train)

     RF: 0.8532    (0.859 ± 0.006)    6.8s
 HistGB: 0.8531    (0.863 ± 0.010)    92.0s
    XGB: 0.8449    (0.856 ± 0.011)    67.4s
XGB_GPU: 0.8447    (0.856 ± 0.011)    33.6s
     CB: 0.8505    (0.861 ± 0.010)    141.1s
 CB_GPU: 0.8548    (0.865 ± 0.010)    38.2s
    LGB: 0.8525    (0.862 ± 0.009)    24.3s


In [59]:
X_train.shape

(7939, 3679)

In [60]:
X_train = non_corr_features(X_train, y_train)
X_test = X_test[X_train.columns]
X_train.shape

________________________________________________________________________________
[Memory] Calling src.corr.get_corr...
get_corr(        0    1    2    3    4    5    6    7    8    9  ...   gin_291  \
0     0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  ...  1.009682   
1     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.745189   
2     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ... -0.655922   
3     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ... -0.674330   
4     0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  0.124397   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...       ...   
7934  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.289294   
7935  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  0.375500   
7936  0.0  1.0  1.0  0.0  0.0  0.0  0.0  0....)
_______________________________________________________get_corr - 144.9s, 2.4min


(7939, 3108)

In [61]:
for name, model in models.items():
    eval_model(name, model, X_train, y_train)

     RF: 0.8554    (0.861 ± 0.006)    4.7s
 HistGB: 0.8555    (0.864 ± 0.009)    74.0s
    XGB: 0.8446    (0.855 ± 0.010)    50.2s
XGB_GPU: 0.8465    (0.855 ± 0.009)    28.2s
     CB: 0.8494    (0.859 ± 0.010)    99.1s
 CB_GPU: 0.8533    (0.863 ± 0.010)    33.2s
    LGB: 0.8566    (0.863 ± 0.006)    18.2s


In [55]:
models['XGB_GPU'].fit(X_train, y_train)

In [56]:
y_pred = models['XGB_GPU'].predict_proba(X_test)

In [57]:
from src.utils import arr_to_submit

arr_to_submit(y_pred[:, 1]).to_csv("../submits/fps_rd_md_gin_descs_xgb_gpu.csv")
# pd.Series(y_pred[:, 1], name='Y').to_csv("../submits/fps_rd_md_descs_hist_gb.csv")

In [132]:
from sklearn.model_selection import cross_validate

def get_fitted_models(model, X, y):        
    kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    cv_res = cross_validate(model, scale(X), y, cv=kfold, scoring='roc_auc', return_estimator=True)
    score = cv_res['test_score'].mean() - cv_res['test_score'].std()    
    return cv_res['estimator'], score
    

In [133]:
# fitted_models = {}

In [134]:
# X = ds['dataset_2']['X_train']
# y = ds['dataset_2']['y_train']
# 
# for name, model in tqdm(models.items()):
#     fitted_models[name] = get_fitted_models(model, X, y)

In [135]:
dataset = {
    'X_train': pd.read_pickle('../data/processed/X_train.pkl.zip'),
    'X_test': pd.read_pickle('../data/processed/X_test.pkl.zip'),
    'y_train': pd.read_pickle('../data/processed/y_train.pkl'),
}

In [136]:
def get_predictions(clf_list, X_test):
    preds = []
    for clf in clf_list:
        target_one_probs = clf.predict_proba(X_test)[:, 1]
        preds.append(target_one_probs)
        
    return np.stack(preds).mean(axis=0)
    
# get_predictions(fitted_models['XGB GPU'], ds['dataset_2']['X_test'])

In [137]:
# fitted_models['XGB GPU'][4].predict_proba(ds['dataset_2']['X_test'])[:, 1]

In [138]:
def get_avg_predictions(fitted_models_dict, dataset, weights: dict = None):
    X_test = dataset['X_test']
    preds = np.zeros(X_test.shape[0])
    
    if weights is None:
        weights = {k: 1.0 for k in fitted_models_dict.keys()}

    for name, models in tqdm(fitted_models_dict.items(), desc='Predicting     '):        
        preds += weights[name] * get_predictions(models, X_test)
                    
    preds /= sum(weights.values())
    return preds

# get_avg_predictions(fitted_models, ds['dataset_2'])

In [139]:
# get_avg_predictions(fitted_models, ds['dataset_2'], weights={'XGB GPU': 0.1, 'RF': 0.5})

In [140]:
def fit_models(models, dataset):
    fitted_models = {}
    for name, model in models.items():
        print(name, end=': ')
        fitted_models[name], score = get_fitted_models(model, dataset['X_train'], dataset['y_train'])
        print(round(score, 3))
    return fitted_models

# res = fit_models(models, ds['dataset_2'])

In [141]:
# res

In [142]:
# y_pred_2 = get_avg_predictions(res, ds['dataset_2'])

In [143]:
def fit_predict(models: dict, dataset: dict, weights: dict = None):
    res = fit_models(models, dataset)
    return get_avg_predictions(res, dataset, weights)

In [144]:
scores = {
     'RF': 0.868,
 'HistGB': 0.895,
    'XGB': 0.893,
'XGB_GPU': 0.892,
     'CB': 0.883,
 'CB_GPU': 0.881,
    'LGB': 0.897,
}

df = pd.DataFrame(pd.Series(scores, name='score')).sort_values(by=['score'], ascending=False)
df['norm'] = (df.score - df.score.min()) / (df.score.max() - df.score.min()) + 0.05 
df.norm

LGB        1.050000
HistGB     0.981034
XGB        0.912069
XGB_GPU    0.877586
CB         0.567241
CB_GPU     0.498276
RF         0.050000
Name: norm, dtype: float64

In [145]:
weights = df.norm.to_dict()

In [146]:
def fit_predict(models: dict, dataset: dict, weights: dict = None):
    res = fit_models(models, dataset)
    return get_avg_predictions(res, dataset, weights)

In [147]:
y_pred = fit_predict(models, dataset, weights)


  0%|          | 0/7 [00:00<?, ?it/s][A
Fitting      RF:   0%|          | 0/7 [00:00<?, ?it/s][A
Fitting      RF:  14%|█▍        | 1/7 [00:07<00:44,  7.33s/it][A
Fitting  HistGB:  14%|█▍        | 1/7 [00:07<00:44,  7.33s/it][A
Fitting  HistGB:  29%|██▊       | 2/7 [01:48<05:11, 62.25s/it][A
Fitting     XGB:  29%|██▊       | 2/7 [01:48<05:11, 62.25s/it][A
Fitting     XGB:  43%|████▎     | 3/7 [03:00<04:26, 66.72s/it][A
Fitting XGB_GPU:  43%|████▎     | 3/7 [03:00<04:26, 66.72s/it][A
Fitting XGB_GPU:  57%|█████▋    | 4/7 [03:36<02:44, 54.80s/it][A
Fitting      CB:  57%|█████▋    | 4/7 [03:36<02:44, 54.80s/it][A
Fitting      CB:  71%|███████▏  | 5/7 [06:07<02:58, 89.39s/it][A
Fitting  CB_GPU:  71%|███████▏  | 5/7 [06:07<02:58, 89.39s/it][A
Fitting  CB_GPU:  86%|████████▌ | 6/7 [06:57<01:16, 76.09s/it][A
Fitting     LGB:  86%|████████▌ | 6/7 [06:57<01:16, 76.09s/it][A
Fitting     LGB: 100%|██████████| 7/7 [07:39<00:00, 65.65s/it][A

Predicting     :   0%|          | 0/7 [00

Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220],
      dtype='int64', length=1221)

In [156]:
res = pd.Series(y_pred, name='Y')
res.index.name = 'id'
res.to_csv('../submits/fps_gnn_ensemble_boost.csv')