In [9]:
import sys
sys.path.insert(0, '..')
import config as cfg
from metrics import *
from helper import *

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pickle

In [10]:
import catboost as cat
from sklearn.feature_selection import RFECV, RFE
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import cross_val_predict

In [11]:
train = pd.read_pickle(cfg.PREPARED_TRAIN_DATA_PATH)
test = pd.read_pickle(cfg.PREPARED_TEST_DATA_PATH)

In [12]:
RANDOM_STATE = 77
N_SPLITS = 6

In [13]:
scorer = get_weird_pred_proba_score()
cv = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

In [14]:
X, y = train.drop(cfg.TARGETS, axis=1), train[cfg.TARGETS]

In [15]:
clf = cat.CatBoostClassifier(
    n_estimators=100,
    cat_features=cfg.CAT_UNORDERED_COLS, 
    objective='MultiLogloss',
    silent=True)

In [16]:
usecols = X.columns.tolist()
cat_cols = [c for c in usecols if c in cfg.CAT_UNORDERED_COLS]

n_cols = len(usecols)
best_score = float('-inf')
best_features = []


for i in tqdm(range(1, n_cols), total=n_cols-2):
    importances = np.zeros(shape=len(usecols))
    oof = pd.DataFrame(data=np.zeros_like(y), index=y.index, columns=y.columns)

    for train_idx, val_idx in tqdm(cv.split(X, y), total=N_SPLITS):
        clf = cat.CatBoostClassifier(
            loss_function='MultiLogloss',
            iterations=2000,
            silent=True,
            depth=4,
            l2_leaf_reg=2.0,
            learning_rate=0.003,
            early_stopping_rounds=300,
            bagging_temperature=1,
            cat_features=cat_cols
            )
        
        clf.fit(
            X=X[usecols].iloc[train_idx], 
            y=y.iloc[train_idx], 
            eval_set=[(X[usecols].iloc[val_idx], y.iloc[val_idx])])
        importances += clf.feature_importances_
        oof.iloc[val_idx, :] = clf.predict_proba(X[usecols].iloc[val_idx])

    score = compute_weird_pred_proba_score(y.values, oof.values, sub_std=True)
    if score > best_score:
        best_features = clf.feature_names_
        best_score = score
    print('n_features', len(usecols), 'score', score, 'best_score', best_score)

    importances /= N_SPLITS
    importances = pd.Series(data=importances, index=clf.feature_names_).sort_values()

    usecols = importances.iloc[i:].index.tolist()
    cat_cols = [c for c in usecols if c in cfg.CAT_UNORDERED_COLS]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

[0.6783808692277843, 0.6884268608612284, 0.612719779484572, 0.6803383679351017, 0.7289506374056225]
0.6777633029828618 0.0373403989496109
n_features 38 score 0.6404229040332509 best_score 0.6404229040332509


  0%|          | 0/6 [00:00<?, ?it/s]

[0.67534924330617, 0.6884268608612284, 0.6033933684802097, 0.6883839462027006, 0.733018668452166]
0.6777144174604949 0.04199135109818368
n_features 37 score 0.6357230663623112 best_score 0.6404229040332509


  0%|          | 0/6 [00:00<?, ?it/s]

[0.6771864086146682, 0.6977643147094221, 0.6030588487167822, 0.6889977050755189, 0.7317588342569181]
0.679753222274662 0.0424336165284048
n_features 35 score 0.6373196057462571 best_score 0.6404229040332509


  0%|          | 0/6 [00:00<?, ?it/s]

[0.6851232052774544, 0.6987638455418885, 0.6142050472341906, 0.6889309921545606, 0.7307765159857982]
0.6835599212387785 0.038215067556342734
n_features 32 score 0.6453448536824358 best_score 0.6453448536824358


  0%|          | 0/6 [00:00<?, ?it/s]

[0.6804666278618549, 0.6960865308120678, 0.6055744373377578, 0.6841676895981215, 0.7299329556767424]
0.6792456482573089 0.04075777977064921
n_features 28 score 0.6384878684866597 best_score 0.6453448536824358


  0%|          | 0/6 [00:00<?, ?it/s]

[0.6863782984090028, 0.6920068131285315, 0.6154762223352156, 0.6889309921545606, 0.7280410018765362]
0.6821666655807693 0.03663984830673844
n_features 23 score 0.6455268172740309 best_score 0.6455268172740309


  0%|          | 0/6 [00:00<?, ?it/s]

[0.6886762708575864, 0.6922158986598128, 0.6032729413653759, 0.695722367508139, 0.7292391658664223]
0.6818253288514674 0.04187260811314267
n_features 17 score 0.6399527207383248 best_score 0.6455268172740309


  0%|          | 0/6 [00:00<?, ?it/s]

[0.6805272603802872, 0.7044397527691084, 0.6094481761982498, 0.6825265517425415, 0.7239729708299929]
0.680182942384036 0.038768486296013
n_features 10 score 0.6414144560880229 best_score 0.6455268172740309


  0%|          | 0/6 [00:00<?, ?it/s]

[0.6504171517268141, 0.6675795034983579, 0.5986364974442691, 0.6796312109729412, 0.7173191961729233]
0.6627167119630611 0.03885545360676854
n_features 2 score 0.6238612583562926 best_score 0.6455268172740309


  0%|          | 0/6 [00:00<?, ?it/s]

CatBoostError: Input data must have at least one feature

In [18]:
best_features

['alcohol_age',
 'smoking_status_ordinal',
 'sleep_time_ordinal',
 'smoking_age',
 'alcohol_ordinal',
 'wake_up_time_ordinal',
 'education_ordinal',
 'diabetes',
 'nationality',
 'ethnos',
 'employed',
 'religion',
 'passive_smoking_frequency',
 'smoking_status',
 'wake_up_time',
 'retired',
 'sleep_time',
 'alcohol',
 'family',
 'sex',
 'education',
 'profession',
 'regular_medication_intake']

In [17]:
path = os.path.join(cfg.MODELS_PATH, 'catboost', 'selected_features', 'selected_features.pkl')
with open(path, 'wb') as f:
    pickle.dump(best_features, f)