In [1]:
import pandas as pd
import pickle
from BayDS.lib.training import *
from sklearn.model_selection import GroupKFold, KFold

In [23]:
data = pd.read_pickle("temp/data.pkl")

In [4]:
exclude_features_from_training = ['indicator_goal21', 'indicator_goal22', 'indicator_goal23',
       'indicator_goal24', 'indicator_goal25', 'goal21', 'goal22', 'goal23',
       'goal24', 'goal25', 'goal1','userid',]

In [30]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
clf = lgb.LGBMClassifier(learning_rate=0.1, max_depth=20, num_leaves=50, n_estimators=200)

rfecv = RFECV(estimator=clf, step=10, cv=StratifiedKFold(5),verbose=1,
              scoring='roc_auc')

In [24]:
for col in data.columns:
    if data[col].dtype in ['float64', 'float32']:
        data[col] = data[col].replace([np.inf, -np.inf], np.nan)
        data[col] = data[col].fillna(-999)

In [25]:
X = data[data['goal1']>=0].drop(exclude_features_from_training +['goal1'], axis=1)
y = data[data['goal1']>=0]['goal1']
test=data[data['goal1']<0]
train_features = [f for f in data.columns if f not in exclude_features_from_training]

In [None]:
rfecv.fit(X, y)

Fitting estimator with 646 features.
Fitting estimator with 636 features.
Fitting estimator with 626 features.
Fitting estimator with 616 features.
Fitting estimator with 606 features.
Fitting estimator with 596 features.
Fitting estimator with 586 features.
Fitting estimator with 576 features.
Fitting estimator with 566 features.
Fitting estimator with 556 features.
Fitting estimator with 546 features.
Fitting estimator with 536 features.
Fitting estimator with 526 features.
Fitting estimator with 516 features.
Fitting estimator with 506 features.
Fitting estimator with 496 features.
Fitting estimator with 486 features.
Fitting estimator with 476 features.


In [10]:
results = {}

In [None]:
import datetime

NFOLDS = 5
folds = GroupKFold(n_splits=NFOLDS)

for max_depth in range(2,20,2):
    print(f"**********\n Max_depth={max_depth}\n")
    params = {
              'objective': 'binary',
              "metric": 'auc',
              "verbosity": -1,
              'random_state':0,
              'max_depth': max_depth
             }
    train_options = {
            "model_type":'lgb',
            "params": params,
            "eval_metric":'auc',
            'early_stopping_rounds': 100,
            'n_estimators': 500,
            'averaging': 'rank',
            'use_groups': False,
            'fold_name': folds.__class__.__name__,
            'n_splits': NFOLDS,

        }


    result_dict = train_model_classification_vb( X=X, 
                                                 X_test=test, 
                                                 columns=train_features,
                                                 y=y, 
                                                 params=params, folds=folds,
                                                 model_type=train_options['model_type'], 
                                                 plot_feature_importance=True,
                                                 verbose=100, early_stopping_rounds=train_options['early_stopping_rounds'],
                                                 n_estimators=train_options['n_estimators'], 
                                                 averaging=train_options['averaging'],
                                                 groups=X['userid'],
                                                 n_jobs=-1)
    results[max_depth] = result_dict
    pickle.dump(results, open('temp/max_depth.pkl','wb'))
    
    t = datetime.datetime.now().strftime("%m-%d-%H-%M")
    test['proba'] = result_dict['prediction']
    sub = pd.read_csv('data/onetwotrip_challenge_sub1.csv')
    sub['proba'] = test['proba'] / test['proba'].max()
    sub.reset_index(drop=True).set_index('orderid').to_csv(f'temp/submission_depth={max_depth}-{t}.csv')

**********
 Max_depth=2

Fold 1 started at Tue Dec 17 17:27:09 2019
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.732959	training's auc: 0.732959	valid_1's auc: 0.676344	valid_1's auc: 0.676344
[200]	training's auc: 0.75345	training's auc: 0.75345	valid_1's auc: 0.673913	valid_1's auc: 0.673913
Early stopping, best iteration is:
[114]	training's auc: 0.736893	training's auc: 0.736893	valid_1's auc: 0.67697	valid_1's auc: 0.67697
Fold 2 started at Tue Dec 17 17:28:03 2019
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.721551	training's auc: 0.721551	valid_1's auc: 0.718717	valid_1's auc: 0.718718
[200]	training's auc: 0.743946	training's auc: 0.743946	valid_1's auc: 0.719524	valid_1's auc: 0.719524
[300]	training's auc: 0.75951	training's auc: 0.75951	valid_1's auc: 0.719195	valid_1's auc: 0.719195
Early stopping, best iteration is:
[227]	training's auc: 0.748058	training's auc: 0.748058	valid_1's auc: 0.7214

CV mean score: 0.6923, std: 0.0172.
**********
 Max_depth=10

Fold 1 started at Tue Dec 17 17:46:02 2019
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.89605	training's auc: 0.89605	valid_1's auc: 0.669423	valid_1's auc: 0.669423
Early stopping, best iteration is:
[41]	training's auc: 0.831316	training's auc: 0.831316	valid_1's auc: 0.675621	valid_1's auc: 0.675621
Fold 2 started at Tue Dec 17 17:46:54 2019
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.895597	training's auc: 0.895597	valid_1's auc: 0.712549	valid_1's auc: 0.71255
Early stopping, best iteration is:
[39]	training's auc: 0.822271	training's auc: 0.822271	valid_1's auc: 0.720132	valid_1's auc: 0.720131
Fold 3 started at Tue Dec 17 17:47:46 2019
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.897868	training's auc: 0.897868	valid_1's auc: 0.692066	valid_1's auc: 0.692066
Early stopping, best iteration is:
[29

In [12]:
CV mean score: 0.6925, std: 0.0178.
sub.reset_index(drop=True).set_index('orderid').to_csv(f'temp/submission_depth={max_depth}-{t}.csv')

In [21]:
for k,v in results.items():
    print (k, v['scores'])

0 [0.6764080531918433, 0.7253302297489943, 0.7011452480307342, 0.6805495522257086, 0.6833038734198107]
1 [0.6764080531918433, 0.7253302297489943, 0.7011452480307342, 0.6805495522257086, 0.6833038734198107]
2 [0.6764080531918433, 0.7253302297489943, 0.7011452480307342, 0.6805495522257086, 0.6833038734198107]
3 [0.6764080531918433, 0.7253302297489943, 0.7011452480307342, 0.6805495522257086, 0.6833038734198107]
4 [0.6764080531918433, 0.7253302297489943, 0.7011452480307342, 0.6805495522257086, 0.6833038734198107]
5 [0.6764080531918433, 0.7253302297489943, 0.7011452480307342, 0.6805495522257086, 0.6833038734198107]
6 [0.6764080531918433, 0.7253302297489943, 0.7011452480307342, 0.6805495522257086, 0.6833038734198107]
7 [0.6764080531918433, 0.7253302297489943, 0.7011452480307342, 0.6805495522257086, 0.6833038734198107]
8 [0.6764080531918433, 0.7253302297489943, 0.7011452480307342, 0.6805495522257086, 0.6833038734198107]
9 [0.6764080531918433, 0.7253302297489943, 0.7011452480307342, 0.68054955