In [1]:
import pandas as pd
import numpy as np
from functools import reduce

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

import lightgbm as lgb

from tqdm import tqdm, trange

import numpy as np
np.random.seed(0)

import matplotlib.pyplot as plt

from sklearn.calibration import calibration_curve
from sklearn.calibration import IsotonicRegression

import xgboost as xgb
from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import StratifiedKFold

import mlflow
from utils import log_mlflow, create_submission, apply_isotonic_regression


In [2]:
experiment_name = "criteo_privacy_preserving_competition"
mlflow.set_experiment(experiment_name)

In [3]:
X_train = pd.read_csv('output/X_train_full.csv')
X_test = pd.read_csv('output/X_test_full.csv')

DATA_DIR = 'data/'
Y_TRAIN_PATH = DATA_DIR + 'y_train.csv.gz'
y_train = pd.read_csv(Y_TRAIN_PATH, compression='gzip')

In [9]:
class DatasetHandler:
    
    def __init__(self, X_train, X_valid, y_train, y_valid, X_test):
        self.X_train = X_train
        self.X_valid = X_valid
        self.y_train = y_train
        self.y_valid = y_valid
        self.X_test = X_test
                
    @staticmethod
    def smoothed_stat(df, smoothed_factor, prefix):
        df[f'{prefix}_ctr'] = (smoothed_factor['ctr'] * 100 + df[f'{prefix}_clicks']) / (100 + df[f'{prefix}_count'])
        df[f'{prefix}_cr'] = (smoothed_factor['cr'] * 100 + df[f'{prefix}_sales']) / (100 + df[f'{prefix}_count'])
        df[f'{prefix}_ccr'] = (smoothed_factor['ccr'] * 100 + df[f'{prefix}_sales']) /(100 + df[f'{prefix}_clicks'])
        return df
    
    def get_smoothed_factor(self):
        smoothed_factor = {
            'count': 0, 
            'clicks': 0, 
            'sales': 0,
            'freq': 0,
            'ctr': np.mean(self.y_train['click']),
            'cr': np.mean(self.y_train['sale']),
            'ccr': np.mean(self.y_train[self.y_train['click'] == 1]['sale']) 
        } 
        return smoothed_factor
                
    def preprocess_dataframes(self):

        hash_cols = {f'hash_{i}' for i in range(19)}
        train_cols = list(set(self.X_train.columns) - hash_cols)
        smoothed_factor = self.get_smoothed_factor()

        for dataset in (self.X_train, self.X_valid, self.X_test):
            for i in trange(19):
                dataset = self.smoothed_stat(dataset, smoothed_factor, f'feature{i}')
                for j in trange(19):
                    if i < j:
                        dataset = self.smoothed_stat(dataset, smoothed_factor, f'feature{i}_feature{j}')

            for col in train_cols:
                stat = col.split('_')[-1]
                if stat in smoothed_factor:
                    dataset[col] = dataset[col].replace([np.inf, -np.inf], np.nan).fillna(smoothed_factor[stat])

        dataset_dict = {
            'X_train': self.X_train[train_cols], 
            'X_valid': self.X_valid[train_cols], 
            'y_train': self.y_train['click'], 
            'y_valid': self.y_valid['click'], 
            'X_test':  self.X_test[train_cols],
        }
        return dataset_dict
        

In [17]:
class ModelStacking:
    
    def __init__(self, models, models_params):
        self.models = models
        self.models_params = models_params
        
        self.scores = {model: [] for model in self.models}
        self.val_preds = {model: {} for model in self.models}
        self.test_preds = {model: pd.DataFrame() for model in self.models}
        self.calibrated_test_preds = {model: pd.DataFrame() for model in self.models}
        
    def set_dataset(self, X_train, y_train, X_valid, y_valid, X_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_valid = X_valid
        self.y_valid = y_valid
        self.X_test = X_test

    def fit_model(self, model_name):
        if model_name == 'lightgbm':
            tr = lgb.Dataset(np.array(self.X_train), np.array(self.y_train))
            te = lgb.Dataset(np.array(self.X_valid), np.array(self.y_valid), reference=tr)
            model = lgb.train(
                self.models_params[model_name], 
                tr, 
                num_boost_round=10000, 
                valid_sets=te, 
                early_stopping_rounds=int(5 / self.models_params[model_name]['learning_rate']), 
                verbose_eval=100,
            )
            self.scores[model_name].append(model.best_score['valid_0']['binary_logloss'])
            self.val_preds[model_name][str(i)] = model.predict(self.X_valid)
            self.test_preds[model_name][str(i)] = model.predict(self.X_test)
        
        elif model_name == 'xgboost':
            model = xgb.XGBClassifier(**models_params[model_name])
            model.fit(
                self.X_train, 
                self.y_train, 
                early_stopping_rounds=int(5 / models_params[model_name]['learning_rate']), 
                eval_metric="logloss", 
                eval_set=[(self.X_valid, self.y_valid)], 
                verbose=100,
            )
            self.scores[model_name].append(model.best_score)
            self.val_preds[model_name][str(i)] = model.predict_proba(self.X_valid)[:, 1]
            self.test_preds[model_name][str(i)] = model.predict_proba(self.X_test)[:, 1]
        
        elif model_name == 'catboost':
            model = CatBoostClassifier(**models_params[model_name])
            model.fit(
                self.X_train, 
                self.y_train, 
                eval_set=Pool(self.X_valid, self.y_valid), 
                verbose=100,
            )
            self.scores[model_name].append(model.best_score_['validation']['Logloss'])
            self.val_preds[model_name][str(i)] = model.predict_proba(self.X_valid)[:, 1]
            self.test_preds[model_name][str(i)] = model.predict_proba(self.X_test)[:, 1]

    def apply_isotonic_regression(self, model_name):
        self.calibrated_test_preds[model_name][str(i)] = apply_isotonic_regression(
            self.val_preds[model_name][str(i)], 
            self.y_valid, 
            self.test_preds[model_name][str(i)],
        )
                        
    def predict(self):
        predictions = reduce(lambda x, y: x+y, [model_stacking.calibrated_test_preds[model_name].mean(axis=1) for model_name in model_stacking.models]) / len(model_stacking.models)
        return predictions


In [11]:
models = {'lightgbm', 'catboost', 'xgboost'}

models_params = {
    'lightgbm': {
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'boost': 'gbdt',
        'feature_fraction': 0.8,
        'learning_rate': 0.01,
        'metric':'binary',
        'num_leaves': 31,
        'num_threads': 8,
        'objective': 'binary',
        'seed': 42,
    }, 
    'catboost': {        
        'iterations': 10000,
        'learning_rate': 0.01,
        'loss_function': 'Logloss',
        'early_stopping_rounds': 500,
        'random_seed': 42,
    }, 
    'xgboost': {
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'learning_rate': 0.01,
        'max_leaves': 31,
        'objective': 'binary:logistic',
        'n_estimators': 10000,
        'use_label_encoder': False,
        'seed': 42,
    },
}


In [19]:
model_stacking = ModelStacking(models, models_params)

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
for i , (train_index, test_index) in enumerate(kf.split(X_train, y_train['click'])):
    
    data_handler = DatasetHandler(
        X_train.loc[train_index], 
        X_train.loc[test_index], 
        y_train.loc[train_index], 
        y_train.loc[test_index], 
        X_test,
    )
    dataset_dict = data_handler.preprocess_dataframes()
    model_stacking.set_dataset(**dataset_dict)
    
    for model_name in models:
        model_stacking.fit_model(model_name)
        model_stacking.apply_isotonic_regression(model_name)

metrics = {model: np.mean(val_score) for model, val_score in model_stacking.scores.items()}

  0%|          | 0/19 [00:00<?, ?it/s]
100%|██████████| 19/19 [00:00<00:00, 253.56it/s]

  
  if sys.path[0] == '':
  del sys.path[0]
100%|██████████| 19/19 [00:00<00:00, 253.31it/s]
 11%|█         | 2/19 [00:00<00:01, 12.18it/s]
100%|██████████| 19/19 [00:00<00:00, 277.01it/s]

100%|██████████| 19/19 [00:00<00:00, 296.19it/s]
 21%|██        | 4/19 [00:00<00:01, 12.93it/s]
100%|██████████| 19/19 [00:00<00:00, 322.40it/s]

100%|██████████| 19/19 [00:00<00:00, 330.35it/s]
 32%|███▏      | 6/19 [00:00<00:00, 13.89it/s]
100%|██████████| 19/19 [00:00<00:00, 354.63it/s]

100%|██████████| 19/19 [00:00<00:00, 370.38it/s]
 42%|████▏     | 8/19 [00:00<00:00, 14.87it/s]
100%|██████████| 19/19 [00:00<00:00, 430.33it/s]

100%|██████████| 19/19 [00:00<00:00, 476.31it/s]

100%|██████████| 19/19 [00:00<00:00, 538.00it/s]
 58%|█████▊    | 11/19 [00:00<00:00, 17.12it/s]
100%|██████████| 19/19 [00:00<00:00, 612.11it/s]

100%|██████████| 19/19 [00:00<00:00, 708.05it/s]

100%|██████████| 19/19 [00:00<00:00

[LightGBM] [Info] Number of positive: 9170, number of negative: 83020
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 123135
[LightGBM] [Info] Number of data points in the train set: 92190, number of used features: 760
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099468 -> initscore=-2.203144
[LightGBM] [Info] Start training from score -2.203144
Training until validation scores don't improve for 500 rounds
[100]	valid_0's binary_logloss: 0.27063
[200]	valid_0's binary_logloss: 0.254259
[300]	valid_0's binary_logloss: 0.246207
[400]	valid_0's binary_logloss: 0.242198
[500]	valid_0's binary_logloss: 0.240119
[600]	valid_0's binary_logloss: 0.239087
[700]	valid_0's binary_logloss: 0.238469
[800]	valid_0's binary_logloss: 0.238037
[900]	valid_0's binary_logloss: 0.237623
[1000]	valid_0's binary_logloss: 0.237357
[1100]	valid_0's binary_logloss: 0.237204
[1200]	valid_0's binary_logloss: 0.236946
[1300]	valid_0's binary_logloss: 0.236897
[1400]	val

  0%|          | 0/19 [00:00<?, ?it/s]
100%|██████████| 19/19 [00:00<00:00, 216.67it/s]
  5%|▌         | 1/19 [00:00<00:02,  8.16it/s]
  
  if sys.path[0] == '':
  del sys.path[0]
100%|██████████| 19/19 [00:00<00:00, 229.55it/s]

100%|██████████| 19/19 [00:00<00:00, 261.53it/s]
 16%|█▌        | 3/19 [00:00<00:01, 10.50it/s]
100%|██████████| 19/19 [00:00<00:00, 256.55it/s]

100%|██████████| 19/19 [00:00<00:00, 269.68it/s]
 26%|██▋       | 5/19 [00:00<00:01, 11.36it/s]
100%|██████████| 19/19 [00:00<00:00, 295.50it/s]

100%|██████████| 19/19 [00:00<00:00, 326.59it/s]
 37%|███▋      | 7/19 [00:00<00:00, 12.45it/s]
100%|██████████| 19/19 [00:00<00:00, 343.38it/s]

100%|██████████| 19/19 [00:00<00:00, 399.03it/s]
 47%|████▋     | 9/19 [00:00<00:00, 13.76it/s]
100%|██████████| 19/19 [00:00<00:00, 415.72it/s]

100%|██████████| 19/19 [00:00<00:00, 497.31it/s]
 58%|█████▊    | 11/19 [00:00<00:00, 15.40it/s]
100%|██████████| 19/19 [00:00<00:00, 595.19it/s]

100%|██████████| 19/19 [00:00<00:00, 62

[LightGBM] [Info] Number of positive: 9170, number of negative: 83020
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 123132
[LightGBM] [Info] Number of data points in the train set: 92190, number of used features: 760
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099468 -> initscore=-2.203144
[LightGBM] [Info] Start training from score -2.203144
Training until validation scores don't improve for 500 rounds
[100]	valid_0's binary_logloss: 0.270579
[200]	valid_0's binary_logloss: 0.253587
[300]	valid_0's binary_logloss: 0.244989
[400]	valid_0's binary_logloss: 0.240462
[500]	valid_0's binary_logloss: 0.23796
[600]	valid_0's binary_logloss: 0.236258
[700]	valid_0's binary_logloss: 0.235207
[800]	valid_0's binary_logloss: 0.234597
[900]	valid_0's binary_logloss: 0.234162
[1000]	valid_0's binary_logloss: 0.233784
[1100]	valid_0's binary_logloss: 0.233565
[1200]	valid_0's binary_logloss: 0.233382
[1300]	valid_0's binary_logloss: 0.233115
[1400]	val

5900:	learn: 0.1842265	test: 0.2314586	best: 0.2314501 (5876)	total: 5m 43s	remaining: 3m 58s
6000:	learn: 0.1834364	test: 0.2314318	best: 0.2314296 (5998)	total: 5m 49s	remaining: 3m 52s
6100:	learn: 0.1826765	test: 0.2314094	best: 0.2314059 (6097)	total: 5m 55s	remaining: 3m 47s
6200:	learn: 0.1819289	test: 0.2313894	best: 0.2313810 (6195)	total: 6m 1s	remaining: 3m 41s
6300:	learn: 0.1811727	test: 0.2314355	best: 0.2313810 (6195)	total: 6m 8s	remaining: 3m 36s
6400:	learn: 0.1803973	test: 0.2314704	best: 0.2313810 (6195)	total: 6m 14s	remaining: 3m 30s
6500:	learn: 0.1796284	test: 0.2314874	best: 0.2313810 (6195)	total: 6m 20s	remaining: 3m 24s
6600:	learn: 0.1789090	test: 0.2315075	best: 0.2313810 (6195)	total: 6m 26s	remaining: 3m 18s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.2313810198
bestIteration = 6195

Shrink model to first 6196 iterations.


  0%|          | 0/19 [00:00<?, ?it/s]
  0%|          | 0/19 [00:00<?, ?it/s][A
100%|██████████| 19/19 [00:00<00:00, 171.59it/s][A
  5%|▌         | 1/19 [00:00<00:02,  6.86it/s]
  
  if sys.path[0] == '':
  del sys.path[0]
100%|██████████| 19/19 [00:00<00:00, 208.53it/s]
 11%|█         | 2/19 [00:00<00:02,  8.40it/s]
100%|██████████| 19/19 [00:00<00:00, 228.14it/s]

  0%|          | 0/19 [00:00<?, ?it/s][A
100%|██████████| 19/19 [00:00<00:00, 179.00it/s][A
 21%|██        | 4/19 [00:00<00:01,  9.13it/s]
  0%|          | 0/19 [00:00<?, ?it/s][A
100%|██████████| 19/19 [00:00<00:00, 23.17it/s][A
 26%|██▋       | 5/19 [00:01<00:04,  3.00it/s]
100%|██████████| 19/19 [00:00<00:00, 269.42it/s]

100%|██████████| 19/19 [00:00<00:00, 294.79it/s]
 37%|███▋      | 7/19 [00:01<00:02,  4.70it/s]
100%|██████████| 19/19 [00:00<00:00, 322.29it/s]

100%|██████████| 19/19 [00:00<00:00, 322.01it/s]
 47%|████▋     | 9/19 [00:01<00:01,  6.47it/s]
100%|██████████| 19/19 [00:00<00:00, 352.49it/s]

100%|█

[LightGBM] [Info] Number of positive: 9170, number of negative: 83020
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 123121
[LightGBM] [Info] Number of data points in the train set: 92190, number of used features: 760
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099468 -> initscore=-2.203144
[LightGBM] [Info] Start training from score -2.203144
Training until validation scores don't improve for 500 rounds
[100]	valid_0's binary_logloss: 0.273423
[200]	valid_0's binary_logloss: 0.257604
[300]	valid_0's binary_logloss: 0.250179
[400]	valid_0's binary_logloss: 0.2463
[500]	valid_0's binary_logloss: 0.2443
[600]	valid_0's binary_logloss: 0.243018
[700]	valid_0's binary_logloss: 0.242219
[800]	valid_0's binary_logloss: 0.241526
[900]	valid_0's binary_logloss: 0.24115
[1000]	valid_0's binary_logloss: 0.240844
[1100]	valid_0's binary_logloss: 0.240611
[1200]	valid_0's binary_logloss: 0.240451
[1300]	valid_0's binary_logloss: 0.240358
[1400]	valid_0

5800:	learn: 0.1847449	test: 0.2394063	best: 0.2393926 (5747)	total: 4m 24s	remaining: 3m 11s
5900:	learn: 0.1839780	test: 0.2393818	best: 0.2393811 (5858)	total: 4m 28s	remaining: 3m 6s
6000:	learn: 0.1831974	test: 0.2393618	best: 0.2393549 (5972)	total: 4m 32s	remaining: 3m 1s
6100:	learn: 0.1824349	test: 0.2393445	best: 0.2393236 (6080)	total: 4m 36s	remaining: 2m 56s
6200:	learn: 0.1817188	test: 0.2393218	best: 0.2393218 (6200)	total: 4m 40s	remaining: 2m 51s
6300:	learn: 0.1809732	test: 0.2393017	best: 0.2393017 (6300)	total: 4m 44s	remaining: 2m 47s
6400:	learn: 0.1802186	test: 0.2392955	best: 0.2392699 (6342)	total: 4m 49s	remaining: 2m 42s
6500:	learn: 0.1794506	test: 0.2393242	best: 0.2392699 (6342)	total: 4m 54s	remaining: 2m 38s
6600:	learn: 0.1787001	test: 0.2393194	best: 0.2392699 (6342)	total: 4m 58s	remaining: 2m 33s
6700:	learn: 0.1779438	test: 0.2393521	best: 0.2392699 (6342)	total: 5m 2s	remaining: 2m 29s
6800:	learn: 0.1772161	test: 0.2393400	best: 0.2392699 (6342)	t

  0%|          | 0/19 [00:00<?, ?it/s]
100%|██████████| 19/19 [00:00<00:00, 234.68it/s]
  5%|▌         | 1/19 [00:00<00:02,  8.62it/s]
  
  if sys.path[0] == '':
  del sys.path[0]
100%|██████████| 19/19 [00:00<00:00, 233.69it/s]

100%|██████████| 19/19 [00:00<00:00, 281.59it/s]
 16%|█▌        | 3/19 [00:00<00:01, 11.00it/s]
100%|██████████| 19/19 [00:00<00:00, 292.43it/s]

100%|██████████| 19/19 [00:00<00:00, 305.07it/s]
 26%|██▋       | 5/19 [00:00<00:01, 12.41it/s]
100%|██████████| 19/19 [00:00<00:00, 335.80it/s]

100%|██████████| 19/19 [00:00<00:00, 360.21it/s]
 37%|███▋      | 7/19 [00:00<00:00, 13.73it/s]
100%|██████████| 19/19 [00:00<00:00, 382.52it/s]

100%|██████████| 19/19 [00:00<00:00, 409.68it/s]
 47%|████▋     | 9/19 [00:00<00:00, 15.11it/s]
100%|██████████| 19/19 [00:00<00:00, 469.09it/s]

100%|██████████| 19/19 [00:00<00:00, 527.98it/s]

100%|██████████| 19/19 [00:00<00:00, 606.39it/s]
 63%|██████▎   | 12/19 [00:00<00:00, 17.81it/s]
100%|██████████| 19/19 [00:00<00:00, 72

[LightGBM] [Info] Number of positive: 9170, number of negative: 83020
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 123116
[LightGBM] [Info] Number of data points in the train set: 92190, number of used features: 760
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099468 -> initscore=-2.203144
[LightGBM] [Info] Start training from score -2.203144
Training until validation scores don't improve for 500 rounds
[100]	valid_0's binary_logloss: 0.271173
[200]	valid_0's binary_logloss: 0.254802
[300]	valid_0's binary_logloss: 0.247045
[400]	valid_0's binary_logloss: 0.243215
[500]	valid_0's binary_logloss: 0.241003
[600]	valid_0's binary_logloss: 0.23974
[700]	valid_0's binary_logloss: 0.238975
[800]	valid_0's binary_logloss: 0.238397
[900]	valid_0's binary_logloss: 0.238014
[1000]	valid_0's binary_logloss: 0.237801
[1100]	valid_0's binary_logloss: 0.237653
[1200]	valid_0's binary_logloss: 0.23754
[1300]	valid_0's binary_logloss: 0.237446
[1400]	vali

  0%|          | 0/19 [00:00<?, ?it/s]
100%|██████████| 19/19 [00:00<00:00, 233.59it/s]

  
  if sys.path[0] == '':
  del sys.path[0]
100%|██████████| 19/19 [00:00<00:00, 253.60it/s]
 11%|█         | 2/19 [00:00<00:01, 11.62it/s]
100%|██████████| 19/19 [00:00<00:00, 252.40it/s]

100%|██████████| 19/19 [00:00<00:00, 289.18it/s]
 21%|██        | 4/19 [00:00<00:01, 12.22it/s]
100%|██████████| 19/19 [00:00<00:00, 318.80it/s]

100%|██████████| 19/19 [00:00<00:00, 332.01it/s]
 32%|███▏      | 6/19 [00:00<00:00, 13.42it/s]
100%|██████████| 19/19 [00:00<00:00, 358.58it/s]

100%|██████████| 19/19 [00:00<00:00, 379.71it/s]
 42%|████▏     | 8/19 [00:00<00:00, 14.61it/s]
100%|██████████| 19/19 [00:00<00:00, 428.87it/s]

100%|██████████| 19/19 [00:00<00:00, 472.79it/s]

100%|██████████| 19/19 [00:00<00:00, 533.21it/s]
 58%|█████▊    | 11/19 [00:00<00:00, 16.98it/s]
100%|██████████| 19/19 [00:00<00:00, 602.92it/s]

100%|██████████| 19/19 [00:00<00:00, 684.83it/s]

100%|██████████| 19/19 [00:00<00:00

[LightGBM] [Info] Number of positive: 9171, number of negative: 83020
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 123188
[LightGBM] [Info] Number of data points in the train set: 92191, number of used features: 760
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099478 -> initscore=-2.203035
[LightGBM] [Info] Start training from score -2.203035
Training until validation scores don't improve for 500 rounds
[100]	valid_0's binary_logloss: 0.268726
[200]	valid_0's binary_logloss: 0.25163
[300]	valid_0's binary_logloss: 0.243445
[400]	valid_0's binary_logloss: 0.239081
[500]	valid_0's binary_logloss: 0.236623
[600]	valid_0's binary_logloss: 0.235123
[700]	valid_0's binary_logloss: 0.234237
[800]	valid_0's binary_logloss: 0.233783
[900]	valid_0's binary_logloss: 0.233391
[1000]	valid_0's binary_logloss: 0.232967
[1100]	valid_0's binary_logloss: 0.232755
[1200]	valid_0's binary_logloss: 0.232583
[1300]	valid_0's binary_logloss: 0.232405
[1400]	val

  0%|          | 0/19 [00:00<?, ?it/s]
100%|██████████| 19/19 [00:00<00:00, 191.32it/s]
  5%|▌         | 1/19 [00:00<00:01,  9.24it/s]
  
  if sys.path[0] == '':
  del sys.path[0]
100%|██████████| 19/19 [00:00<00:00, 203.78it/s]
 11%|█         | 2/19 [00:00<00:01,  9.54it/s]
100%|██████████| 19/19 [00:00<00:00, 225.56it/s]

100%|██████████| 19/19 [00:00<00:00, 249.01it/s]
 21%|██        | 4/19 [00:00<00:01, 10.51it/s]
100%|██████████| 19/19 [00:00<00:00, 275.51it/s]

100%|██████████| 19/19 [00:00<00:00, 325.17it/s]
 32%|███▏      | 6/19 [00:00<00:01, 11.88it/s]
100%|██████████| 19/19 [00:00<00:00, 343.71it/s]

100%|██████████| 19/19 [00:00<00:00, 389.20it/s]
 42%|████▏     | 8/19 [00:00<00:00, 13.47it/s]
100%|██████████| 19/19 [00:00<00:00, 406.41it/s]

100%|██████████| 19/19 [00:00<00:00, 466.46it/s]
 53%|█████▎    | 10/19 [00:00<00:00, 15.17it/s]
100%|██████████| 19/19 [00:00<00:00, 515.16it/s]

100%|██████████| 19/19 [00:00<00:00, 552.07it/s]

100%|██████████| 19/19 [00:00<00:00, 63

[LightGBM] [Info] Number of positive: 9170, number of negative: 83021
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 123118
[LightGBM] [Info] Number of data points in the train set: 92191, number of used features: 760
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099467 -> initscore=-2.203156
[LightGBM] [Info] Start training from score -2.203156
Training until validation scores don't improve for 500 rounds
[100]	valid_0's binary_logloss: 0.271278
[200]	valid_0's binary_logloss: 0.255214
[300]	valid_0's binary_logloss: 0.248081
[400]	valid_0's binary_logloss: 0.244576
[500]	valid_0's binary_logloss: 0.242815
[600]	valid_0's binary_logloss: 0.24196
[700]	valid_0's binary_logloss: 0.241478
[800]	valid_0's binary_logloss: 0.241093
[900]	valid_0's binary_logloss: 0.240911
[1000]	valid_0's binary_logloss: 0.24075
[1100]	valid_0's binary_logloss: 0.240569
[1200]	valid_0's binary_logloss: 0.240522
[1300]	valid_0's binary_logloss: 0.240525
[1400]	vali

  0%|          | 0/19 [00:00<?, ?it/s]
  0%|          | 0/19 [00:00<?, ?it/s][A
100%|██████████| 19/19 [00:00<00:00, 182.78it/s][A
  5%|▌         | 1/19 [00:00<00:02,  8.37it/s]
  
  if sys.path[0] == '':
  del sys.path[0]
100%|██████████| 19/19 [00:00<00:00, 221.18it/s]

100%|██████████| 19/19 [00:00<00:00, 238.91it/s]
 16%|█▌        | 3/19 [00:00<00:01, 10.00it/s]
100%|██████████| 19/19 [00:00<00:00, 261.59it/s]

100%|██████████| 19/19 [00:00<00:00, 260.03it/s]
 26%|██▋       | 5/19 [00:00<00:01, 10.95it/s]
100%|██████████| 19/19 [00:00<00:00, 275.95it/s]

100%|██████████| 19/19 [00:00<00:00, 213.98it/s]
 37%|███▋      | 7/19 [00:00<00:01, 11.14it/s]
100%|██████████| 19/19 [00:00<00:00, 371.37it/s]

100%|██████████| 19/19 [00:00<00:00, 406.24it/s]
 47%|████▋     | 9/19 [00:00<00:00, 12.93it/s]
100%|██████████| 19/19 [00:00<00:00, 459.07it/s]

100%|██████████| 19/19 [00:00<00:00, 514.67it/s]

100%|██████████| 19/19 [00:00<00:00, 582.65it/s]
 63%|██████▎   | 12/19 [00:00<00:00, 15.87

[LightGBM] [Info] Number of positive: 9170, number of negative: 83021
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 123130
[LightGBM] [Info] Number of data points in the train set: 92191, number of used features: 760
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099467 -> initscore=-2.203156
[LightGBM] [Info] Start training from score -2.203156
Training until validation scores don't improve for 500 rounds
[100]	valid_0's binary_logloss: 0.270191
[200]	valid_0's binary_logloss: 0.253988
[300]	valid_0's binary_logloss: 0.24648
[400]	valid_0's binary_logloss: 0.242557
[500]	valid_0's binary_logloss: 0.240612
[600]	valid_0's binary_logloss: 0.239551
[700]	valid_0's binary_logloss: 0.238918
[800]	valid_0's binary_logloss: 0.238465
[900]	valid_0's binary_logloss: 0.238184
[1000]	valid_0's binary_logloss: 0.238071
[1100]	valid_0's binary_logloss: 0.237975
[1200]	valid_0's binary_logloss: 0.237965
[1300]	valid_0's binary_logloss: 0.237983
[1400]	val

  0%|          | 0/19 [00:00<?, ?it/s]
100%|██████████| 19/19 [00:00<00:00, 189.83it/s]
  5%|▌         | 1/19 [00:00<00:01,  9.04it/s]
  
  if sys.path[0] == '':
  del sys.path[0]
100%|██████████| 19/19 [00:00<00:00, 193.51it/s]
 11%|█         | 2/19 [00:00<00:01,  9.14it/s]
100%|██████████| 19/19 [00:00<00:00, 250.30it/s]

100%|██████████| 19/19 [00:00<00:00, 256.40it/s]
 21%|██        | 4/19 [00:00<00:01, 10.74it/s]
100%|██████████| 19/19 [00:00<00:00, 264.47it/s]

100%|██████████| 19/19 [00:00<00:00, 261.92it/s]
 32%|███▏      | 6/19 [00:00<00:01, 11.44it/s]
100%|██████████| 19/19 [00:00<00:00, 279.11it/s]

100%|██████████| 19/19 [00:00<00:00, 234.83it/s]
 42%|████▏     | 8/19 [00:00<00:00, 11.49it/s]
100%|██████████| 19/19 [00:00<00:00, 311.73it/s]

100%|██████████| 19/19 [00:00<00:00, 436.18it/s]
 53%|█████▎    | 10/19 [00:00<00:00, 12.79it/s]
100%|██████████| 19/19 [00:00<00:00, 497.52it/s]

100%|██████████| 19/19 [00:00<00:00, 570.78it/s]

100%|██████████| 19/19 [00:00<00:00, 62

[LightGBM] [Info] Number of positive: 9170, number of negative: 83021
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 123122
[LightGBM] [Info] Number of data points in the train set: 92191, number of used features: 760
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099467 -> initscore=-2.203156
[LightGBM] [Info] Start training from score -2.203156
Training until validation scores don't improve for 500 rounds
[100]	valid_0's binary_logloss: 0.269618
[200]	valid_0's binary_logloss: 0.252524
[300]	valid_0's binary_logloss: 0.244077
[400]	valid_0's binary_logloss: 0.239614
[500]	valid_0's binary_logloss: 0.237223
[600]	valid_0's binary_logloss: 0.235655
[700]	valid_0's binary_logloss: 0.234737
[800]	valid_0's binary_logloss: 0.234135
[900]	valid_0's binary_logloss: 0.233809
[1000]	valid_0's binary_logloss: 0.233586
[1100]	valid_0's binary_logloss: 0.233292
[1200]	valid_0's binary_logloss: 0.233199
[1300]	valid_0's binary_logloss: 0.232979
[1400]	va

  0%|          | 0/19 [00:00<?, ?it/s]
  0%|          | 0/19 [00:00<?, ?it/s][A
 26%|██▋       | 5/19 [00:00<00:00, 47.51it/s][A
 53%|█████▎    | 10/19 [00:00<00:00, 41.90it/s][A
100%|██████████| 19/19 [00:00<00:00, 40.53it/s][A
  5%|▌         | 1/19 [00:00<00:08,  2.01it/s]
  0%|          | 0/19 [00:00<?, ?it/s][A
 32%|███▏      | 6/19 [00:00<00:00, 58.10it/s][A
 63%|██████▎   | 12/19 [00:00<00:00, 43.12it/s][A
100%|██████████| 19/19 [00:00<00:00, 42.07it/s][A
 11%|█         | 2/19 [00:00<00:08,  2.05it/s]
  0%|          | 0/19 [00:00<?, ?it/s][A
 37%|███▋      | 7/19 [00:00<00:00, 67.97it/s][A
 74%|███████▎  | 14/19 [00:00<00:00, 44.34it/s][A
100%|██████████| 19/19 [00:00<00:00, 44.07it/s][A
 16%|█▌        | 3/19 [00:01<00:07,  2.11it/s]
  0%|          | 0/19 [00:00<?, ?it/s][A
 42%|████▏     | 8/19 [00:00<00:00, 77.82it/s][A
100%|██████████| 19/19 [00:00<00:00, 48.79it/s][A
 21%|██        | 4/19 [00:01<00:06,  2.21it/s]
  0%|          | 0/19 [00:00<?, ?it/s][A
 47%|█

[LightGBM] [Info] Number of positive: 9170, number of negative: 83021
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 123172
[LightGBM] [Info] Number of data points in the train set: 92191, number of used features: 760
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099467 -> initscore=-2.203156
[LightGBM] [Info] Start training from score -2.203156
Training until validation scores don't improve for 500 rounds
[100]	valid_0's binary_logloss: 0.271283
[200]	valid_0's binary_logloss: 0.25516
[300]	valid_0's binary_logloss: 0.247683
[400]	valid_0's binary_logloss: 0.243841
[500]	valid_0's binary_logloss: 0.241364
[600]	valid_0's binary_logloss: 0.240012
[700]	valid_0's binary_logloss: 0.239214
[800]	valid_0's binary_logloss: 0.238623
[900]	valid_0's binary_logloss: 0.23813
[1000]	valid_0's binary_logloss: 0.237836
[1100]	valid_0's binary_logloss: 0.237556
[1200]	valid_0's binary_logloss: 0.237389
[1300]	valid_0's binary_logloss: 0.237181
[1400]	vali

  0%|          | 0/19 [00:00<?, ?it/s]
100%|██████████| 19/19 [00:00<00:00, 250.20it/s]

  
  if sys.path[0] == '':
  del sys.path[0]
100%|██████████| 19/19 [00:00<00:00, 258.51it/s]
 11%|█         | 2/19 [00:00<00:01, 12.15it/s]
100%|██████████| 19/19 [00:00<00:00, 272.84it/s]

100%|██████████| 19/19 [00:00<00:00, 290.05it/s]
 21%|██        | 4/19 [00:00<00:01, 12.80it/s]
100%|██████████| 19/19 [00:00<00:00, 308.66it/s]

100%|██████████| 19/19 [00:00<00:00, 328.99it/s]
 32%|███▏      | 6/19 [00:00<00:00, 13.68it/s]
100%|██████████| 19/19 [00:00<00:00, 363.41it/s]

100%|██████████| 19/19 [00:00<00:00, 398.29it/s]
 42%|████▏     | 8/19 [00:00<00:00, 14.93it/s]
100%|██████████| 19/19 [00:00<00:00, 431.36it/s]

100%|██████████| 19/19 [00:00<00:00, 461.12it/s]
 53%|█████▎    | 10/19 [00:00<00:00, 16.42it/s]
100%|██████████| 19/19 [00:00<00:00, 543.28it/s]

100%|██████████| 19/19 [00:00<00:00, 622.14it/s]

100%|██████████| 19/19 [00:00<00:00, 711.85it/s]
 68%|██████▊   | 13/19 [00:00<00:00,

[LightGBM] [Info] Number of positive: 9170, number of negative: 83021
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 123098
[LightGBM] [Info] Number of data points in the train set: 92191, number of used features: 760
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099467 -> initscore=-2.203156
[LightGBM] [Info] Start training from score -2.203156
Training until validation scores don't improve for 500 rounds
[100]	valid_0's binary_logloss: 0.269985
[200]	valid_0's binary_logloss: 0.25272
[300]	valid_0's binary_logloss: 0.244562
[400]	valid_0's binary_logloss: 0.240401
[500]	valid_0's binary_logloss: 0.238084
[600]	valid_0's binary_logloss: 0.23678
[700]	valid_0's binary_logloss: 0.23596
[800]	valid_0's binary_logloss: 0.235516
[900]	valid_0's binary_logloss: 0.235108
[1000]	valid_0's binary_logloss: 0.234853
[1100]	valid_0's binary_logloss: 0.234595
[1200]	valid_0's binary_logloss: 0.234491
[1300]	valid_0's binary_logloss: 0.234283
[1400]	valid

In [20]:
y_hat_click = model_stacking.predict()

# create zip file to upload
assert X_test.shape[0] == y_hat_click.shape[0], \
    "invalid prediction shape: %s expected %s" % (X_test.shape[0], y_hat_click.shape[0])
submission_name = create_submission(y_hat_click, description='lightgbm submission')


wrote submissions/submission-2021-10-20_18-27-12.599036.zip


In [21]:
tags = {
    'model': 'lightgbm',
    'experiment_name': 'stacking',
    'submission_name': submission_name
}

features_path = mlflow.get_artifact_uri() + 'features.csv'
X_train.columns.to_series().to_csv(features_path)
            
log_mlflow(metrics=metrics, tags=tags, parameters=models_params, artifacts=mlflow.get_artifact_uri())

None {'model': 'lightgbm', 'experiment_name': 'stacking', 'submission_name': 'submissions/submission-2021-10-20_18-27-12.599036.zip'} {'lightgbm': {'bagging_fraction': 0.8, 'bagging_freq': 1, 'boost': 'gbdt', 'feature_fraction': 0.8, 'learning_rate': 0.01, 'metric': 'binary', 'num_leaves': 31, 'num_threads': 8, 'objective': 'binary', 'seed': 42}, 'catboost': {'iterations': 10000, 'learning_rate': 0.01, 'loss_function': 'Logloss', 'early_stopping_rounds': 500, 'random_seed': 42}, 'xgboost': {'subsample': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_leaves': 31, 'objective': 'binary:logistic', 'n_estimators': 10000, 'use_label_encoder': False, 'seed': 42}} {'lightgbm': 0.235953855612544, 'xgboost': 0.2358611, 'catboost': 0.23579339555420065}
