### This is a simple Stacking code.

Steps
- Build two models using CV. These models will generate predictions on oof data as well as on test data.
- Build data set for level 2
    - Combine the oof predictions to form a train DataFrame (l1_train_df). With two models in level 1, this dataset will have two feature columns. Each column will be the oof prediction from one of the models from level 1. Third column will be the target variable from the raw data.
    - Combine the test predictions to form a test DataFrame (l1_test_df). With two models in level 1, this dataset will have two columns. Each column will be the predictions on test data from one of the models from level 1.
- Build a level 2 model with l1_train_df with CV and predict on l1_test_df

In [1]:
import os
import sys
import gc

import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import RidgeClassifier

pd.options.display.max_rows = 1000

In [2]:
sys.path.insert(0, "/opt/vssexclude/personal/kaggle/kaggle_tab_mar/src")

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import munging.process_data_util as process_data
import common.com_util as common
import config.constants as constants
import modeling.train_util as model

In [5]:
SEED = 42
TARGET = 'target'

LOGGER_NAME = 'main'
logger = common.get_logger(LOGGER_NAME)
common.set_seed(SEED)

In [6]:
def __evaluate_and_log(
    logger,
    run_id,
    train_Y,
    y_oof,
    y_predicted,
    n_folds,
    result_dict,
    cv_scores,
    best_iterations,
):
    y_predicted /= n_folds

    oof_score = round(__calculate_perf_metric(train_Y, y_oof), 5)
    avg_cv_scores = round(sum(cv_scores) / len(cv_scores), 5)
    std_cv_scores = round(np.array(cv_scores).std(), 5)

    logger.info(f"Combined OOF score : {oof_score}")
    logger.info(f"Average of {n_folds} folds OOF score {avg_cv_scores}")
    logger.info(f"std of {n_folds} folds OOF score {std_cv_scores}")

    result_dict["y_oof"] = y_oof
    result_dict["prediction"] = y_predicted
    result_dict["oof_score"] = oof_score
    result_dict["cv_scores"] = cv_scores
    result_dict["avg_cv_scores"] = avg_cv_scores
    result_dict["std_cv_scores"] = std_cv_scores

    return result_dict


def __get_X_Y_from_CV(train_X, train_Y, train_index, validation_index):
    X_train, X_validation = (
        train_X.iloc[train_index].values,
        train_X.iloc[validation_index].values,
    )
    y_train, y_validation = (
        train_Y.iloc[train_index].values,
        train_Y.iloc[validation_index].values,
    )
    return X_train, X_validation, y_train, y_validation


def __calculate_perf_metric(y, y_hat):
    """Returns the performance metrics

       Args:
           y: Real value
           y_hat: predicted value

       Returns:
           RMSE computed
    """
    return roc_auc_score(y, y_hat)


def lgb_train_validate_on_cv(
    logger,
    run_id,
    train_X,
    train_Y,
    test_X,
    kf,
    features,
    params={},
    n_estimators=1000,
    early_stopping_rounds=100,
    cat_features="auto",
    verbose_eval=100,
    is_test=False,
):
    """Train a LightGBM model, validate using cross validation. If `test_X` has
    a valid value, creates a new model with number of best iteration found during
    holdout phase using training as well as validation data.

    startify_by_labels: Used as the label for StartifiedKFold on top of continous
    variables
    """

    y_oof = np.zeros(len(train_X))
    y_predicted = np.zeros(len(test_X))
    cv_scores = []
    result_dict = {}
    feature_importance = pd.DataFrame()
    best_iterations = []

    fold = 0
    n_folds = kf.get_n_splits()
    for train_index, validation_index in kf.split(X=train_X, y=train_Y):
        fold += 1
        logger.info(f"fold {fold} of {n_folds}")

        X_train, X_validation, y_train, y_validation = __get_X_Y_from_CV(
            train_X, train_Y, train_index, validation_index
        )

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(
            X_validation, y_validation, reference=lgb_train)
        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_eval],
            verbose_eval=verbose_eval,
            early_stopping_rounds=early_stopping_rounds,
            num_boost_round=n_estimators,
            feature_name=features,
            categorical_feature=cat_features,
        )

        del lgb_train, lgb_eval, train_index, X_train, y_train
        gc.collect()

        y_oof[validation_index] = model.predict(
            X_validation, num_iteration=model.best_iteration
        )

        if test_X is not None:
            y_predicted += model.predict(
                test_X.values, num_iteration=model.best_iteration
            )

        cv_oof_score = __calculate_perf_metric(y_validation, y_oof[validation_index])
        cv_scores.append(cv_oof_score)
        logger.info(f"CV OOF Score for fold {fold} is {cv_oof_score}")

        del validation_index, X_validation, y_validation
        gc.collect()

    result_dict = __evaluate_and_log(
        logger,
        run_id,
        train_Y,
        y_oof,
        y_predicted,
        n_folds,
        result_dict,
        cv_scores,
        best_iterations,
    )

    del y_oof
    gc.collect()

    logger.info("Training/Prediction completed!")
    return result_dict

In [7]:
train_df, test_df, sample_submission_df = process_data.read_processed_data(
    logger, constants.PROCESSED_DATA_DIR, train=True, test=True, sample_submission=True)

[INFO]2021-03-23 10:06:45,454:main:Reading Data from /opt/vssexclude/personal/kaggle/kaggle_tab_mar/data/processed...
[INFO]2021-03-23 10:06:45,576:main:Shape of train_df : (300000, 31)
[INFO]2021-03-23 10:06:45,616:main:Shape of test_df : (200000, 30)
[INFO]2021-03-23 10:06:45,624:main:Shape of sample_submission_df : (200000, 1)


### LightGBM with LabelEncoding

In [8]:
combined_df = pd.concat([train_df.drop('target', axis=1), test_df])
target = train_df[TARGET]

cat_fetaures = [name for name in train_df.columns if "cat" in name]

logger.info("Label Encoding the categorcal features")
for name in cat_fetaures:
    lb = LabelEncoder()
    combined_df[name] = lb.fit_transform(combined_df[name])

train_df = combined_df.loc[train_df.index]
train_df[TARGET] = target

test_df = combined_df.loc[test_df.index]

train_X = train_df.drop([TARGET], axis=1)
train_Y = train_df[TARGET]
test_X = test_df

[INFO]2021-03-23 10:06:46,708:main:Label Encoding the categorcal features


In [9]:
MODEL_TYPE = "lgb"
OBJECTIVE = "binary"
BOOSTING_TYPE = "gbdt"
METRIC = "auc"
VERBOSE = 100
N_THREADS = -1
NUM_LEAVES = 31
MAX_DEPTH = -1
N_ESTIMATORS = 10000
LEARNING_RATE = 0.1
EARLY_STOPPING_ROUNDS = 100


lgb_params = {
                'objective': OBJECTIVE,
                'boosting_type': BOOSTING_TYPE,
                'learning_rate': LEARNING_RATE,
                'num_leaves': NUM_LEAVES,
                'tree_learner': 'serial',
                'n_jobs': N_THREADS,
                'seed': SEED,
                'max_depth': MAX_DEPTH,
                'max_bin': 255,
                'metric': METRIC,
                'verbose': -1,
                }

predictors = list(train_X.columns)
logger.info(f"List of predictors {predictors}")

sk = StratifiedKFold(n_splits=10, shuffle=False)

results_dict = lgb_train_validate_on_cv(
    logger,
    run_id=1234,
    train_X=train_X,
    train_Y=train_Y,
    test_X=test_X,
    kf=sk,
    features=predictors,
    params=lgb_params,
    n_estimators=N_ESTIMATORS,
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    cat_features=cat_fetaures,
    is_test=False,
    verbose_eval=100,
)

[INFO]2021-03-23 10:06:48,232:main:List of predictors ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10']
[INFO]2021-03-23 10:06:48,269:main:fold 1 of 10


New categorical_feature is ['cat0', 'cat1', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.904474	valid_1's auc: 0.892688
[200]	training's auc: 0.914308	valid_1's auc: 0.893376
[300]	training's auc: 0.920909	valid_1's auc: 0.89343
Early stopping, best iteration is:
[242]	training's auc: 0.917376	valid_1's auc: 0.893562
[INFO]2021-03-23 10:06:57,295:main:CV OOF Score for fold 1 is 0.8935621455069291
[INFO]2021-03-23 10:06:57,333:main:fold 2 of 10


New categorical_feature is ['cat0', 'cat1', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.904693	valid_1's auc: 0.892532
[200]	training's auc: 0.914114	valid_1's auc: 0.893137
[300]	training's auc: 0.921219	valid_1's auc: 0.893238
Early stopping, best iteration is:
[288]	training's auc: 0.92041	valid_1's auc: 0.893268
[INFO]2021-03-23 10:07:07,238:main:CV OOF Score for fold 2 is 0.8932684415487866
[INFO]2021-03-23 10:07:07,284:main:fold 3 of 10


New categorical_feature is ['cat0', 'cat1', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.90439	valid_1's auc: 0.893218
[200]	training's auc: 0.914146	valid_1's auc: 0.894507
[300]	training's auc: 0.921166	valid_1's auc: 0.894784
Early stopping, best iteration is:
[267]	training's auc: 0.918997	valid_1's auc: 0.894837
[INFO]2021-03-23 10:07:16,308:main:CV OOF Score for fold 3 is 0.8948369264823768
[INFO]2021-03-23 10:07:16,348:main:fold 4 of 10


New categorical_feature is ['cat0', 'cat1', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.904569	valid_1's auc: 0.891442
[200]	training's auc: 0.913978	valid_1's auc: 0.892328
[300]	training's auc: 0.921102	valid_1's auc: 0.892389
Early stopping, best iteration is:
[228]	training's auc: 0.91608	valid_1's auc: 0.892522
[INFO]2021-03-23 10:07:24,858:main:CV OOF Score for fold 4 is 0.8925215961343859
[INFO]2021-03-23 10:07:24,897:main:fold 5 of 10


New categorical_feature is ['cat0', 'cat1', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.904619	valid_1's auc: 0.892951
[200]	training's auc: 0.913791	valid_1's auc: 0.893959
[300]	training's auc: 0.920615	valid_1's auc: 0.893815
Early stopping, best iteration is:
[230]	training's auc: 0.915981	valid_1's auc: 0.894065
[INFO]2021-03-23 10:07:33,241:main:CV OOF Score for fold 5 is 0.8940646361215158
[INFO]2021-03-23 10:07:33,282:main:fold 6 of 10


New categorical_feature is ['cat0', 'cat1', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.904383	valid_1's auc: 0.891602
[200]	training's auc: 0.913907	valid_1's auc: 0.892497
[300]	training's auc: 0.920975	valid_1's auc: 0.892624
Early stopping, best iteration is:
[274]	training's auc: 0.91915	valid_1's auc: 0.892695
[INFO]2021-03-23 10:07:42,756:main:CV OOF Score for fold 6 is 0.8926953681706283
[INFO]2021-03-23 10:07:42,794:main:fold 7 of 10


New categorical_feature is ['cat0', 'cat1', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.904779	valid_1's auc: 0.887345
[200]	training's auc: 0.914073	valid_1's auc: 0.888388
[300]	training's auc: 0.920976	valid_1's auc: 0.888543
Early stopping, best iteration is:
[287]	training's auc: 0.920124	valid_1's auc: 0.888592
[INFO]2021-03-23 10:07:52,628:main:CV OOF Score for fold 7 is 0.8885918213105781
[INFO]2021-03-23 10:07:52,671:main:fold 8 of 10


New categorical_feature is ['cat0', 'cat1', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.904475	valid_1's auc: 0.890753
[200]	training's auc: 0.913768	valid_1's auc: 0.891295
[300]	training's auc: 0.921011	valid_1's auc: 0.891071
Early stopping, best iteration is:
[215]	training's auc: 0.915027	valid_1's auc: 0.891323
[INFO]2021-03-23 10:08:00,540:main:CV OOF Score for fold 8 is 0.8913228304385516
[INFO]2021-03-23 10:08:00,579:main:fold 9 of 10


New categorical_feature is ['cat0', 'cat1', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.904635	valid_1's auc: 0.893019
[200]	training's auc: 0.91435	valid_1's auc: 0.893963
Early stopping, best iteration is:
[195]	training's auc: 0.913939	valid_1's auc: 0.894012
[INFO]2021-03-23 10:08:07,810:main:CV OOF Score for fold 9 is 0.8940121598426086
[INFO]2021-03-23 10:08:07,852:main:fold 10 of 10


New categorical_feature is ['cat0', 'cat1', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.904875	valid_1's auc: 0.88927
[200]	training's auc: 0.914355	valid_1's auc: 0.890166
Early stopping, best iteration is:
[192]	training's auc: 0.913711	valid_1's auc: 0.890207
[INFO]2021-03-23 10:08:15,067:main:CV OOF Score for fold 10 is 0.8902072121789532
[INFO]2021-03-23 10:08:15,213:main:Combined OOF score : 0.8925
[INFO]2021-03-23 10:08:15,214:main:Average of 10 folds OOF score 0.89251
[INFO]2021-03-23 10:08:15,214:main:std of 10 folds OOF score 0.00184
[INFO]2021-03-23 10:08:15,285:main:Training/Prediction completed!


In [12]:
len(results_dict['y_oof']), len(results_dict['prediction'])

(300000, 200000)

In [13]:
lgb_le_oof = results_dict['y_oof']
lgb_le_test_pred = results_dict['prediction']

## LightGBM with OHE

In [14]:
combined_df = pd.concat([train_df.drop('target', axis=1), test_df])
target = train_df[TARGET]

cat_fetaures = [name for name in train_df.columns if "cat" in name]

logger.info("OH Encoding the categorcal features")

combined_df = pd.get_dummies(data=combined_df, columns=cat_fetaures)

train_df = combined_df.loc[train_df.index]
train_df[TARGET] = target

test_df = combined_df.loc[test_df.index]

train_X = train_df.drop([TARGET], axis=1)
train_Y = train_df[TARGET]
test_X = test_df

[INFO]2021-03-23 10:08:23,925:main:OH Encoding the categorcal features


In [15]:
MODEL_TYPE = "lgb"
OBJECTIVE = "binary"
BOOSTING_TYPE = "gbdt"
METRIC = "auc"
VERBOSE = 100
N_THREADS = -1
NUM_LEAVES = 31
MAX_DEPTH = -1
N_ESTIMATORS = 10000
LEARNING_RATE = 0.1
EARLY_STOPPING_ROUNDS = 100


lgb_params = {
                'objective': OBJECTIVE,
                'boosting_type': BOOSTING_TYPE,
                'learning_rate': LEARNING_RATE,
                'num_leaves': NUM_LEAVES,
                'tree_learner': 'serial',
                'n_jobs': N_THREADS,
                'seed': SEED,
                'max_depth': MAX_DEPTH,
                'max_bin': 255,
                'metric': METRIC,
                'verbose': -1,
                }

predictors = list(train_X.columns)
logger.info(f"List of predictors {predictors}")

sk = StratifiedKFold(n_splits=10, shuffle=False)

results_dict = lgb_train_validate_on_cv(
    logger,
    run_id=1234,
    train_X=train_X,
    train_Y=train_Y,
    test_X=test_X,
    kf=sk,
    features=predictors,
    params=lgb_params,
    n_estimators=N_ESTIMATORS,
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    cat_features="",
    is_test=False,
    verbose_eval=100,
)

[INFO]2021-03-23 10:08:27,383:main:List of predictors ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cat0_0', 'cat0_1', 'cat1_0', 'cat1_1', 'cat1_2', 'cat1_3', 'cat1_4', 'cat1_5', 'cat1_6', 'cat1_7', 'cat1_8', 'cat1_9', 'cat1_10', 'cat1_11', 'cat1_12', 'cat1_13', 'cat1_14', 'cat2_0', 'cat2_1', 'cat2_2', 'cat2_3', 'cat2_4', 'cat2_5', 'cat2_6', 'cat2_7', 'cat2_8', 'cat2_9', 'cat2_10', 'cat2_11', 'cat2_12', 'cat2_13', 'cat2_14', 'cat2_15', 'cat2_16', 'cat2_17', 'cat2_18', 'cat3_0', 'cat3_1', 'cat3_2', 'cat3_3', 'cat3_4', 'cat3_5', 'cat3_6', 'cat3_7', 'cat3_8', 'cat3_9', 'cat3_10', 'cat3_11', 'cat3_12', 'cat4_0', 'cat4_1', 'cat4_2', 'cat4_3', 'cat4_4', 'cat4_5', 'cat4_6', 'cat4_7', 'cat4_8', 'cat4_9', 'cat4_10', 'cat4_11', 'cat4_12', 'cat4_13', 'cat4_14', 'cat4_15', 'cat4_16', 'cat4_17', 'cat4_18', 'cat4_19', 'cat5_0', 'cat5_1', 'cat5_2', 'cat5_3', 'cat5_4', 'cat5_5', 'cat5_6', 'cat5_7', 'cat5_8', 'cat5_9', 'cat5_10', 'cat5_11', 'cat5_

New categorical_feature is []


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.895103	valid_1's auc: 0.891617
[200]	training's auc: 0.901922	valid_1's auc: 0.892701
[300]	training's auc: 0.907385	valid_1's auc: 0.892991
[400]	training's auc: 0.912249	valid_1's auc: 0.89328
[500]	training's auc: 0.916989	valid_1's auc: 0.893652
Early stopping, best iteration is:
[498]	training's auc: 0.916914	valid_1's auc: 0.893667
[INFO]2021-03-23 10:08:43,484:main:CV OOF Score for fold 1 is 0.8936672007803833
[INFO]2021-03-23 10:08:43,536:main:fold 2 of 10


New categorical_feature is []


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.89522	valid_1's auc: 0.890598
[200]	training's auc: 0.90215	valid_1's auc: 0.8916
[300]	training's auc: 0.907581	valid_1's auc: 0.892256
[400]	training's auc: 0.912493	valid_1's auc: 0.892464
[500]	training's auc: 0.91719	valid_1's auc: 0.892736
Early stopping, best iteration is:
[498]	training's auc: 0.917108	valid_1's auc: 0.892753
[INFO]2021-03-23 10:08:59,132:main:CV OOF Score for fold 2 is 0.8927527690938045
[INFO]2021-03-23 10:08:59,182:main:fold 3 of 10


New categorical_feature is []


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.895247	valid_1's auc: 0.891921
[200]	training's auc: 0.902314	valid_1's auc: 0.893347
[300]	training's auc: 0.907788	valid_1's auc: 0.893929
[400]	training's auc: 0.912868	valid_1's auc: 0.894029
[500]	training's auc: 0.917223	valid_1's auc: 0.894257
[600]	training's auc: 0.921457	valid_1's auc: 0.894451
[700]	training's auc: 0.925696	valid_1's auc: 0.894547
Early stopping, best iteration is:
[649]	training's auc: 0.923647	valid_1's auc: 0.894566
[INFO]2021-03-23 10:09:19,174:main:CV OOF Score for fold 3 is 0.894565877029156
[INFO]2021-03-23 10:09:19,217:main:fold 4 of 10


New categorical_feature is []


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.895473	valid_1's auc: 0.889164
[200]	training's auc: 0.902254	valid_1's auc: 0.890321
[300]	training's auc: 0.907803	valid_1's auc: 0.890645
[400]	training's auc: 0.91256	valid_1's auc: 0.890799
Early stopping, best iteration is:
[336]	training's auc: 0.909675	valid_1's auc: 0.890804
[INFO]2021-03-23 10:09:35,836:main:CV OOF Score for fold 4 is 0.8908038311381365
[INFO]2021-03-23 10:09:35,885:main:fold 5 of 10


New categorical_feature is []


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.895064	valid_1's auc: 0.892148
[200]	training's auc: 0.901964	valid_1's auc: 0.893124
[300]	training's auc: 0.907514	valid_1's auc: 0.893669
[400]	training's auc: 0.912435	valid_1's auc: 0.894044
[500]	training's auc: 0.916829	valid_1's auc: 0.894238
[600]	training's auc: 0.920816	valid_1's auc: 0.894255
Early stopping, best iteration is:
[531]	training's auc: 0.918074	valid_1's auc: 0.894292
[INFO]2021-03-23 10:10:03,085:main:CV OOF Score for fold 5 is 0.8942923338684665
[INFO]2021-03-23 10:10:03,128:main:fold 6 of 10


New categorical_feature is []


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.895426	valid_1's auc: 0.889921
[200]	training's auc: 0.902541	valid_1's auc: 0.891332
[300]	training's auc: 0.908193	valid_1's auc: 0.891591
[400]	training's auc: 0.91309	valid_1's auc: 0.891445
Early stopping, best iteration is:
[324]	training's auc: 0.909416	valid_1's auc: 0.891673
[INFO]2021-03-23 10:10:18,875:main:CV OOF Score for fold 6 is 0.8916733247324583
[INFO]2021-03-23 10:10:18,913:main:fold 7 of 10


New categorical_feature is []


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.895603	valid_1's auc: 0.88672
[200]	training's auc: 0.902543	valid_1's auc: 0.888312
[300]	training's auc: 0.90806	valid_1's auc: 0.889061
[400]	training's auc: 0.913077	valid_1's auc: 0.889152
Early stopping, best iteration is:
[332]	training's auc: 0.909699	valid_1's auc: 0.88929
[INFO]2021-03-23 10:10:30,797:main:CV OOF Score for fold 7 is 0.8892899281540623
[INFO]2021-03-23 10:10:30,837:main:fold 8 of 10


New categorical_feature is []


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.895473	valid_1's auc: 0.889304
[200]	training's auc: 0.902475	valid_1's auc: 0.890489
[300]	training's auc: 0.907959	valid_1's auc: 0.89076
[400]	training's auc: 0.912511	valid_1's auc: 0.890865
[500]	training's auc: 0.91713	valid_1's auc: 0.89098
[600]	training's auc: 0.921327	valid_1's auc: 0.891229
[700]	training's auc: 0.925187	valid_1's auc: 0.891208
Early stopping, best iteration is:
[610]	training's auc: 0.921865	valid_1's auc: 0.891339
[INFO]2021-03-23 10:10:49,163:main:CV OOF Score for fold 8 is 0.8913391108674036
[INFO]2021-03-23 10:10:49,207:main:fold 9 of 10


New categorical_feature is []


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.895051	valid_1's auc: 0.892356
[200]	training's auc: 0.902144	valid_1's auc: 0.893382
[300]	training's auc: 0.907822	valid_1's auc: 0.893868
[400]	training's auc: 0.91281	valid_1's auc: 0.894109
[500]	training's auc: 0.917491	valid_1's auc: 0.894277
[600]	training's auc: 0.921441	valid_1's auc: 0.894414
[700]	training's auc: 0.925612	valid_1's auc: 0.894558
[800]	training's auc: 0.929224	valid_1's auc: 0.894464
Early stopping, best iteration is:
[742]	training's auc: 0.927154	valid_1's auc: 0.894593
[INFO]2021-03-23 10:11:10,361:main:CV OOF Score for fold 9 is 0.8945932393342191
[INFO]2021-03-23 10:11:10,402:main:fold 10 of 10


New categorical_feature is []


Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.895665	valid_1's auc: 0.887131
[200]	training's auc: 0.902573	valid_1's auc: 0.888066
[300]	training's auc: 0.907868	valid_1's auc: 0.888574
[400]	training's auc: 0.912799	valid_1's auc: 0.888646
[500]	training's auc: 0.917276	valid_1's auc: 0.888833
[600]	training's auc: 0.921515	valid_1's auc: 0.888991
Early stopping, best iteration is:
[543]	training's auc: 0.919362	valid_1's auc: 0.889129
[INFO]2021-03-23 10:11:27,160:main:CV OOF Score for fold 10 is 0.8891286193057756
[INFO]2021-03-23 10:11:27,306:main:Combined OOF score : 0.89221
[INFO]2021-03-23 10:11:27,307:main:Average of 10 folds OOF score 0.89221
[INFO]2021-03-23 10:11:27,307:main:std of 10 folds OOF score 0.00198
[INFO]2021-03-23 10:11:27,351:main:Training/Prediction completed!


In [17]:
lgb_ohe_oof = results_dict['y_oof']
lgb_ohe_test_pred = results_dict['prediction']

len(lgb_ohe_oof), len(lgb_ohe_test_pred)

(300000, 200000)

# Create the dataset for Level 2

In [18]:
l1_train_df = pd.DataFrame(data={
    "lgb_le": lgb_le_oof,
    "lgb_ohe_oof": lgb_ohe_oof
}, index=train_df.index)


l1_test_df = pd.DataFrame(data={
    "lgb_le": lgb_le_test_pred,
    "lgb_ohe_oof": lgb_ohe_test_pred
}, index=test_df.index)

In [19]:
l1_train_df.head()

Unnamed: 0_level_0,lgb_le,lgb_ohe_oof
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.135424,0.079853
1,0.255094,0.427562
2,0.04378,0.052041
3,0.064838,0.061243
4,0.18704,0.2439


In [20]:
l1_test_df.head()

Unnamed: 0_level_0,lgb_le,lgb_ohe_oof
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5,0.128911,0.160895
6,0.326869,0.444791
8,0.01906,0.023641
9,0.216979,0.201414
11,0.109366,0.118352


In [21]:
train_X = l1_train_df
train_Y = train_df[TARGET]
test_X = l1_test_df

In [22]:
def CCCV_train_validate_on_cv(
    logger,
    run_id,
    train_X,
    train_Y,
    test_X,
    kf,
    is_test=False,
):
    """Train a LightGBM model, validate using cross validation. If `test_X` has
    a valid value, creates a new model with number of best iteration found during
    holdout phase using training as well as validation data.

    startify_by_labels: Used as the label for StartifiedKFold on top of continous
    variables
    """

    y_oof = np.zeros(len(train_X))
    y_predicted = np.zeros(len(test_X))
    cv_scores = []
    result_dict = {}

    n_folds = kf.get_n_splits()
    for fold, (train_index, validation_index) in enumerate(kf.split(X=train_X, y=train_Y)):
        fold += 1
        logger.info(f"fold {fold} of {n_folds}")

        X_train, X_validation, y_train, y_validation = __get_X_Y_from_CV(
            train_X, train_Y, train_index, validation_index
        )

        model = CalibratedClassifierCV(RidgeClassifier(random_state=SEED), cv=5)
        model.fit(X_train, y_train)

        y_oof[validation_index] = model.predict_proba(X_validation)[:, -1]
        y_predicted += model.predict_proba(test_X.values)[:, -1]

        cv_oof_score = __calculate_perf_metric(y_validation, y_oof[validation_index])
        cv_scores.append(cv_oof_score)
        logger.info(f"CV OOF Score for fold {fold} is {cv_oof_score}")

        del validation_index, X_validation, y_validation
        gc.collect()

    result_dict = __evaluate_and_log(
        logger,
        run_id,
        train_Y,
        y_oof,
        y_predicted,
        n_folds,
        result_dict,
        cv_scores,
        best_iterations=0
    )

    del y_oof
    gc.collect()

    logger.info("Training/Prediction completed!")
    return result_dict

In [23]:
predictors = list(train_X.columns)
logger.info(f"List of predictors {predictors}")

sk = StratifiedKFold(n_splits=10, shuffle=False)

results_dict = CCCV_train_validate_on_cv(
    logger,
    run_id=1234,
    train_X=train_X,
    train_Y=train_Y,
    test_X=test_X,
    kf=sk,
    is_test=True,
)

[INFO]2021-03-23 10:12:13,991:main:List of predictors ['lgb_le', 'lgb_ohe_oof']
[INFO]2021-03-23 10:12:14,033:main:fold 1 of 10
[INFO]2021-03-23 10:12:14,755:main:CV OOF Score for fold 1 is 0.8953651816031907
[INFO]2021-03-23 10:12:14,818:main:fold 2 of 10
[INFO]2021-03-23 10:12:15,498:main:CV OOF Score for fold 2 is 0.8947129487055673
[INFO]2021-03-23 10:12:15,558:main:fold 3 of 10
[INFO]2021-03-23 10:12:16,637:main:CV OOF Score for fold 3 is 0.8964197117155474
[INFO]2021-03-23 10:12:16,695:main:fold 4 of 10
[INFO]2021-03-23 10:12:17,396:main:CV OOF Score for fold 4 is 0.8933180988540336
[INFO]2021-03-23 10:12:17,449:main:fold 5 of 10
[INFO]2021-03-23 10:12:18,290:main:CV OOF Score for fold 5 is 0.8958314649548733
[INFO]2021-03-23 10:12:18,343:main:fold 6 of 10
[INFO]2021-03-23 10:12:19,204:main:CV OOF Score for fold 6 is 0.8938522829498133
[INFO]2021-03-23 10:12:19,253:main:fold 7 of 10
[INFO]2021-03-23 10:12:19,951:main:CV OOF Score for fold 7 is 0.8908003673385176
[INFO]2021-03-23 

In [24]:
results_dict['prediction']

array([0.10928265, 0.33402923, 0.05623773, ..., 0.66344367, 0.0817689 ,
       0.53300568])