In [36]:
import time

from contextlib import contextmanager
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve
import lightgbm

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]

    return df, new_columns

In [3]:
df = pd.read_csv('../data/processed_data_3.8.csv', compression='zip')

In [4]:
train = df[df['TARGET'].notnull()]
train = train[['SK_ID_CURR', 'TARGET']]

In [5]:
lgbm_val_df = pd.read_csv('../data/lightgbm-fast-val.csv')
four_model_hold_df =  pd.read_csv('../predictions/hold_blend_nn.csv')
four_model_val_df = pd.read_csv('../predictions/va_blend_nn.csv')

In [6]:
lgbm_val_df = lgbm_val_df.rename(columns = {'TARGET':'lgbm'})

In [7]:
train_df = pd.merge(four_model_val_df, lgbm_val_df, on="SK_ID_CURR", how = 'left')

In [8]:
train_df = pd.merge(train_df, train, on="SK_ID_CURR", how = 'left')

In [9]:
train_df = train_df.drop('Unnamed: 0', axis = 1)

In [11]:
hold_df = pd.merge(four_model_hold_df, lgbm_val_df, on = 'SK_ID_CURR', how = 'left')
hold_df = pd.merge(hold_df, train, on="SK_ID_CURR", how = 'left')

In [13]:
hold_df = hold_df.drop('Unnamed: 0', axis = 1)

In [14]:
train_df = train_df.append(hold_df).reset_index()

KeyError: "labels ['Unnamed: 0'] not contained in axis"

In [22]:
train_df = train_df.drop('index', axis = 1)

In [25]:
train_df.to_csv('../data/stack_train.csv')

In [17]:
test = pd.read_csv('../predictions/lightgbm_pred_3.7.csv')
test = test.rename(columns = {'TARGET':'lgbm'})
test_df = pd.read_csv('../predictions/tst_blend_nn.csv')
test_df = pd.merge(test_df, test, on="SK_ID_CURR", how = 'left')
test_df = test_df.drop('Unnamed: 0', axis = 1)

In [28]:
test_df.to_csv('../data/stack_test.csv')

In [34]:
test_df.head()

Unnamed: 0,SK_ID_CURR,ridge,f10_dnn,lgbm
0,100001,0.078939,0.078008,0.027875
1,100005,0.126457,0.169328,0.131281
2,100013,0.037151,0.039333,0.015879
3,100028,0.000366,0.052468,0.047909
4,100038,0.156629,0.172443,0.143755


In [51]:
print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
sub_df = test_df[['SK_ID_CURR']].copy()
sub_df['TARGET'] = 0

folds = KFold(n_splits= 5, shuffle=True, random_state=1001)
# folds = StratifiedKFold(n_splits= 5, shuffle=True, random_state=1001)
# train_df = pd.read_csv('../data/stack_train.csv')
# test_df = pd.read_csv('../data/stack_test.csv')

# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
    dtrain = lightgbm.Dataset(data=train_df[feats].iloc[train_idx], 
                            label=train_df['TARGET'].iloc[train_idx], 
                            free_raw_data=False, silent=True)
    dvalid = lightgbm.Dataset(data=train_df[feats].iloc[valid_idx], 
                            label=train_df['TARGET'].iloc[valid_idx], 
                            free_raw_data=False, silent=True)

    # LightGBM parameters found by Bayesian optimization
    params = {
        'objective': 'binary',
        'boosting_type': 'goss', # 'gbdt'
        'nthread': 4,
        'learning_rate': 0.08,  # 02,
        'num_leaves': 3,
#         'colsample_bytree': 0.9497036,
#         'subsample': 0.8715623,
#         'subsample_freq': 1,
        'max_depth': 30,
#         'reg_alpha': 0.01,
#         'reg_lambda': 0.01,
        'min_split_gain': 0.05,
#         'min_child_weight': 60, #39.3259775
        'seed': 0,
        'verbose': -1,
        'metric': 'auc',
    }

    clf = lightgbm.train(
        params=params,
        train_set=dtrain,
        num_boost_round=10000,
        valid_sets=[dtrain, dvalid],
        early_stopping_rounds= 200,
        verbose_eval=100
    )

    oof_preds[valid_idx] = clf.predict(dvalid.data)
    sub_preds += clf.predict(test_df[feats]) / folds.n_splits

    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(dvalid.label, oof_preds[valid_idx])))

print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))        
sub_df['TARGET'] = sub_preds
sub_df[['SK_ID_CURR', 'TARGET']].to_csv('../predictions/stack_preds.csv', index= False)

Starting LightGBM. Train shape: (307511, 5), test shape: (48744, 4)
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.793189	valid_1's auc: 0.797739
[200]	training's auc: 0.793386	valid_1's auc: 0.797764
[300]	training's auc: 0.793569	valid_1's auc: 0.797589
Early stopping, best iteration is:
[142]	training's auc: 0.79329	valid_1's auc: 0.797867
Fold  1 AUC : 0.797867
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.795729	valid_1's auc: 0.787142
[200]	training's auc: 0.795961	valid_1's auc: 0.787001
[300]	training's auc: 0.796152	valid_1's auc: 0.786965
Early stopping, best iteration is:
[112]	training's auc: 0.795777	valid_1's auc: 0.787215
Fold  2 AUC : 0.787215
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.793717	valid_1's auc: 0.795306
[200]	training's auc: 0.793954	valid_1's auc: 0.795355
Early stopping, best iteration is:
[59]	training's auc: 0.793511	valid_1's auc