In [188]:
import time

from contextlib import contextmanager
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
import lightgbm

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]

    return df, new_columns

In [113]:
df = pd.read_csv('../data/processed_data_3.8.csv', compression='zip')

In [133]:
train = df[df['TARGET'].notnull()]
train = train[['SK_ID_CURR', 'TARGET']]

In [134]:
train.shape

(307511, 2)

In [135]:
lgbm_val_df = pd.read_csv('../data/lightgbm-fast-val.csv')
four_model_hold_df =  pd.read_csv('../predictions/hold_blend_nn.csv')
four_model_val_df = pd.read_csv('../predictions/va_blend_nn.csv')

In [136]:
lgbm_val_df = lgbm_val_df.rename(columns = {'TARGET':'lgbm'})

In [137]:
lgbm_val_df.head()

Unnamed: 0,SK_ID_CURR,lgbm
0,100002,0.293967
1,100003,0.014617
2,100004,0.027693
3,100006,0.026419
4,100007,0.089296


In [138]:
four_model_hold_df.head()

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,ridge,f10_dnn
0,0,384575,0.223925,0.269615
1,1,214010,0.005147,0.031345
2,2,142232,0.122121,0.143304
3,3,389171,0.087019,0.060104
4,4,283617,0.136457,0.084204


In [139]:
four_model_val_df.head()

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,ridge,f10_dnn
0,0,252724,0.02163,0.040466
1,1,372834,0.0,0.0
2,2,326336,0.078165,0.086967
3,3,382390,0.111067,0.127865
4,4,330511,0.104096,0.084409


In [142]:
train_df = pd.merge(four_model_val_df, lgbm_val_df, on="SK_ID_CURR", how = 'left')

In [143]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,ridge,f10_dnn,lgbm
0,0,252724,0.02163,0.040466,0.02347
1,1,372834,0.0,0.0,0.103268
2,2,326336,0.078165,0.086967,0.050801
3,3,382390,0.111067,0.127865,0.029729
4,4,330511,0.104096,0.084409,0.086421


In [144]:
train_df.shape

(276759, 5)

In [145]:
train_df = pd.merge(train_df, train, on="SK_ID_CURR", how = 'left')

In [146]:
train_df.shape

(276759, 6)

In [123]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,ridge,f10_dnn,lgbm,TARGET
0,0.0,252724,0.02163,0.040466,0.02347,0.0
1,1.0,372834,0.0,0.0,0.103268,0.0
2,2.0,326336,0.078165,0.086967,0.050801,1.0
3,3.0,382390,0.111067,0.127865,0.029729,0.0
4,4.0,330511,0.104096,0.084409,0.086421,0.0


In [124]:
train_df = train_df.drop('Unnamed: 0', axis = 1)

In [147]:
test_df = pd.merge(four_model_hold_df, lgbm_val_df, on = 'SK_ID_CURR', how = 'left')
test_df = pd.merge(test_df, train, on="SK_ID_CURR", how = 'left')

In [148]:
test_df.shape

(30752, 6)

In [149]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,ridge,f10_dnn,lgbm,TARGET
0,0,384575,0.223925,0.269615,0.167486,0.0
1,1,214010,0.005147,0.031345,0.035745,0.0
2,2,142232,0.122121,0.143304,0.083213,0.0
3,3,389171,0.087019,0.060104,0.040297,0.0
4,4,283617,0.136457,0.084204,0.079696,0.0


In [128]:
train_df.head()

Unnamed: 0,SK_ID_CURR,ridge,f10_dnn,lgbm,TARGET
0,252724,0.02163,0.040466,0.02347,0.0
1,372834,0.0,0.0,0.103268,0.0
2,326336,0.078165,0.086967,0.050801,1.0
3,382390,0.111067,0.127865,0.029729,0.0
4,330511,0.104096,0.084409,0.086421,0.0


In [158]:
train_df = train_df.drop('Unnamed: 0', axis = 1)

In [159]:
train_df.to_csv('../data/stack_train_val.csv')

In [151]:
test_df = test_df.drop('Unnamed: 0', axis = 1)

In [152]:
test_df.to_csv('../data/stack_hold.csv')

In [153]:
test_df.head()

Unnamed: 0,SK_ID_CURR,ridge,f10_dnn,lgbm,TARGET
0,384575,0.223925,0.269615,0.167486,0.0
1,214010,0.005147,0.031345,0.035745,0.0
2,142232,0.122121,0.143304,0.083213,0.0
3,389171,0.087019,0.060104,0.040297,0.0
4,283617,0.136457,0.084204,0.079696,0.0


In [157]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,ridge,f10_dnn,lgbm,TARGET
0,0,252724,0.02163,0.040466,0.02347,0.0
1,1,372834,0.0,0.0,0.103268,0.0
2,2,326336,0.078165,0.086967,0.050801,1.0
3,3,382390,0.111067,0.127865,0.029729,0.0
4,4,330511,0.104096,0.084409,0.086421,0.0


In [187]:
print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))

folds = KFold(n_splits= 5, shuffle=True, random_state=1001)
# train_df = pd.read_csv('../data/stack_train_val.csv')
# test_df = pd.read_csv('../data/stack_hold.csv')

# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
    dtrain = lightgbm.Dataset(data=train_df[feats].iloc[train_idx], 
                            label=train_df['TARGET'].iloc[train_idx], 
                            free_raw_data=False, silent=True)
    dvalid = lightgbm.Dataset(data=train_df[feats].iloc[valid_idx], 
                            label=train_df['TARGET'].iloc[valid_idx], 
                            free_raw_data=False, silent=True)

    # LightGBM parameters found by Bayesian optimization
    params = {
        'objective': 'binary',
        'boosting_type': 'goss', # 'gbdt'
        'nthread': 4,
        'learning_rate': 0.08,  # 02,
        'num_leaves': 3,
#         'colsample_bytree': 0.9497036,
#         'subsample': 0.8715623,
#         'subsample_freq': 1,
#         'max_depth': 8,
#         'reg_alpha': 0.041545473,
#         'reg_lambda': 0.0735294,
        'min_split_gain': 0.05,
#         'min_child_weight': 60, #39.3259775
        'seed': 0,
        'verbose': -1,
        'metric': 'auc',
    }

    clf = lightgbm.train(
        params=params,c
        train_set=dtrain,
        num_boost_round=10000,
        valid_sets=[dtrain, dvalid],
        early_stopping_rounds = 100,
        verbose_eval=50
    )

    oof_preds[valid_idx] = clf.predict(dvalid.data)
    sub_preds += clf.predict(test_df[feats]) / folds.n_splits

    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(dvalid.label, oof_preds[valid_idx])))

print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))        
print('Holdout score %.6f' % roc_auc_score(test_df['TARGET'], sub_preds))

Starting LightGBM. Train shape: (276759, 5), test shape: (30752, 5)
Training until validation scores don't improve for 100 rounds.
[50]	training's auc: 0.794026	valid_1's auc: 0.793469
[100]	training's auc: 0.794273	valid_1's auc: 0.793466
[150]	training's auc: 0.794505	valid_1's auc: 0.79336
Early stopping, best iteration is:
[67]	training's auc: 0.794103	valid_1's auc: 0.79366
Fold  1 AUC : 0.793660
Training until validation scores don't improve for 100 rounds.
[50]	training's auc: 0.794521	valid_1's auc: 0.791534
[100]	training's auc: 0.794795	valid_1's auc: 0.791717
[150]	training's auc: 0.794921	valid_1's auc: 0.791651
[200]	training's auc: 0.795012	valid_1's auc: 0.791758
[250]	training's auc: 0.795064	valid_1's auc: 0.791672
Early stopping, best iteration is:
[190]	training's auc: 0.794974	valid_1's auc: 0.791809
Fold  2 AUC : 0.791809
Training until validation scores don't improve for 100 rounds.
[50]	training's auc: 0.793418	valid_1's auc: 0.795924
[100]	training's auc: 0.7936

In [191]:
test_df.head()

Unnamed: 0,SK_ID_CURR,ridge,f10_dnn,lgbm,TARGET
0,384575,0.223925,0.269615,0.167486,0.0
1,214010,0.005147,0.031345,0.035745,0.0
2,142232,0.122121,0.143304,0.083213,0.0
3,389171,0.087019,0.060104,0.040297,0.0
4,283617,0.136457,0.084204,0.079696,0.0


In [199]:
print("Starting logistic regression. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))

# train_df = pd.read_csv('../data/stack_train_val.csv')
# test_df = pd.read_csv('../data/stack_hold.csv')

# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

log_clf = LogisticRegression()
log_clf.fit(train_df[feats], train_df['TARGET'])
sub_preds = log_clf.predict_proba(test_df[feats])


# print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))        
print('Holdout score %.6f' % roc_auc_score(test_df['TARGET'], sub_preds[:,1]))

Starting logistic regression. Train shape: (276759, 5), test shape: (30752, 5)
Holdout score 0.792292


In [195]:
sub_preds.shape

(30752, 2)