In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
features = [c for c in train_df.columns if c not in ['ID_code', 'target']] #basic features
target = train_df['target']
param = {
    'bagging_freq': 5,          
    'bagging_fraction': 0.38,   'boost_from_average':'false',   
    'boost': 'gbdt',             'feature_fraction': 0.04,     'learning_rate': 0.0085,
    'max_depth': -1,             'metric':'auc',                'min_data_in_leaf': 80,     'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,            'num_threads': 8,              'tree_learner': 'serial',   'objective': 'binary',
    'reg_alpha': 0.1302650970728192, 'reg_lambda': 0.3603427518866501,'verbosity': 1
}
folds = StratifiedKFold(n_splits=12, shuffle=False, random_state=99999)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
train_preds = np.zeros(len(train_df))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
    clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 2000)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
    train_preds += clf.predict(train_df[features], num_iteration=clf.best_iteration) / folds.n_splits
print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = predictions
sub.to_csv("submission_lgbm_base1.csv", index=False)
sub.to_csv("../input/output_test_lgbm_base1.csv", index=False)

sub_lgb = pd.DataFrame({"ID_code": train_df.ID_code.values})
sub_lgb["output"] = train_preds
sub_lgb.to_csv("../input/output_lgbm_base1.csv")


Fold 0
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.925151	valid_1's auc: 0.898164
[10000]	training's auc: 0.940839	valid_1's auc: 0.901261
Early stopping, best iteration is:
[10313]	training's auc: 0.941697	valid_1's auc: 0.901416
Fold 1
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.925388	valid_1's auc: 0.897582
[10000]	training's auc: 0.941018	valid_1's auc: 0.899296
[15000]	training's auc: 0.953868	valid_1's auc: 0.899497
Early stopping, best iteration is:
[13782]	training's auc: 0.950925	valid_1's auc: 0.899681
Fold 2
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.925878	valid_1's auc: 0.889935
[10000]	training's auc: 0.941418	valid_1's auc: 0.892076
Early stopping, best iteration is:
[11513]	training's auc: 0.945485	valid_1's auc: 0.892175
Fold 3
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.924659	valid_1's au