In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
features = [c for c in train_df.columns if c not in ['ID_code', 'target']] #basic features
target = train_df['target']

random_state = 42
np.random.seed(random_state)

params = {
    "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    "bagging_seed" : random_state,
    "verbosity" : 1,
    "seed": random_state
}
folds = StratifiedKFold(n_splits=12, shuffle=False, random_state=99999)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
train_preds = np.zeros(len(train_df))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
    clf = lgb.train(params, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 2000)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
    train_preds += clf.predict(train_df[features], num_iteration=clf.best_iteration) / folds.n_splits
print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = predictions
sub.to_csv("submission_lgbm_base3.csv", index=False)

sub_lgb = pd.DataFrame({"ID_code": train_df.ID_code.values})
sub_lgb["output"] = train_preds
sub_lgb.to_csv("../input/output_lgbm_base3.csv")


Fold 0
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.929514	valid_1's auc: 0.897949
[10000]	training's auc: 0.949096	valid_1's auc: 0.900497
Early stopping, best iteration is:
[12556]	training's auc: 0.957406	valid_1's auc: 0.90088
Fold 1
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.929658	valid_1's auc: 0.896968
[10000]	training's auc: 0.949257	valid_1's auc: 0.898639
Early stopping, best iteration is:
[9824]	training's auc: 0.948637	valid_1's auc: 0.898687
Fold 2
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.930358	valid_1's auc: 0.890278
[10000]	training's auc: 0.949729	valid_1's auc: 0.89187
Early stopping, best iteration is:
[8415]	training's auc: 0.944191	valid_1's auc: 0.892147
Fold 3
Training until validation scores don't improve for 2000 rounds.
[5000]	training's auc: 0.928991	valid_1's auc: 0.905146
[10000]	training's auc: 0.948754	valid_1's auc: 0