In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold,RepeatedKFold
import warnings
from six.moves import urllib
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('seaborn')
from scipy.stats import norm, skew

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
features = [c for c in train.columns if c not in ['ID_code', 'target']]

In [3]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [4]:
target = train['target']
train = train.drop(["ID_code", "target"], axis=1)

num_folds = 11
features = [c for c in train.columns if c not in ['ID_code', 'target']]

folds = StratifiedKFold(n_splits=num_folds, shuffle=False, random_state=2319)
oof = np.zeros(len(train))
predictions = np.zeros(len(target))

print('Light GBM Model')
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("Fold idx:{}".format(fold_ + 1))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])
    
    clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 4000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Light GBM Model
Fold idx:1
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.925038	valid_1's auc: 0.898056
[10000]	training's auc: 0.940795	valid_1's auc: 0.900623
[15000]	training's auc: 0.953527	valid_1's auc: 0.901141
Early stopping, best iteration is:
[14826]	training's auc: 0.953108	valid_1's auc: 0.901212
Fold idx:2
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.925337	valid_1's auc: 0.896843
[10000]	training's auc: 0.94097	valid_1's auc: 0.89824
[15000]	training's auc: 0.953692	valid_1's auc: 0.897862
Early stopping, best iteration is:
[12398]	training's auc: 0.947305	valid_1's auc: 0.89828
Fold idx:3
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.925795	valid_1's auc: 0.890566
[10000]	training's auc: 0.941327	valid_1's auc: 0.89323
[15000]	training's auc: 0.954066	valid_1's auc: 0.892995
Early stopping, best iteration is:
[12375]	training's auc: 0.947641	v

In [5]:
num_sub = 19
print('Saving the Submission File')
sub = pd.DataFrame({"ID_code": test.ID_code.values})
sub["target"] = predictions
sub.to_csv('sub{}.csv'.format(num_sub), index=False)

Saving the Submission File
