In [3]:
import numpy                  as np
import pandas                 as pd
import lightgbm               as lgb
from sklearn.metrics          import mean_squared_error
from sklearn.metrics          import roc_auc_score
from sklearn.model_selection  import StratifiedKFold
import utils
import warnings
warnings.filterwarnings('ignore')

In [4]:
train_path   = '../data/input/input_pkl/train/'
df_train     = utils.read_pickles(train_path)

100%|██████████| 5/5 [00:00<00:00, 11.64it/s]


In [5]:
test_path   = '../data/input/input_pkl/test/'
df_test     = utils.read_pickles(test_path)

100%|██████████| 5/5 [00:00<00:00, 24.50it/s]


In [6]:
features = [c for c in df_train.columns if c not in ['ID_code', 'target']]
target = df_train['target']

In [7]:
param = {
    'bagging_freq': 5,          'bagging_fraction': 0.335,   'boost_from_average':'false',   'boost': 'gbdt',
    'feature_fraction': 0.041,   'learning_rate': 0.0083,     'max_depth': -1,                'metric':'auc',
    'min_data_in_leaf': 80,     'min_sum_hessian_in_leaf': 10.0,'num_leaves': 13,           'num_threads': 8,
    'tree_learner': 'serial',   'objective': 'binary',      'verbosity': 1
}

In [9]:
folds       = StratifiedKFold(n_splits=12, shuffle=False, random_state=44000)
oof         = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))

In [12]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data     = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data     = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx])
    clf          = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 4000)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions += clf.predict(df_train[features], num_iteration=clf.best_iteration) / folds.n_splits

Fold 0
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.924808	valid_1's auc: 0.897552
[10000]	training's auc: 0.940315	valid_1's auc: 0.900713
[15000]	training's auc: 0.952888	valid_1's auc: 0.900801
Early stopping, best iteration is:
[14060]	training's auc: 0.950667	valid_1's auc: 0.900968
Fold 1
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.924739	valid_1's auc: 0.89741
[10000]	training's auc: 0.940336	valid_1's auc: 0.899156
[15000]	training's auc: 0.953011	valid_1's auc: 0.899456
Early stopping, best iteration is:
[15365]	training's auc: 0.953879	valid_1's auc: 0.899525
Fold 2
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.92554	valid_1's auc: 0.88936
[10000]	training's auc: 0.940936	valid_1's auc: 0.891233
[15000]	training's auc: 0.953506	valid_1's auc: 0.890813
Early stopping, best iteration is:
[11142]	training's auc: 0.943941	valid_1's auc: 0.89133
Fold 

In [20]:
utils.result_append('200_Model',roc_auc_score(target, oof),param,0.9,12,None)

In [23]:
sub = pd.DataFrame({"ID_code": df_test.ID_code.values})
sub["target"] = predictions
sub.to_csv("../result/submission.csv", index=False)