In [31]:
import os
import gc
import numpy as np
import pandas as pd

In [39]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
gender_submission = pd.read_csv("../input/submission.csv")

In [40]:
data = pd.concat([train, test], sort=False)

In [41]:
data['Sex'].replace(['male','female'], [0, 1], inplace=True)
#data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} )
#data['Fare'].fillna(np.mean(data['Fare']), inplace=True)

#age_avg = data['Age'].mean()
#age_std = data['Age'].std()

#data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)

delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test = data[len(train):]

y = train['Survived']
X = train.drop('Survived', axis = 1)
test = test.drop('Survived', axis = 1)

In [42]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna

In [43]:
# optunaで最適化のために一連の処理を関数化
def objectives(trial):
    drop_rate = trial.suggest_uniform('drop_rate', 0, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0, 1.0)
    learning_rate = trial.suggest_uniform('learning_rate', 0, 1.0)
    subsample = trial.suggest_uniform('subsample', 0.8, 1.0)
    num_leaves = trial.suggest_int('num_leaves', 5, 1000)
    verbosity = trial.suggest_int('verbosity', -1, 1)
    num_boost_round = trial.suggest_int('num_boost_round', 10, 100000)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 10, 100000)
    min_child_samples = trial.suggest_int('min_child_samples', 5, 500)
    min_child_weight = trial.suggest_int('min_child_weight', 5, 500)

    params = {"objective": "binary",
              "boosting_type": "gbdt",
              "learning_rate": learning_rate,
              "num_leaves": num_leaves,
              "max_bin": 256,
              "feature_fraction": feature_fraction,
              "verbosity": verbosity,
              "drop_rate": drop_rate,
              "is_unbalance": False,
              "max_drop": 50,
              "min_child_samples": min_child_samples,
              "min_child_weight": min_child_weight,
              "min_split_gain": 0,
              "min_data_in_leaf": min_data_in_leaf,
              "subsample": subsample,
              "metric": "auc"
              }
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val)
    
    clf = lgb.train(params, 
                    dtrain,
                    1000, 
                    valid_sets = [dtrain, dval], 
                    verbose_eval=200, 
                    early_stopping_rounds=100)
    
    y_pred_val = clf.predict(X_val)
    y_pred_val = (y_pred_val > 0.5).astype(int)
    score = accuracy_score(y_pred_val, y_val)
    print(score)
    return score

# optunaによる最適化呼び出し
opt = optuna.create_study(direction='maximize')# scoreの最大化を目指す。最小化の場合はminimize
opt.optimize(objectives, n_trials=30)
# 実行結果表示
print('最終トライアル回数: {}'.format(len(opt.trials)))
print('ベストトライアル:')
trial = opt.best_trial
print('値: {}'.format(trial.value))
print('パラメータ:')
for key, value in trial.params.items():
    print('{}: {}'.format(key, value))

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:05,003] Finished trial#0 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:05,127] Finished trial#1 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:05,247] Finished trial#2 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:05,363] Finished trial#3 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:05,476] Finished trial#4 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:05,596] Finished trial#5 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:05,718] Finished trial#6 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:05,828] Finished trial#7 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:05,936] Finished trial#8 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:06,048] Finished trial#9 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:06,193] Finished trial#10 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:06,341] Finished trial#11 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:06,476] Finished trial#12 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:06,608] Finished trial#13 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:06,743] Finished trial#14 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:06,899] Finished trial#15 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:07,036] Finished trial#16 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:07,173] Finished trial#17 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:07,296] Finished trial#18 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:07,425] Finished trial#19 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:07,559] Finished trial#20 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:07,683] Finished trial#21 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:07,813] Finished trial#22 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:07,954] Finished trial#23 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:08,080] Finished trial#24 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:08,214] Finished trial#25 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:08,346] Finished trial#26 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:08,470] Finished trial#27 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:08,607] Finished trial#28 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
0.5921787709497207


[I 2020-02-13 03:46:08,755] Finished trial#29 resulted in value: 0.5921787709497207. Current best value is 0.5921787709497207 with parameters: {'drop_rate': 0.9754676518035469, 'feature_fraction': 0.1004995968580088, 'learning_rate': 0.43803383148043296, 'subsample': 0.8948938528455712, 'num_leaves': 824, 'verbosity': 0, 'num_boost_round': 9726, 'min_data_in_leaf': 53645, 'min_child_samples': 232, 'min_child_weight': 462}.


最終トライアル回数: 30
ベストトライアル:
値: 0.5921787709497207
パラメータ:
drop_rate: 0.9754676518035469
feature_fraction: 0.1004995968580088
learning_rate: 0.43803383148043296
subsample: 0.8948938528455712
num_leaves: 824
verbosity: 0
num_boost_round: 9726
min_data_in_leaf: 53645
min_child_samples: 232
min_child_weight: 462


In [44]:
trial.params

{'drop_rate': 0.9754676518035469,
 'feature_fraction': 0.1004995968580088,
 'learning_rate': 0.43803383148043296,
 'subsample': 0.8948938528455712,
 'num_leaves': 824,
 'verbosity': 0,
 'num_boost_round': 9726,
 'min_data_in_leaf': 53645,
 'min_child_samples': 232,
 'min_child_weight': 462}

In [45]:
params = {'drop_rate': 0.8128936953440956,
          'feature_fraction': 0.48426736268694337,
          'learning_rate': 0.3355716162961806,
          'subsample': 0.8786519586713651,
          'num_leaves': 387,
          'verbosity': 0,
          'num_boost_round': 34035,
          'min_data_in_leaf': 34715,
          'min_child_samples': 278,
          'min_child_weight': 106,
          'objective': "binary",
          'boosting_type': "gbdt",
          'max_bin': 256,
          'metric': "auc"
         }

In [46]:
results = oof(X, y, test, params)
print(f"\nMean AUC = {results['score']}")



Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
Fold 1 | AUC: 0.5




Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
Fold 2 | AUC: 0.5




Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
Fold 3 | AUC: 0.5




Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
Fold 4 | AUC: 0.5




Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_1's auc: 0.5
Fold 5 | AUC: 0.5

Mean AUC = 0.5


In [21]:
def oof(X, y, test, params, NFOLDS = 5):
    
    columns = X.columns
    
    folds = KFold(n_splits=NFOLDS)
    splits = folds.split(X, y)
    
    y_preds = np.zeros(test.shape[0])
    y_oof = np.zeros(X.shape[0])
    score = 0
    feature_importances = pd.DataFrame()
    feature_importances['feature'] = columns
    
    for fold_n, (train_index, valid_index) in enumerate(splits):
        X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        dtrain = lgb.Dataset(X_train, label=y_train)
        dvalid = lgb.Dataset(X_valid, label=y_valid)
        
        clf = lgb.train(params, 
                        dtrain, 
                        1000, 
                        valid_sets = [dtrain, dvalid], 
                        verbose_eval=200, 
                        early_stopping_rounds=100)
        
        #===== feature importances =====
        feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
        
        #===== out-of-fold =====
        y_pred_valid = clf.predict(X_valid)
        y_oof[valid_index] = y_pred_valid
        
        #===== score =====
        score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
        
        #===== test =====
        y_preds += clf.predict(test) / NFOLDS
        
        print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
        
        del X_train, X_valid, y_train, y_valid
        gc.collect()
    
    return {'y_oof': y_oof,
            'y_preds': y_preds,
            'score': score, 
            'feature_importances': feature_importances}