In [8]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, accuracy_score

import xgboost as xgb
import optuna

import sklearn
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# read the data from /input/train.csv
data = pd.read_csv("merged_data.csv")

In [5]:
X = data.drop(['Class'], axis=1) #'id'
y = data['Class']

# Train, Val, Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.1, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(
#     X_train, y_train, test_size=0.11, random_state=42)

In [17]:
skf = StratifiedKFold(n_splits=10)

val_losses = []

params = {'max_depth': 4,
          'learning_rate': 0.06,
          'colsample_bytree': 0.67,
          'n_jobs': -1,
          'objective': 'binary:logistic',
          'early_stopping_rounds': 150,
          'verbosity': 0,
          'eval_metric': 'logloss'}

for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    print("Fold:", fold)
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]

    fit_set = xgb.DMatrix(X_train, label=y_train)
    val_set = xgb.DMatrix(X_val, label=y_val)

    model = xgb.train(params=params, dtrain=fit_set, evals=[(fit_set, 'train'), (val_set, 'val')], num_boost_round=1000, early_stopping_rounds=10, verbose_eval=10)

    val_preds = model.predict(val_set)
    val_score = log_loss(y_val, val_preds)
    val_losses.append(val_score)

Fold: 0
[0]	train-logloss:0.63714	val-logloss:0.63704
[10]	train-logloss:0.31112	val-logloss:0.31038
[20]	train-logloss:0.17331	val-logloss:0.17264
[30]	train-logloss:0.10694	val-logloss:0.10625
[40]	train-logloss:0.07282	val-logloss:0.07227
[50]	train-logloss:0.05536	val-logloss:0.05497
[60]	train-logloss:0.04589	val-logloss:0.04568
[70]	train-logloss:0.04078	val-logloss:0.04073
[80]	train-logloss:0.03807	val-logloss:0.03826
[90]	train-logloss:0.03651	val-logloss:0.03690
[100]	train-logloss:0.03554	val-logloss:0.03612
[110]	train-logloss:0.03482	val-logloss:0.03559
[120]	train-logloss:0.03435	val-logloss:0.03539
[130]	train-logloss:0.03397	val-logloss:0.03527
[140]	train-logloss:0.03366	val-logloss:0.03515
[150]	train-logloss:0.03330	val-logloss:0.03500
[160]	train-logloss:0.03300	val-logloss:0.03485
[170]	train-logloss:0.03270	val-logloss:0.03483
[180]	train-logloss:0.03247	val-logloss:0.03477
[190]	train-logloss:0.03218	val-logloss:0.03467
[200]	train-logloss:0.03193	val-logloss:0.0

In [15]:
sum(val_losses)/10

0.03660307593828035

# Parameter search with Optuna

In [13]:
def objective(trial): 
    params = {
    'task': 'train', 
    'boosting_type':'gbdt',
    'objective':'binary',
    'metric':'binary_logloss', 
    'verbose':-10000000,
    'seed':42,
    'max_bin':128,
    'n_estimators':128,
    # 'n_estimators':trial.suggest_int("n_estimators", 800, 1200),
    'learning_rate':0.08,
    'feature_fraction':1.0,
    'bagging_fraction':1.0,
   # 'bagging_freq':trial.suggest_categorical("bagging_freq", [1]),
    'max_depth':trial.suggest_int("max_depth", 5, 12), # decrease this in the next round
    'num_leaves':trial.suggest_int("num_leaves",8, 32),
    'min_data_in_leaf':200,
    'min_gain_to_split':1.0,
}
    
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100)
    y_val_pred = model.predict_proba(X_val)
    logloss = log_loss(y_val, y_val_pred)
    return logloss

In [None]:
study = optuna.create_study(direction='minimize');
study.optimize(objective, n_trials=10);
#optuna.visualization.plot_optimization_history(study)


In [18]:
study.best_trial.params

{'max_depth': 11, 'num_leaves': 12}

In [19]:
params = {
    'task': 'train', 
    'boosting_type':'gbdt',
    'objective':'binary',
    'metric':'binary_logloss', 
    'verbose':-10000000,
    'seed':42,
    'max_bin':128,
    'n_estimators':128,
    # 'n_estimators':trial.suggest_int("n_estimators", 800, 1200),
    'learning_rate':0.08,
    'feature_fraction':1.0,
    'bagging_fraction':1.0,
   # 'bagging_freq':trial.suggest_categorical("bagging_freq", [1]),
    'min_data_in_leaf':200,
    'min_gain_to_split':1.0,
}
    

In [20]:
params.update(study.best_trial.params)

In [None]:
tuned_model = lgb.LGBMClassifier(**params)
tuned_model.fit(X_train, y_train, 
        eval_set=[(X_val, y_val)],
        eval_metric=['binary_logloss'],
        early_stopping_rounds=100) 

In [23]:
preds = tuned_model.predict_proba(X_test)
testloss = log_loss(y_test, preds)
testloss

0.030570310091813074

In [24]:
train_preds = tuned_model.predict_proba(X_train)
trainloss = log_loss(y_train, train_preds)
trainloss

0.026942563847874208

In [25]:
val_preds = tuned_model.predict_proba(X_val)
valloss = log_loss(y_val, val_preds)
valloss

0.03386998075382093

# Submission Pipeline

In [None]:
# read the real test data from test.csv
test_data = pd.read_csv("test.csv")

In [None]:
X_testtest = test_data.drop(['id'], axis=1)
ID_test = test_data.id

In [None]:
X_testtest = poly.fit_transform(X_testtest)

In [None]:
preds = tuned_model.predict_proba(X_testtest)

In [None]:
preds

array([[9.99515200e-01, 4.84800360e-04],
       [9.99230283e-01, 7.69716583e-04],
       [9.99826850e-01, 1.73149779e-04],
       ...,
       [9.99832612e-01, 1.67387506e-04],
       [9.35309601e-01, 6.46903993e-02],
       [1.61411435e-02, 9.83858856e-01]])

In [None]:
preds=  preds[:,1]

In [None]:
preds

array([4.84800360e-04, 7.69716583e-04, 1.73149779e-04, ...,
       1.67387506e-04, 6.46903993e-02, 9.83858856e-01])

In [None]:
preds.shape

(78377,)

In [None]:
# Save test predictions to file
submission = pd.DataFrame({'id': ID_test,
                       'Class': preds})
submission.to_csv('submission_optuna_lgbm.csv', index=False)