In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, accuracy_score
import optuna
import lightgbm as lgb
import sklearn
import numpy as np
import matplotlib.pyplot as plt

In [5]:
# read the data from /input/train.csv
data = pd.read_csv("train.csv")

In [6]:
X = data.drop(['Class', 'id'], axis=1)
y = data['Class']

In [7]:
X.head()

Unnamed: 0,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve
0,133.171875,59.716081,0.043133,-0.703383,54.917224,70.084438,0.749798,-0.649512
1,87.09375,36.257973,0.435469,2.266057,3.417224,21.865069,7.03933,52.686251
2,112.640625,39.818393,0.379639,0.922306,2.730769,15.68969,8.193471,85.649785
3,120.679688,45.918448,-0.09849,0.011775,2.696488,20.954662,8.183874,70.332899
4,134.070312,57.720107,-0.107772,-0.573335,1.10786,11.255051,16.107748,308.753765


# Preprocessing with PolynomialFeatures!!!

In [8]:
# preprocess the data by adding interaction features
#from sklearn.preprocessing import PolynomialFeatures
#poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
#X = poly.fit_transform(X)

In [9]:
type(X)

pandas.core.frame.DataFrame

# Train, Val, Test Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.11, random_state=42)

In [11]:
X

Unnamed: 0,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve
0,133.171875,59.716081,0.043133,-0.703383,54.917224,70.084438,0.749798,-0.649512
1,87.093750,36.257973,0.435469,2.266057,3.417224,21.865069,7.039330,52.686251
2,112.640625,39.818393,0.379639,0.922306,2.730769,15.689690,8.193471,85.649785
3,120.679688,45.918448,-0.098490,0.011775,2.696488,20.954662,8.183874,70.332899
4,134.070312,57.720107,-0.107772,-0.573335,1.107860,11.255051,16.107748,308.753765
...,...,...,...,...,...,...,...,...
117559,132.843750,56.748838,-0.060070,-0.554084,4.054348,27.844144,6.564423,44.442664
117560,112.578125,52.539271,0.179580,-0.306961,1.637960,15.331913,11.384718,142.535470
117561,119.757812,49.980013,0.064402,-0.270822,3.877926,19.788559,6.959740,56.367789
117562,105.789062,46.986595,0.441426,0.372466,2.097826,17.170612,9.442445,99.074539


In [12]:
X_train.shape

(94168, 8)

# Parameter search with Optuna

In [13]:
def objective(trial): 
    params = {
    'task': 'train', 
    'boosting_type':'gbdt',
    'objective':'binary',
    'metric':'binary_logloss', 
    'verbose':-10000000,
    'seed':42,
    'max_bin':128,
    'n_estimators':128,
    # 'n_estimators':trial.suggest_int("n_estimators", 800, 1200),
    'learning_rate':0.08,
    'feature_fraction':1.0,
    'bagging_fraction':1.0,
   # 'bagging_freq':trial.suggest_categorical("bagging_freq", [1]),
    'max_depth':trial.suggest_int("max_depth", 5, 12), # decrease this in the next round
    'num_leaves':trial.suggest_int("num_leaves",8, 32),
    'min_data_in_leaf':200,
    'min_gain_to_split':1.0,
}
    
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100)
    y_val_pred = model.predict_proba(X_val)
    logloss = log_loss(y_val, y_val_pred)
    return logloss

In [17]:
study = optuna.create_study(direction='minimize');
study.optimize(objective, n_trials=10);
#optuna.visualization.plot_optimization_history(study)


[32m[I 2023-03-13 15:51:49,935][0m A new study created in memory with name: no-name-68545e48-9bd2-4a69-aa6f-e55aad46b501[0m


[1]	valid_0's binary_logloss: 0.246449
[2]	valid_0's binary_logloss: 0.212494
[3]	valid_0's binary_logloss: 0.18836
[4]	valid_0's binary_logloss: 0.169523
[5]	valid_0's binary_logloss: 0.154069
[6]	valid_0's binary_logloss: 0.141098
[7]	valid_0's binary_logloss: 0.129997
[8]	valid_0's binary_logloss: 0.120353
[9]	valid_0's binary_logloss: 0.111919
[10]	valid_0's binary_logloss: 0.10439
[11]	valid_0's binary_logloss: 0.0976797
[12]	valid_0's binary_logloss: 0.0917019
[13]	valid_0's binary_logloss: 0.0863629
[14]	valid_0's binary_logloss: 0.0815257
[15]	valid_0's binary_logloss: 0.0772091
[16]	valid_0's binary_logloss: 0.0733193
[17]	valid_0's binary_logloss: 0.0697887
[18]	valid_0's binary_logloss: 0.0666331
[19]	valid_0's binary_logloss: 0.0637304
[20]	valid_0's binary_logloss: 0.0611072
[21]	valid_0's binary_logloss: 0.0587462
[22]	valid_0's binary_logloss: 0.0565292
[23]	valid_0's binary_logloss: 0.0545757
[24]	valid_0's binary_logloss: 0.0527582
[25]	valid_0's binary_logloss: 0.0511

[32m[I 2023-03-13 15:51:50,408][0m Trial 0 finished with value: 0.03427285365910371 and parameters: {'max_depth': 11, 'num_leaves': 21}. Best is trial 0 with value: 0.03427285365910371.[0m



[93]	valid_0's binary_logloss: 0.0343708
[94]	valid_0's binary_logloss: 0.034314
[95]	valid_0's binary_logloss: 0.0343208
[96]	valid_0's binary_logloss: 0.0342865
[97]	valid_0's binary_logloss: 0.0342729
[98]	valid_0's binary_logloss: 0.0342973
[99]	valid_0's binary_logloss: 0.0343109
[100]	valid_0's binary_logloss: 0.0343007
[101]	valid_0's binary_logloss: 0.0343036
[102]	valid_0's binary_logloss: 0.0343011
[103]	valid_0's binary_logloss: 0.0342869
[104]	valid_0's binary_logloss: 0.03431
[105]	valid_0's binary_logloss: 0.0343158
[106]	valid_0's binary_logloss: 0.0343065
[107]	valid_0's binary_logloss: 0.0343196
[108]	valid_0's binary_logloss: 0.0343319
[109]	valid_0's binary_logloss: 0.0343323
[110]	valid_0's binary_logloss: 0.0343323
[111]	valid_0's binary_logloss: 0.0343323
[112]	valid_0's binary_logloss: 0.0343323
[113]	valid_0's binary_logloss: 0.0343323
[114]	valid_0's binary_logloss: 0.0343323
[115]	valid_0's binary_logloss: 0.0343323
[116]	valid_0's binary_logloss: 0.0343323
[




[2]	valid_0's binary_logloss: 0.212586
[3]	valid_0's binary_logloss: 0.188484
[4]	valid_0's binary_logloss: 0.169664
[5]	valid_0's binary_logloss: 0.154295
[6]	valid_0's binary_logloss: 0.141353
[7]	valid_0's binary_logloss: 0.130239
[8]	valid_0's binary_logloss: 0.12061
[9]	valid_0's binary_logloss: 0.112134
[10]	valid_0's binary_logloss: 0.104625
[11]	valid_0's binary_logloss: 0.0979358
[12]	valid_0's binary_logloss: 0.09198
[13]	valid_0's binary_logloss: 0.086669
[14]	valid_0's binary_logloss: 0.0818472
[15]	valid_0's binary_logloss: 0.0775802
[16]	valid_0's binary_logloss: 0.0737042
[17]	valid_0's binary_logloss: 0.0701966
[18]	valid_0's binary_logloss: 0.067032
[19]	valid_0's binary_logloss: 0.0641329
[20]	valid_0's binary_logloss: 0.0615132
[21]	valid_0's binary_logloss: 0.0591508
[22]	valid_0's binary_logloss: 0.0570212
[23]	valid_0's binary_logloss: 0.0550377
[24]	valid_0's binary_logloss: 0.0532273
[25]	valid_0's binary_logloss: 0.0515978
[26]	valid_0's binary_logloss: 0.0501

[32m[I 2023-03-13 15:51:50,891][0m Trial 1 finished with value: 0.03386998075382093 and parameters: {'max_depth': 11, 'num_leaves': 12}. Best is trial 1 with value: 0.03386998075382093.[0m


[1]	valid_0's binary_logloss: 0.246432
[2]	valid_0's binary_logloss: 0.212473
[3]	valid_0's binary_logloss: 0.188335
[4]	valid_0's binary_logloss: 0.169487
[5]	valid_0's binary_logloss: 0.154026
[6]	valid_0's binary_logloss: 0.141056
[7]	valid_0's binary_logloss: 0.129948
[8]	valid_0's binary_logloss: 0.120302
[9]	valid_0's binary_logloss: 0.111866
[10]	valid_0's binary_logloss: 0.10432
[11]	valid_0's binary_logloss: 0.0976137
[12]	valid_0's binary_logloss: 0.0916305
[13]	valid_0's binary_logloss: 0.0862944
[14]	valid_0's binary_logloss: 0.0815024
[15]	valid_0's binary_logloss: 0.0771059
[16]	valid_0's binary_logloss: 0.0732028
[17]	valid_0's binary_logloss: 0.0696264
[18]	valid_0's binary_logloss: 0.0664535
[19]	valid_0's binary_logloss: 0.0635868
[20]	valid_0's binary_logloss: 0.0609343
[21]	valid_0's binary_logloss: 0.0585421
[22]	valid_0's binary_logloss: 0.056332
[23]	valid_0's binary_logloss: 0.0543479
[24]	valid_0's binary_logloss: 0.0525217
[25]	valid_0's binary_logloss: 0.0508

[32m[I 2023-03-13 15:51:51,387][0m Trial 2 finished with value: 0.034194972548673465 and parameters: {'max_depth': 9, 'num_leaves': 29}. Best is trial 1 with value: 0.03386998075382093.[0m



[88]	valid_0's binary_logloss: 0.0342229
[89]	valid_0's binary_logloss: 0.0342079
[90]	valid_0's binary_logloss: 0.034195
[91]	valid_0's binary_logloss: 0.0342075
[92]	valid_0's binary_logloss: 0.0342172
[93]	valid_0's binary_logloss: 0.0342815
[94]	valid_0's binary_logloss: 0.0342507
[95]	valid_0's binary_logloss: 0.0342511
[96]	valid_0's binary_logloss: 0.0342798
[97]	valid_0's binary_logloss: 0.0342625
[98]	valid_0's binary_logloss: 0.0342859
[99]	valid_0's binary_logloss: 0.0343001
[100]	valid_0's binary_logloss: 0.0343082
[101]	valid_0's binary_logloss: 0.0343291
[102]	valid_0's binary_logloss: 0.0343269
[103]	valid_0's binary_logloss: 0.0343269
[104]	valid_0's binary_logloss: 0.0343269
[105]	valid_0's binary_logloss: 0.0343269
[106]	valid_0's binary_logloss: 0.0343269
[107]	valid_0's binary_logloss: 0.0343269
[108]	valid_0's binary_logloss: 0.0343269
[109]	valid_0's binary_logloss: 0.0343269
[110]	valid_0's binary_logloss: 0.0343269
[111]	valid_0's binary_logloss: 0.0343269
[112



[1]	valid_0's binary_logloss: 0.246461
[2]	valid_0's binary_logloss: 0.212509
[3]	valid_0's binary_logloss: 0.18838
[4]	valid_0's binary_logloss: 0.16954
[5]	valid_0's binary_logloss: 0.154082
[6]	valid_0's binary_logloss: 0.141112
[7]	valid_0's binary_logloss: 0.130019
[8]	valid_0's binary_logloss: 0.120338
[9]	valid_0's binary_logloss: 0.111905
[10]	valid_0's binary_logloss: 0.104426
[11]	valid_0's binary_logloss: 0.097759
[12]	valid_0's binary_logloss: 0.0917623
[13]	valid_0's binary_logloss: 0.0864202
[14]	valid_0's binary_logloss: 0.0816525
[15]	valid_0's binary_logloss: 0.0773188
[16]	valid_0's binary_logloss: 0.0734169
[17]	valid_0's binary_logloss: 0.06992
[18]	valid_0's binary_logloss: 0.0667295
[19]	valid_0's binary_logloss: 0.0638
[20]	valid_0's binary_logloss: 0.061159
[21]	valid_0's binary_logloss: 0.0587818
[22]	valid_0's binary_logloss: 0.0565973
[23]	valid_0's binary_logloss: 0.0546514
[24]	valid_0's binary_logloss: 0.0528495
[25]	valid_0's binary_logloss: 0.051203
[26]

[32m[I 2023-03-13 15:51:51,880][0m Trial 3 finished with value: 0.03446520415473591 and parameters: {'max_depth': 12, 'num_leaves': 19}. Best is trial 1 with value: 0.03386998075382093.[0m


[1]	valid_0's binary_logloss: 0.24648
[2]	valid_0's binary_logloss: 0.212512
[3]	valid_0's binary_logloss: 0.188387
[4]	valid_0's binary_logloss: 0.169553
[5]	valid_0's binary_logloss: 0.154103
[6]	valid_0's binary_logloss: 0.141135
[7]	valid_0's binary_logloss: 0.130004
[8]	valid_0's binary_logloss: 0.120366
[9]	valid_0's binary_logloss: 0.111906
[10]	valid_0's binary_logloss: 0.104363
[11]	valid_0's binary_logloss: 0.0976506
[12]	valid_0's binary_logloss: 0.0916862
[13]	valid_0's binary_logloss: 0.0863692
[14]	valid_0's binary_logloss: 0.0815501
[15]	valid_0's binary_logloss: 0.077182
[16]	valid_0's binary_logloss: 0.0732746
[17]	valid_0's binary_logloss: 0.0697306
[18]	valid_0's binary_logloss: 0.0665465
[19]	valid_0's binary_logloss: 0.0636801
[20]	valid_0's binary_logloss: 0.0610341
[21]	valid_0's binary_logloss: 0.0586183
[22]	valid_0's binary_logloss: 0.0564081
[23]	valid_0's binary_logloss: 0.0544441
[24]	valid_0's binary_logloss: 0.0526455
[25]	valid_0's binary_logloss: 0.0509

[32m[I 2023-03-13 15:51:52,360][0m Trial 4 finished with value: 0.03444125140461969 and parameters: {'max_depth': 6, 'num_leaves': 26}. Best is trial 1 with value: 0.03386998075382093.[0m



[95]	valid_0's binary_logloss: 0.0344766
[96]	valid_0's binary_logloss: 0.0344717
[97]	valid_0's binary_logloss: 0.0344817
[98]	valid_0's binary_logloss: 0.0344884
[99]	valid_0's binary_logloss: 0.0344874
[100]	valid_0's binary_logloss: 0.0344903
[101]	valid_0's binary_logloss: 0.0345237
[102]	valid_0's binary_logloss: 0.0345259
[103]	valid_0's binary_logloss: 0.0345314
[104]	valid_0's binary_logloss: 0.0345521
[105]	valid_0's binary_logloss: 0.0345812
[106]	valid_0's binary_logloss: 0.0345619
[107]	valid_0's binary_logloss: 0.0345619
[108]	valid_0's binary_logloss: 0.0345619
[109]	valid_0's binary_logloss: 0.0345619
[110]	valid_0's binary_logloss: 0.0345619
[111]	valid_0's binary_logloss: 0.0345619
[112]	valid_0's binary_logloss: 0.0345619
[113]	valid_0's binary_logloss: 0.0345619
[114]	valid_0's binary_logloss: 0.0345619
[115]	valid_0's binary_logloss: 0.0345619
[116]	valid_0's binary_logloss: 0.0345619
[117]	valid_0's binary_logloss: 0.0345619
[118]	valid_0's binary_logloss: 0.0345




[2]	valid_0's binary_logloss: 0.212478
[3]	valid_0's binary_logloss: 0.188342
[4]	valid_0's binary_logloss: 0.169495
[5]	valid_0's binary_logloss: 0.154034
[6]	valid_0's binary_logloss: 0.141064
[7]	valid_0's binary_logloss: 0.129957
[8]	valid_0's binary_logloss: 0.120309
[9]	valid_0's binary_logloss: 0.111876
[10]	valid_0's binary_logloss: 0.104338
[11]	valid_0's binary_logloss: 0.0976297
[12]	valid_0's binary_logloss: 0.0916472
[13]	valid_0's binary_logloss: 0.0863158
[14]	valid_0's binary_logloss: 0.0815252
[15]	valid_0's binary_logloss: 0.0771333
[16]	valid_0's binary_logloss: 0.0732537
[17]	valid_0's binary_logloss: 0.0696972
[18]	valid_0's binary_logloss: 0.0665194
[19]	valid_0's binary_logloss: 0.0636538
[20]	valid_0's binary_logloss: 0.06102
[21]	valid_0's binary_logloss: 0.0586318
[22]	valid_0's binary_logloss: 0.056413
[23]	valid_0's binary_logloss: 0.0544474
[24]	valid_0's binary_logloss: 0.0526486
[25]	valid_0's binary_logloss: 0.0510582
[26]	valid_0's binary_logloss: 0.04

[32m[I 2023-03-13 15:51:52,892][0m Trial 5 finished with value: 0.03416133077275566 and parameters: {'max_depth': 9, 'num_leaves': 25}. Best is trial 1 with value: 0.03386998075382093.[0m



[114]	valid_0's binary_logloss: 0.0342963
[115]	valid_0's binary_logloss: 0.0342963
[116]	valid_0's binary_logloss: 0.0342963
[117]	valid_0's binary_logloss: 0.0342963
[118]	valid_0's binary_logloss: 0.0342963
[119]	valid_0's binary_logloss: 0.0342963
[120]	valid_0's binary_logloss: 0.0342963
[121]	valid_0's binary_logloss: 0.0342963
[122]	valid_0's binary_logloss: 0.0342963
[123]	valid_0's binary_logloss: 0.0342963
[124]	valid_0's binary_logloss: 0.0342963
[125]	valid_0's binary_logloss: 0.0342963
[126]	valid_0's binary_logloss: 0.0342963
[127]	valid_0's binary_logloss: 0.0342963
[128]	valid_0's binary_logloss: 0.0342963
[1]	valid_0's binary_logloss: 0.246506
[2]	valid_0's binary_logloss: 0.212559
[3]	valid_0's binary_logloss: 0.188445
[4]	valid_0's binary_logloss: 0.169629
[5]	valid_0's binary_logloss: 0.154189
[6]	valid_0's binary_logloss: 0.141231




[7]	valid_0's binary_logloss: 0.130123
[8]	valid_0's binary_logloss: 0.120485
[9]	valid_0's binary_logloss: 0.112044
[10]	valid_0's binary_logloss: 0.104529
[11]	valid_0's binary_logloss: 0.0978531
[12]	valid_0's binary_logloss: 0.0918767
[13]	valid_0's binary_logloss: 0.0865289
[14]	valid_0's binary_logloss: 0.0817477
[15]	valid_0's binary_logloss: 0.0774062
[16]	valid_0's binary_logloss: 0.0735144
[17]	valid_0's binary_logloss: 0.0700067
[18]	valid_0's binary_logloss: 0.0668208
[19]	valid_0's binary_logloss: 0.0639323
[20]	valid_0's binary_logloss: 0.0613275
[21]	valid_0's binary_logloss: 0.0589589
[22]	valid_0's binary_logloss: 0.0567534
[23]	valid_0's binary_logloss: 0.0547733
[24]	valid_0's binary_logloss: 0.0529693
[25]	valid_0's binary_logloss: 0.0513412
[26]	valid_0's binary_logloss: 0.0498225
[27]	valid_0's binary_logloss: 0.0485016
[28]	valid_0's binary_logloss: 0.0472655
[29]	valid_0's binary_logloss: 0.0461208
[30]	valid_0's binary_logloss: 0.045083
[31]	valid_0's binary_l

[32m[I 2023-03-13 15:51:53,381][0m Trial 6 finished with value: 0.03390044143109382 and parameters: {'max_depth': 6, 'num_leaves': 16}. Best is trial 1 with value: 0.03386998075382093.[0m



[68]	valid_0's binary_logloss: 0.0344627
[69]	valid_0's binary_logloss: 0.0344482
[70]	valid_0's binary_logloss: 0.0343923
[71]	valid_0's binary_logloss: 0.0343417
[72]	valid_0's binary_logloss: 0.0343227
[73]	valid_0's binary_logloss: 0.0342474
[74]	valid_0's binary_logloss: 0.0342246
[75]	valid_0's binary_logloss: 0.0341583
[76]	valid_0's binary_logloss: 0.0341703
[77]	valid_0's binary_logloss: 0.0341484
[78]	valid_0's binary_logloss: 0.0341519
[79]	valid_0's binary_logloss: 0.0341364
[80]	valid_0's binary_logloss: 0.0341268
[81]	valid_0's binary_logloss: 0.0341313
[82]	valid_0's binary_logloss: 0.034135
[83]	valid_0's binary_logloss: 0.034146
[84]	valid_0's binary_logloss: 0.0341109
[85]	valid_0's binary_logloss: 0.0340596
[86]	valid_0's binary_logloss: 0.0340704
[87]	valid_0's binary_logloss: 0.0340777
[88]	valid_0's binary_logloss: 0.0340556
[89]	valid_0's binary_logloss: 0.0340961
[90]	valid_0's binary_logloss: 0.0340926
[91]	valid_0's binary_logloss: 0.0340561
[92]	valid_0's bi



[1]	valid_0's binary_logloss: 0.246461
[2]	valid_0's binary_logloss: 0.212509
[3]	valid_0's binary_logloss: 0.18838
[4]	valid_0's binary_logloss: 0.16954
[5]	valid_0's binary_logloss: 0.154082
[6]	valid_0's binary_logloss: 0.141112
[7]	valid_0's binary_logloss: 0.130019
[8]	valid_0's binary_logloss: 0.120338
[9]	valid_0's binary_logloss: 0.111905
[10]	valid_0's binary_logloss: 0.104426
[11]	valid_0's binary_logloss: 0.097759
[12]	valid_0's binary_logloss: 0.0917623
[13]	valid_0's binary_logloss: 0.0864202
[14]	valid_0's binary_logloss: 0.0816525
[15]	valid_0's binary_logloss: 0.0773188
[16]	valid_0's binary_logloss: 0.0734169
[17]	valid_0's binary_logloss: 0.06992
[18]	valid_0's binary_logloss: 0.0667295
[19]	valid_0's binary_logloss: 0.0638
[20]	valid_0's binary_logloss: 0.061159
[21]	valid_0's binary_logloss: 0.0587818
[22]	valid_0's binary_logloss: 0.0565973
[23]	valid_0's binary_logloss: 0.0546514
[24]	valid_0's binary_logloss: 0.0528495
[25]	valid_0's binary_logloss: 0.051203
[26]

[32m[I 2023-03-13 15:51:53,872][0m Trial 7 finished with value: 0.034322739056157583 and parameters: {'max_depth': 10, 'num_leaves': 19}. Best is trial 1 with value: 0.03386998075382093.[0m



[96]	valid_0's binary_logloss: 0.0343802
[97]	valid_0's binary_logloss: 0.0343698
[98]	valid_0's binary_logloss: 0.0343713
[99]	valid_0's binary_logloss: 0.0343807
[100]	valid_0's binary_logloss: 0.0343954
[101]	valid_0's binary_logloss: 0.0344103
[102]	valid_0's binary_logloss: 0.0343945
[103]	valid_0's binary_logloss: 0.034349
[104]	valid_0's binary_logloss: 0.0343426
[105]	valid_0's binary_logloss: 0.0343227
[106]	valid_0's binary_logloss: 0.0343227
[107]	valid_0's binary_logloss: 0.0343227
[108]	valid_0's binary_logloss: 0.0343227
[109]	valid_0's binary_logloss: 0.0343227
[110]	valid_0's binary_logloss: 0.0343227
[111]	valid_0's binary_logloss: 0.0343227
[112]	valid_0's binary_logloss: 0.0343227
[113]	valid_0's binary_logloss: 0.0343227
[114]	valid_0's binary_logloss: 0.0343227
[115]	valid_0's binary_logloss: 0.0343227
[116]	valid_0's binary_logloss: 0.0343227
[117]	valid_0's binary_logloss: 0.0343227
[118]	valid_0's binary_logloss: 0.0343227
[119]	valid_0's binary_logloss: 0.0343



[1]	valid_0's binary_logloss: 0.246449
[2]	valid_0's binary_logloss: 0.212494
[3]	valid_0's binary_logloss: 0.18836
[4]	valid_0's binary_logloss: 0.169523
[5]	valid_0's binary_logloss: 0.154069
[6]	valid_0's binary_logloss: 0.141098
[7]	valid_0's binary_logloss: 0.129997
[8]	valid_0's binary_logloss: 0.120353
[9]	valid_0's binary_logloss: 0.111919
[10]	valid_0's binary_logloss: 0.10439
[11]	valid_0's binary_logloss: 0.0976797
[12]	valid_0's binary_logloss: 0.0917019
[13]	valid_0's binary_logloss: 0.0863629
[14]	valid_0's binary_logloss: 0.0815257
[15]	valid_0's binary_logloss: 0.0772091
[16]	valid_0's binary_logloss: 0.0733193
[17]	valid_0's binary_logloss: 0.0697887
[18]	valid_0's binary_logloss: 0.0666331
[19]	valid_0's binary_logloss: 0.0637304
[20]	valid_0's binary_logloss: 0.0611072
[21]	valid_0's binary_logloss: 0.0587462
[22]	valid_0's binary_logloss: 0.0565292
[23]	valid_0's binary_logloss: 0.0545757
[24]	valid_0's binary_logloss: 0.0527582
[25]	valid_0's binary_logloss: 0.0511

[32m[I 2023-03-13 15:51:54,360][0m Trial 8 finished with value: 0.03422394065196606 and parameters: {'max_depth': 12, 'num_leaves': 21}. Best is trial 1 with value: 0.03386998075382093.[0m


[1]	valid_0's binary_logloss: 0.24647
[2]	valid_0's binary_logloss: 0.212519
[3]	valid_0's binary_logloss: 0.18839
[4]	valid_0's binary_logloss: 0.169556
[5]	valid_0's binary_logloss: 0.154103
[6]	valid_0's binary_logloss: 0.141138
[7]	valid_0's binary_logloss: 0.130052
[8]	valid_0's binary_logloss: 0.12036
[9]	valid_0's binary_logloss: 0.111933
[10]	valid_0's binary_logloss: 0.104404
[11]	valid_0's binary_logloss: 0.0977014
[12]	valid_0's binary_logloss: 0.0917499
[13]	valid_0's binary_logloss: 0.0864016
[14]	valid_0's binary_logloss: 0.0815941
[15]	valid_0's binary_logloss: 0.0772686
[16]	valid_0's binary_logloss: 0.0733971
[17]	valid_0's binary_logloss: 0.0698695
[18]	valid_0's binary_logloss: 0.0667016
[19]	valid_0's binary_logloss: 0.0638279
[20]	valid_0's binary_logloss: 0.0611768
[21]	valid_0's binary_logloss: 0.0587766
[22]	valid_0's binary_logloss: 0.056613
[23]	valid_0's binary_logloss: 0.0546123
[24]	valid_0's binary_logloss: 0.0528353
[25]	valid_0's binary_logloss: 0.051210

[32m[I 2023-03-13 15:51:54,862][0m Trial 9 finished with value: 0.03396449453089007 and parameters: {'max_depth': 11, 'num_leaves': 18}. Best is trial 1 with value: 0.03386998075382093.[0m



[96]	valid_0's binary_logloss: 0.0340596
[97]	valid_0's binary_logloss: 0.0340545
[98]	valid_0's binary_logloss: 0.0340721
[99]	valid_0's binary_logloss: 0.0340617
[100]	valid_0's binary_logloss: 0.034062
[101]	valid_0's binary_logloss: 0.0340406
[102]	valid_0's binary_logloss: 0.0340027
[103]	valid_0's binary_logloss: 0.0339866
[104]	valid_0's binary_logloss: 0.0340068
[105]	valid_0's binary_logloss: 0.0340092
[106]	valid_0's binary_logloss: 0.0340308
[107]	valid_0's binary_logloss: 0.0340298
[108]	valid_0's binary_logloss: 0.0340334
[109]	valid_0's binary_logloss: 0.0340276
[110]	valid_0's binary_logloss: 0.034031
[111]	valid_0's binary_logloss: 0.0340202
[112]	valid_0's binary_logloss: 0.033977
[113]	valid_0's binary_logloss: 0.0339796
[114]	valid_0's binary_logloss: 0.0339645
[115]	valid_0's binary_logloss: 0.0339645
[116]	valid_0's binary_logloss: 0.0339645
[117]	valid_0's binary_logloss: 0.0339645
[118]	valid_0's binary_logloss: 0.0339645
[119]	valid_0's binary_logloss: 0.033964

In [18]:
study.best_trial.params

{'max_depth': 11, 'num_leaves': 12}

In [19]:
params = {
    'task': 'train', 
    'boosting_type':'gbdt',
    'objective':'binary',
    'metric':'binary_logloss', 
    'verbose':-10000000,
    'seed':42,
    'max_bin':128,
    'n_estimators':128,
    # 'n_estimators':trial.suggest_int("n_estimators", 800, 1200),
    'learning_rate':0.08,
    'feature_fraction':1.0,
    'bagging_fraction':1.0,
   # 'bagging_freq':trial.suggest_categorical("bagging_freq", [1]),
    'min_data_in_leaf':200,
    'min_gain_to_split':1.0,
}
    

In [20]:
params.update(study.best_trial.params)

In [22]:
tuned_model = lgb.LGBMClassifier(**params)
tuned_model.fit(X_train, y_train, 
        eval_set=[(X_val, y_val)],
        eval_metric=['binary_logloss'],
        early_stopping_rounds=100) 



[1]	valid_0's binary_logloss: 0.246505
[2]	valid_0's binary_logloss: 0.212586
[3]	valid_0's binary_logloss: 0.188484
[4]	valid_0's binary_logloss: 0.169664
[5]	valid_0's binary_logloss: 0.154295
[6]	valid_0's binary_logloss: 0.141353
[7]	valid_0's binary_logloss: 0.130239
[8]	valid_0's binary_logloss: 0.12061
[9]	valid_0's binary_logloss: 0.112134
[10]	valid_0's binary_logloss: 0.104625
[11]	valid_0's binary_logloss: 0.0979358
[12]	valid_0's binary_logloss: 0.09198
[13]	valid_0's binary_logloss: 0.086669
[14]	valid_0's binary_logloss: 0.0818472
[15]	valid_0's binary_logloss: 0.0775802
[16]	valid_0's binary_logloss: 0.0737042
[17]	valid_0's binary_logloss: 0.0701966
[18]	valid_0's binary_logloss: 0.067032
[19]	valid_0's binary_logloss: 0.0641329
[20]	valid_0's binary_logloss: 0.0615132
[21]	valid_0's binary_logloss: 0.0591508
[22]	valid_0's binary_logloss: 0.0570212
[23]	valid_0's binary_logloss: 0.0550377
[24]	valid_0's binary_logloss: 0.0532273
[25]	valid_0's binary_logloss: 0.0515978

LGBMClassifier(bagging_fraction=1.0, feature_fraction=1.0, learning_rate=0.08,
               max_bin=128, max_depth=11, metric='binary_logloss',
               min_data_in_leaf=200, min_gain_to_split=1.0, n_estimators=128,
               num_leaves=12, objective='binary', seed=42, task='train',
               verbose=-10000000)

In [None]:
X_test.shape

(11757, 36)

In [23]:
preds = tuned_model.predict_proba(X_test)
testloss = log_loss(y_test, preds)
testloss

0.030570310091813074

In [24]:
train_preds = tuned_model.predict_proba(X_train)
trainloss = log_loss(y_train, train_preds)
trainloss

0.026942563847874208

In [25]:
val_preds = tuned_model.predict_proba(X_val)
valloss = log_loss(y_val, val_preds)
valloss

0.03386998075382093

# Submission Pipeline

In [None]:
# read the real test data from test.csv
test_data = pd.read_csv("test.csv")

In [None]:
X_testtest = test_data.drop(['id'], axis=1)
ID_test = test_data.id

In [None]:
X_testtest = poly.fit_transform(X_testtest)

In [None]:
preds = tuned_model.predict_proba(X_testtest)

In [None]:
preds

array([[9.99515200e-01, 4.84800360e-04],
       [9.99230283e-01, 7.69716583e-04],
       [9.99826850e-01, 1.73149779e-04],
       ...,
       [9.99832612e-01, 1.67387506e-04],
       [9.35309601e-01, 6.46903993e-02],
       [1.61411435e-02, 9.83858856e-01]])

In [None]:
preds=  preds[:,1]

In [None]:
preds

array([4.84800360e-04, 7.69716583e-04, 1.73149779e-04, ...,
       1.67387506e-04, 6.46903993e-02, 9.83858856e-01])

In [None]:
preds.shape

(78377,)

In [None]:
# Save test predictions to file
submission = pd.DataFrame({'id': ID_test,
                       'Class': preds})
submission.to_csv('submission_optuna_lgbm.csv', index=False)