In [1]:
import numpy as np
import pandas as  pd

In [2]:
!ls ./input

gender_submission.csv  test.csv  train.csv


In [3]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")
gender_submission = pd.read_csv("./input/gender_submission.csv")

In [4]:
data = pd.concat([train, test], sort=False)

In [5]:
data['Sex'].replace(['male', 'female'], [0,1], inplace=True)

In [6]:
data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map({'S':0,'C':1,'Q':2}).astype(int)

In [7]:
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)

In [8]:
data['Age'].fillna(data['Age'].median(), inplace=True)

In [9]:
data['FamilySize'] = data['Parch'] + data['SibSp'] + 1
data['IsAlone'] = 0
data.loc[data['FamilySize'] == 1, 'IsAlone'] = 1

In [10]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,2,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,2,0
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,1,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0,2,0
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,1,1


In [11]:
delete_columns = ['Name', 'PassengerId', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test = data[len(train):]

y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop('Survived', axis=1)

In [12]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,3,0,22.0,1,0,7.25,0,2,0
1,1,1,38.0,1,0,71.2833,1,2,0
2,3,1,26.0,0,0,7.925,0,1,1
3,1,1,35.0,1,0,53.1,0,2,0
4,3,0,35.0,0,0,8.05,0,1,1


# 機械学習アルゴリズム

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0, stratify=y_train)

In [14]:
categorical_features = ['Embarked', 'Pclass', 'Sex']

In [24]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)

params = {
    'objective': 'binary',
    'max_bin': 300,
    'learning_rate': 0.05,
    'num_leaves': 40
}

model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(10)]
)

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.505696	valid_1's binary_logloss: 0.532008
[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iter



In [25]:
y_pred[:10]

array([0.03935242, 0.52380086, 0.11487096, 0.07538967, 0.39761111,
       0.41886672, 0.7023978 , 0.13190461, 0.74563405, 0.0485806 ])

In [26]:
y_pred = (y_pred > 0.5).astype(int)
y_pred[:10]

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0])

In [27]:
sub = gender_submission

In [28]:
sub['Survived'] = y_pred
sub.to_csv("submission_lightgbm.csv", index=False)

sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0


In [32]:
import optuna
from sklearn.metrics import log_loss

def objective(trial):
    params = {
        'objective': 'binary',
        'max_bin': trial.suggest_int('max_bin', 255, 500),
        'learning_rate': 0.05,
        'num_leaves': trial.suggest_int('num_leaves', 32, 128),
    }
    
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)

    model = lgb.train(
        params, lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        verbose_eval=10,
        num_boost_round=1000,
        early_stopping_rounds=10
    )

    y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
    score = log_loss(y_valid, y_pred_valid)
    return score

In [33]:
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
study.optimize(objective, n_trials=40)
study.best_params

[32m[I 2023-02-21 21:45:25,715][0m A new study created in memory with name: no-name-772bd743-6576-4d83-9fa3-b2610bbba7bb[0m
[32m[I 2023-02-21 21:45:25,841][0m Trial 0 finished with value: 0.43594874611252654 and parameters: {'max_bin': 390, 'num_leaves': 101}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:25,900][0m Trial 1 finished with value: 0.43594874611252654 and parameters: {'max_bin': 403, 'num_leaves': 84}. Best is trial 0 with value: 0.43594874611252654.[0m


[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.505696	valid_1's binary_logloss: 0.532008
[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iter

[32m[I 2023-02-21 21:45:25,993][0m Trial 2 finished with value: 0.43594874611252654 and parameters: {'max_bin': 359, 'num_leaves': 94}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:26,057][0m Trial 3 finished with value: 0.43594874611252654 and parameters: {'max_bin': 362, 'num_leaves': 118}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:26,117][0m Trial 4 finished with value: 0.43594874611252654 and parameters: {'max_bin': 492, 'num_leaves': 69}. Best is trial 0 with value: 0.43594874611252654.[0m


[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.2972	valid_1's binary_logloss: 0.435949
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.505696	valid_1's binary_logloss: 0

[32m[I 2023-02-21 21:45:26,193][0m Trial 5 finished with value: 0.43594874611252654 and parameters: {'max_bin': 449, 'num_leaves': 83}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:26,251][0m Trial 6 finished with value: 0.43594874611252654 and parameters: {'max_bin': 394, 'num_leaves': 121}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:26,308][0m Trial 7 finished with value: 0.43594874611252654 and parameters: {'max_bin': 272, 'num_leaves': 40}. Best is trial 0 with value: 0.43594874611252654.[0m


[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.2972	valid_1's binary_logloss: 0.435949
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.505696	valid_1's binary_logloss: 0.532008
[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0

[32m[I 2023-02-21 21:45:26,372][0m Trial 8 finished with value: 0.43594874611252654 and parameters: {'max_bin': 259, 'num_leaves': 112}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:26,438][0m Trial 9 finished with value: 0.43594874611252654 and parameters: {'max_bin': 446, 'num_leaves': 116}. Best is trial 0 with value: 0.43594874611252654.[0m


[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.2972	valid_1's binary_logloss: 0.435949
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.505696	valid_1's binary_logloss: 0.532008
[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0

[32m[I 2023-02-21 21:45:26,617][0m Trial 10 finished with value: 0.43594874611252654 and parameters: {'max_bin': 495, 'num_leaves': 109}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:26,688][0m Trial 11 finished with value: 0.43594874611252654 and parameters: {'max_bin': 368, 'num_leaves': 107}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:26,743][0m Trial 12 finished with value: 0.43594874611252654 and parameters: {'max_bin': 284, 'num_leaves': 94}. Best is trial 0 with value: 0.43594874611252654.[0m


[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.2972	valid_1's binary_logloss: 0.435949
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10

[32m[I 2023-02-21 21:45:26,823][0m Trial 13 finished with value: 0.43594874611252654 and parameters: {'max_bin': 290, 'num_leaves': 123}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:26,879][0m Trial 14 finished with value: 0.43594874611252654 and parameters: {'max_bin': 383, 'num_leaves': 72}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:26,942][0m Trial 15 finished with value: 0.43594874611252654 and parameters: {'max_bin': 320, 'num_leaves': 107}. Best is trial 0 with value: 0.43594874611252654.[0m


[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.2972	valid_1's binary_logloss: 0.435949
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10

[32m[I 2023-02-21 21:45:27,017][0m Trial 16 finished with value: 0.43594874611252654 and parameters: {'max_bin': 367, 'num_leaves': 87}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:27,100][0m Trial 17 finished with value: 0.43594874611252654 and parameters: {'max_bin': 259, 'num_leaves': 91}. Best is trial 0 with value: 0.43594874611252654.[0m


[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.2972	valid_1's binary_logloss: 0.435949
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]

[32m[I 2023-02-21 21:45:27,181][0m Trial 18 finished with value: 0.43594874611252654 and parameters: {'max_bin': 405, 'num_leaves': 91}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:27,250][0m Trial 19 finished with value: 0.43594874611252654 and parameters: {'max_bin': 487, 'num_leaves': 98}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:27,310][0m Trial 20 finished with value: 0.43594874611252654 and parameters: {'max_bin': 343, 'num_leaves': 74}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:27,367][0m Trial 21 finished with value: 0.43594874611252654 and parameters: {'max_bin': 426, 'num_leaves': 37}. Best is trial 0 with value: 0.43594874611252654.[0m


[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.505696	valid_1's binary_logloss: 0.532008
[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iter

[32m[I 2023-02-21 21:45:27,451][0m Trial 22 finished with value: 0.43594874611252654 and parameters: {'max_bin': 419, 'num_leaves': 97}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:27,520][0m Trial 23 finished with value: 0.43594874611252654 and parameters: {'max_bin': 306, 'num_leaves': 44}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:27,578][0m Trial 24 finished with value: 0.43594874611252654 and parameters: {'max_bin': 332, 'num_leaves': 67}. Best is trial 0 with value: 0.43594874611252654.[0m


[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.2972	valid_1's binary_logloss: 0.435949
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10

[32m[I 2023-02-21 21:45:27,653][0m Trial 25 finished with value: 0.43594874611252654 and parameters: {'max_bin': 395, 'num_leaves': 74}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:27,716][0m Trial 26 finished with value: 0.43594874611252654 and parameters: {'max_bin': 498, 'num_leaves': 41}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:27,776][0m Trial 27 finished with value: 0.43594874611252654 and parameters: {'max_bin': 306, 'num_leaves': 47}. Best is trial 0 with value: 0.43594874611252654.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.505696	valid_1's binary_logloss: 0.532008
[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.2972	valid_1's binary_logloss: 0.435949
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.

[32m[I 2023-02-21 21:45:27,846][0m Trial 28 finished with value: 0.43594874611252654 and parameters: {'max_bin': 415, 'num_leaves': 56}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:27,900][0m Trial 29 finished with value: 0.43594874611252654 and parameters: {'max_bin': 369, 'num_leaves': 55}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:27,974][0m Trial 30 finished with value: 0.43594874611252654 and parameters: {'max_bin': 294, 'num_leaves': 42}. Best is trial 0 with value: 0.43594874611252654.[0m


[10]	training's binary_logloss: 0.505696	valid_1's binary_logloss: 0.532008
[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.2972	valid_1's binary_logloss: 0.435949
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg

[32m[I 2023-02-21 21:45:28,057][0m Trial 31 finished with value: 0.43594874611252654 and parameters: {'max_bin': 416, 'num_leaves': 45}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:28,165][0m Trial 32 finished with value: 0.43594874611252654 and parameters: {'max_bin': 303, 'num_leaves': 67}. Best is trial 0 with value: 0.43594874611252654.[0m


[10]	training's binary_logloss: 0.505696	valid_1's binary_logloss: 0.532008
[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.2972	valid_1's binary_logloss: 0.435949
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start trainin

[32m[I 2023-02-21 21:45:28,266][0m Trial 33 finished with value: 0.43594874611252654 and parameters: {'max_bin': 456, 'num_leaves': 41}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:28,321][0m Trial 34 finished with value: 0.43594874611252654 and parameters: {'max_bin': 461, 'num_leaves': 41}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:28,378][0m Trial 35 finished with value: 0.43594874611252654 and parameters: {'max_bin': 495, 'num_leaves': 77}. Best is trial 0 with value: 0.43594874611252654.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.505696	valid_1's binary_logloss: 0.532008
[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.2972	valid_1's binary_logloss: 0.435949
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.

[32m[I 2023-02-21 21:45:28,473][0m Trial 36 finished with value: 0.43594874611252654 and parameters: {'max_bin': 495, 'num_leaves': 90}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:28,533][0m Trial 37 finished with value: 0.43594874611252654 and parameters: {'max_bin': 436, 'num_leaves': 35}. Best is trial 0 with value: 0.43594874611252654.[0m
[32m[I 2023-02-21 21:45:28,589][0m Trial 38 finished with value: 0.43594874611252654 and parameters: {'max_bin': 324, 'num_leaves': 43}. Best is trial 0 with value: 0.43594874611252654.[0m


[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.2972	valid_1's binary_logloss: 0.435949
[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.505696	valid_1's binary_logloss: 0.532008
[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0

[32m[I 2023-02-21 21:45:28,661][0m Trial 39 finished with value: 0.43594874611252654 and parameters: {'max_bin': 327, 'num_leaves': 43}. Best is trial 0 with value: 0.43594874611252654.[0m


[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.2972	valid_1's binary_logloss: 0.435949


{'max_bin': 390, 'num_leaves': 101}

In [34]:
params = {
    'objective': 'binary',
    'max_bin': study.best_params['max_bin'],
    'learning_rate': 0.05,
    'num_leaves': study.best_params['num_leaves']
}

lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)

model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=10
)

y_pred = model.predict(X_test, num_iteration=model.best_iteration)



[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179




Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.505696	valid_1's binary_logloss: 0.532008
[20]	training's binary_logloss: 0.422967	valid_1's binary_logloss: 0.478904
[30]	training's binary_logloss: 0.373496	valid_1's binary_logloss: 0.448045
[40]	training's binary_logloss: 0.342723	valid_1's binary_logloss: 0.439465
[50]	training's binary_logloss: 0.318708	valid_1's binary_logloss: 0.438633
[60]	training's binary_logloss: 0.299108	valid_1's binary_logloss: 0.436609
[70]	training's binary_logloss: 0.282689	valid_1's binary_logloss: 0.437435
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.2972	valid_1's binary_logloss: 0.435949


In [35]:
y_pred = (y_pred > 0.5).astype(int)

sub['Survived'] = y_pred
sub.to_csv("submission_lightgbm_optuna.csv", index = False)

sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0


# Validation

In [38]:
from sklearn.model_selection import KFold


y_preds = []
models = []
oof_train = np.zeros((len(X_train),))
cv = KFold(n_splits=5, shuffle=True, random_state=0)

categorical_features = ['Embarked', 'Pclass', 'Sex']

params = {
    'objective': 'binary',
    'max_bin': 300,
    'learning_rate': 0.05,
    'num_leaves': 40
}

for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train)):
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]

    lgb_train = lgb.Dataset(X_tr, y_tr,
                            categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_val, y_val,
                           reference=lgb_train,
                           categorical_feature=categorical_features)

    model = lgb.train(params, lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      verbose_eval=10,
                      num_boost_round=1000,
                      early_stopping_rounds=10)

    oof_train[valid_index] = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    y_preds.append(y_pred)
    models.append(model)

KeyError: '[0, 9, 18, 23, 24, 36, 42, 47, 57, 59, 72, 73, 82, 83, 86, 89, 96, 99, 113, 114, 116, 121, 126, 128, 130, 132, 135, 137, 143, 146, 147, 148, 152, 153, 174, 180, 182, 188, 191, 192, 199, 207, 209, 215, 218, 221, 227, 228, 232, 256, 260, 267, 268, 276, 277, 280, 282, 283, 284, 285, 288, 293, 302, 303, 304, 306, 309, 310, 314, 316, 318, 319, 324, 327, 328, 330, 331, 336, 340, 342, 345, 347, 371, 372, 376, 381, 390, 391, 394, 400, 412, 416, 419, 420, 421, 425, 427, 429, 434, 436, 437, 438, 440, 442, 445, 448, 449, 452, 458, 461, 464, 469, 471, 475, 478, 479, 480, 483, 487, 500, 510, 512, 513, 525, 528, 529, 533, 536, 543, 548, 551, 556, 562, 569, 572, 575, 580, 589, 592, 593, 594, 598, 599, 604, 606, 607, 610, 611, 612, 613, 620] not in index'