In [14]:
import pandas as pd
from lightgbm import LGBMClassifier
from lightgbm.callback import early_stopping, log_evaluation
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

# pd.set_option('display.max_rows', None)

In [None]:
data = pd.read_csv('dataset/train.csv')

In [None]:
data.head()

In [None]:
# 欠損値の確認
data.isnull().sum()

In [None]:
# Survivedのvalue_counts
data['Survived'].value_counts()

In [None]:
pclass_survived = data.groupby(['Survived', 'Pclass']).size()
pclass_survived

In [None]:
# Ageのvalue_counts
age_survived = data.groupby(['Survived', 'Age'], dropna=False).size()
pd.DataFrame(age_survived)

In [None]:
data[data['Age'].isnull()]

In [None]:
data[data['Age'].isnull()].groupby(['Survived', 'Pclass', 'Embarked']).size()

In [15]:
def label_encode(data):
    label_encoders = {}
    categorical_columns = ['Sex', 'Embarked', 'Cabin', 'Ticket', 'Name']
    for col in categorical_columns:
        label_encoders[col] = LabelEncoder()
        data[col] = label_encoders[col].fit_transform(data[col].astype(str))

    return data

In [16]:
def feture_engineering(train_data, test_data):
    # fill in missing data
    train_data.fillna(0, inplace=True)
    test_data.fillna(0, inplace=True)

    # label encoding
    train_data = label_encode(train_data)
    test_data = label_encode(test_data)

    return train_data, test_data

In [17]:
def train_data_setting(train_df, target_col, exclude_cols):
    # カラムの設定
    feture_cols = []
    for col in train_df.columns:
        if col not in exclude_cols:
            feture_cols.append(col)

    # 説明変数と目的変数にデータを分割
    x = train_df[feture_cols]
    y = train_df[target_col]

    return x, y

In [21]:
def model_fit(model, x, y):
    if model == 'lgbm':
        param_grid = {
            'num_leaves': [31, 50, 100, 200],
            'learning_rate': [0.01, 0.05, 0.1, 0.5],
            'n_estimators': [100, 200, 300, 400],
            'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
        }

        model = GridSearchCV(
            estimator=LGBMClassifier(),
            param_grid=param_grid,
            scoring='roc_auc',
            cv=5
        )

        model.fit(x, y)
    
    return model

In [22]:
# read data
train_data = pd.read_csv('dataset/train.csv')
test_data = pd.read_csv('dataset/test.csv')

# feture engineering
train_data, test_data = feture_engineering(train_data, test_data)

# train_data setting
target_col = 'Survived'
exclude_cols = [target_col]
x, y = train_data_setting(train_data, target_col, exclude_cols)

# model fitting
model = 'lgbm'
model = model_fit(model, x, y)

'''
# predict test_data
predict = model.predict(test_data)
result = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predict
})

# create csv file
result.to_csv('dataset/submission.csv', index=False)

'''

[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 965
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 963
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292 -> initscore=-0.471371
[LightGBM] [Info] Start training from score -0.471371
[LightGBM] [Info] Number o

"\n# predict test_data\npredict = model.predict(test_data)\nresult = pd.DataFrame({\n    'PassengerId': test_data['PassengerId'],\n    'Survived': predict\n})\n\n# create csv file\nresult.to_csv('dataset/submission.csv', index=False)\n\n"

In [23]:
print("Best parameters:", model.best_params_)
print("Best AUC score:", model.best_score_)

Best parameters: {'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}
Best AUC score: 0.8744411832252936
