In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import display

from category_encoders import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold

import lightgbm as lgb

In [2]:
# データロード
train = pd.read_csv('input/train.csv')

# かんたんな特徴量作成
train['CabinRank'] = train.Cabin.str[0]
train['FamilySize'] = train.SibSp + train.Parch
train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# 文字列をラベルに変換
enc = OrdinalEncoder()
X_train = enc.fit_transform(train.drop('Survived', axis=1)).astype(float)
y_train = train.Survived.values

# 欠損は平均値で補完
X_train.Age.fillna(X_train.Age.mean(), inplace=True)

In [3]:
X_train.sample(5, random_state=0).sort_index()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,FamilySize,Sex,Embarked,CabinRank
31,1.0,29.699118,1.0,0.0,146.5208,1.0,2.0,2.0,7.0
255,3.0,29.0,0.0,2.0,15.2458,2.0,2.0,2.0,1.0
278,3.0,7.0,4.0,1.0,29.125,5.0,1.0,3.0,1.0
495,3.0,29.699118,0.0,0.0,14.4583,0.0,1.0,2.0,1.0
648,3.0,29.699118,0.0,0.0,7.55,0.0,1.0,1.0,1.0


パラメータ

In [4]:
params = {
    'n_estimators': 100,
    'random_state': 0
}

cv = KFold(n_splits=5, random_state=0, shuffle=True)

## RandomForest

In [5]:
result_df = pd.DataFrame()

model = RandomForestClassifier(**params)
result_df['initial'] = cross_val_score(model, X_train, y_train, cv=cv)

model = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(**params)
)
result_df['StandardScaler'] = cross_val_score(model, X_train, y_train, cv=cv)

model = make_pipeline(
    MinMaxScaler(),
    RandomForestClassifier(**params)
)
result_df['MinMaxScaler'] = cross_val_score(model, X_train, y_train, cv=cv)

model = make_pipeline(
    RobustScaler(),
    RandomForestClassifier(**params)
)
result_df['RobustScaler'] = cross_val_score(model, X_train, y_train, cv=cv)

model = RandomForestClassifier(**params)
result_df['log1p'] = cross_val_score(model, np.log1p(X_train), y_train, cv=cv)

display(result_df.T)

Unnamed: 0,0,1,2,3,4
initial,0.837989,0.803371,0.842697,0.792135,0.786517
StandardScaler,0.837989,0.803371,0.842697,0.792135,0.786517
MinMaxScaler,0.837989,0.803371,0.842697,0.792135,0.786517
RobustScaler,0.837989,0.803371,0.842697,0.792135,0.786517
log1p,0.837989,0.792135,0.848315,0.792135,0.792135


## LightGBM

In [6]:
result_df = pd.DataFrame()

model = lgb.LGBMClassifier(**params)
result_df['initial'] = cross_val_score(model, X_train, y_train, cv=cv)

model = make_pipeline(
    StandardScaler(),
    lgb.LGBMClassifier(**params)
)
result_df['StandardScaler'] = cross_val_score(model, X_train, y_train, cv=cv)

model = make_pipeline(
    MinMaxScaler(),
    lgb.LGBMClassifier(**params)
)
result_df['MinMaxScaler'] = cross_val_score(model, X_train, y_train, cv=cv)

model = make_pipeline(
    RobustScaler(),
    lgb.LGBMClassifier(**params)
)
result_df['RobustScaler'] = cross_val_score(model, X_train, y_train, cv=cv)

model = lgb.LGBMClassifier(**params)
result_df['log1p'] = cross_val_score(model, np.log1p(X_train), y_train, cv=cv)

display(result_df.T)

Unnamed: 0,0,1,2,3,4
initial,0.860335,0.792135,0.814607,0.786517,0.797753
StandardScaler,0.860335,0.792135,0.814607,0.786517,0.797753
MinMaxScaler,0.860335,0.792135,0.814607,0.786517,0.797753
RobustScaler,0.860335,0.792135,0.814607,0.786517,0.797753
log1p,0.860335,0.792135,0.814607,0.786517,0.797753
