In [259]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

path = "../data/"

# データの読み込み
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

# データの統合（特徴量エンジニアリングしやすくするために）
df = pd.concat([train, test], sort=False)

In [260]:
test.shape

(418, 11)

### Title

In [261]:
# Titleの抽出
df['Title'] = df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
test['Title'] = test['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())

# Rare titlesを統合
rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')
df['Title'] = df['Title'].replace(rare_titles, 'Rare Title')
test['Title'] = test['Title'].replace(['Mlle', 'Ms'], 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')
test['Title'] = test['Title'].replace(rare_titles, 'Rare Title')

### FamilySize

In [262]:
# FamilySizeの作成
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
# 一人かどうか
df['IsAlone'] = 1  # デフォルトは1 (一人)
df['IsAlone'].loc[df['FamilySize'] > 1] = 0  # 家族がいれば0

test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
test['IsAlone'] = 1  # デフォルトは1 (一人)
test['IsAlone'].loc[test['FamilySize'] > 1] = 0  # 家族がいれば0


### 補完

In [263]:
# Ageの欠損を埋める (例: Titleごとの中央値で補完)
df['Age'].fillna(df.groupby('Title')['Age'].transform('median'), inplace=True)

test['Age'].fillna(test.groupby('Title')['Age'].transform('median'), inplace=True)

# 年齢のビニング
df['AgeBin'] = pd.cut(df['Age'], bins=[0, 10, 20, 40, 60, 120], labels=['Child', 'Teen', 'Young Adult', 'Adult', 'Senior'])
test['AgeBin'] = pd.cut(test['Age'], bins=[0, 10, 20, 40, 60, 120], labels=['Child', 'Teen', 'Young Adult', 'Adult', 'Senior'])

# Fareの欠損を埋める (例: Pclassごとの中央値で補完)
df['Fare'].fillna(df.groupby('Pclass')['Fare'].transform('median'), inplace=True)

test['Fare'].fillna(test.groupby('Pclass')['Fare'].transform('median'), inplace=True)

# 運賃のビニング
df['FareBin'] = pd.qcut(df['Fare'], 3, labels=['Low', 'Mid', 'High'])
test['FareBin'] = pd.qcut(test['Fare'], 3, labels=['Low', 'Mid', 'High'])


# Embarkedの欠損を最頻値で埋める
df['Embarked'].fillna("S", inplace=True)
test['Embarked'].fillna("S", inplace=True)


In [264]:
print(df.isnull().sum())

df.drop(['Cabin', 'Ticket', 'Age' , 'Fare','SibSp','Parch','Name','FamilySize'], axis=1, inplace=True)
df.drop(['PassengerId'], axis=1, inplace=True)
df.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)

test.drop(['Cabin', 'Ticket', 'Age' , 'Fare','SibSp','Parch','Name','FamilySize'], axis=1, inplace=True)
test.drop(['PassengerId'], axis=1, inplace=True)
test.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)

PassengerId       0
Perished        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1014
Embarked          0
Title             0
FamilySize        0
IsAlone           0
AgeBin            0
FareBin           0
dtype: int64


In [265]:
df.head(10)



Unnamed: 0,Perished,Pclass,Sex,Embarked,Title,IsAlone,AgeBin,FareBin
0,1.0,3,0,S,Mr,0,Young Adult,Low
1,0.0,1,1,C,Mrs,0,Young Adult,High
2,0.0,3,1,S,Miss,1,Young Adult,Low
3,0.0,1,1,S,Mrs,0,Young Adult,High
4,1.0,3,0,S,Mr,1,Young Adult,Low
5,1.0,3,0,Q,Mr,1,Young Adult,Low
6,1.0,1,0,S,Mr,1,Adult,High
7,1.0,3,0,S,Master,0,Child,Mid
8,0.0,3,1,S,Mrs,0,Young Adult,Mid
9,0.0,2,1,C,Mrs,0,Teen,High


### One-hot

In [267]:
test.head(10)

Unnamed: 0,Sex,IsAlone,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare Title,...,AgeBin_Teen,AgeBin_Young Adult,AgeBin_Adult,AgeBin_Senior,FareBin_Low,FareBin_Mid,FareBin_High,Pclass_1,Pclass_2,Pclass_3
0,0,1,False,True,False,False,False,True,False,False,...,False,True,False,False,True,False,False,False,False,True
1,1,0,False,False,True,False,False,False,True,False,...,False,False,True,False,True,False,False,False,False,True
2,0,1,False,True,False,False,False,True,False,False,...,False,False,False,True,False,True,False,False,True,False
3,0,1,False,False,True,False,False,True,False,False,...,False,True,False,False,False,True,False,False,False,True
4,1,0,False,False,True,False,False,False,True,False,...,False,True,False,False,False,True,False,False,False,True
5,0,1,False,False,True,False,False,True,False,False,...,True,False,False,False,False,True,False,False,False,True
6,1,1,False,True,False,False,True,False,False,False,...,False,True,False,False,True,False,False,False,False,True
7,0,0,False,False,True,False,False,True,False,False,...,False,True,False,False,False,False,True,False,True,False
8,1,1,True,False,False,False,False,False,True,False,...,True,False,False,False,True,False,False,False,False,True
9,0,0,False,False,True,False,False,True,False,False,...,False,True,False,False,False,True,False,False,False,True


In [266]:
# Embarked列のダミー変数化
# Embarkedのダミー変数化
embarked = pd.get_dummies(df['Embarked'], prefix='Embarked')
embarked2 = pd.get_dummies(test['Embarked'], prefix='Embarked')

# 元のdfに新しいダミー変数を追加
df = pd.concat([df, embarked], axis=1)
df.drop('Embarked', axis=1, inplace=True)

test = pd.concat([test, embarked2], axis=1)
test.drop('Embarked', axis=1, inplace=True)

# Embarked列のダミー変数化
# 
title = pd.get_dummies(df['Title'], prefix='Title')

# 元のdfに新しいダミー変数を追加
df = pd.concat([df, title], axis=1)
df.drop('Title', axis=1, inplace=True)
df.drop('Title_the Countess', axis=1, inplace=True)

title2 = pd.get_dummies(test['Title'], prefix='Title')

# 元のdfに新しいダミー変数を追加
test = pd.concat([test, title2], axis=1)
test.drop('Title', axis=1, inplace=True)
#test.drop('Title_the Countess', axis=1, inplace=True)

agebin = pd.get_dummies(df['AgeBin'], prefix='AgeBin')

# 元のdfに新しいダミー変数を追加
df = pd.concat([df, agebin], axis=1)
df.drop('AgeBin', axis=1, inplace=True)

agebin2 = pd.get_dummies(test['AgeBin'], prefix='AgeBin')

# 元のdfに新しいダミー変数を追加
test = pd.concat([test, agebin2], axis=1)
test.drop('AgeBin', axis=1, inplace=True)

farebin = pd.get_dummies(df['FareBin'], prefix='FareBin')

# 元のdfに新しいダミー変数を追加
df = pd.concat([df, farebin], axis=1)
df.drop('FareBin', axis=1, inplace=True)

farebin2 = pd.get_dummies(test['FareBin'], prefix='FareBin')

# 元のdfに新しいダミー変数を追加
test = pd.concat([test, farebin2], axis=1)
test.drop('FareBin', axis=1, inplace=True)

pclass = pd.get_dummies(df['Pclass'], prefix='Pclass')

# 元のdfに新しいダミー変数を追加
df = pd.concat([df, pclass], axis=1)
df.drop('Pclass', axis=1, inplace=True)

pclass2 = pd.get_dummies(test['Pclass'], prefix='Pclass')

# 元のdfに新しいダミー変数を追加
test = pd.concat([test, pclass2], axis=1)
test.drop('Pclass', axis=1, inplace=True)

In [268]:
df.shape

(1309, 22)

### モデル作成

In [285]:
# trainデータとtestデータを再び分割
# df0 = df
# df = df0[:len(train)]
df_test = test

X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

X_test = df_test.iloc[:, 0:].values
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

from sklearn.preprocessing import StandardScaler


# データのスケーリング
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)


print(X_train_scaled.shape)
print(X_valid_scaled.shape)
print(X_test.shape)
print(y_train.shape)




(623, 21)
(268, 21)
(418, 21)
(623,)


In [284]:
print(X)

[0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0.
 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1.
 1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1.
 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 1. 1.
 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1.
 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1.
 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1.
 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1.
 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0.
 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1.

In [272]:
df.shape
df_test.shape

(418, 21)

### ランダムフォレスト

In [286]:
rfc_gs = RandomForestClassifier(max_depth=8, min_samples_leaf=7, min_samples_split=2, n_estimators=100, n_jobs=-1, random_state=42)
rfc_gs.fit(X_train_scaled, y_train)

In [238]:
print('Train Score: {}'.format(round(rfc.score(X_train_scaled, y_train), 3)))
print(' Test Score: {}'.format(round(rfc.score(X_valid_scaled, y_valid), 3)))

Train Score: 0.799
 Test Score: 0.813


### クロスバリエーション

In [273]:
param_grid = {
    'max_depth': [7, 8, 9, 10],  # max_depthの範囲を1〜10に変更
    'min_samples_leaf': [6, 7, 8]  # min_samples_leafの範囲を1〜10に変更  
    }

rfc_gs = GridSearchCV(RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42), param_grid, cv=5)
rfc_gs.fit(X_train_scaled, y_train)

print('Best Parameters: {}'.format(rfc_gs.best_params_))
print('CV Score: {}'.format(round(rfc_gs.best_score_, 3)))

Best Parameters: {'max_depth': 8, 'min_samples_leaf': 7}
CV Score: 0.828


In [240]:
rfc_fe = RandomForestClassifier(max_depth=8, min_samples_leaf=7, n_estimators=100, n_jobs=-1, random_state=42)
rfc_fe.fit(X_train_scaled, y_train)

### ロジスティク回帰

In [274]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


# ハイパーパラメータの候補を指定
# 0.01, 0.09, 0.1, 0.11, 1, 10, 100
param_grid = {'C': [0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13],
              'solver': ['liblinear', 'lbfgs', 'saga']}
# liblinear: L1正則化
# lbfgs: L2正則化
# saga: L1正則化

# 0.08

# グリッドサーチの設定
lr = LogisticRegression(random_state=42)
lr_gs = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# グリッドサーチの実行
lr_gs.fit(X_train_scaled, y_train)

# 最適なパラメータと結果を表示
print('Best Parameters: {}'.format(lr_gs.best_params_))
print('Train Score: {}'.format(round(lr_gs.score(X_train_scaled, y_train), 3)))
print('Test Score: {}'.format(round(lr_gs.score(X_valid_scaled, y_valid), 3)))

Best Parameters: {'C': 0.08, 'solver': 'lbfgs'}
Train Score: 0.835
Test Score: 0.821


### 多層パーセプトロン

In [275]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler


# MLPClassifierのパラメータを調整して、solverを'Adam'に変更
mlpc = MLPClassifier(hidden_layer_sizes=(100, 100, 90),  # 隠れ層のユニット数を増やす
                     alpha=0.001,  # 正則化の強度を弱める
                     learning_rate_init=0.01,  # 学習率を明示的に設定
                     max_iter=300,  # 最大反復回数を増やす
                     random_state=42,
                     early_stopping=True,
                     solver='adam')  # Adamを使用

# モデルの学習
mlpc.fit(X_train_scaled, y_train)

# 結果の表示
print('Multilayer Perceptron \n')
print('Train Score: {}'.format(round(mlpc.score(X_train_scaled, y_train), 3)))
print('Test Score: {}'.format(round(mlpc.score(X_valid_scaled, y_valid), 3)))

Multilayer Perceptron 

Train Score: 0.833
Test Score: 0.821


In [287]:

rfc_pred = rfc_gs.predict_proba(X_test)
lr_pred = lr_gs.predict_proba(X_test)
mlpc_pred = mlpc.predict_proba(X_test)

pred_proba = (rfc_pred + lr_pred + mlpc_pred) / 3
pred = pred_proba.argmax(axis=1)

In [288]:
df.shape

print(len(train))
print(len(test))

891
418


In [289]:
path = '../data/'

submission = pd.read_csv(path + 'gender_submission.csv')
submission



Unnamed: 0,PassengerId,Perished
0,892,1
1,893,0
2,894,1
3,895,1
4,896,0
...,...,...
413,1305,1
414,1306,0
415,1307,1
416,1308,1


In [290]:
submission['Perished'] = pred
submission

submission.to_csv('../out/banban.csv', index=False)
