# 課題："Titanic: Machine Learning from Disaster" from Kaggle

Webサイト：https://www.kaggle.com/c/titanic

In [1]:
% matplotlib inline

from __future__ import print_function

import copy

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error

## 1.データ前処理
### カラムの整理

In [2]:
#データ読み込み
train_data = pd.read_csv("train_mod.csv")
test_data = pd.read_csv("test_mod.csv")

In [3]:
#使わない行を削除
del train_data['Name']
del train_data['Cabin']
del train_data['Ticket']
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Age_category,SibSp,Parch,Fare,Fare_category,Embarked
0,804,1,3,male,0.42,0,0,1,8.5167,0,C
1,206,0,3,female,2.0,0,0,1,10.4625,1,S
2,173,1,3,female,1.0,0,1,1,11.1333,1,S
3,870,1,3,male,4.0,0,1,1,11.1333,1,S
4,480,1,3,female,2.0,0,0,1,12.2875,1,S


In [4]:
del test_data['Name']
del test_data['Cabin']
del test_data['Ticket']
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Age_category,SibSp,Parch,Fare,Fare_category,Embarked
0,913,3,male,9.0,0,0,1,3.1708,0,S
1,1199,3,male,0.83,0,0,1,9.35,0,S
2,1155,3,female,1.0,0,1,1,12.1833,1,S
3,1173,3,male,0.75,0,1,1,13.775,1,S
4,1301,3,female,3.0,0,1,1,13.775,1,S


In [5]:
#カテゴリカル変数をダミー変数に変換
train_data_2 = pd.get_dummies(train_data,dummy_na=True)
test_data_2 = pd.get_dummies(test_data,dummy_na=True)

In [6]:
train_data_2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Age_category,SibSp,Parch,Fare,Fare_category,Sex_female,Sex_male,Sex_nan,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
0,804,1,3,0.42,0,0,1,8.5167,0,0,1,0,1,0,0,0
1,206,0,3,2.0,0,0,1,10.4625,1,1,0,0,0,0,1,0
2,173,1,3,1.0,0,1,1,11.1333,1,1,0,0,0,0,1,0
3,870,1,3,4.0,0,1,1,11.1333,1,0,1,0,0,0,1,0
4,480,1,3,2.0,0,0,1,12.2875,1,1,0,0,0,0,1,0


In [7]:
test_data_2.head()

Unnamed: 0,PassengerId,Pclass,Age,Age_category,SibSp,Parch,Fare,Fare_category,Sex_female,Sex_male,Sex_nan,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
0,913,3,9.0,0,0,1,3.1708,0,0,1,0,0,0,1,0
1,1199,3,0.83,0,0,1,9.35,0,0,1,0,0,0,1,0
2,1155,3,1.0,0,1,1,12.1833,1,1,0,0,0,0,1,0
3,1173,3,0.75,0,1,1,13.775,1,0,1,0,0,0,1,0
4,1301,3,3.0,0,1,1,13.775,1,1,0,0,0,0,1,0


In [8]:
#Age、Fareは幅広い値となっているためカテゴリを作成し、AgeCategoryとFareCategoryを使う
#Age_categoryは年代
#Fare_categoryは0:0-10未満,1:10-50未満,2:50-100未満,3:100以上
train_data_2['Fare_category'].value_counts()

1    394
0    336
2    108
3     53
Name: Fare_category, dtype: int64

In [9]:
train_data_2['Age_category'].value_counts()

3    344
2    220
1    102
4     89
0     62
5     48
6     19
7      6
8      1
Name: Age_category, dtype: int64

In [10]:
#使用列整理
#Age、Fareは幅広い値となっているためカテゴリを作成し、AgeCategoryとFareCategoryを使う
target_col = 'Survived'
exclude_cols = ['PassengerId','Survived','Age','Fare']
feature_cols = [col for col in train_data_2.columns if col not in exclude_cols]

## 2.モデリング

In [11]:
#trainデータを、さらにtrain:test=7:3に分ける

y = np.array(train_data_2[target_col])
X = np.array(train_data_2[feature_cols])

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1234)

In [12]:
#ランダムフォレストでモデル作成
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=150,oob_score=True)
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=150, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)

In [13]:
#accuracyを計算
ypred = rf.predict(X_test)

from sklearn import metrics
print('Accuracy:\n', metrics.accuracy_score(y_test, ypred))

Accuracy:
 0.798507462687


In [14]:
#テストデータに適用
X_test_test = np.array(test_data_2[feature_cols])
ypred_test = rf.predict(X_test_test)

In [15]:
ypred_test_df = pd.DataFrame(ypred_test)
result = pd.concat([test_data_2.PassengerId,ypred_test_df],axis=1)
result.rename(columns = {"0":"Survived"},inplace=True)

In [16]:
#CSVファイルに書き込み
result.to_csv('titanic_result_3',index=None)