In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import pandas_profiling as pp


warnings.filterwarnings('ignore')
%matplotlib inline


print(os.curdir)
print(os.listdir("input"))


.
['test.csv', 'train.csv', 'gender_submission.csv']


In [3]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
data = pd.concat([train, test], sort=False)
data


# pclass： 旅客クラス（1＝1等、2＝2等、3＝3等）。裕福さの目安となる
# name： 乗客の名前
# sex： 性別（male＝男性、female＝女性）
# age： 年齢。一部の乳児は小数値
# sibsp： タイタニック号に同乗している兄弟（Siblings）や配偶者（Spouses）の数
# parch： タイタニック号に同乗している親（Parents）や子供（Children）の数
# ticket： チケット番号
# fare： 旅客運賃
# cabin： 客室番号
# embarked： 出港地（C＝Cherbourg：シェルブール、Q＝Queenstown：クイーンズタウン、S＝Southampton：サウサンプトン）
# boat： 救命ボート番号
# body： 遺体収容時の識別番号
# home.dest： 自宅または目的地
# survived：生存状況（0＝死亡、1＝生存）。通常はこの数値が目的変数として使われる


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [4]:
data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
data['Embarked'].fillna('S', inplace=True)
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
age_avg = data['Age'].mean()
age_std = data['Age'].std()
data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)

data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,0
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",0,34.0,0,0,A.5. 3236,8.0500,,0
414,1306,,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,108.9000,C105,1
415,1307,,3,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,0
416,1308,,3,"Ware, Mr. Frederick",0,34.0,0,0,359309,8.0500,,0


In [5]:
data.describe()
# data.info()


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,1309.0,891.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,655.0,0.383838,2.294882,0.355997,30.708686,0.498854,0.385027,33.295479,0.394194
std,378.020061,0.486592,0.837836,0.478997,12.988552,1.041658,0.86556,51.738879,0.653499
min,1.0,0.0,1.0,0.0,0.17,0.0,0.0,0.0,0.0
25%,328.0,0.0,2.0,0.0,22.0,0.0,0.0,7.8958,0.0
50%,655.0,0.0,3.0,0.0,32.0,0.0,0.0,14.4542,0.0
75%,982.0,1.0,3.0,1.0,35.0,1.0,0.0,31.275,1.0
max,1309.0,1.0,3.0,1.0,80.0,8.0,9.0,512.3292,2.0


In [6]:
delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test = data[len(train):]

train
# test

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0.0,3,0,22.0,7.2500,0
1,1.0,1,1,38.0,71.2833,1
2,1.0,3,1,26.0,7.9250,0
3,1.0,1,1,35.0,53.1000,0
4,0.0,3,0,35.0,8.0500,0
...,...,...,...,...,...,...
886,0.0,2,0,27.0,13.0000,0
887,1.0,1,1,19.0,30.0000,0
888,0.0,3,1,34.0,23.4500,0
889,1.0,1,0,26.0,30.0000,1


In [7]:
y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop(['Survived'], axis=1)



print(y_train.head())
print(X_train.head())
print(X_test.head())

0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Survived, dtype: float64
   Pclass  Sex   Age     Fare  Embarked
0       3    0  22.0   7.2500         0
1       1    1  38.0  71.2833         1
2       3    1  26.0   7.9250         0
3       1    1  35.0  53.1000         0
4       3    0  35.0   8.0500         0
   Pclass  Sex   Age     Fare  Embarked
0       3    0  34.5   7.8292         2
1       3    1  47.0   7.0000         0
2       2    0  62.0   9.6875         2
3       3    0  27.0   8.6625         0
4       3    1  22.0  12.2875         0


In [8]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

y_pred[:20]
y_pred

array([0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

In [13]:
# output rounded
sub = pd.read_csv('input/gender_submission.csv')
sub['Survived'] = list(map(int, y_pred))
sub.to_csv('output/submission/submission_21-random-rorest.csv', index=False)



In [10]:
sub

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
