# Titanic Project

## 1. Data preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load Titanic dataset
data_file = 'data/titanic.csv'
titanic = pd.read_csv(data_file, index_col='PassengerId')
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


더미변수를 만들지만, 베이스라인 변수를 버리지 않습니다.

In [3]:
embark_dum = pd.get_dummies(titanic['Embarked'], prefix='port')
titanic = pd.concat([titanic, embark_dum], axis=1)
titanic.drop(['Embarked'], axis=1, inplace=True)

In [4]:
age_group = []
for i in range (0, len(titanic)):
    age = titanic.iloc[i]['Age']
    age = 'child' if age < 20 else 'adult' if age >= 20 else 'unknown'
    age_group.append(age)
    
titanic['Age_modified'] = age_group
age_dum = pd.get_dummies(titanic['Age_modified'], prefix = 'Age')
age_dum.sample(n = 10)
titanic = pd.concat([titanic, age_dum], axis=1)
titanic.drop(['Age', 'Age_modified'], axis=1, inplace=True)

In [5]:
pclass_dum = pd.get_dummies(titanic['Pclass'], prefix = 'pclass')
titanic = pd.concat([titanic, pclass_dum], axis=1)
titanic.drop(['Pclass'], axis=1, inplace=True)
titanic.head(5)

Unnamed: 0_level_0,Survived,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,port_C,port_Q,port_S,Age_adult,Age_child,Age_unknown,pclass_1,pclass_2,pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,0,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,,False,False,True,True,False,False,False,False,True
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C85,True,False,False,True,False,False,True,False,False
3,1,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925,,False,False,True,True,False,False,False,False,True
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1,C123,False,False,True,True,False,False,True,False,False
5,0,"Allen, Mr. William Henry",male,0,0,373450,8.05,,False,False,True,True,False,False,False,False,True


In [6]:
sex_dum = pd.get_dummies(titanic['Sex'], prefix = 'sex')
titanic = pd.concat([titanic, sex_dum], axis=1)
titanic.drop(['Sex'], axis=1, inplace=True)

titanic.head(5)

Unnamed: 0_level_0,Survived,Name,SibSp,Parch,Ticket,Fare,Cabin,port_C,port_Q,port_S,Age_adult,Age_child,Age_unknown,pclass_1,pclass_2,pclass_3,sex_female,sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,,False,False,True,True,False,False,False,False,True,False,True
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,C85,True,False,False,True,False,False,True,False,False,True,False
3,1,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.925,,False,False,True,True,False,False,False,False,True,True,False
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1,C123,False,False,True,True,False,False,True,False,False,True,False
5,0,"Allen, Mr. William Henry",0,0,373450,8.05,,False,False,True,True,False,False,False,False,True,False,True


In [7]:
titanic.drop(['Name', 'Ticket','Cabin'],axis=1, inplace=True)
titanic.head(5)

Unnamed: 0_level_0,Survived,SibSp,Parch,Fare,port_C,port_Q,port_S,Age_adult,Age_child,Age_unknown,pclass_1,pclass_2,pclass_3,sex_female,sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,1,0,7.25,False,False,True,True,False,False,False,False,True,False,True
2,1,1,0,71.2833,True,False,False,True,False,False,True,False,False,True,False
3,1,0,0,7.925,False,False,True,True,False,False,False,False,True,True,False
4,1,1,0,53.1,False,False,True,True,False,False,True,False,False,True,False
5,0,0,0,8.05,False,False,True,True,False,False,False,False,True,False,True


In [8]:
data = titanic

In [9]:
# 변수명 가져오기
col_names = data.columns.values
print(col_names)

['Survived' 'SibSp' 'Parch' 'Fare' 'port_C' 'port_Q' 'port_S' 'Age_adult'
 'Age_child' 'Age_unknown' 'pclass_1' 'pclass_2' 'pclass_3' 'sex_female'
 'sex_male']


In [10]:
X = data[col_names[1:]]
y = data[col_names[0]]

In [11]:
X.head(5)

Unnamed: 0_level_0,SibSp,Parch,Fare,port_C,port_Q,port_S,Age_adult,Age_child,Age_unknown,pclass_1,pclass_2,pclass_3,sex_female,sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,0,7.25,False,False,True,True,False,False,False,False,True,False,True
2,1,0,71.2833,True,False,False,True,False,False,True,False,False,True,False
3,0,0,7.925,False,False,True,True,False,False,False,False,True,True,False
4,1,0,53.1,False,False,True,True,False,False,True,False,False,True,False
5,0,0,8.05,False,False,True,True,False,False,False,False,True,False,True


In [12]:
y.head(5)

PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

## Split data

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [14]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [15]:
# 예측
y_pred = rf.predict(X_test)
print(y_pred)

[1 0 0 0 0 0 1 1 0 0 0 1 1 0 1 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 1 1 0
 0 1 0 1 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 0 1 0
 0 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 1 0 0 0 1 1
 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 1 1 0 0 1
 0 1 1 0 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0
 1 0 1 0 1 0 0 1 0 1 0 0 1 1 0 1 1 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0
 0]


In [16]:
importances = rf.feature_importances_

In [20]:
# Print the feature importance for each feature
for feature, importance in zip(X_train.columns.values, importances):
    print(f"{feature}: {importance:.4f}")

# for feature, importance in zip(X_train.columns.values):
#     print(f"{feature}: {importances:.4f}")

SibSp: 0.0661
Parch: 0.0548
Fare: 0.3527
port_C: 0.0138
port_Q: 0.0098
port_S: 0.0191
Age_adult: 0.0208
Age_child: 0.0230
Age_unknown: 0.0152
pclass_1: 0.0317
pclass_2: 0.0182
pclass_3: 0.0613
sex_female: 0.1501
sex_male: 0.1635


In [18]:
print(importances)

[0.06608485 0.05477429 0.35265902 0.01377892 0.00976628 0.01913242
 0.02077648 0.02303171 0.01521953 0.03167176 0.01815719 0.06133453
 0.15011528 0.16349774]


In [19]:
print(X_train.columns.values)

['SibSp' 'Parch' 'Fare' 'port_C' 'port_Q' 'port_S' 'Age_adult' 'Age_child'
 'Age_unknown' 'pclass_1' 'pclass_2' 'pclass_3' 'sex_female' 'sex_male']
