# 타이타닉 생존자 데이터 분석

## 타이타닉호에서 어떤 승객이 생존했는지 데이터로부터 분석


1912년 4월 15일 타이타닉호 침몰로 2224명중 1502이 사망했다. 생존한 승객이 성별, 나이, 선실 등급과 어떤 관계가 있었는지를 분석한다.


## 주어진 데이터 내용

  * **Survival** - Survival. 0 = No, 1 = Yes
  * **Pclass** - Ticket class. 1 = 1st, 2 = 2nd, 3 = 3rd
  * **Sex** - Sex.
  * **Age** - Age in years.
  * **SibSp** - # of siblings / spouses aboard the Titanic.
  * **Parch** - # of parents / children aboard the Titanic.
  * **Ticket** - Ticket number.
  * **Fare** - Passenger fare.
  * **Cabin** - Cabin number.
  * **Embarked** - Port of Embarkation. C = Cherbourg, Q = Queenstown, S = Southampton


## Load Dataset

'titanic_train.csv' 파일을 읽어서 변수 train에 저장하고,

PassengerId를 index로 지정

In [245]:
import pandas as pd
seed = 37

In [246]:
train = pd.read_csv('titanic_data/titanic_train.csv')

train.index = train['PassengerId']
train.head()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


'titanic_test.csv' 파일을 읽어서 변수 test에 저장하고,

PassengerId를 index로 지정

In [247]:
test = pd.read_csv('titanic_data/titanic_test.csv')

test.index = test['PassengerId']


## Preprocessing

### Encode Sex

'Sex_encode'라는 이름의 column을 생성하고 성별이 남자면 0, 여자면 1로 코딩

(train, test 둘 다 적용)

- .loc 활용

In [248]:
# test.loc[test['Sex'] == 'male', 'Sex_encode'] = 0
# test.loc[test['Sex'] == 'female', 'Sex_encode'] = 1

for (i,j)  in enumerate(train.index): 
    if train.loc[j, 'Sex'] == 'male':
        train.loc[j, 'Sex_encode'] = 0
    if train.loc[j, 'Sex'] == 'female':
        train.loc[j, 'Sex_encode'] = 1

        
for (i,j)  in enumerate(test.index): 
    if test.loc[j, 'Sex'] == 'male':
        test.loc[j, 'Sex_encode'] = 0
    if test.loc[j, 'Sex'] == 'female':
        test.loc[j, 'Sex_encode'] = 1
        
        
# if train['Sex'][900] == 'male':
#         train['Sex'][900] = 0
        
# for (i,j)  in enumerate(train.index): 
#     if train['Sex'][j] == 'male':
#         train['Sex'][j] = 0
#     if train['Sex'][j] == 'female':
#         train['Sex'][j] = 1
        
        
# train.head()

### Fill in missing fare
#### - delete row or filling in missing fare(mean)


train, test 모두 'Fare_fillin'이라는 이름의 column을 생성

missing(null)가 있는지 확인하고, missing value를 Fare 전체의 평균값으로 코딩

In [249]:
train["Fare_fillin"] = train["Fare"]
test["Fare_fillin"] = test["Fare"]

In [262]:
train['Fare_fillin'].isna().unique()
train[train['Fare_fillin'].isnull()]
test[test['Fare_fillin'].isnull()]

# test.loc[test['Sex'] == 'female', 'Sex_encode'] = 1
test.loc[test['Fare_fillin'].isnull(), 'Fare_fillin'] = test['Fare'].mean()
# test[test['Fare_fillin'].isnull()] = test['Fare'].mean()
test.loc[1044]
# test['Fare_fillin'].isnull().unique()

KeyError: 'Fare_fillin'

### Encode Embarked

Embarked column의 값이 C일 때, Q일 때, S일 때 값이 1이 되도록 원핫인코딩

(train, test 둘 다 적용)
- .get_dummies 활용

In [261]:
temp_test = pd.get_dummies(test['Embarked'], prefix='Embarked')
                    #, prefix = ['C','Q','S'], columns = ['C','Q','S'])
test = pd.concat([test, temp_test], axis=1)

temp_train = pd.get_dummies(train['Embarked'], prefix='Embarked')  
                    #, prefix = ['C','Q','S'], columns = ['C','Q','S'])
train = pd.concat([train, temp_train] ,axis=1)

train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0,0,1
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,1


## Train

In [260]:
feature_names = ["Pclass", "Sex_encode", "Fare_fillin", "SibSp", "Parch",
                 "Embarked_C", "Embarked_Q", "Embarked_S","Survived"]

X_train = train[feature_names]

print(X_train.shape)
X_train.head()

KeyError: "['Sex_encode' 'Fare_fillin' 'Embarked_C' 'Embarked_Q' 'Embarked_S'] not in index"

In [253]:
label_name = "Survived"

y_train = train[label_name]

print(y_train.shape)
y_train.head()

(891,)


PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

In [254]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=7,
                               random_state=seed)

In [255]:
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=37,
            splitter='best')

In [256]:
import pandas as pd

train = pd.read_csv("titanic_data/titanic_train.csv", index_col="PassengerId")

train["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [257]:
1 - (549 / 891) ** 2 - (342 / 891) ** 2

0.4730129578614428

## Predict

In [258]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head()

(418, 8)


Unnamed: 0_level_0,Pclass,Sex_encode,Fare_fillin,SibSp,Parch,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,0.0,7.8292,0,0,0,1,0
893,3,1.0,7.0,1,0,0,0,1
894,2,0.0,9.6875,0,0,0,1,0
895,3,0.0,8.6625,0,0,0,0,1
896,3,1.0,12.2875,1,1,0,0,1


In [259]:
prediction = model.predict(X_test)

print(prediction.shape)
prediction[:20]

(418,)


array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1],
      dtype=int64)