# Titanic Project

## 1. Data preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Load Titanic dataset
data_file = 'data/titanic.csv'
titanic = pd.read_csv(data_file, index_col='PassengerId')
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
embark_dum = pd.get_dummies(titanic['Embarked'], prefix='port')
titanic = pd.concat([titanic, embark_dum], axis=1)
titanic.drop(['Embarked'], axis=1, inplace=True)

In [5]:
age_group = []
for i in range (0, len(titanic)):
    age = titanic.iloc[i]['Age']
    age = 'child' if age < 20 else 'adult' if age >= 20 else 'unknown'
    age_group.append(age)

titanic['Age_modified'] = age_group
age_dum = pd.get_dummies(titanic['Age_modified'], prefix = 'Age')
age_dum.sample(n = 10)
titanic = pd.concat([titanic, age_dum], axis=1)
titanic.drop(['Age', 'Age_modified'], axis=1, inplace=True)

In [6]:
pclass_dum = pd.get_dummies(titanic['Pclass'], prefix = 'pclass')
titanic = pd.concat([titanic, pclass_dum], axis=1)
titanic.drop(['Pclass'], axis=1, inplace=True)
titanic.head(5)

Unnamed: 0_level_0,Survived,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,port_C,port_Q,port_S,Age_adult,Age_child,Age_unknown,pclass_1,pclass_2,pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,0,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,,False,False,True,True,False,False,False,False,True
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C85,True,False,False,True,False,False,True,False,False
3,1,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925,,False,False,True,True,False,False,False,False,True
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1,C123,False,False,True,True,False,False,True,False,False
5,0,"Allen, Mr. William Henry",male,0,0,373450,8.05,,False,False,True,True,False,False,False,False,True


In [7]:
sex_dum = pd.get_dummies(titanic['Sex'], prefix = 'sex')
titanic = pd.concat([titanic, sex_dum], axis=1)
titanic.drop(['Sex'], axis=1, inplace=True)

titanic.head(5)

Unnamed: 0_level_0,Survived,Name,SibSp,Parch,Ticket,Fare,Cabin,port_C,port_Q,port_S,Age_adult,Age_child,Age_unknown,pclass_1,pclass_2,pclass_3,sex_female,sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,,False,False,True,True,False,False,False,False,True,False,True
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,C85,True,False,False,True,False,False,True,False,False,True,False
3,1,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.925,,False,False,True,True,False,False,False,False,True,True,False
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1,C123,False,False,True,True,False,False,True,False,False,True,False
5,0,"Allen, Mr. William Henry",0,0,373450,8.05,,False,False,True,True,False,False,False,False,True,False,True


In [8]:
titanic.drop(['Name', 'Ticket','Cabin'],axis=1, inplace=True)
titanic.head(5)

Unnamed: 0_level_0,Survived,SibSp,Parch,Fare,port_C,port_Q,port_S,Age_adult,Age_child,Age_unknown,pclass_1,pclass_2,pclass_3,sex_female,sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,1,0,7.25,False,False,True,True,False,False,False,False,True,False,True
2,1,1,0,71.2833,True,False,False,True,False,False,True,False,False,True,False
3,1,0,0,7.925,False,False,True,True,False,False,False,False,True,True,False
4,1,1,0,53.1,False,False,True,True,False,False,True,False,False,True,False
5,0,0,0,8.05,False,False,True,True,False,False,False,False,True,False,True


In [9]:
data = titanic

In [10]:
# 변수명 가져오기
col_names = data.columns.values
print(col_names)

['Survived' 'SibSp' 'Parch' 'Fare' 'port_C' 'port_Q' 'port_S' 'Age_adult'
 'Age_child' 'Age_unknown' 'pclass_1' 'pclass_2' 'pclass_3' 'sex_female'
 'sex_male']


In [11]:
X = data[col_names[1:]]
y = data[col_names[0]]

In [12]:
X.head(5)

Unnamed: 0_level_0,SibSp,Parch,Fare,port_C,port_Q,port_S,Age_adult,Age_child,Age_unknown,pclass_1,pclass_2,pclass_3,sex_female,sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,0,7.25,False,False,True,True,False,False,False,False,True,False,True
2,1,0,71.2833,True,False,False,True,False,False,True,False,False,True,False
3,0,0,7.925,False,False,True,True,False,False,False,False,True,True,False
4,1,0,53.1,False,False,True,True,False,False,True,False,False,True,False
5,0,0,8.05,False,False,True,True,False,False,False,False,True,False,True


In [13]:
y.head(5)

PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

## Split data

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

XGB objective의 옵션들

이진 분류(Binary Classification):
"binary:logistic": 이진 분류에 사용되는 로지스틱 손실 함수입니다. 기본값으로 사용됩니다.

다중 클래스 분류(Multiclass Classification):
"multi:softmax": 다중 클래스 분류를 위한 소프트맥스 손실 함수입니다. num_class 매개변수를 설정하여 클래스 수를 지정해야 합니다.
"multi:softprob": 각 클래스에 속할 확률을 반환하는 소프트맥스 손실 함수입니다.

회귀(Regression):
"reg:squarederror": 평균 제곱 오차(Mean Squared Error)를 최소화하는 회귀 문제에 사용됩니다.

In [16]:
# XGBClassifier로 모델링하기 위해 패키지를 불러옵니다.
from xgboost import XGBClassifier


In [20]:
model = XGBClassifier(
    objective="binary:logistic",  # 분류 문제에 사용하는 목표 함수
    eval_metric="logloss",        # 평가 메트릭스
    max_depth=3,                  # 트리의 최대 깊이
    learning_rate=0.1,            # 학습률
    n_estimators=100,             # 트리의 개수
    random_state=42               # 랜덤 시드
)


In [21]:
# 트레이닝 데이터로 학습
model.fit(X_train, y_train)

In [22]:
# 테스트 데이터로 예측
y_pred = model.predict(X_test)

In [23]:
# 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.82


In [24]:
# Print the feature importance for each feature
importances = model.feature_importances_
print(importances)

[0.03459869 0.04722945 0.02845795 0.01288024 0.02352615 0.04646812
 0.02871426 0.04560011 0.02428832 0.12665541 0.02817852 0.19788235
 0.35552046 0.        ]


In [25]:
# 시각화
from sklearn.inspection import permutation_importance


result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
print(result.importances_mean)

[ 0.01704036 -0.00179372  0.00493274 -0.00313901  0.          0.00717489
 -0.00313901 -0.00986547 -0.0058296  -0.01434978 -0.00807175  0.0793722
  0.21659193  0.        ]
