## 타이타닉
### 타이타닉 승객의 나이, 성별, 승객 등급, 승선 위치 같은 속성을 기반으로 하여 승객의 생존 여부를 예측하는 것이 목표

### 1. 데이터 가져오기

In [None]:
import os
import pandas as pd

test = pd.read_csv('titanic/test.csv')
train = pd.read_csv('titanic/train.csv')
gender_submission = pd.read_csv('titanic/gender_submission.csv')

In [None]:
test.head()

In [None]:
train.head()

In [None]:
gender_submission.head() #정답지

In [None]:
test.shape #survived(생존여부)가 빠진 값

In [None]:
train.shape

In [None]:
gender_submission.shape #PassengerId(승객번호)와 survived(생존여부)

In [None]:
train.info() #Age(나이), Cabin(객실번호), 
             #Embarked (배에 탑승한 위치(C = Cherbourg, Q = Queenstown, S = Southampton)) 컬럼에는 null 값이 있음. 

In [None]:
train.describe()

In [None]:
list = ['Survived', 'Pclass', 'Sex', 'Embarked']

for str in list:
    print(train[str].value_counts())
    print('')

In [None]:
train[['Embarked','Survived']].groupby('Embarked').sum()
# 이건 특별한 의미가 없는거 같다. 

In [None]:
"""
1    216
2    184
3    491
"""
train[['Pclass','Survived']].groupby('Pclass').sum()


In [None]:
pclass_surv_mean = train[['Pclass','Survived']].groupby('Pclass').mean()
pclass_surv_mean

In [None]:
pclass_surv_mean.plot(kind='bar')

### pd.Series.plot(kind = 'hist')
- 히스토그램: 구간별로 속해있는 row의 개수를 시각화 합니다.
- 수치형에서만 가능, 범주는 안됩니다!

In [None]:
train['Age'].plot(kind='hist', bins=10, grid=True)

In [None]:
train.plot.scatter('Age','Fare',c='Survived', alpha=0.5, colormap='viridis')

### 3. 데이터 전처리

#### 3-1. 일반적인 전처리

##### (1) 결측치 여부 확인(pd.Series.isna()) 및 처리(pd.DataFrame.fillna())

In [None]:
train.info(0)

Age, Cabin, Embarked는 null값을 포함하고 있고, Cabin은 77%가 null이라 속성에서 제외, Name, Ticket도 값을 가지고 있지만 머신러닝 모델이 사용하는 숫자로 변환하는 것이 까다로움. 그래서 두개도 제외, Cabin, Name, Ticket은 무시하도록 결론.

In [None]:
train['Age'].isna().sum()

In [None]:
train[train['Age'].isna()]

In [None]:
train.isna().sum()/891

In [None]:
train['Embarked'].fillna(value='S', inplace=True)

In [None]:
test['Embarked'].isna().sum()

In [None]:
age_mean = train['Age'].mean(0)

In [None]:
train['Age'] = train['Age'].fillna(value=age_mean)
test['Age'] = test['Age'].fillna(value=test['Age'].mean())

In [None]:
test['Age'].isna().sum()

##### (2) 값 변환 ( pd.Series.map())

In [None]:
train['Sex'].unique()

In [None]:
train['Sex'] = train['Sex'].map({'male':0, 'female':1})
test['Sex'] = test['Sex'].map({'male':0, 'female':1})


#### 3-2 파이프라인으로 전처리

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
        ("imputer", SimpleImputer(strategy="median")),
    ])
"""
num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector()),
        ("imputer", SimpleImputer(strategy="median")),
    ])
"""    

In [None]:
#num_pipeline.fit_transform(train)

In [None]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:

cat_pipeline = Pipeline([
    ("select_cat", DataFrameSelector(["Pclass", "Sex","Embarked"])),
    ("imputer", MostFrequentImputer()),
    ("cat_encoder", OneHotEncoder(sparse=False)),
])
"""
cat_pipeline = Pipeline([
    ("select_cat", DataFrameSelector()),
    ("imputer", MostFrequentImputer()),
    ("cat_encoder", OneHotEncoder(sparse=False)),
])
"""

In [None]:
#cat_pipeline.fit_transform(train)

In [None]:

from sklearn.pipeline import FeatureUnion

preprocess_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])


In [None]:
"""
train_num = train[["Age", "SibSp", "Parch", "Fare"]]
train_cat = train[["Pclass", "Sex","Embarked"]]
"""

In [None]:
"""
from sklearn.compose import ColumnTransformer

num_attribs = list(train_num)
cat_attribs = list(train_cat)

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])
"""

In [None]:

X_train = preprocess_pipeline.fit_transform(train)

In [None]:
X_train

In [None]:
y_train = train["Survived"]

### 4. 변수 선택 및 모델 구축

In [None]:
X_train = train[['Pclass','Age']]
y_train = train['Survived']
x_test = test[['Pclass','Age']]

In [None]:
gender_submission.head()

#### (1) 로지스틱 회귀

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(x_test)

In [None]:
gender_submission['Survived2'] = y_pred

In [None]:
gender_submission.head()

In [None]:
gender_submission.to_csv('Ir_model_Pclass_Age.csv', index=False)

In [None]:
# 예측을 확률값으로 출력
gender_submission['Survived2'] = model.predict_proba(x_test)[:,1]

In [None]:
gender_submission.to_csv('model_proba.csv', index=False)

#### (2) 의사나무결정트리

------ 첫번째 -------

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()

In [None]:
dt_model.fit(X_train, y_train)

In [None]:
gender_submission['Survived2'] = dt_model.predict(x_test)

In [None]:
gender_submission.to_csv('dt_model.csv', index=False)

In [None]:
# 예측을 확률값으로 출력
gender_submission['Survived2'] = dt_model.predict_proba(x_test)[:,1]

In [None]:
gender_submission.to_csv('dt_model_proba.csv', index=False)

------ 두번째 ------

In [None]:
dt_model_new = DecisionTreeClassifier(min_samples_split=10)

In [None]:
dt_model_new.fit(X_train,y_train)

In [None]:
gender_submission['Survived'] = dt_model_new.predict_proba(x_test)[:,1]

In [None]:
gender_submission.to_csv('dt_min_samples_10_proba.csv', index=False)

In [None]:
from sklearn import tree
tree.plot_tree(dt_model_new)

In [None]:
import graphviz 
dot_data = tree.export_graphviz(dt_model_new, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("titanic_survived")

#### (3) SVM

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_clf.fit(X_train, y_train)

In [None]:
X_test = preprocess_pipeline.transform(test)
y_pred = svm_clf.predict(X_test)

In [None]:
from sklearn.model_selection import cross_val_score

svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean() #10개의 교차검증의 평균값

In [None]:
svm_scores

#### (4) 렌덤포레스트

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean() #10개의 교차검증의 평균값

In [None]:
forest_scores

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))
plt.plot([1]*10, svm_scores, ".")
plt.plot([2]*10, forest_scores, ".")
plt.boxplot([svm_scores, forest_scores], labels=("SVM", "Random Forest"))
plt.ylabel("Accuracy", fontsize=14)
plt.show()

#### (5) 생각해보기
이 결과를 더 향상시키려면:
* 교차 검증과 그리드 탐색을 사용하여 더 많은 모델을 비교하고 하이퍼파라미터를 튜닝하세요.
* 특성 공학을 더 시도해 보세요, 예를 들면:
  * **SibSp**와 **Parch**을 이 두 특성의 합으로 바꿉니다.
  * **Survived** 특성과 관련된 이름을 구별해 보세요(가령, 이름에 "Countess"가 있는 경우 생존할 가능성이 높습니다).
* 수치 특성을 범주형 특성으로 바꾸어 보세요: 예를 들어, 나이대가 다른 경우 다른 생존 비율을 가질 수 있습니다(아래 참조). 그러므로 나이 구간을 범주로 만들어 나이 대신 사용하는 것이 도움이 될 수 있스니다. 비슷하게 생존자의 30%가 혼자 여행하는 사람이기 때문에 이들을 위한 특별한 범주를 만드는 것이 도움이 될 수 있습니다(아래 참조).

In [None]:
train["AgeBucket"] = train["Age"] // 15*15

In [None]:
train["AgeBucket"].value_counts()

In [None]:
train[["AgeBucket", "Survived"]].groupby(["AgeBucket"]).mean()

In [None]:
train["RelativesOnboard"] = train["SibSp"] + train["Parch"]
train[["RelativesOnboard","Survived"]].groupby(["RelativesOnboard"]).mean()

### 5. Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score


In [None]:
X_train = train[['Pclass', 'Age']]
y_train = train['Survived']
X_test = test[['Pclass', 'Age']]

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)


In [None]:
cf_matrix = confusion_matrix(y_train, y_pred)

In [None]:
cf_matrix

In [None]:
precision_score(y_train, y_pred) # 정밀도 : 양성예측의 정확도 160 / (83+160)

In [None]:
recall_score(y_train, y_pred) # 재현율, 민감도, 진짜양성비율(TPR) 160 / (182 + 160)

In [None]:
accuracy_score(y_train, y_pred) #정확도(정분류율) : (466 + 160) / (466+83+182+160)

In [None]:
f1_score(y_train, y_pred) # 정밀도와 재현율의 조화평균 

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
"""
ROC : 민감도(진짜양성비율(TPR))에 대한 거짓양성비율(FPR) = 1-특이도(TNR)의 그래프
    특이도 = TN/FP+TN = 466/(466+83)
"""

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train, y_pred)

In [None]:
# 여기서 그림을 그려야하는데...

In [None]:
roc_auc_score(y_train, y_pred) # AUC : 곡면 아래 면적