In [None]:
import pandas as pd
import numpy as np

### Part I. Data import and review

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
train.head()

In [None]:
test.head()

##### Columns Description  
###### Survived: 0 = No, 1 = Yes
###### pclass: Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd
###### sibsp: # of siblings / spouses aboard the Titanic
###### parch: # of parents / children aboard the Titanic
###### ticket: Ticket number
###### cabin: Cabin number
###### embarked: Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.info()

In [None]:
test.info()

In [None]:
#how many null value? 
train.isnull().sum()

In [None]:
#how many null value? 
test.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set(style='darkgrid')

In [None]:
#define the function 'bar_chart'
def bar_chart(feature): 
    survived = train[train['Survived']==1][feature].value_counts()
    dead = train[train['Survived']==0][feature].value_counts()
    df = pd.DataFrame([survived, dead])
    df.index = ['LIVE', 'DEATH']
    df.plot(kind='bar', stacked = True, figsize=(10,5))  #stacked = False --> side-by-side chart

In [None]:
bar_chart('Sex')

In [None]:
bar_chart('Pclass')

In [None]:
bar_chart('SibSp')

In [None]:
bar_chart('Parch')

In [None]:
bar_chart('Embarked')

### Part II. Feature Engineering 

In [None]:
train.head()

In [None]:
from IPython.display import Image
Image(url= "https://static1.squarespace.com/static/5006453fe4b09ef2252ba068/t/5090b249e4b047ba54dfd258/1351660113175/TItanic-Survival-Infographic.jpg?format=1500w")

In [None]:
train_test_data = [train, test]  #combine train & test datasets

In [None]:
#find all the str which is ended with '.' --> getting title (Mr., Mrs., Miss., Ms., ...)
for dataset in train_test_data: 
    dataset['Title'] = dataset['Name'].str.extract('([A-Za-z]+)\.', expand=False)

In [None]:
train_test_data

In [None]:
train['Title'].value_counts()

In [None]:
test['Title'].value_counts()

In [None]:
#re-coding the title (Mr:0, Miss:1, Mrs:2, Others:3)
title_mapping = {'Mr': 0, 'Miss':1, 'Mrs':2, 'Master':3, 'Dr':3, 'Rev':3, 'Col':3, 'Mlle':3,
                 'Major':3, 'Capt':3, 'Jonkheer':3, 'Don':3, 'Sir':3, 'Lady':3, 'Countess':3, 
                 'Mme':3, 'Ms':3, 'Dona':3, 'Dr':3}

for dataset in train_test_data: 
    dataset['Title'] = dataset['Title'].map(title_mapping)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
#delete the original name field 
#axis = 0 --> drop the row, axis = 1 --> drop the column 
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

In [None]:
#re-coding the sex (M:0, F:1)
sex_mapping = {'male':0, 'female':1} 
for dataset in train_test_data: 
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

In [None]:
train.head(10)

In [None]:
# Is there out lier in age? 
#sns.catplot(data = train, y = 'Age', kind = 'box') 
sns.catplot(data = test, y = 'Age', kind = 'box') 

In [None]:
# NaN of age : fill with median age in the title(Mr, Miss, Mrs, others)
train['Age'].fillna(train.groupby('Title')['Age'].transform('median'), inplace=True) 
test['Age'].fillna(test.groupby('Title')['Age'].transform('median'), inplace=True) 

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
#display chart
facet = sns.FacetGrid(train, hue='Survived', aspect = 4)
facet.map(sns.kdeplot, 'Age', shade=True)
facet.set(xlim=(0, train['Age'].max()))
facet.add_legend()

plt.show()

In [None]:
#display chart: xlim is 0 ~ 20 
facet = sns.FacetGrid(train, hue='Survived', aspect = 4)
facet.map(sns.kdeplot, 'Age', shade=True)
facet.set(xlim=(0, train['Age'].max()))
facet.add_legend()

plt.xlim(0, 20)

In [None]:
#display chart: xlim is 0 ~ 20 
facet = sns.FacetGrid(train, hue='Survived', aspect = 4)
facet.map(sns.kdeplot, 'Age', shade=True)
facet.set(xlim=(0, train['Age'].max()))
facet.add_legend()

plt.xlim(20, 40)

In [None]:
#display chart: xlim is 0 ~ 20 
facet = sns.FacetGrid(train, hue='Survived', aspect = 4)
facet.map(sns.kdeplot, 'Age', shade=True)
facet.set(xlim=(0, train['Age'].max()))
facet.add_legend()

plt.xlim(40, 80)

##### binning the age and re-coding 
##### child : 0 
##### young : 1 
##### adult : 2 
##### mid-age : 3 
##### senior : 4 

In [None]:
#binning the age
for dataset in train_test_data: 
    dataset.loc[ dataset['Age'] <= 12, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 12) & (dataset['Age'] <=20), 'Age'] =1
    dataset.loc[(dataset['Age'] > 20) & (dataset['Age'] <=35), 'Age'] =2
    dataset.loc[(dataset['Age'] > 35) & (dataset['Age'] <=60), 'Age'] =3
    dataset.loc[ dataset['Age'] > 60, 'Age'] = 4   

In [None]:
train.head(10)

In [None]:
bar_chart('Age')

In [None]:
#Some embark is more rich, right?
Pclass1 = train[train['Pclass']==1]['Embarked'].value_counts()
Pclass2 = train[train['Pclass']==2]['Embarked'].value_counts()
Pclass3 = train[train['Pclass']==3]['Embarked'].value_counts()

df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
df.index = ['1st class', '2nd class', '3rd class']
df.plot(kind='bar', stacked = True)

In [None]:
#fill the NaN in Embark with 'S' (because Embarked has most population)
for dataset in train_test_data: 
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [None]:
#re-coding the embark 
embarked_mapping = {'S': 0, 'C': 1, 'Q': 2}
for dataset in train_test_data: 
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)

In [None]:
train.head(10)

In [None]:
#train.isnull().sum()
test.isnull().sum()

In [None]:
# fill the NaN in fare with median of the class 
train['Fare'].fillna(train.groupby('Pclass')['Fare'].transform('median'), inplace=True)
test['Fare'].fillna(test.groupby('Pclass')['Fare'].transform('median'), inplace=True)

In [None]:
test.isnull().sum()

In [None]:
#display chart
facet = sns.FacetGrid(train, hue='Survived', aspect = 4)
facet.map(sns.kdeplot, 'Fare', shade=True)
facet.set(xlim=(0, train['Fare'].max()))
facet.add_legend()

plt.show()

In [None]:
#display chart : fare is in 0~20 
facet = sns.FacetGrid(train, hue='Survived', aspect = 4)
facet.map(sns.kdeplot, 'Fare', shade=True)
facet.set(xlim=(0, train['Fare'].max()))
facet.add_legend()

plt.xlim(0, 20)

In [None]:
#display chart : fare is in 20~50 
facet = sns.FacetGrid(train, hue='Survived', aspect = 4)
facet.map(sns.kdeplot, 'Fare', shade=True)
facet.set(xlim=(0, train['Fare'].max()))
facet.add_legend()

plt.xlim(20, 50)

In [None]:
# binning the fare 
for dataset in train_test_data:
    dataset.loc[ dataset['Fare'] <= 17, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2
    dataset.loc[ dataset['Fare'] > 100, 'Fare'] = 3

In [None]:
train.head(10)

In [None]:
train.Cabin.value_counts()

In [None]:
#extract the first alphabet in Cabin
for dataset in train_test_data : 
    dataset['Cabin'] = dataset['Cabin'].str[:1]

In [None]:
train.Cabin.value_counts()

In [None]:
# Is there relativeness between Pclass & Cabin? 
Pclass1 = train[train['Pclass']==1]['Cabin'].value_counts()
Pclass2 = train[train['Pclass']==2]['Cabin'].value_counts()
Pclass3 = train[train['Pclass']==3]['Cabin'].value_counts()

df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
df.index = ['1st class', '2nd class', '3rd class']
df.plot(kind = 'bar', stacked = 'True')

In [None]:
# re-coding cabin (by Ucladian distance)
cabin_mapping = {'A': 0, 'B': 0.4, 'C': 0.8, 'D': 1.2, 'E': 1.6, 'F': 2.0, 'G': 2.4, 'T': 2.8}
for dataset in train_test_data : 
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)

In [None]:
train.head(10)

In [None]:
#fill the NaN with median of Pclass
train['Cabin'].fillna(train.groupby('Pclass')['Cabin'].transform('median'), inplace=True)
test['Cabin'].fillna(test.groupby('Pclass')['Cabin'].transform('median'), inplace=True)

In [None]:
train.head(10)

In [None]:
# Family size = SibSp + Parch + 1 (including myself) 
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

In [None]:
#display chart
facet = sns.FacetGrid(train, hue='Survived', aspect = 4)
facet.map(sns.kdeplot, 'FamilySize', shade=True)
facet.set(xlim=(0, train['FamilySize'].max()))
facet.add_legend()

plt.show()

In [None]:
facet = sns.FacetGrid(train, hue='Survived', aspect = 4)
facet.map(sns.kdeplot, 'FamilySize', shade=True)
facet.set(xlim=(0, train['FamilySize'].max()))
facet.add_legend()

plt.xlim(0, 2)

In [None]:
facet = sns.FacetGrid(train, hue='Survived', aspect = 4)
facet.map(sns.kdeplot, 'FamilySize', shade=True)
facet.set(xlim=(0, train['FamilySize'].max()))
facet.add_legend()

plt.xlim(2, 6)

In [None]:
#re-coding familysize with Ucladian distance 
family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 
                  10: 3.6, 11: 4}
for dataset in train_test_data:
    dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
#train.isnull().sum()
test.isnull().sum()

In [None]:
# drop the don't nessesary columns 
features_drop = ['Ticket', 'SibSp', 'Parch']
train = train.drop(features_drop, axis=1)
test = test.drop(features_drop, axis=1)

In [None]:
train = train.drop(['PassengerId'], axis=1)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train_data = train.drop('Survived', axis = 1)
target = train['Survived']
train_data.shape, target.shape

In [None]:
train_data.head(10)

In [None]:
train_data.info()

### Part III. Modeling 

##### About Classify Algorism 
###### 1. kNN (k-Nearest Neighber) : 근접한 k개의 이웃 변수들의 결과에 따라 판단 
###### 2. Decision tree : 의사결정 나무 
###### 3. Random Forest : 다수의 의사결정 나무들이 도출한 결과값들에 대한 다수결 
###### 4. Nive Bayes : 여려 변수들(Sex, Age, Pclass, ...) 각각이 y값(Live or Dead)에 도달할 확률의 총계 
###### 5. SVM(Support Vector Machin) : 기준 vector에 대해 해당 case의 위치로써 해당 case의 결과(Live or Dead) 판단 

In [None]:
# import the Classifier pkgs from Scikit-learn 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
train.info()

In [None]:
# cross validation with K-fold 
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits = 10, shuffle = True, random_state = 0)

In [None]:
# 1. kNN : train_data를 10개로 나누어 validation한 결과, 10개의 accuracy score를 얻게 됨 
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring = scoring)
score

In [None]:
# 10개의 accuracy score를 평균하여, 100점 만점 기준으로 표현 
round(np.mean(score)*100, 2)

In [None]:
# 2. Decision tree 
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring = scoring)
score

In [None]:
round(np.mean(score)*100, 2)

In [None]:
# 3. Random Forest 
clf = RandomForestClassifier(n_estimators = 13)
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring = scoring)
score

In [None]:
round(np.mean(score)*100, 2)

In [None]:
# 4. Naive Bayes 
clf = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring = scoring)
score

In [None]:
round(np.mean(score)*100, 2)

In [None]:
# 5. SVM 
clf = SVC(gamma='auto')
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring = scoring)
score

In [None]:
round(np.mean(score)*100, 2)

##### Summry of the Comparing the results 
##### 1. kNN = 82.6 
##### 2. Decision tree = 79.91 
##### 3. Random Forest = 80.58 
##### 4. Naive Bayes = 78.78 
##### 5. SVM = 83.5 

In [None]:
#test.head()
train.head()

In [None]:
# Model test 
clf = SVC(gamma='auto')
clf.fit(train_data, target)

test_data = test.drop("PassengerId", axis=1).copy()
prediction = clf.predict(test_data)

In [None]:
submission = pd.DataFrame({
    'PassengerId' : test['PassengerId'], 
    'Survived': prediction 
})

submission.to_csv('submission.csv', index = False)

In [None]:
submission = pd.read_csv('submission.csv')
submission.head(20)

In [None]:
submission.to_csv('output/submission.csv', index = False)