### Data
- Survived - Survival (0 = No; 1 = Yes)
- Pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
- Sex - Sex
- Age - Age in years
- SibSp - Number of siblings / spouses aboard the Titanic
- Parch - Number of parents / children aboard the Titanic
- Ticket - Ticket number
- Fare - Passenger fare
- Cabin - Cabin number
- Embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

In [154]:
import pandas as pd
import numpy as np

In [155]:
train = pd.read_csv('../data/titanic/train.csv')
train.shape

(891, 12)

In [156]:
test = pd.read_csv('../data/titanic/test.csv')
test.shape

(418, 11)

In [157]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [158]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [159]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [160]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [161]:
train['Age_mean'] = train['Age']
train['Age_mean'].fillna(train['Age'].mean(), inplace=True)

In [162]:
test['Age_mean'] = test['Age']
test['Age_mean'].fillna(test['Age'].mean(), inplace=True)

In [163]:
train['Age'].isnull().sum()

177

In [164]:
train['Age_mean'].isnull().sum()

0

In [165]:
train['Age_mean'].mean()

29.69911764705882

In [166]:
train['Age'].mean()

29.69911764705882

In [167]:
train['Sex'].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [168]:
train['Gender'] = (train['Sex'] == 'female')
test['Gender'] = (test['Sex'] == 'female')

In [169]:
train['Gender'].head()

0    False
1     True
2     True
3     True
4    False
Name: Gender, dtype: bool

In [170]:
train['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [171]:
train['Embarked'].isnull().sum()

2

In [172]:
test['Embarked'].isnull().sum()

0

In [173]:
train['Embarked_S'] = train['Embarked'] == 'S'
train['Embarked_C'] = train['Embarked'] == 'C'
train['Embarked_Q'] = train['Embarked'] == 'Q'

In [174]:
train['Embarked_S'].head()

0     True
1    False
2     True
3     True
4     True
Name: Embarked_S, dtype: bool

In [175]:
train['Embarked_S'].sum()

644

In [176]:
train[['Embarked', 'Embarked_S', 'Embarked_C', 'Embarked_Q']].head()

Unnamed: 0,Embarked,Embarked_S,Embarked_C,Embarked_Q
0,S,True,False,False
1,C,False,True,False
2,S,True,False,False
3,S,True,False,False
4,S,True,False,False


In [177]:
test['Embarked_S'] = test['Embarked'] == 'S'
test['Embarked_C'] = test['Embarked'] == 'C'
test['Embarked_Q'] = test['Embarked'] == 'Q'

In [178]:
train['FamilySize'] = train['SibSp'] + train["Parch"] + 1

In [179]:
train["FamilySize"].value_counts()

FamilySize
1     537
2     161
3     102
4      29
6      22
5      15
7      12
11      7
8       6
Name: count, dtype: int64

In [180]:
train['Family'] = train['FamilySize']
train.loc[train['FamilySize'] == 1, 'Family'] = 'S'
train.loc[(train['FamilySize'] > 1) & (train['FamilySize'] < 5), 'Family'] = 'M'
train.loc[train['FamilySize'] > 4, 'Family'] = 'L'

  train.loc[train['FamilySize'] == 1, 'Family'] = 'S'


In [181]:
train[['Family', 'FamilySize']].head()

Unnamed: 0,Family,FamilySize
0,M,2
1,M,2
2,S,1
3,M,2
4,S,1


In [182]:
train['Family_S'] = train['Family'] == 'S'
train['Family_M'] = train['Family'] == 'M'
train['Family_L'] = train['Family'] == 'L'

In [183]:
train[['Family', 'Family_S', 'Family_M', 'Family_L']].head()

Unnamed: 0,Family,Family_S,Family_M,Family_L
0,M,False,True,False
1,M,False,True,False
2,S,True,False,False
3,M,False,True,False
4,S,True,False,False


In [184]:
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
test['Family'] = test['FamilySize']
test.loc[test['FamilySize'] == 1, 'Family'] = 'S'
test.loc[(test['FamilySize'] > 1) & (test['FamilySize'] < 5), 'Family'] = 'M'
test.loc[test['FamilySize'] > 4, 'Family'] = 'L'

  test.loc[test['FamilySize'] == 1, 'Family'] = 'S'


In [185]:
test['Family_S'] = test['Family'] == 'S'
test['Family_M'] = test['Family'] == 'M'
test['Family_L'] = test['Family'] == 'L'

In [186]:
train['Family'].value_counts()

Family
S    537
M    292
L     62
Name: count, dtype: int64

In [187]:
train['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [188]:
train['Pclass'].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [189]:
train['Pclass'] = train['Pclass'].astype('category')
train['Pclass'].dtypes

CategoricalDtype(categories=[1, 2, 3], ordered=False, categories_dtype=int64)

In [190]:
feature_names = ['Gender', 'Age_mean', 
                 'Embarked_S', 'Embarked_C', 'Embarked_Q', 
                 'Family_S', 'Family_M', 'Family_L']

In [191]:
X_train = train[feature_names]
X_train.head()

Unnamed: 0,Gender,Age_mean,Embarked_S,Embarked_C,Embarked_Q,Family_S,Family_M,Family_L
0,False,22.0,True,False,False,False,True,False
1,True,38.0,False,True,False,False,True,False
2,True,26.0,True,False,False,True,False,False
3,True,35.0,True,False,False,False,True,False
4,False,35.0,True,False,False,True,False,False


In [192]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Age_mean', 'Gender',
       'Embarked_S', 'Embarked_C', 'Embarked_Q', 'FamilySize', 'Family',
       'Family_S', 'Family_M', 'Family_L'],
      dtype='object')

In [193]:
y_label = train['Survived']

In [194]:
X_test = test[feature_names]
X_test.head()

Unnamed: 0,Gender,Age_mean,Embarked_S,Embarked_C,Embarked_Q,Family_S,Family_M,Family_L
0,False,34.5,False,False,True,True,False,False
1,True,47.0,True,False,False,False,True,False
2,False,62.0,False,False,True,True,False,False
3,False,27.0,True,False,False,True,False,False
4,True,22.0,True,False,False,False,True,False


### Modeling

In [195]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=3, random_state=2018)
model

In [196]:
model.fit(X_train, y_label)

In [197]:
from sklearn.model_selection import KFold, cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=2018)

In [198]:
scoring = 'accuracy'
score = cross_val_score(model, X_train, y_label, cv=k_fold, n_jobs=-1, scoring=scoring)
print(score)

[0.83333333 0.87640449 0.87640449 0.83146067 0.84269663 0.78651685
 0.80898876 0.83146067 0.69662921 0.83146067]


In [199]:
round(np.mean(score)*100, 2)

82.15

In [200]:
prediction = model.predict(X_test)
prediction.shape

(418,)

In [201]:
test['Survived'] = prediction

In [202]:
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Age_mean', 'Gender',
       'Embarked_S', 'Embarked_C', 'Embarked_Q', 'FamilySize', 'Family',
       'Family_S', 'Family_M', 'Family_L', 'Survived'],
      dtype='object')

In [203]:
submission = test[["PassengerId", "Survived"]]
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
