# 0. Libraries and Datasets

## 0.1. Import Libraries

In [15]:
import pandas            as pd
import numpy             as np
import seaborn           as sns
import matplotlib.pyplot as plt
import xgboost           as xgb
import warnings

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error
from IPython.core.display import display, HTML



%matplotlib inline
warnings.filterwarnings('ignore')
display(HTML("<style>.container { width:80% !important; }</style>"))

## 0.2. Importing Datasets

In [5]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# 1. Data Understanding

## 1.1. Datasets Dimensions

In [100]:
df1 = train.copy()

In [37]:
df1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,G6,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [38]:
print('Train Dataset\n\nRows: {}\nCols: {}\n_____________________\n\nTest Dataset\n\nRows: {}\nCols: {}'.format(df1.shape[0], df1.shape[1], test.shape[0], test.shape[1]))

Train Dataset

Rows: 891
Cols: 12
_____________________

Test Dataset

Rows: 418
Cols: 11


## 1.2. Dataset dtypes

In [39]:
df1.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [18]:
test.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## 1.3. Checking Missing Values

In [50]:
df1.isnull().sum().sort_values(ascending=False)

Cabin          685
Embarked         2
Fare             0
Ticket           0
Parch            0
SibSp            0
Age              0
Sex              0
Name             0
Pclass           0
Survived         0
PassengerId      0
dtype: int64

In [21]:
test.isnull().sum().sort_values(ascending=False)

Cabin          327
Age             86
Fare             1
Embarked         0
Ticket           0
Parch            0
SibSp            0
Sex              0
Name             0
Pclass           0
PassengerId      0
dtype: int64

## 1.4. Dealing with missing values

In [101]:
df1['Age'].fillna(train['Age'].median(), inplace=True)
df1['Embarked'].fillna(df1['Embarked'].mode()[0], inplace=True)

In [102]:
df1.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [103]:
df1 = df1[['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked']]

In [104]:
df1.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [99]:
df1['Embarked'].mode()[0]

'S'

In [112]:
test['Age'].fillna(test['Age'].median(), inplace=True)
test['Fare'].fillna(test['Fare'].mean(),inplace=True)

In [113]:
test = test[['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked']]

In [114]:
test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [119]:
df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,891.0,29.361582,13.019697,0.42,22.0,28.0,35.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


# 2. Feature Engineering

In [150]:
df2 = df1.copy()

In [151]:
df2.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Family_Size,Fare_Per_Person,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,1,3.625,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1,35.64165,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0,7.925,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1,26.55,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0,8.05,0


In [152]:
df2.sample(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Family_Size,Fare_Per_Person,Survived
73,74,3,"Chronopoulos, Mr. Apostolos",male,26.0,1,0,2680,14.4542,C,1,7.2271,0
316,317,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24.0,1,0,244367,26.0,S,1,13.0,1
90,91,3,"Christmann, Mr. Emil",male,29.0,0,0,343276,8.05,S,0,8.05,0
565,566,3,"Davies, Mr. Alfred J",male,24.0,2,0,A/4 48871,24.15,S,2,8.05,0
725,726,3,"Oreskovic, Mr. Luka",male,20.0,0,0,315094,8.6625,S,0,8.6625,0
783,784,3,"Johnston, Mr. Andrew G",male,28.0,1,2,W./C. 6607,23.45,S,3,5.8625,0
794,795,3,"Dantcheff, Mr. Ristiu",male,25.0,0,0,349203,7.8958,S,0,7.8958,0
113,114,3,"Jussila, Miss. Katriina",female,20.0,1,0,4136,9.825,S,1,4.9125,0
574,575,3,"Rush, Mr. Alfred George John",male,16.0,0,0,A/4. 20589,8.05,S,0,8.05,0
548,549,3,"Goldsmith, Mr. Frank John",male,33.0,1,1,363291,20.525,S,2,6.841667,0


## 2.1. Hypothesis

**1.** Young passenger can have more chances to survived.

**2.** Passengers who get a higher class can have more chances to survived.

**3.** How expense the Fare was, the better chance to survive is.

**4.** How bigger the family of parents with the passengers, low chances they have to survived.

# 2.2. Feature Engineering

In [124]:
df1['Ticket'].value_counts()

1601        7
CA. 2343    7
347082      7
3101295     6
347088      6
           ..
4579        1
345783      1
A/5 3540    1
8475        1
218629      1
Name: Ticket, Length: 681, dtype: int64

In [153]:
df2['Family_Size'] = df2['SibSp'] + df2['Parch']
df2['Fare_Per_Person'] = df2['Fare'] / (df2['Family_Size']+1)

test['Family_Size'] = test['SibSp'] + test['Parch']
test['Fare_Per_Person'] = test['Fare'] / (test['Family_Size'])

In [154]:
df2 = df2[['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked', 'Family_Size',
       'Fare_Per_Person','Survived']]

In [144]:
df1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Family_Size,Fare_Per_Person
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,1,3.625
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1,35.64165
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1,26.55
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0,8.05


In [126]:
df1['SibSp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [155]:
df2.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Family_Size,Fare_Per_Person,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,1,3.625,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1,35.64165,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0,7.925,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1,26.55,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0,8.05,0
