# Titanic Survivor prediction
## dataset
### file
- dataset/train.csv
- dataset/test.csv
### Data fields
- survival : 0 - dead, 1 - survive
- pclass : ticket class
- sex
- age
- sibsp : total of brothers and partner
- parch : total of sons and daughters and parents
- tikect : number of ticket
- fare : boarding charge
- cabin : number of cabin
- embarked : boarding Port

## 1. Load data

In [1]:
# import library for data analysis
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [46]:
# load data in local
dataset = pd.read_csv('dataset/train.csv')
dataset_name = dataset.columns
print(dataset.info())
print(dataset.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.

### Data analysis
- RangeIndex : 891
- Missing value : Age, Cabin, Embarked
    - Age : insert mean
    - Cabin, Embarked : delete columns
 
## 2. Feature Engineering

In [47]:
# insert mean in age column
data = dataset.copy()
data['Age'] = data['Age'].replace(np.nan, '29.6')

In [48]:
# delete Cabin and Embarked columns
del data['Cabin']
del data['Embarked']

In [49]:
# check data
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null object
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
dtypes: float64(1), int64(5), object(4)
memory usage: 69.7+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05


In [50]:
# Sex change 0 and 1  ( male > 1, female > 0)
data['Sex'] = data['Sex'].replace('male', 1)
data['Sex'] = data['Sex'].replace('female', 0)

In [51]:
# feature selection
# delete Name and Ticket
del data['Name']
del data['Ticket']
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,1,22,1,0,7.25
1,2,1,1,0,38,1,0,71.2833
2,3,1,3,0,26,0,0,7.925
3,4,1,1,0,35,1,0,53.1
4,5,0,3,1,35,0,0,8.05


In [64]:
# pandas data save
data.to_pickle('dataset/titanic_dataset')

## 3. Data split

In [52]:
# import library for data split
from sklearn.model_selection import train_test_split

In [53]:
# data devide X, y 
y = data.pop('Survived')
X = data

In [54]:
# data split train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [55]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(712, 7) (179, 7) (712,) (179,)


## 4. build model

In [56]:
# import library for model
from sklearn.ensemble import RandomForestClassifier

### 4.1. base model

In [59]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [60]:
clf.score(X_test, y_test)

0.8156424581005587

### 4.2. change depth None

In [61]:
clf = RandomForestClassifier(max_depth=None, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [62]:
clf.score(X_test, y_test)

0.8659217877094972