In [355]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

In [356]:
df = pd.read_csv('train.csv',sep=',')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [357]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [358]:
# mean age is 29.69. Filling the NaNs in Age with the average age of 30.
df = df.fillna(30)

In [359]:
df.Embarked.unique()

array(['S', 'C', 'Q', 30], dtype=object)

In [360]:
df = df.replace({'Embarked' : { 'S' : 0, 'C' : 1, 'Q' : 2}})
# Use 0 to represent male and 1 to represent female.
df = df.replace({'Sex' : { 'male' : 0, 'female' : 1}})

In [361]:
# Drop string columns that does not help with the model.
df = df.drop(df.columns[[3, 8, 10]], axis=1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,0,22.0,1,0,7.25,0
1,2,1,1,1,38.0,1,0,71.2833,1
2,3,1,3,1,26.0,0,0,7.925,0
3,4,1,1,1,35.0,1,0,53.1,0
4,5,0,3,0,35.0,0,0,8.05,0


In [362]:
# Split train and test set
X, y = df.iloc[:,df.columns != 'Survived'], df.iloc[:, 1]
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify = y, random_state = 60)

## Random Forest Model

In [363]:
for n in range(1,9):
    ran_forest = RandomForestClassifier(random_state = 24, max_features = n)
    ran_forest.fit(X_train, y_train)
    print('Accuracy on training set with {} max features: {:3f}'.format(n, ran_forest.score(X_train, y_train)))
    print('Accuracy on test set with {} max features: {:3f}'.format(n, ran_forest.score(X_test, y_test)))

Accuracy on training set with 1 max features: 0.982036
Accuracy on test set with 1 max features: 0.780269
Accuracy on training set with 2 max features: 0.977545
Accuracy on test set with 2 max features: 0.798206
Accuracy on training set with 3 max features: 0.983533
Accuracy on test set with 3 max features: 0.820628
Accuracy on training set with 4 max features: 0.980539
Accuracy on test set with 4 max features: 0.798206
Accuracy on training set with 5 max features: 0.980539
Accuracy on test set with 5 max features: 0.816143
Accuracy on training set with 6 max features: 0.977545
Accuracy on test set with 6 max features: 0.789238
Accuracy on training set with 7 max features: 0.977545
Accuracy on test set with 7 max features: 0.798206
Accuracy on training set with 8 max features: 0.974551
Accuracy on test set with 8 max features: 0.802691


## Gradient Boosted Decision Tree

In [364]:
for n in range(1,9):
    gradient = GradientBoostingClassifier(random_state = 24, max_depth = n)
    gradient.fit(X_train, y_train)
    print('Accuracy on training set with {} max depth: {:3f}'.format(n, gradient.score(X_train, y_train)))
    print('Accuracy on test set with {} max depth: {:3f}'.format(n, gradient.score(X_test, y_test)))

Accuracy on training set with 1 max depth: 0.812874
Accuracy on test set with 1 max depth: 0.811659
Accuracy on training set with 2 max depth: 0.850299
Accuracy on test set with 2 max depth: 0.816143
Accuracy on training set with 3 max depth: 0.910180
Accuracy on test set with 3 max depth: 0.820628
Accuracy on training set with 4 max depth: 0.953593
Accuracy on test set with 4 max depth: 0.816143
Accuracy on training set with 5 max depth: 0.986527
Accuracy on test set with 5 max depth: 0.811659
Accuracy on training set with 6 max depth: 1.000000
Accuracy on test set with 6 max depth: 0.793722
Accuracy on training set with 7 max depth: 1.000000
Accuracy on test set with 7 max depth: 0.780269
Accuracy on training set with 8 max depth: 1.000000
Accuracy on test set with 8 max depth: 0.793722


In [365]:
# Modify the test DataFrame so it matches with the training set
df_test = pd.read_csv('test.csv',sep=',')
df_test = df_test.fillna(30)
df_test = df_test.replace({'Embarked' : { 'S' : 0, 'C' : 1, 'Q' : 2}})
df_test = df_test.replace({'Sex' : { 'male' : 0, 'female' : 1}})
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,30,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,30,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,30,2
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,30,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,30,0


In [366]:
df_test = df_test.drop(df_test.columns[[2,7, 9]], axis = 1)
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,0,34.5,0,0,7.8292,2
1,893,3,1,47.0,1,0,7.0,0
2,894,2,0,62.0,0,0,9.6875,2
3,895,3,0,27.0,0,0,8.6625,0
4,896,3,1,22.0,1,1,12.2875,0


### I decided to use Random Forest Model with 3 max features.

In [367]:
titanic_model = RandomForestClassifier(random_state = 44, max_features = 3)
titanic_model.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=44, verbose=0, warm_start=False)

In [368]:
result = titanic_model.predict(df_test)

In [369]:
df_test['Survived'] = result

In [370]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,892,3,0,34.5,0,0,7.8292,2,0
1,893,3,1,47.0,1,0,7.0,0,0
2,894,2,0,62.0,0,0,9.6875,2,0
3,895,3,0,27.0,0,0,8.6625,0,0
4,896,3,1,22.0,1,1,12.2875,0,0


In [371]:
df_test = df_test.drop(df_test.columns[[1,2,3,4,5,6,7]], axis = 1)

In [372]:
df_test.to_csv('prediction.csv')

#### Remove the first column of prediction.csv and rename as prediction_titanic.csv
#### And Submit!!

### The Final Result was 0.78, which is disappointing. I will try gradient Boosting Model next.

In [256]:
#Fit the Gradient Boosting Model with max_depth = 3
gradient_titanic = GradientBoostingClassifier(random_state = 24, max_depth = 3)
gradient_titanic.fit(X,y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=24, subsample=1.0, verbose=0,
              warm_start=False)

In [257]:
result = gradient_titanic.predict(df_test)

In [258]:
df_test['Survived'] = result

In [259]:
df_test = df_test.drop(df_test.columns[[1,2,3,4,5,6,7]], axis = 1)
df_test.head()
df_test.to_csv('prediction2.csv')