# Get your hands dirty ! All on your own 
##### (If you don't know what to do next. Don't be shy. Ask me anything

In this section, we are sloving classification problem again. The objective is to predict whether the person survived or not from titanic dataset. 



### Overview of the dataset

The data has been split into two groups:

1. training set (train.csv)

2. test set (test.csv)

The training set should be used to build your machine learning models. For the training set, we provide the outcome (also known as the “ground truth”) for each passenger. Your model will be based on “features” like passengers’ gender and class. You can also use feature engineering to create new features.

The test set should be used to see how well your model performs on unseen data. For the test set, we do not provide the ground truth for each passenger. It is your job to predict these outcomes. For each passenger in the test set, use the model you trained to predict whether or not they survived the sinking of the Titanic.

In [1]:
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [2]:
df_train = pd.read_csv('train.csv', encoding='utf-8', low_memory=False)
df_test = pd.read_csv('test.csv', encoding='utf-8', low_memory=False)

# Understanding of the dataset

In [3]:
df_train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# check the missing values in which column

print(df_train.columns[df_train.isnull().any()].tolist())

['Age', 'Cabin', 'Embarked']


In [6]:
# see the frequencies value in Embarked column

df_train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [7]:
# imput missing values to Age and Embarked as we dont need Cabin
# as you can see from the above result, S is the most frequently
# occured, so I decide to impute the missing values in Embarked 
# with 'S'
# while for Age I prefer impute it using mean

df_train['Age'] = df_train['Age'].fillna(df_train['Age'].mean())
df_train['Embarked'] = df_train['Embarked'].fillna('S')

In [8]:
# check the value in Sex column

df_train['Sex'].unique()

array(['male', 'female'], dtype=object)

In [9]:
# since LabelEncoder only takes 1d array as an input, so need to use cat.codes to encode string to numeric

df_train.Sex = pd.Categorical(df_train.Sex)
df_test.Sex = pd.Categorical(df_test.Sex)
df_train['SexCode'] = df_train.Sex.cat.codes
df_test['SexCode'] = df_test.Sex.cat.codes

df_train.Embarked = pd.Categorical(df_train.Embarked)
df_test.Embarked = pd.Categorical(df_test.Embarked)
df_train['EmbarkedCode'] = df_train.Embarked.cat.codes
df_test['EmbarkedCode'] = df_test.Embarked.cat.codes

In [10]:
# now check the new column for train dataset

df_train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SexCode,EmbarkedCode
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,2


In [11]:
# new column for test dataset

df_test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SexCode,EmbarkedCode
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0,2
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1,2
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0,2


In [12]:
# split data into input and output
# remove column which I think not important such as
# name, ticket, cabin, passenger id
# also remove column Embarked and Sex since we have
# EmbarkedCode and SexCode

train = df_train.drop(['Survived','Name','Ticket','Cabin','PassengerId','Embarked','Sex'], axis=1)
X_train = train.values
X_train = X_train.astype('int')
Y_train = df_train['Survived'].values
Y_train = Y_train.astype('int')

test = df_test.drop(['Name','Ticket','Cabin','PassengerId','Embarked','Sex'], axis=1)
X_test = test.values
X_test = X_test.astype('int')

In [13]:
# split the data into training and validation set

seed = 42
validation_size = 20
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X_train, Y_train, test_size=validation_size, random_state=seed)

# Modelling and Prediction

In [14]:
# Setup the models
# Using 4 different algorithms which are Logistic Regression
# Decision Tree, Support Vector Machine and Multi Layer
# Perceptron

seed = 42
scoring = 'accuracy'
models = []
models.append(('LR', LogisticRegression()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('SVM', SVC()))
models.append(('NN', MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)))

In [15]:
# evaluate each model in turn
results = []
names = []
for name, model in models:
    
    # kfold here means we split the dataset into 10 fold
    # so there will be 9 fold for training and 1 fold
    # for validate, we will do this 10 times using different
    # fold.
    
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.793312 (0.061121)
CART: 0.772636 (0.037549)
SVM: 0.700287 (0.044369)
NN: 0.738231 (0.049779)


In [16]:
# read test file

df_testY = pd.read_csv('gender_submission.csv')
Y_test = df_testY['Survived'].values

In [17]:
# train each model AND test your model with new dataset

cls = LogisticRegression()
cls.fit(X_train, Y_train)
Y_predict = cls.predict(X_test)
print('LR: '+ str(accuracy_score(Y_predict,Y_test)))

cls = DecisionTreeClassifier()
cls.fit(X_train, Y_train)
Y_predict = cls.predict(X_test)
print('CART: '+ str(accuracy_score(Y_predict,Y_test)))

cls = SVC()
cls.fit(X_train, Y_train)
Y_predict = cls.predict(X_test)
print('SVM: '+ str(accuracy_score(Y_predict,Y_test)))

cls = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
cls.fit(X_train, Y_train)
Y_predict = cls.predict(X_test)
print('NN: '+ str(accuracy_score(Y_predict,Y_test)))


LR: 0.820574162679
CART: 0.665071770335
SVM: 0.543062200957
NN: 0.648325358852
