## Titanic Survival Prediction (Machine Learning)

Predict which passengers will survive in the Titanic shipwreck using a Support Vector Machine (SVM) classifier.

#### Import Packages and Data

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [2]:
test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")

In [3]:
print("Test set:",test.shape)
print("Train set:",train.shape)

Test set: (418, 11)
Train set: (891, 12)


In [4]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Exploratory Data Analysis

In [6]:
# Check data for null values
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
# Descriptive statistics for numerical values
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
# Check ratio of males to females
train.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

#### Data Cleaning

In [9]:
# Drop missing values from the train and test set
train.dropna(axis=0, inplace=True)
test.dropna(axis=0, inplace=True)

In [10]:
# Converting categorical features to numerical
train['Embarked'] = train['Embarked'].map({'C':1, 'S':2, 'Q':3})
train['Sex']      = train['Sex'].map({'male': 1,'female':0})
train['CabinGroup'] = train['Cabin'].map({'H': 0, 'M': 1, 'L': 2, 'X':3})

test['Embarked'] = test['Embarked'].map({'C':1, 'S':2, 'Q':3})
test['Sex']      = test['Sex'].map({'male': 1,'female':0})
test['CabinGroup'] = test['Cabin'].map({'H': 0, 'M': 1, 'L': 2, 'X':3})

#### Classification using SVM

In [11]:
# Split data into features (X) and target (y)
features = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch',  'Fare', 'Embarked']]
target = train[['Survived']]

In [12]:
# Split train/validation data into random train and test subsets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.30,random_state=42)

In [13]:
# Use GridSearchCV to tune model hyperparamaters
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = SVC()
model = GridSearchCV(svc, parameters)

In [14]:
# fit model to training data
model.fit(X_train, y_train)

# make predictions
predictions = model.predict(X_test)
predictions

array([0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0])

In [15]:
# compute model accuracy score
accuracy_score(y_test, predictions)

0.6909090909090909

In [16]:
# generate classification report
confusion_matrix(y_test, predictions, labels=[0,1])

array([[18,  2],
       [15, 20]])

In [17]:
# predict on test data
test_set = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch',  'Fare', 'Embarked']]
test_predictions = model.predict(test_set)

results = test.copy()
results['Survived'] = test_predictions.astype('int')
results.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CabinGroup,Survived
12,904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",0,23.0,1,0,21228,82.2667,B45,2,,1
14,906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance...",0,47.0,1,0,W.E.P. 5734,61.175,E31,2,,1
24,916,1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",0,48.0,1,3,PC 17608,262.375,B57 B59 B63 B66,1,,1
26,918,1,"Ostby, Miss. Helene Ragnhild",0,22.0,0,1,113509,61.9792,B36,1,,1
28,920,1,"Brady, Mr. John Bertram",1,41.0,0,0,113054,30.5,A21,2,,0
