# Titanic - Prediction using SVM-Classifier

In [1]:
# Import statements
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn import metrics

In [2]:
# Reading the training and testing datasets (given as .csv files)
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

datasets = [train,test] # Making a list of datasets.

Let's have a look at the datasets we are dealing with.

In [3]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
test.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [5]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


## Data cleaning

First, let's check how many of the column entries from both the datasets have missing values. 

In [7]:
print(train.isnull().sum())
print('_'*20)
print(test.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
____________________
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


A lot of entries in the 'Age' column of both the datasets are missing. Along with this, two entries of 'Embarked' in the training dataset and one entry of 'Fare' in the testing dataset are missing. I will fill-up these discrepencies based on the data that we have.<br><br>
Note: I decided to ignore the 'Cabin' entries which were missing as they are Alphanumberic strings which are difficult to assume and fill-up.

### Filling up all the NaN values in 'Age' column of both the testing and training datasets.

I decided to fill up the 'Age' of each passenger (in both the training and testing datasets) based on the 'Pclass' they booked their ticket under. So, for a passenger whose 'Age' entry is missing, his/her 'Age' will be filled up with the average age of passengers from his/her 'Pclass'.

In [8]:
for df in datasets: # Loops through the dataset
    for _ in range(1,4): # Loops through range(1,4) i.e. from 1 to 3 (since there are three 'PClass' values i.e. 1,2,3)
        df.loc[df.Pclass==_,'Age'] = df.loc[df.Pclass==_,'Age'].fillna(df.loc[df.Pclass==_].Age.mean()) # Fill-up statement

Next, I will be filling up the missing 'Embarked' values in the training dataset. The 'Embarked' column contains either 'S', 'C' or 'Q'.

In [9]:
# Checking the number of passengers embarked at 'S','C' and 'Q'
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [10]:
# Since, the majority of passengers embarked at 'S', I will replace the missing values with 'S'.
train['Embarked'] = train['Embarked'].fillna('S')

Next, I will be filling up the missing 'Fare' value in the training dataset.

In [11]:
test[test.Fare.isnull()] # Printing the row where the 'Fare' value is missing.

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [12]:
# Since there is only one passenger whose 'Fare' entry is missing, I decided to replace it with average 'Fare' paid by 
# passengers in his/her 'Pclass'.
test['Fare'] = test['Fare'].fillna(test.loc[test.Pclass==3].Fare.mean())

In [13]:
# Chceking if there are anymore missing entries in both the datasets (except for 'Cabin' entries)
print(train.isnull().sum())
print('_'*20)
print(test.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64
____________________
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


## Feature Generation and Encoding <br>
In this section, I created two new columns/features in both the given datasets i.e. 'FamilySize' and 'IsAlone'. Along with this, I used LabelEncoder() to convert the 'Sex' and 'Embarked' columns to numeric codes, for simplicity while model training and testing.

In [14]:
for df in datasets:
    df['FamilySize'] = df['SibSp']+df['Parch']+1 # Number of siblings + Number of parents + Passenger = Size of the Family.
    # If a particular passenger is alone or not. 
    # # 0 represents 'not alone' and 1 represents 'alone'.
    df['IsAlone'] = 1 # Initially setting the value as 1.
    df['IsAlone'].loc[df['FamilySize'] > 1] = 0 # Changing the value to 0, if 'FamilySize' is greater than 1.
    
    # Encoding
    label = LabelEncoder()   
    df['Sex_Code'] = label.fit_transform(df['Sex']) # Converts 'male' and 'female' to 0 and 1.
    df['Embarked_Code'] = label.fit_transform(df['Embarked']) # Converts the 'Embarked' values to 0,1,2.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


## Splitting the datasets, model training and model application
The model I decided to use is **Support Vector Machine - Classifier** with a **Linear** kernel.

In [15]:
# The feature classes that I have selected to train the model on.
features = ['Pclass', 'Sex_Code', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Code', 'FamilySize', 'IsAlone']

In [16]:
# Splitting the training dataset into training entries (90% of training dataset) and validation entries (10% of training dataset).
x_train, x_val, y_train, y_val = train_test_split(train[features], train['Survived'], train_size = 0.9)

In [17]:
# Application of Support Vector Machine - Classifier
model = SVC(kernel='linear') # Using Linear Kernel
model.fit(x_train, y_train) # Model training

y_val_pred = model.predict(x_val) # Predicting y_val from x_val [val == validation set]

In [18]:
# Model accuracy
print("Model Accuracy:",metrics.accuracy_score(y_val, y_val_pred))

Model Accuracy: 0.8111111111111111


## Applying the model on testing dataset.

In [19]:
test['Survived'] = model.predict(test[features]) # Predicting 'Survived' attribute using the above-trained model.

In [20]:
final_df = test[['PassengerId','Survived']]  
# Final dataframe containing the testing datasets 'PassengerId' and predicted 'Survived' attribute.

In [21]:
# Saving the final dataframe as a .csv file.
final_df.to_csv('final_submission.csv', index = False)