## Step 1 : Loading the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Step 2 : Load the data

In [4]:
data = pd.read_excel('Titanic.xlsx')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,,Row Labels,Count of Sex
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,,female,314
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,,male,577
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,,Grand Total,891
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,,,


In [5]:
data.shape

(891, 15)

## Step 3 : Data Cleaning, Data Preprocessing

In [7]:
## Check the total missing value count per column

data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Unnamed: 12    891
Unnamed: 13    887
Unnamed: 14    887
dtype: int64

In [10]:
### Check the percent missing values per column

data.isnull().sum() / len(data) * 100

PassengerId      0.000000
Survived         0.000000
Pclass           0.000000
Name             0.000000
Sex              0.000000
Age             19.865320
SibSp            0.000000
Parch            0.000000
Ticket           0.000000
Fare             0.000000
Cabin           77.104377
Embarked         0.224467
Unnamed: 12    100.000000
Unnamed: 13     99.551066
Unnamed: 14     99.551066
dtype: float64

## As we see, Cabin, Unnamed: 12, Unnamed:13 and Unnamed: 14 contain more than 30% missing values. Hence, we drop them

In [11]:
data = data.drop(['Cabin', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14'], axis = 1)
data.shape

(891, 11)

## Treating Age column with median imputation and Embarked column with mode imputation

In [12]:
from sklearn.impute import SimpleImputer
sim = SimpleImputer(strategy = 'median')
sim

In [13]:
data['Age'] = sim.fit_transform(data[['Age']])
data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [14]:
from sklearn.impute import SimpleImputer
sim = SimpleImputer(strategy = 'most_frequent')
sim

In [15]:
data['Embarked'] = sim.fit_transform(data[['Embarked']])
data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

## Observations :

1. Passengerid, Name, ticket are of no use for our analysis hence delete the columns from the data

In [17]:
data = data.drop(['PassengerId', 'Name', 'Ticket'], axis = 1)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


## Apply Robust Scaler on Age, Fare columns

In [18]:
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()
rs

In [19]:
data[['Age', 'Fare']] = rs.fit_transform(data[['Age', 'Fare']])
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,-0.461538,1,0,-0.312011,S
1,1,1,female,0.769231,1,0,2.461242,C
2,1,3,female,-0.153846,0,0,-0.282777,S
3,1,1,female,0.538462,1,0,1.673732,S
4,0,3,male,0.538462,0,0,-0.277363,S


## Apply Feature Encoding on Sex and Embarked variable

In [20]:
dic = {'male' : 0, 'female' : 1}
data['Sex'] = data['Sex'].replace(dic)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,-0.461538,1,0,-0.312011,S
1,1,1,1,0.769231,1,0,2.461242,C
2,1,3,1,-0.153846,0,0,-0.282777,S
3,1,1,1,0.538462,1,0,1.673732,S
4,0,3,0,0.538462,0,0,-0.277363,S


In [23]:
## One hot encoding on Embarked column

data[['C', 'Q', 'S']] = pd.get_dummies(data['Embarked'])
data = data.drop('Embarked', axis = 1)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,0,3,0,-0.461538,1,0,-0.312011,0,0,1
1,1,1,1,0.769231,1,0,2.461242,1,0,0
2,1,3,1,-0.153846,0,0,-0.282777,0,0,1
3,1,1,1,0.538462,1,0,1.673732,0,0,1
4,0,3,0,0.538462,0,0,-0.277363,0,0,1


## Step 4 : Seperate X and y

In [24]:
X = data.drop('Survived', axis = 1)
y = data['Survived']

## Step 5 : Split the data into train and test set

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state = 0)

## Step 6 : Apply Support Vector Classification on X_train and y_train

In [26]:
from sklearn.svm import SVC
svc = SVC()
svc

In [27]:
svc.fit(X_train, y_train)

## Step 7 : Performing Predictions on the X_test data

In [28]:
y_pred = svc.predict(X_test)
y_pred

array([0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0], dtype=int64)

In [29]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
495,3,0,0.0,0,0,0.000178,1,0,0
648,3,0,0.0,0,0,-0.299018,0,0,1
278,3,0,-1.615385,4,1,0.635386,0,1,0
31,1,1,0.0,1,0,5.719744,1,0,0
255,3,1,0.076923,0,2,0.034284,1,0,0


## Step 8: Perform Evaluations 

In [30]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

0.8097014925373134

In [31]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[146,  22],
       [ 29,  71]], dtype=int64)