In [1]:
# Import important libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_excel('titanic.xlsx')

In [3]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


## Describing the data
- survival - Survival (0=No, 1=Yes)
- pclass - Ticket Class
- sex - Sex
- Age - Age in years
- sibsp - Number of siblings / spouses aboard the Titanic
- parch - Number of parents / children aboard the Titanic
- ticket - Ticket number
- fare - Passenger fare
- cabin - Cabin number
- embarked - Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

In [5]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [6]:
# Removing NaN values present in the 'Age' and 'Fare' columns.
df.dropna(subset=['Age','Fare'], inplace=True)

In [7]:
y = df['Survived']
y

0      0
1      1
2      0
3      0
4      1
      ..
409    1
411    1
412    1
414    1
415    0
Name: Survived, Length: 331, dtype: int64

In [8]:
X = df.drop(columns=['Survived','PassengerId'])
X

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...
409,3,"Peacock, Miss. Treasteall",female,3.0,1,1,SOTON/O.Q. 3101315,13.7750,,S
411,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37.0,1,0,19928,90.0000,C78,Q
412,3,"Henriksson, Miss. Jenny Lovisa",female,28.0,0,0,347086,7.7750,,S
414,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C


In [9]:
X.isnull().sum()

Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       244
Embarked      0
dtype: int64

In [10]:
X = df.drop(columns=['Name','Cabin','Survived','PassengerId','Ticket'])

In [11]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0000,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S
...,...,...,...,...,...,...,...
409,3,female,3.0,1,1,13.7750,S
411,1,female,37.0,1,0,90.0000,Q
412,3,female,28.0,0,0,7.7750,S
414,1,female,39.0,0,0,108.9000,C


In [12]:
unique_embarked = df['Embarked'].unique()
print(unique_embarked)

value_counts = df['Embarked'].value_counts()
print(value_counts)

['Q' 'S' 'C']
S    227
C     82
Q     22
Name: Embarked, dtype: int64


In [13]:
replace = {'Q': 0, 'S': 1, 'C': 2}
df['Embarked'] = df['Embarked'].replace(replace)

gender = {'female':1, 'male':2}
df['Sex'] = df['Sex'].replace(gender)

In [14]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",2,34.5,0,0,330911,7.8292,,0
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0000,,1
2,894,0,2,"Myles, Mr. Thomas Francis",2,62.0,0,0,240276,9.6875,,0
3,895,0,3,"Wirz, Mr. Albert",2,27.0,0,0,315154,8.6625,,1
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,1
...,...,...,...,...,...,...,...,...,...,...,...,...
409,1301,1,3,"Peacock, Miss. Treasteall",1,3.0,1,1,SOTON/O.Q. 3101315,13.7750,,1
411,1303,1,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",1,37.0,1,0,19928,90.0000,C78,0
412,1304,1,3,"Henriksson, Miss. Jenny Lovisa",1,28.0,0,0,347086,7.7750,,1
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,108.9000,C105,2


In [15]:
X = df.drop(columns=['Name','Cabin','Survived','PassengerId','Ticket'])
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,2,34.5,0,0,7.8292,0
1,3,1,47.0,1,0,7.0000,1
2,2,2,62.0,0,0,9.6875,0
3,3,2,27.0,0,0,8.6625,1
4,3,1,22.0,1,1,12.2875,1
...,...,...,...,...,...,...,...
409,3,1,3.0,1,1,13.7750,1
411,1,1,37.0,1,0,90.0000,0
412,3,1,28.0,0,0,7.7750,1
414,1,1,39.0,0,0,108.9000,2


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [17]:
# Standardizing features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
X_train.size

1736

In [19]:
X_test.size

581

#### I am deploying three models with default parameters to see which one yields the best result.
#### Models selected - Logistic Regression, K Nearest Neighbours and Support Vector Classifier.

In [20]:
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

#### Logistic Regression

In [21]:
lr = LogisticRegression(max_iter = 1000)
cv = cross_val_score(lr,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[1. 1. 1. 1. 1.]
1.0


#### K Nearest Neighbours

In [22]:
import warnings
warnings.filterwarnings('ignore')

knn = KNeighborsClassifier()
cv = cross_val_score(knn,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.98 1.   0.98 1.   1.  ]
0.992


#### Support Vector Classifier

In [23]:
svc = SVC(probability = True)
cv = cross_val_score(svc,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[1.         1.         1.         0.97959184 1.        ]
0.9959183673469388


#### Therefore the accuracy of the models are:
#### Logistic regression: 100.0%
#### K Nearest Neighbour: 99.2%
#### SVC: 99.59%
#### All our models give us a decent accuracy, but the best one is Logistic Regression.