In [43]:
# import required libraries
import pandas as pd
import numpy as np
from plotly import graph_objects as go
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [44]:
# load and read dataset
dataset = pd.read_csv('titanic.csv')
dataset.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [45]:
# columns
dataset.columns

Index(['PassengerId', 'Name', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Survived'],
      dtype='object')

In [46]:
# drop the columns which are not necessary
dataset = dataset.drop(columns = ['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis = 1)
dataset.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [47]:
# split data into inputs and target
inputs = dataset.drop(columns = ['Survived'], axis = 1)
target = dataset['Survived']
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833
2,3,female,26.0,7.925
3,1,female,35.0,53.1
4,3,male,35.0,8.05


In [48]:
# find any null values in inputs
inputs.isnull().sum()

Pclass      0
Sex         0
Age       177
Fare        0
dtype: int64

In [49]:
# lets find out unique values in age column
inputs['Age'].value_counts()

Age
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: count, Length: 88, dtype: int64

In [50]:
# fill missing values in Age column using mean, median, or mode based values
mean = int(inputs.Age.mean())
inputs.loc[inputs['Age'].isnull() & (inputs.index % 5 == 0), 'Age'] = 24.00
inputs.loc[inputs['Age'].isnull() & (inputs.index % 3 == 0), 'Age'] = 27.00
inputs.loc[inputs['Age'].isnull() & (inputs.index % 2 == 0), 'Age'] = mean
inputs.loc[inputs.Age.isnull() & (inputs.index % 2 != 0), 'Age'] = 18.00

In [51]:
inputs.isnull().sum()

Pclass    0
Sex       0
Age       0
Fare      0
dtype: int64

In [52]:
# find null values in target if any
target.isnull().sum()

0

In [53]:
# dtypes of inputs
inputs.dtypes

Pclass      int64
Sex        object
Age       float64
Fare      float64
dtype: object

In [54]:
# now apply one hot encoding on Sex column
inputs = pd.get_dummies(inputs, columns = ['Sex'], dtype = int)
inputs

Unnamed: 0,Pclass,Age,Fare,Sex_female,Sex_male
0,3,22.0,7.2500,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.9250,1,0
3,1,35.0,53.1000,1,0
4,3,35.0,8.0500,0,1
...,...,...,...,...,...
886,2,27.0,13.0000,0,1
887,1,19.0,30.0000,1,0
888,3,27.0,23.4500,1,0
889,1,26.0,30.0000,0,1


In [55]:
# scale using standard scaler
scaler = MinMaxScaler()

inputs = scaler.fit_transform(inputs)

In [56]:
# now split data into training and testing
train_size = int(len(inputs)*0.85)

x_train = inputs[ : train_size, : ]
x_test = inputs[train_size : , : ]
x_test.shape

(134, 5)

In [57]:
# split target into training and testing
y_train = target[0 : train_size]
y_test = target[train_size : ]
y_test.shape

(134,)

In [58]:
# now import Naive bayes gussian model as it is for data with normal distribution
bayes_model = GaussianNB()

# fit data
bayes_model.fit(x_train, y_train)

In [59]:
# predict target
y_pred = bayes_model.predict(x_test)
y_pred

array([0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 0], dtype=int64)

In [60]:
accuracy_score(y_test, y_pred)

0.8059701492537313