# Importing all necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing

# Reading data

In [2]:
data = pd.read_csv('SampleTrain.csv')
data.shape

(700, 12)

# Cleaning data

In [3]:
data['Sex'] = data['Sex'].map({'male':1,'female':0}).astype(int)
data.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,250,0,2,"Carter, Rev. Ernest Courtenay",1,54,1,0,244252,26.0,,S
1,357,1,1,"Bowerman, Miss. Elsie Edith",0,22,0,1,113505,55.0,E33,S
2,372,0,3,"Wiklund, Mr. Jakob Alfred",1,18,1,0,3101267,6.4958,,S
3,384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline To...",0,35,1,0,113789,52.0,,S
4,25,0,3,"Palsson, Miss. Torborg Danira",0,8,3,1,349909,21.075,,S
5,414,0,2,"Cunningham, Mr. Alfred Fleming",1,*,0,0,239853,0.0,,S
6,245,0,3,"Attalah, Mr. Sleiman",1,30,0,0,2694,7.225,,C
7,708,1,1,"Calderhead, Mr. Edward Pennington",1,42,0,0,PC 17476,26.2875,E24,S
8,85,1,2,"Ilett, Miss. Bertha",0,17,0,0,SO/C 14885,10.5,,S
9,330,1,1,"Hippach, Miss. Jean Gertrude",0,16,0,1,111361,57.9792,B18,C


In [4]:
data['Age'] = [np.nan if x == '*' else x for x in data['Age']]
median = data['Age'].median()
data['Age'].fillna(median, inplace = True)
data.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,250,0,2,"Carter, Rev. Ernest Courtenay",1,54,1,0,244252,26.0,,S
1,357,1,1,"Bowerman, Miss. Elsie Edith",0,22,0,1,113505,55.0,E33,S
2,372,0,3,"Wiklund, Mr. Jakob Alfred",1,18,1,0,3101267,6.4958,,S
3,384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline To...",0,35,1,0,113789,52.0,,S
4,25,0,3,"Palsson, Miss. Torborg Danira",0,8,3,1,349909,21.075,,S
5,414,0,2,"Cunningham, Mr. Alfred Fleming",1,28,0,0,239853,0.0,,S
6,245,0,3,"Attalah, Mr. Sleiman",1,30,0,0,2694,7.225,,C
7,708,1,1,"Calderhead, Mr. Edward Pennington",1,42,0,0,PC 17476,26.2875,E24,S
8,85,1,2,"Ilett, Miss. Bertha",0,17,0,0,SO/C 14885,10.5,,S
9,330,1,1,"Hippach, Miss. Jean Gertrude",0,16,0,1,111361,57.9792,B18,C


In [5]:
features = np.array([data['Pclass'],data['Sex'],data['Age'],data['SibSp'],data['Parch']])
f = pd.DataFrame(features).T
label = data['Survived']

# Naive Bayes Classifier

In [6]:
model = GaussianNB()
model.fit(f,label)

GaussianNB()

In [7]:
test_data = pd.read_csv('test.csv')
test_data['Sex'] = test_data['Sex'].map({'male':1,'female':0}).astype(int)
test_data['Age'] = [np.nan if x == '*' else x for x in test_data['Age']]
median = test_data['Age'].median()
test_data['Age'].fillna(median, inplace = True)
test_values = np.array([test_data['Pclass'],test_data['Sex'],test_data['Age'],test_data['SibSp'],test_data['Parch']]).T

In [8]:
predicted_value = model.predict(test_values)
predicted_value

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [9]:
x = list(zip(test_data['PassengerId'],predicted_value))
df = pd.DataFrame(x,columns = ['PassengerId','Survived'])
df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


# Accuracy

In [10]:
gender_csv = pd.read_csv('gender_submission.csv')
accuracy1 = model.score(f,label)
accuracy2 = model.score(test_values,gender_csv['Survived'])
print("The original model had an accuracy score of :",accuracy1)
print("Our Model has an accuracy score of :",accuracy2)

The original model had an accuracy score of : 0.7828571428571428
Our Model has an accuracy score of : 0.9617224880382775


# Misclassification Rate

In [11]:
Misclassfication_Rate = 1 - accuracy2
print("The Misclassfication Rate for our model is :", Misclassfication_Rate)

The Misclassfication Rate for our model is : 0.038277511961722466
