# Naive Bayes in Python

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import MultinomialNB, CategoricalNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('Heart.csv')
df

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,299,45,1,typical,110,264,0,0,132,0,1.2,2,0.0,reversable,Yes
299,300,68,1,asymptomatic,144,193,1,0,141,0,3.4,2,2.0,reversable,Yes
300,301,57,1,asymptomatic,130,131,0,0,115,1,1.2,2,1.0,reversable,Yes
301,302,57,0,nontypical,130,236,0,2,174,0,0.0,2,1.0,normal,Yes


In [3]:
df = df.drop(columns=df.columns[0])
df

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,typical,110,264,0,0,132,0,1.2,2,0.0,reversable,Yes
299,68,1,asymptomatic,144,193,1,0,141,0,3.4,2,2.0,reversable,Yes
300,57,1,asymptomatic,130,131,0,0,115,1,1.2,2,1.0,reversable,Yes
301,57,0,nontypical,130,236,0,2,174,0,0.0,2,1.0,normal,Yes


In [5]:
df.isnull().sum()

Age          0
Sex          0
ChestPain    0
RestBP       0
Chol         0
Fbs          0
RestECG      0
MaxHR        0
ExAng        0
Oldpeak      0
Slope        0
Ca           4
Thal         2
AHD          0
dtype: int64

In [6]:
df = df.dropna()
df

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57,0,asymptomatic,140,241,0,0,123,1,0.2,2,0.0,reversable,Yes
298,45,1,typical,110,264,0,0,132,0,1.2,2,0.0,reversable,Yes
299,68,1,asymptomatic,144,193,1,0,141,0,3.4,2,2.0,reversable,Yes
300,57,1,asymptomatic,130,131,0,0,115,1,1.2,2,1.0,reversable,Yes


In [8]:
# set seed
np.random.seed(1234)

# Randomize the dataset (i.e. shuffle the rows); 
df_randomized = df.sample(frac=1)

# Calculate index for split (size of train data) -take first 70% of the training data
trainsize = round(len(df_randomized) * 0.7)

# Split into training and test sets
training_set = df_randomized[:trainsize].reset_index(drop=True)
test_set = df_randomized[trainsize:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

(208, 14)
(89, 14)


In [9]:
## Sanity Check
print(type(training_set))
training_set.head(6)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,65,1,asymptomatic,110,248,0,2,158,0,0.6,1,2.0,fixed,Yes
1,39,1,asymptomatic,118,219,0,0,140,0,1.2,2,0.0,reversable,Yes
2,71,0,asymptomatic,112,149,0,0,125,0,1.6,2,0.0,normal,No
3,43,0,asymptomatic,132,341,1,2,136,1,3.0,2,0.0,reversable,Yes
4,52,1,typical,118,186,0,2,190,0,0.0,2,0.0,fixed,No
5,55,1,asymptomatic,140,217,0,0,111,1,5.6,3,0.0,reversable,Yes


In [10]:
training_set['AHD'].value_counts(normalize=True)

No     0.533654
Yes    0.466346
Name: AHD, dtype: float64

In [11]:
test_set['AHD'].value_counts(normalize=True)

No     0.550562
Yes    0.449438
Name: AHD, dtype: float64

# Creating the Naive Bayes Model

In [12]:
trainX = training_set.iloc[:,:-1]
trainy = training_set['AHD']

colnames = trainX.columns

trainX.head()

trainy.head()

0    Yes
1    Yes
2     No
3    Yes
4     No
Name: AHD, dtype: object

In [13]:
test_set.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,62,0,asymptomatic,160,164,0,2,145,0,6.2,3,3.0,reversable,Yes
1,48,1,asymptomatic,122,222,0,2,186,0,0.0,1,0.0,normal,No
2,51,1,nonanginal,100,222,0,0,143,1,1.2,2,0.0,normal,No
3,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
4,59,1,asymptomatic,140,177,0,0,162,1,0.0,1,1.0,reversable,Yes


In [14]:
testX = test_set.drop(columns=test_set.columns[13])
testX

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal
0,62,0,asymptomatic,160,164,0,2,145,0,6.2,3,3.0,reversable
1,48,1,asymptomatic,122,222,0,2,186,0,0.0,1,0.0,normal
2,51,1,nonanginal,100,222,0,0,143,1,1.2,2,0.0,normal
3,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed
4,59,1,asymptomatic,140,177,0,0,162,1,0.0,1,1.0,reversable
...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,55,1,asymptomatic,160,289,0,2,145,1,0.8,2,1.0,reversable
85,50,1,asymptomatic,144,200,0,2,126,1,0.9,2,0.0,reversable
86,44,1,nontypical,130,219,0,2,188,0,0.0,1,0.0,normal
87,68,1,asymptomatic,144,193,1,0,141,0,3.4,2,2.0,reversable


In [15]:
testy = test_set['AHD']
testy

0     Yes
1      No
2      No
3      No
4     Yes
     ... 
84    Yes
85    Yes
86     No
87    Yes
88    Yes
Name: AHD, Length: 89, dtype: object

In [16]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

#0-1 encoding train labels (think of it as a Bernoulli variable)
trainBrnli = le.fit_transform(trainy)

trainBrnli[:5]

array([1, 1, 0, 1, 0])

In [17]:
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

trainX = enc.fit_transform(trainX)

trainX = pd.DataFrame(trainX, columns=colnames)

trainX.head()  #sanity check

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal
0,31.0,1.0,0.0,7.0,63.0,0.0,2.0,48.0,0.0,6.0,0.0,2.0,0.0
1,5.0,1.0,0.0,12.0,39.0,0.0,0.0,31.0,0.0,12.0,1.0,0.0,2.0
2,37.0,0.0,0.0,8.0,3.0,0.0,0.0,20.0,0.0,16.0,1.0,0.0,1.0
3,9.0,0.0,0.0,22.0,118.0,1.0,2.0,28.0,1.0,27.0,1.0,0.0,2.0
4,18.0,1.0,3.0,12.0,16.0,0.0,2.0,73.0,0.0,0.0,1.0,0.0,0.0


In [18]:
model = CategoricalNB()
model.fit(trainX,trainBrnli)

In [20]:
# predict on train data
yhattrain = model.predict(trainX) 
yhattrain

array([1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0])

In [21]:
# Confusion Matrix
pd.crosstab(yhattrain, trainy)

AHD,No,Yes
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,103,11
1,8,86


In [22]:
# Accuracy Score
accuracy_score(yhattrain, trainBrnli)

0.9086538461538461

In [24]:
testBrnli = le.fit_transform(testy)
testX = enc.fit_transform(testX)
yhattest = model.predict(testX)
yhattest



array([1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0])

In [25]:
# Confusion Matrix
confM = pd.crosstab(yhattest, testy)
confM

AHD,No,Yes
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,42,7
1,7,33


In [26]:
# Accuracy Score
acc = accuracy_score(yhattest, testBrnli)
acc

0.8426966292134831

In [27]:
# False Negative Rate
FNR = (7) / (33+7)
FNR

0.175