In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score,classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
dataFrame = pd.read_csv("heart.csv")

In [3]:
train, test = train_test_split(dataFrame, test_size=.2, random_state=42)

In [4]:
#separating the continuous and categorical columns

df_continuous_train = train[['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'HeartDisease']]
df_categorical_train = train[['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope', 'HeartDisease']]

df_continuous_test = test[['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'HeartDisease']]
df_categorical_test = test[['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope', 'HeartDisease']]

In [35]:
df_continuous_test

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
668,63,140,195,0,179,0.0,0
30,53,145,518,0,130,0.0,1
377,65,160,0,1,122,1.2,1
535,56,130,0,0,122,1.0,1
807,54,108,309,0,156,0.0,0
...,...,...,...,...,...,...,...
211,50,140,288,0,140,0.0,1
745,63,108,269,0,169,1.8,1
584,64,141,244,1,116,1.5,1
878,49,130,266,0,171,0.6,0


In [5]:
df_continuous_train

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
795,42,120,240,1,194,0.8,0
25,36,130,209,0,178,0.0,0
84,56,150,213,1,125,1.0,1
10,37,130,211,0,142,0.0,0
344,51,120,0,1,104,0.0,1
...,...,...,...,...,...,...,...
106,48,120,254,0,110,0.0,0
270,45,120,225,0,140,0.0,0
860,60,130,253,0,144,1.4,1
435,60,152,0,0,118,0.0,0


In [6]:
df_categorical_test

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope,HeartDisease
668,F,ATA,Normal,N,Up,0
30,M,NAP,Normal,N,Flat,1
377,M,ASY,ST,N,Flat,1
535,M,ASY,LVH,Y,Flat,1
807,M,ATA,Normal,N,Up,0
...,...,...,...,...,...,...
211,F,NAP,Normal,Y,Flat,1
745,F,ASY,Normal,Y,Flat,1
584,M,ASY,ST,Y,Flat,1
878,M,ATA,Normal,N,Up,0


In [7]:
def calculate_prior(df, Y):
    classes = sorted(list(df[Y].unique()))
    prior = []
    for i in classes:
        prior.append(len(df[df[Y]==i])/len(df))
    return prior

In [8]:
def calculate_likelihood_categorical(df, feat_name, feat_val, Y, label):
    feat = list(df.columns)
    df = df[df[Y]==label]
    p_x_given_y = len(df[df[feat_name]==feat_val]) / len(df)
    return p_x_given_y

In [9]:
def calculate_likelihood_gaussian(df, feat_name, feat_val, Y, label):
    feat = list(df.columns)
    df = df[df[Y]==label]
    mean, std = df[feat_name].mean(), df[feat_name].std()
    p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) *  np.exp(-((feat_val-mean)**2 / (2 * std**2 )))
    return p_x_given_y

In [9]:
def naive_bayes_categorical(df, X, Y):
    # get feature names
    features = list(df.columns)[:-1]

    # calculate prior
    prior = calculate_prior(df, Y)

    Y_pred = []
    # loop over every data sample
    for x in X:
        # calculate likelihood
        labels = sorted(list(df[Y].unique()))
        likelihood = [1]*len(labels)
        for j in range(len(labels)):
            for i in range(len(features)):
                likelihood[j] *= calculate_likelihood_categorical(df, features[i], x[i], Y, labels[j])

        # calculate posterior probability (numerator only)
        post_prob = [1]*len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[j]

        Y_pred.append(np.argmax(post_prob))

    return np.array(Y_pred),likelihood 

In [10]:
X_test = df_categorical_test.iloc[:,:-1].values
Y_test = df_categorical_test.iloc[:,-1].values

Y_pred,likelihood_cat = naive_bayes_categorical(df_categorical_train, X=X_test, Y="HeartDisease")

from sklearn.metrics import confusion_matrix, f1_score
print(confusion_matrix(Y_test, Y_pred))
print(f1_score(Y_test, Y_pred))
print(likelihood_cat)

[[66 11]
 [23 84]]
0.8316831683168316
[0.0006966273106428987, 0.07847069498891543]


In [12]:
def naive_bayes_gaussian(df, X, Y):
    # get feature names
    features = list(df.columns)[:-1]

    # calculate prior
    prior = calculate_prior(df, Y)

    Y_pred = []
    # loop over every data sample
    for x in X:
        # calculate likelihood
        labels = sorted(list(df[Y].unique()))
        likelihood = [1]*len(labels)
        for j in range(len(labels)):
            for i in range(len(features)):
                likelihood[j] *= calculate_likelihood_gaussian(df, features[i], x[i], Y, labels[j])

        # calculate posterior probability (numerator only)
        post_prob = [1]*len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[j]

        Y_pred.append(np.argmax(post_prob))

    return np.array(Y_pred), likelihood

In [13]:
X_test_2 = df_continuous_test.iloc[:,:-1].values
Y_test_2 = df_continuous_test.iloc[:,-1].values

Y_pred_2,likelihood_cont = naive_bayes_categorical(df_continuous_train, X=X_test_2, Y="HeartDisease")

from sklearn.metrics import confusion_matrix, f1_score
print(confusion_matrix(Y_test_2, Y_pred_2))
print(f1_score(Y_test_2, Y_pred_2))
print(likelihood_cont)

[[63 14]
 [50 57]]
0.6404494382022471
[0.0, 2.3336694884864405e-07]
