In [1]:
import numpy as np
import pandas as pd
import pprint as pp
from numpy import *

In [12]:
# pull in data
X = pd.read_csv('heart_disease_uci.csv')
display(X)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [13]:
# drop irrelevant columns 
X = X.drop("id", axis=1)
X = X.drop("dataset", axis=1)
# drop rows with NaN in relevant feature columns
feature_names = ["age", "sex", "cp", "trestbps", "chol", "fbs", "thalch", "exang", "oldpeak", "slope", "ca", "thal"]
X = X.dropna(subset=feature_names)
# get feature indices 
features = [X.columns.get_loc(col) for col in feature_names]
# convert to numpy matrix for calculations
X = X.to_numpy()

In [14]:
# Split Data for Naive Bayes Classifier 80:20/training:verification
X_t = mat(X[0:int(len(X) * .8), :])
X_v = mat(X[len(X_t):, :])

In [15]:
# split data into c_0 = no disease, c_1 = disease
c_0, c_1 = split_classes(X_t)
# calculate class likelihood
c0_prior = len(c_0)/(len(X_t))
c1_prior = 1 - c0_prior

In [23]:
# calculate class_conditioned
class_likelihood = fill_class_conditioned(features)
# get results from generating for verification set
results = classify_bayes(features, class_likelihood)
# sum correct/incorrect generative predictions
correct = 0
incorrect = 0
for i in range(len(results[0])):
    if results[0][i] == results[1][i]:
        correct += 1
    else:
        incorrect += 1
# show accuracy achieved on verification set
print(f"Percentage Accuracy: {correct / (correct + incorrect)}")

Percentage Accuracy: 0.8166666666666667


In [26]:
# get confusion matrix from validation results
conf_matrix = get_confusion_matrix(results)
TrueP = conf_matrix[0][0]
FalsN = conf_matrix[1][0]
TrueN = conf_matrix[1][1]
FalsP = conf_matrix[0][1]
# calculate performance metrics from confusion matrix
accuracy = (TrueP + TrueN) / (TrueP + FalsN + FalsP + TrueN)
precision = TrueP / (TrueP + FalsP)
recall = TrueP / (TrueP + FalsN)
f_score = 2 * ((precision * recall) / (precision + recall))
print(f"accuracy: {accuracy}")
print(f"precision: {precision}")
print(f"recall: {recall}")
print(f"F-score: {f_score}")

60
accuracy: 0.8166666666666667
precision: 0.7878787878787878
recall: 0.8666666666666667
F-score: 0.8253968253968254


In [27]:
def get_confusion_matrix(R):
    # in: training results => row 0 := true value; row 1 := test value
    # out: confusion matrix
    confusion = [[0, 0], [0, 0]]
    for i in zip(R[0], R[1]):
        if i[0] == 1 and i[1] == 1:
            confusion[0][0] += 1
    for i in zip(R[0], R[1]):
        if i[0] == 1 and i[1] == 0:
            confusion[1][0] += 1
    for i in zip(R[0], R[1]):
        if i[0] == 0 and i[1] == 0:
            confusion[1][1] += 1
    for i in zip(R[0], R[1]):
        if i[0] == 0 and i[1] == 1:
            confusion[0][1] += 1
    
    return confusion

In [17]:
def classify_bayes(feats, class_likelihood):
    # in: feature list
    # out: prediction/actual matrix for verification set
    # loops through each instance of verification data 
    pred_actual = [[], []]
    for i in range(len(X_v)):
        prediction = calc_posterior(asarray(X_v[i])[0], feats, class_likelihood)
        pred_actual[0].append(0 if prediction >= 1 else 1)
        pred_actual[1].append(0 if X_v[i, 13] == 0 else 1)

    return pred_actual


In [18]:
def calc_posterior(A, J, class_likelihood):
    # input: new test feature vector, J feature indices
    # out: prediction, >1 => no disease, <1 => disease
    posterior_0 = 0
    posterior_1 = 0
    for i in J:
        if type(A[i]) == float or type(A[i]) == int:
            sigma_0, sigma_1 = class_likelihood[0][i]["std"], class_likelihood[1][i]["std"]
            mu_0, mu_1 = class_likelihood[0][i]["mean"], class_likelihood[1][i]["mean"]
            x = A[i]
            posterior_0 += (1.0 / (sigma_0 * np.sqrt(2 * np.pi))) * np.exp(-0.5 * ((x - mu_0) / sigma_0) ** 2)
            posterior_1 += (1.0 / (sigma_1 * np.sqrt(2 * np.pi))) * np.exp(-0.5 * ((x - mu_1) / sigma_1) ** 2)
        else:
            posterior_0 += log(class_likelihood[0][i][A[i]]) 
            posterior_1 += log(class_likelihood[1][i][A[i]]) 
    return((posterior_1 + log(c1_prior)) / (posterior_0 + log(c0_prior)))

In [19]:
def fill_class_conditioned(J):
    # in: J, a list of feature indices
    # out: lookup table for priors w/ shape (yes/no, feature, name:prior)
    curr_table = [[0] * 14 for _ in range(2)] 
    
    # add categorical or continuous here
    for i in J:

        if type(c_0[0, i]) == float or type(c_0[0, i]) == int:
        # if issubdtype(c_0[0, i], np.number):
            curr_table[0][i] = calc_class_continuous(c_0, i)
            curr_table[1][i] = calc_class_continuous(c_1, i)

        else:
            curr_table[0][i] = calc_class_categorical(c_0, i)
            curr_table[1][i] = calc_class_categorical(c_1, i)

    return curr_table

In [20]:
def calc_class_continuous(A, j):
    # in: Training data matrix and current feature index
    # out: dictionary of priors for relevant features
    sigma = std(A[:, j].astype(float))
    mu = mean(A[:, j])
    curr_conditional = {}
    curr_conditional["mean"] = mu
    curr_conditional["std"] = sigma
    return curr_conditional

In [21]:
def calc_class_categorical(A, j):
    # in: Training data matrix and current feature index
    # out: dictionary of priors for relevant features
    vals, counts = unique(asarray(A[:, j]), return_counts=True)
    tot = len(asarray(A[:, j]))
    curr_conditional = {}
    for i in range(len(vals)):
        curr_conditional[vals[i]] = counts[i] / tot

    return curr_conditional

In [22]:
def split_classes(A):
    # in: training data
    # out: two matrices split by class label
    # num => 0 = no disease, num = 1-4 => yes disease
    c_0 = mat([asarray(x)[0] for x in A if x[0, -1] == 0])
    c_1 = mat([asarray(x)[0] for x in A if x[0, -1] > 0])
    return c_0, c_1