In [1]:
import numpy as np 
import pandas as pd
from math import sqrt, exp, pi
import pprint

In [2]:
data = pd.read_csv('caesarian.csv')
target = data.columns[-1]
print(target)
# print(data.columns.get_loc('Delivery Time'))


# naive bayes classifi

# Calculate the mean of a list of numbers
def mean(numbers: list):
    return sum(numbers)/float(len(numbers))

# Calculate the standard deviation of a list of numbers
def stdev(numbers: list):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)

# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
	exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
	return (1 / (sqrt(2 * pi) * stdev)) * exponent


def prior_probability(df, target):
      
    rows = len(df)
    values = df[target].value_counts()
    prior = []

    for i in values:
        prob = i/rows
        prior.append(prob)
        
    return prior     


def training(df, className, CategoryIndices):
    feature_prob_given_true = []
    feature_prob_given_false = []

    features = df.columns[:-1]                 # Age, Delivery Number, Delivery Time, Blood Pressure, Heart Problem, Caesarian
    class_count = df[className].value_counts()  # 2
    class_values = df[className].unique()       # 0, 1

    id = 0
    for feature in features:
        
        if id in CategoryIndices:
            true_count = []
            false_count = []

            for i, row in df.iterrows():
                if row[className] == class_values[0]: true_count.append(df[feature][i])
                elif row[className] == class_values[1]: false_count.append(df[feature][i])
            
            true_mean = mean(true_count)
            false_mean = mean(false_count)
            
            true_std = stdev(true_count)
            false_std = stdev(false_count)
            
            feature_prob_given_true.append({'m': true_mean, 'std': true_std})
            feature_prob_given_false.append({'m': false_mean, 'std': false_std})
        else: 
            
            distinct_values = df[feature].unique()  # 22, 26, 31 => Age[Categorical], 0,1 => Delivery Number[Numerical]
            
            f_ls = []
            t_ls = []
            for value in distinct_values:           # iterate over each of Numerical Value
                true_count = 0
                false_count = 0

                for i, row in df.iterrows():
                    if row[className] == class_values[0] and row[feature] == value:
                        true_count += 1
                    elif row[className] == class_values[1] and row[feature] == value:
                        false_count += 1
                
                prob_true = true_count / class_count[class_values[0]]
                prob_false = false_count / class_count[class_values[1]]

                
                t_ls.append({value: prob_true})
                f_ls.append({value: prob_false})
                
            feature_prob_given_true.append(t_ls)
            feature_prob_given_false.append(f_ls)
        
        id+=1
    return feature_prob_given_true, feature_prob_given_false


def naive_bayes_classifer(df, X_test, Y_test, CategoryIndices: list): 
     
    mle_given_True, mle_given_false = training(df, Y_test, CategoryIndices)   
    # print(mle_given_True)
    # print(mle_given_false)
    # Testing Starts...

    class_values = df[Y_test].unique()
    descision = []

    for each_test_case in X_test:
        posterior_T = 1
        posterior_F = 1
       
        for index, feature in enumerate(each_test_case, 0):
            # print(index, feature)
            tfeature_values = mle_given_True[index]
            ffeature_values = mle_given_false[index]
            
            if index in CategoryIndices:
                posterior_T *= calculate_probability(feature, tfeature_values['m'], tfeature_values['std'])
                posterior_F *= calculate_probability(feature, ffeature_values['m'], ffeature_values['std'])
                
            else:
                posterior_T *= [i[feature] for i in tfeature_values if feature in i][0]
                posterior_F *= [i[feature] for i in ffeature_values if feature in i][0]

        posterior_T *= prior_probability(data, Y_test)[0]
        posterior_F *= prior_probability(data, Y_test)[1]
    
        descision.append(class_values[0] if posterior_T > posterior_F else class_values[1])
        
    return descision

 Caesarian


In [3]:

from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=.2)

X_test = test.iloc[:,:-1].values
Y_test = test.iloc[:,-1].values
# category = [data.columns.get_loc(col) for col in data.columns[:-1]]
category = [0]

Y_pred = naive_bayes_classifer(df=train, X_test=X_test, Y_test=target, CategoryIndices=category)
print(Y_pred, Y_test)

[1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0] [1 1 0 0 1 0 0 1 1 1 1 1 1 1 1 1]


In [4]:
# manual calculation on Accuracy score and Confusion matrix

TP = 0  # Prediction (+) , Actual(+) 
FP = 0  # Prediction (+) , Actual(-) 
TN = 0  # Prediction (-) , Actual(-) 
FN = 0  # Prediction (-) , Actual(+)

for i in range(len(Y_pred)):
    if Y_pred[i] == 1 and Y_test[i] == 1:
        TP+=1
    elif Y_pred[i] == 1 and Y_test[i] == 0:
        FP+=1
    elif Y_pred[i] == 0 and Y_test[i] == 0:
        TN+=1
    elif Y_pred[i] == 0 and Y_test[i] == 1:
        FN+=1
        
accuracy = (TP+TN)/(TP+TN+FP+FN)
CF = np.zeros(shape=(2,2),dtype=int)
CF[0,0] = TN
CF[0,1] = FP
CF[1,0] = FN
CF[1,1] = TP
print(CF)
print("TP: ", TP, " TN: ", TN, " FP: ", FP, " FN: ", FN)
print("Accuracy Score: ", accuracy)

[[2 2]
 [5 7]]
TP:  7  TN:  2  FP:  2  FN:  5
Accuracy Score:  0.5625


In [5]:
# Library Based Model Train and Accuracy, Confusion Matrix Calculation

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, f1_score , accuracy_score

model = GaussianNB()
model.fit(train.iloc[:,:-1], train.iloc[:,-1])
Y_pred2 = model.predict(test.iloc[:,:-1])
# print(Y_pred)

print("Confusion Matrix: \n", confusion_matrix(Y_test, Y_pred2))
print("Accuracy: ",accuracy_score(Y_test, Y_pred2))

Confusion Matrix: 
 [[1 3]
 [4 8]]
Accuracy:  0.5625
