Implementing Naive Bayes algorithm from scratch

In [7]:
import pandas as pd
import numpy as np
import math
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score,accuracy_score

Importing all the essential packages and libararies along with the dataset.

In [8]:
data = load_breast_cancer()
df = data.data
y = data.target
x_train, x_test, y_train, y_test = train_test_split(df, y,  test_size = 0.3, random_state =42)
log_reg = LogisticRegression(solver = 'liblinear', random_state=42)

Splitting the dataset into training and test dataset

In [9]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

Finding the mean

In [10]:
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

Finding the standard deviation

In [11]:
def separateByClass(x_train, y_train):
    separated_mal = []
    separated_ben = []
    for j in range(len(x_train)):
        if (y_train[j] == 0):
            separated_mal.append(x_train[j])
        else:
            separated_ben.append(x_train[j])
    return separated_mal, separated_ben

Separting the malignant and benign data and applying data handling on those

In [12]:
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

Calculating the probablity of the feature

In [13]:
def calculateClassProbabilities(summaries, input_1):
    probabilities = 1
    for i in range(len(summaries)):
        mean, stdev = summaries[i]
        x = input_1[i]
        probabilities *= calculateProbability(x, mean, stdev)
    return probabilities

In [14]:
def summarize(x_train):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*x_train)]
    return summaries
 
def summarizeByClass(x_train, y_train):
    separated_mal, separated_ben = separateByClass(x_train, y_train)
    summary_mal = summarize(separated_mal)
    summary_ben = summarize(separated_ben)
    return summary_mal, summary_ben 

summary_mal, summary_ben = summarizeByClass(x_train, y_train)

In [15]:
def getPredictions(x_test):
    predictions = []
    for i in range(len(x_test)):
        mal_pro = calculateClassProbabilities(summary_mal, x_test[i])
        ben_pro = calculateClassProbabilities(summary_ben, x_test[i])
        if mal_pro > ben_pro:
            predictions.append(0)
        else:
            predictions.append(1)
    return predictions

In [16]:
y_predictions = getPredictions(x_test)

In [17]:
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(y_test)):
        if y_predictions[i] == y_test[i]:
            correct += 1
    return (correct/float(len(y_test))) * 100.0

The accuracy using the Naive Bayes algorithm is:

In [18]:
getAccuracy(y_predictions, y_test)

93.56725146198829

The result using Logistic Regression:

In [13]:
log_reg.fit(x_train, y_train)
log_reg.score(x_train,y_train)
log_pred = log_reg.predict(x_test)
confusion_matrix(y_test,log_pred)
print('Test Data Metrics')
print ('Accuracy Score :\t{:.4}'.format(accuracy_score(y_test,log_pred)))
print ('Recall Score :\t\t{:.4}'.format(recall_score(y_test,log_pred)))
print ('Precision Score :\t{:.4}'.format(precision_score(y_test,log_pred)))

Test Data Metrics
Accuracy Score :	0.9649
Recall Score :		0.9815
Precision Score :	0.9636


The accuracy using Logistic Regression came out to be 96.49%

We can see that Logistic Regression has a better prediction than NB with probability of 96.49% which is significantly higher than 63.15%