# Implementing Gaussian Naive Bayes from scratch on Breast Cancer Data

In [6]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Function for grouping the data by each class

In [7]:
def GroupDataByClass(trainX,trainY):
    trainX["Class"]=trainY
    C=trainX["Class"].unique()
    dataByClass={}
    indexByClass={}
    for i,row in trainX.iterrows():
        if row["Class"] not in indexByClass:
            indexByClass[row["Class"]]=[]
            indexByClass[row["Class"]].append(i)
        else:
            indexByClass[row["Class"]].append(i)
    for c in C:
        inner=trainX.loc[indexByClass[c]]
        dataByClass[c]=inner
    return dataByClass

## Function for training the Naive Bayes classifier
Here I am calculating the Prior Probability and dictionary with mean and variances for each feature of data in every class

In [None]:
def naive_bayes_train(dataByClass,C,features):
    Pcs={}
    #   calc Prior Probability P(c)
    Tc=0
    for c in C:
        Tc+=len(dataByClass[c])

    for c in C:
        Nc=len(dataByClass[c])
        Pc=Nc/Tc
        Pcs[c]=Pc

    #   create dictionary for mean and variances for each feature, for every class
    info_perClass={}
    for c in C:
        info={}
        for feature in features:
            inner=[]
            m = mean(dataByClass[c][feature])
            v = variance(dataByClass[c][feature])
            inner.append(m)
            inner.append(v)
            info[feature]=inner
        info_perClass[c]=info
    return info_perClass,Pcs

### Calculating the mean of each feature in a given class

In [9]:
def mean(listt):
    m=0
    for l in listt:
        m+=l
    m=m/float(len(listt))
    return m

### Calculating the variance of each feature in a given class

In [10]:
def variance(listt):
    m=sum(listt)/float(len(listt))
    variance = sum([pow(x - m, 2) for x in listt]) / float(len(listt) - 1)
    return variance

## Predicting test data
Here, I am predicting the class value of test data with all the features row by row

In [11]:
def predict(testX,info,C,features,Pcs):
    cValues=[]
    for i,row in testX.iterrows():
        predC,classprob=predicting(row,info,C,features,Pcs)
        cValues.append(predC)
    return cValues

## Calculating Posterior probability
Now for each row, I am calculating the probability of the features in the row, given the class, which in Naive bayes, is the product of the probabilities of individual features of the row of test data

In [12]:
def predicting(row,info,C,features,Pcs):
    predC=C[0]
    maxprob=0
    for c in C:
        probs = 1
        for feature in features:
            exponent=math.exp(-(math.pow(row[feature]-info[c][feature][0],2)))/(2*info[c][feature][1])
            prob=(1/(math.sqrt(2*math.pi*info[c][feature][1])))*exponent
            probs*=prob
        probs*=Pcs[c]
        if probs>maxprob:
            maxprob=probs
            predC=c
    return predC,maxprob

## Starting the Main Function by reading the data into a Pandas Dataframe

In [13]:
url="https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
data=pd.read_csv(url,names=["Sample code number","Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape","Marginal Adhesion","Single Epithelial Cell Size","Bare Nuclei","Bland Chromatin","Normal Nucleoli","Mitoses","Class"])

## Preprocessing the data
Here I am transforming the Bare Nuclei attribute of the Breast Cancer data from string to int and creating a new Pandas Dataframe

In [14]:
#Preprocessing the data because the column Bare Nuclei is string type
features=["Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape","Marginal Adhesion","Single Epithelial Cell Size","Bare Nuclei","Bland Chromatin","Normal Nucleoli","Mitoses"]
features_withoutBareNucei=["Clump Thickness","Uniformity of Cell Size","Uniformity of Cell Shape","Marginal Adhesion","Single Epithelial Cell Size","Bland Chromatin","Normal Nucleoli","Mitoses","Class"]
predictor=["Class"]

right_data=data.loc[data["Bare Nuclei"]!="?",features_withoutBareNucei]
temp_data=data.loc[data["Bare Nuclei"]!="?",["Bare Nuclei"]]

right_data["Bare Nuclei"]=temp_data["Bare Nuclei"].apply(np.int64)

## Splitting data
Then I split the data into training and testing data with 80% to training data and 20% of testing data

In [15]:
C=right_data["Class"].unique()
trainX,testX,trainY,testY=train_test_split(right_data[features],right_data[predictor].values.ravel(),test_size=0.20,random_state=1)

## Training and Testing the Data using Gaussian Naive Bayes Classifier and checking the accuracy of the model

In [17]:
dataByClass=GroupDataByClass(trainX,trainY)
info,Pcs=naive_bayes_train(dataByClass,C,features)

predY=predict(testX,info,C,features,Pcs)


print("Accuracy Score",accuracy_score(testY,predY))

Accuracy Score 0.9635036496350365
