In [143]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


In [144]:
class NaiveBayesClassifier(object):
    
    def __init__(self):
        pass
    
    #Input: X - features of a trainset
    #       y - labels of a trainset
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
        self.no_of_classes = np.max(self.y_train) + 1
     
        
    #This is our function to calculate all nodes/samples in our radius    
    def euclidianDistance(self, Xtest, Xtrain):
        return np.sqrt(np.sum(np.power((Xtest - Xtrain), 2)))
    
       
    #our main function is predict
    #All calculation is done by using our test or new samples
    #There are 4 steps to be performed:
    # 1. calculate Prior probability. Ex. P(A) = No_of_elements_of_one_class / total_no_of_samples
    # 2. calculate Margin probability P(X) = No_of_elements_in_radius / total_no_of_samples
    # 3. calculate Likeliyhood (P(X|A) = No_of_elements_of_current_class / total_no_of_samples
    # 4. calculate Posterior probability: P(A|X) = (P(X|A) * P(A)) / P(X)
    # NOTE: Do these steps for all clases in dataset!
    #
    #Inputs: X - test dataset
    #       radius - this parameter is how big circle is going to be around our new datapoint, default = 2
    def predict(self, X, radius=0.4):   
        pred = []
        
#         number of malignant and benign elements in member_of_class
        members_of_class = []
        for i in range(self.no_of_classes):
            counter = 0
            print(i)
            for j in range(len(self.y_train)):
                if self.y_train[j] == i:
                    counter += 1
            members_of_class.append(counter)
        print(members_of_class)
        
        #prediction starts
        for t in range(len(X)):
            #Creating empty list for every class probability
            prob_of_classes = []
#            for malignant and benign
            for i in range(self.no_of_classes):
                
                #1. step > Prior probability P(class) = no_of_elements_of_that_class/total_no_of_elements
                prior_prob = members_of_class[i]/len(self.y_train)

                #2. step > Margin probability P(X) = no_of_elements_in_radius/total_no_of_elements
                #NOTE: In the same loop collecting infromation for 3. step as well
                
                inRadius = 0
                #counter for how many points are from the current class in circle
                inRadius_current_class = 0
                
#                 finding all points inside the given radius circle
                
                for j in range(len(self.X_train)):
                    if self.euclidianDistance(X[t], self.X_train[j]) < radius:
                        inRadius += 1
                        if self.y_train[j] == i:
                            inRadius_current_class += 1
                
                #finding margin probability
                margin_prob = inRadius/len(self.X_train)
                if margin_prob == 0:
                    margin_prob = 0.0000000000000000000000000000000000000000000000001
                
                #3. step > Likelihood P(X|current_class) = no_of_elements_in_circle_of_current_class/total_no_of_elements
                likelihood = inRadius_current_class/len(self.X_train)
                
#                 #4. step > Posterial Probability > formula from Bayes theorem: P(current_class | X) = (likelihood*prior_prob)/margin_prob
                post_prob = (likelihood * prior_prob)/margin_prob
                prob_of_classes.append(post_prob)
            
            #Getting index of the biggest element (class with the biggest probability)
            pred.append(np.argmax(prob_of_classes))
            
                
        return pred

In [145]:
def accuracy(y_tes, y_pred):
    correct = 0
    for i in range(len(y_pred)):
        if(y_tes[i] == y_pred[i]):
            correct += 1
    return (correct/len(y_tes))*100


In [146]:
def breastCancerTest():
    # Importing the dataset
#     dataset = pd.read_csv('breastCancer.csv')
# #     dataset.replace('?', 0, inplace=True)
# #     dataset = dataset.applymap(np.int64)
# #     X = dataset.iloc[:, 1:-1].values    
# #     y = dataset.iloc[:,0].values
# #     #This part is necessery beacuse of NUMBER of features part of algo
# #     #and in this dataset classes are marked with 2 and 4
# #     print(y)

#     # Splitting the dataset into the Training set and Test set
#     from sklearn.model_selection import train_test_split
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
    
    df = pd.read_csv('data.csv')
    
#     list of all features from the data
    
    list_of_columns = list(df.columns)
    
#     list of features for my model
    
    prediction_vars = ['diagnosis','radius_mean','perimeter_mean','area_mean','compactness_mean','concavity_mean','concave points_mean',
                   'radius_se','area_se' ]
    
# removing non- required features
    
    for ele in list_of_columns:
        if ele in prediction_vars:
            list_of_columns.remove(ele)
            
    
    df = df.drop(columns = list_of_columns ,axis=1)
    df['diagnosis'] = df['diagnosis'].map({'M':1, 'B':0})
    
#     assigning features to x
#     assigning labels to y
       
    X = df.iloc[:,1:].values
    y = df.iloc[:,0:1].values
    
#     splitting data into train and test
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
    

#     mean_features = list(df.columns[1:11])
#     se_features = list(df.columns[11:21])
#     worst_features = list(df.columns[21:31])
    
#     mean_features.append('diagnosis')
#     se_features.append('diagnosis')
#     worst_features.append('diagnosis')
    
    
#     train_x = train[prediction_vars]
#     train_y = train['diagnosis']
#     test_x = test[prediction_vars]
#     test_y = test['diagnosis']
    
    #Testing my Naive Bayes Classifier
    NB = NaiveBayesClassifier()
    NB.fit(X_train, y_train)
    
    y_pred = NB.predict(X_test, radius=8)
    
#     sklearn
    from sklearn.naive_bayes import GaussianNB
    NB_sk = GaussianNB()
    NB_sk.fit(X_train, y_train)
    
    sk_pred = NB_sk.predict(X_test)
     
    
    print("Accuracy for my Naive Bayes Classifier: ", accuracy(y_test, y_pred), "%")
    print("Accuracy for sklearn Naive Bayes Classifier: ",accuracy(y_test, sk_pred), "%")

In [147]:
breastCancerTest()

2
0
1
[267, 159]
Accuracy for my Naive Bayes Classifier:  85.3146853146853 %
Accuracy for sklearn Naive Bayes Classifier:  90.20979020979021 %


  y = column_or_1d(y, warn=True)
