In [44]:
import numpy as np
import pandas as pd

from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from scipy.stats import norm

In [45]:
data = pd.read_csv('data.csv')

In [46]:
TRAIN_SIZE_RATIO = 0.8

#train-test split
train_data = data[:int(TRAIN_SIZE_RATIO * len(data))]
test_data = data[int(TRAIN_SIZE_RATIO * len(data)):]

In [47]:
train_labels = train_data['diagnosis'].to_numpy()
train_features = train_data.drop(columns=['id', 'diagnosis', 'Unnamed: 32'], inplace=False).to_numpy()

test_labels = test_data['diagnosis'].to_numpy()
test_features = test_data.drop(columns=['id', 'diagnosis', 'Unnamed: 32'], inplace=False).to_numpy()

# Implementation

In [48]:
class GaussianNaiveBayesClassifier:
    def __init__(self):
        self.class_probs = {}
        self.feature_stats = {}
        self.classes = []

    def fit(self, X, y):
        # Calculate class probabilities
        self.classes, class_counts = np.unique(y, return_counts=True)
        total_samples = len(y)
        for c, count in zip(self.classes, class_counts):
            self.class_probs[c] = count / total_samples

        # Calculate feature statistics (mean and standard deviation)
        for c in self.classes:
            self.feature_stats[c] = {
                'mean': X[y == c].mean(axis=0),
                'std': X[y == c].std(axis=0)
            }

    def predict(self, X):
        predictions = []
        for x in X:
            posteriors = {}
            for c in self.classes:
                class_prob = np.log(self.class_probs[c])
                feature_probs = norm.logpdf(x, loc=self.feature_stats[c]['mean'], scale=self.feature_stats[c]['std'])
                posterior = class_prob + np.sum(feature_probs)
                posteriors[c] = posterior
            predictions.append(max(posteriors, key=posteriors.get))
        return predictions


classifier = GaussianNaiveBayesClassifier()
classifier.fit(train_features, train_labels)

train_predictions = classifier.predict(train_features)
test_predictions = classifier.predict(test_features)



In [49]:
# train performance
def evaluate_model(train_labels, test_labels, train_predictions, test_predictions):
    train_precision = precision_score(train_labels, train_predictions, pos_label='M')
    train_recall = recall_score(train_labels, train_predictions, pos_label='M')
    train_accucary = accuracy_score(train_labels, train_predictions)
    train_confusion_matrix = confusion_matrix(train_labels, train_predictions)

    # test performance
    test_precision = precision_score(test_labels, test_predictions, pos_label='M')
    test_recall = recall_score(test_labels, test_predictions, pos_label='M')
    test_accucary = accuracy_score(test_labels, test_predictions)
    test_confusion_matrix = confusion_matrix(test_labels, test_predictions)

    print(f'Train performance:\n\taccuracy : {train_accucary}\n\tpresicion : {train_precision}\n\trecall : {train_recall}\nconfusion matrix : \n{train_confusion_matrix}')
    print(f'Test performance:\n\taccuracy : {test_accucary}\n\tpresicion : {test_precision}\n\trecall : {test_recall}\nconfusion matrix : \n{test_confusion_matrix}')

evaluate_model(train_labels, test_labels, train_predictions, test_predictions)

Train performance:
	accuracy : 0.9406593406593406
	presicion : 0.9491525423728814
	recall : 0.9032258064516129
confusion matrix : 
[[260   9]
 [ 18 168]]
Test performance:
	accuracy : 0.9210526315789473
	presicion : 0.7741935483870968
	recall : 0.9230769230769231
confusion matrix : 
[[81  7]
 [ 2 24]]


# Using sklearn

In [50]:
classifier = GaussianNB()

classifier.fit(train_features, train_labels)
train_predictions = classifier.predict(train_features)
test_predictions = classifier.predict(test_features)

evaluate_model(train_labels, test_labels, train_predictions, test_predictions)

Train performance:
	accuracy : 0.945054945054945
	presicion : 0.9497206703910615
	recall : 0.9139784946236559
confusion matrix : 
[[260   9]
 [ 16 170]]
Test performance:
	accuracy : 0.956140350877193
	presicion : 0.8888888888888888
	recall : 0.9230769230769231
confusion matrix : 
[[85  3]
 [ 2 24]]
