In [None]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Normalize the features
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

# Split the data into training and testing sets (60% train, 40% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# posterior = (likelihood * prior)
# Fit the Gaussian Naive Bayes model
def fit(X, y):
    classes = np.unique(y)  # Find unique classes
    means = {}  # Dictionary to store the mean of each feature per class
    variances = {}  # Dictionary to store the variance of each feature per class
    priors = {}  # Dictionary to store the prior probabilities of each class

    for cls in classes:
        X_cls = X[y == cls]  # Extract the rows of X that correspond to the current class

        # Calculate mean of each feature for the current class
        # mean =  sum_(x_i -> the feature vector of the i-th sample in class) / N(num.of samples in class)
        means[cls] = np.mean(X_cls, axis=0)

        # Calculate variance of each feature for the current class
        # sigma_square =  sum_(x_i - mean)^2 \ N(num.of samples in class)
        variances[cls] = np.var(X_cls, axis=0)

        # Calculate prior probability of the current class
        # Prior = (num. of samples in the class) / (num. of samples in the data )
        priors[cls] = X_cls.shape[0] / X.shape[0]

    return classes, means, variances, priors

# function to  calculate the likelihood of each feature in x given the class
def calculate_likelihood(mean, var, x):
    eps = 1e-6  # epsilon to prevent division by zero

    # Combined coefficient and exponent in the Gaussian probability density function
    likelihood = (1 / np.sqrt(2 * np.pi * var + eps)) * np.exp(-((x - mean) ** 2) / (2 * var + eps))

    return likelihood

# posterior = log(prior) + log(likelihood)
def calculate_posterior(x, classes, means, variances, priors):
    posteriors = []
    for cls in classes:
        # Logarithm of the prior probability of the class
        prior = np.log(priors[cls])

        # Logarithm of the likelihood of the data features given the class
        # The log transformation converts the product of probabilities into a sum
        all_likelihood = np.sum(np.log(calculate_likelihood(means[cls], variances[cls], x)))

        # Sum of the log prior and log likelihood
        posterior = prior + all_likelihood

        posteriors.append(posterior)

    # Return the class with the highest posterior probability
    return classes[np.argmax(posteriors)]

def predict(X, classes, means, variances, priors):
    # Predict the class for each instance in X
    return np.array([calculate_posterior(x, classes, means, variances, priors) for x in X])

# Train the classifier
classes, means, variances, priors = fit(X_train, y_train)

# Predict on the test set
y_pred = predict(X_test, classes, means, variances, priors)

# Calculate accuracy
accuracy = np.mean(y_pred == y_test)
print("Accuracy:", accuracy)

# Print predicted class labels
print("Predicted class labels:")
print(y_pred)

# Print actual class labels
print("Actual class labels:")
print(y_test)


Accuracy: 0.9777777777777777
Predicted class labels:
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 2 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]
Actual class labels:
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]
