#### Question: Implement a Naive Bayes classifier from scratch in Python. Your implementation should work with both discrete (categorical) and continuous (Gaussian) data. You can use a simple dataset such as Iris or Titanic for testing the classifier.

Importing requirements

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
import pandas as pd

Using a Gaussian probability function for Continuous Features

In [2]:
def calculate_gaussian_probability(x, mean, std):
    denominator = np.sqrt(2 * np.pi) * std
    exponent = -((x - mean) ** 2 / (2 * std ** 2))
    probability = (1 / denominator) * np.exp(exponent)
    return probability

Implementing the Naive Bayes Class

In [3]:
class NaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.means = {}
        self.stds = {}
        self.class_priors = {}
        self.feature_types = []

        # Determine feature types
        for i in range(X.shape[1]):
            if np.issubdtype(X[:, i].dtype, np.number):
                self.feature_types.append('continuous')
            else:
                self.feature_types.append('discrete')

        self.discrete_probs = {}

        # Compute class-wise statistics
        for cls in self.classes:
            X_cls = X[y == cls]
            self.class_priors[cls] = len(X_cls) / len(X)
            self.means[cls] = {}
            self.stds[cls] = {}
            self.discrete_probs[cls] = {}

            for i in range(X.shape[1]):
                if self.feature_types[i] == 'continuous':
                    feature_mean = np.mean(X_cls[:, i])
                    feature_std = np.std(X_cls[:, i])
                    self.means[cls][i] = feature_mean
                    self.stds[cls][i] = feature_std
                else:
                    values, counts = np.unique(X_cls[:, i], return_counts=True)
                    probabilities = counts / len(X_cls)
                    self.discrete_probs[cls][i] = dict(zip(values, probabilities))

    def _discrete_prob(self, x, cls, i):
        # Handle discrete probability retrieval with smoothing
        prob = self.discrete_probs[cls][i].get(x, 1e-6)
        return prob

    def predict(self, X):
        preds = []
        for x in X:
            posteriors = []
            for cls in self.classes:
                prior = np.log(self.class_priors[cls])
                likelihood = 0

                for i in range(X.shape[1]):
                    if self.feature_types[i] == 'continuous':
                        gaussian_prob = calculate_gaussian_probability(x[i], self.means[cls][i], self.stds[cls][i])
                        log_gaussian_prob = np.log(gaussian_prob)
                        likelihood += log_gaussian_prob
                    else:
                        discrete_prob = self._discrete_prob(x[i], cls, i)
                        log_discrete_prob = np.log(discrete_prob)
                        likelihood += log_discrete_prob

                posterior = prior + likelihood
                posteriors.append(posterior)

            predicted_class = self.classes[np.argmax(posteriors)]
            preds.append(predicted_class)
        return np.array(preds)

Testing on Iris Data

In [4]:
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
nb = NaiveBayes()
nb.fit(X_train, y_train)
print("Iris Accuracy:", accuracy_score(y_test, nb.predict(X_test)) * 100)

Iris Accuracy: 100.0


Testing on Titanic Data

In [5]:
data = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
data = data[["Survived", "Pclass", "Sex", "Age", "Fare"]].dropna()
data["Sex"] = data["Sex"].map({"male": 0, "female": 1})
X, y = data.iloc[:, 1:].values, data.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
nb.fit(X_train, y_train)
print("Titanic Accuracy:", accuracy_score(y_test, nb.predict(X_test)) * 100)

Titanic Accuracy: 74.12587412587412
