In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.metrics import accuracy_score

## Categorical Naive Bayes

Categorical Naive Bayes is a variant of the Naive Bayes algorithm that is specifically designed to handle categorical features — features that can take on a limited, fixed number of possible values (e.g., "red", "blue", "green" or "low", "medium", "high").

In [2]:
df = pd.read_csv(r"C:\Users\Mohit\OneDrive\Desktop\ml\datasets\play_tennis.csv")
df = df.drop(columns=["day"])

In [3]:
X = df.iloc[:, :-1]
y = df["play"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [4]:
class NaiveBayes:
    def __init__(self):
        # probability of a class
        self.probab_classes = dict()
        # probabilit of each feature value given class
        self.probab_feature_val = defaultdict(lambda: defaultdict(dict))
        # number of times a class appeared
        self.occurence_of_classes = dict()
        
    def calc_probab_of_classes(self, y):
        self.classes = list(y.unique())
        self.n_classes = len(y)
        
        for cls in self.classes:
            self.occurence_of_classes[cls] = sum(y == cls)
            self.probab_classes[cls] = self.occurence_of_classes[cls] / self.n_classes
            

    def calc_probabs_for_unique_vals(self, X, y):
        # for every col in X
        for col in X:
            crosstab = pd.crosstab(X[col], y)
            # for every class in self.classes
            for cls in self.classes:
                # for every index value of class 'cls' in the corsstab
                for val in crosstab.index:
                    self.probab_feature_val[cls][col][val] = crosstab[cls][val] / self.occurence_of_classes[cls]
        
    def fit(self, X, y):
        self.calc_probab_of_classes(y)
        self.calc_probabs_for_unique_vals(X, y)

    def predict(self, X):
        # probability of each outcome/row
        probabilities = np.zeros(X.shape[0])
        # the list of class with the highest probability for each row
        class_outcome = list()
        
        # go through every row
        for row_ind in range(X.shape[0]):
            # the class with highest probability
            max_probab_cls = None
            # current row
            row = X.iloc[row_ind]
            # for each class in self.classes
            for cls in self.classes:
                # probability of the current class
                p=self.probab_classes[cls]
                # go through all columns in X
                for col in X:
                    # If row[col] contains a value not seen during training for class cls, you'll get a KeyError.
                    # Fix: Use .get() with a small fallback probability (Laplace smoothing):
                    p *= self.probab_feature_val[cls][col].get(row[col], 1e-6)
                    # using 1e-6 avoids zeroing out the entire probability due to an unseen value.
                    
                #  if probabilty of current cls is greater than the existing probability (in probabilities)
                if p > probabilities[row_ind]:
                    probabilities[row_ind] = p
                    max_probab_cls = cls
            # append the class with highest probability
            class_outcome.append(max_probab_cls)
                    
        return class_outcome

    def accuracy(self, y_true, y_pred):
        return np.mean(np.array(y_true) == np.array(y_pred))

    def score(self, X, y_true):
        return self.accuracy(y_true, self.predict(X))

In [5]:
nb = NaiveBayes()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [6]:
nb.score(X_test, y_test)

0.6666666666666666

In [7]:
# lets train CategoricalNB model from sklearn
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)

Xe_train, Xe_test, ye_train, ye_test = train_test_split(X_encoded, y, train_size=0.8, random_state=42)
cnb = CategoricalNB()
# cnb.fit(Xe_train, ye_train)


In [8]:
scores = cross_val_score(
    estimator=CategoricalNB(),
    X = X_encoded,
    y = y,
    cv=5
)
scores.mean()

0.5666666666666667

### let's work on a larger dataset: https://www.kaggle.com/datasets/uciml/mushroom-classification

In [9]:
df = pd.read_csv(r"C:\Users\Mohit\OneDrive\Desktop\ml\datasets\mushrooms.csv")
X = df.drop("class", axis=1)
y = df["class"]

X_encoded = OrdinalEncoder().fit_transform(X)
y_encoded = LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, train_size=0.8, random_state=42)

In [10]:
cnb = CategoricalNB()
cnb.fit(X_train, y_train)

In [11]:
y_pred1 = cnb.predict(X_test)
accuracy_score(y_test, y_pred1)

0.9507692307692308

In [12]:
X_df = pd.DataFrame(X_encoded)
y_df = pd.Series(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, train_size=0.8, random_state=42)
nb.fit(X_train, y_train)

In [13]:
y_pred2 = nb.predict(X_test)
accuracy_score(y_test, y_pred2)

0.9963076923076923

In [14]:
# lets try cross validation
scores = cross_val_score(
    estimator=CategoricalNB(),
    X = X_encoded,
    y = y_encoded,
    cv=20
)

In [15]:
scores.mean()

0.9370553491243145