# W6 : Naive Bayes

In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd

Naive Bayes is a **generative probabilistic model**. Let’s assume we have a dataset sampled from a **Bernoulli experiment**:

$$
\mathcal{D} = {(x_i, y_i) }_{i=1}^n, \quad x_i \in {0,1}^m, \ y_i \in {1,\dots,K}
$$

Then:

1. **Naive Bayes assumption**: each feature is independent given the class (y):

$$
P(x \mid y=c) = \prod_{j=1}^m P(x_j \mid y=c)
$$

2. **Bernoulli distribution**: each feature follows a Bernoulli distribution conditioned on the class:

$$
P(x_j = 1 \mid y=c) = \theta_{jc}, \quad P(x_j = 0 \mid y=c) = 1 - \theta_{jc}
$$

In [3]:
# binary class classification task dataset
X1 = np.array([[1,0], [0,1], [0,1], [1,0]])
y1 = np.array([1, 0, 0, 1])
print(X1.shape, y1.shape)
# multi-class classification task dataset
X2 = np.array([[1,0], [0,1], [0,1], [1,0], [1, 1], [1, 1]])
y2 = np.array([1, 0, 0, 1, 2, 2])
print(X2.shape, y2.shape)

(4, 2) (4,)
(6, 2) (6,)


For class $(c)$, let $N_c$ be the number of samples with $y_i = c$. Using **Laplace smoothing** $\alpha>0$, the **conditional probability** of feature $j$ is:

$$
\theta_{jc} = P(x_j=1 \mid y=c) = \frac{\sum_{i:y_i=c} x_{ij} + \alpha}{N_c + 2\alpha}
$$

The **class prior** is estimated as:

$$
\pi_c = P(y=c) = \frac{N_c + \alpha}{n + K \alpha}
$$


In [9]:
class NaiveBayesBernoulli(object):
    def __init__(self, alpha=0.1):
        self.alpha = alpha
        
    def fit(self, X, y):
        """
        Task : Estimation of parameters of bernoulli distribution by naive bayes
        Inputs :
            X: Feature matrix of shape (n,m)
            y: Label vector of shape (n,) 
        Outputs:
            w_{j y_c}, w_prior
        """
        n_samples, m_features = X.shape
        n_classes = len(np.unique(y))

        # Inialization of weight matrix
        self.W = np.zeros((n_classes,m_features), dtype=np.float64)
        self.W_prior = np.zeros(n_classes, dtype=np.float64)

        # Processing samples for each classes seperatly
        for c in range(n_classes):
            # 1. find example with label c
            X_c = X[y==c]
            # 2. Estimation of W{j, y_c} = P(x_j | y_c = c)
            self.W[c, :] = (np.sum(X_c, axis=0) + self.alpha)/(X_c.shape[0] + 2.0 * self.alpha)
            # 3. Estimation of prior
            self.W_prior[c] = (X_c.shape[0] + self.alpha)/(float(n_samples) + n_classes * self.alpha)
        
        return self.W, self.W_prior

model = NaiveBayesBernoulli()
print("--------------Binary Classification----------")
class_cond_density, prior = model.fit(X1, y1)
print(f"Class conditional density : {class_cond_density}")
print(f"Prior probabilities of class : {prior}")

print("-------------multi-class classification-----------")
class_cond_density, prior = model.fit(X2, y2)
print(f"Class conditional density : {class_cond_density}")
print(f"Prior probabilities of class : {prior}")

--------------Binary Classification----------
Class conditional density : [[0.04545455 0.95454545]
 [0.95454545 0.04545455]]
Prior probabilities of class : [0.5 0.5]
-------------multi-class classification-----------
Class conditional density : [[0.04545455 0.95454545]
 [0.95454545 0.04545455]
 [0.95454545 0.95454545]]
Prior probabilities of class : [0.33333333 0.33333333 0.33333333]



For a single sample $x$, the **log-likelihood of class (c)** is:

$$
\log P(x \mid y=c) + \log P(y=c)
= \sum_{j=1}^m \big$$ x_j \log \theta_{jc} + (1-x_j) \log(1-\theta_{jc}) \big$$ + \log \pi_c
$$

In [13]:
class NaiveBayesBernoulli(object):
    def __init__(self, alpha=0.1):
        self.alpha = alpha
        
    def fit(self, X, y):
        """
        Task : Estimation of parameters of bernoulli distribution by naive bayes
        Inputs :
            X: Feature matrix of shape (n,m)
            y: Label vector of shape (n,) 
        Outputs:
            w_{j y_c}, w_prior
        """
        n_samples, m_features = X.shape
        n_classes = len(np.unique(y))

        # Inialization of weight matrix
        self.W = np.zeros((n_classes,m_features), dtype=np.float64)
        self.W_prior = np.zeros(n_classes, dtype=np.float64)

        # Processing samples for each classes seperatly
        for c in range(n_classes):
            # 1. find example with label c
            X_c = X[y==c]
            # 2. Estimation of W{j, y_c} = P(x_j | y_c = c)
            self.W[c, :] = (np.sum(X_c, axis=0) + self.alpha)/(X_c.shape[0] + 2.0 * self.alpha)
            # 3. Estimation of prior
            self.W_priors[c] = (X_c.shape[0] + self.alpha)/(float(n_samples) + n_classes * self.alpha)
        
        return self.W, self.W_prior
    def log_likelihood_prior_probability(self, X):
        return X @ (np.log(self.W).T) + (1 - X) @ (np.log(1-self.W).T) + np.log(self.W_prior)

model = NaiveBayesBernoulli()       
log_likelihood_prior = model.log_likelihood_prior_probability(X1)
print("--------------Binary Classification----------")
print(f"log likelihood of Prior probabilities of class : {log_likelihood_prior}")

print("-------------multi-class classification-----------")
log_likelihood_prior = model.log_likelihood_prior_probability(X2)
print(f"log likelihood of prior probabilities of class : {log_likelihood_prior}")

AttributeError: 'NaiveBayesBernoulli' object has no attribute 'W'

Using Bayes’ theorem, the **posterior probability** of class (c) is:

$$
P(y=c \mid x) = \frac{P(x \mid y=c)\pi_c}{\sum_{c'=1}^K P(x \mid y=c')\pi_{c'}} = \frac{P(x \mid y=c) P(y = c)}{\sum_{c'=1}^K P(x \mid y=c')P(y=c')}
$$

In [None]:
class NaiveBayesBernoulli(object):
    def __init__(self, alpha=0.1):
        self.alpha = alpha
        
    def fit(self, X, y):
        """
        Task : Estimation of parameters of bernoulli distribution by naive bayes
        Inputs :
            X: Feature matrix of shape (n,m)
            y: Label vector of shape (n,) 
        Outputs:
            w_{j y_c}, w_prior
        """
        n_samples, m_features = X.shape
        n_classes = len(np.unique(y))

        # Inialization of weight matrix
        self.W = np.zeros((n_classes,m_features), dtype=np.float64)
        self.W_prior = np.zeros(n_classes, dtype=np.float64)

        # Processing samples for each classes seperatly
        for c in range(n_classes):
            # 1. find example with label c
            X_c = X[y==c]
            # 2. Estimation of W{j, y_c} = P(x_j | y_c = c)
            self.W[c, :] = (np.sum(X_c, axis=0) + self.alpha)/(X_c.shape[0] + 2.0 * self.alpha)
            # 3. Estimation of prior
            self.W_priors[c] = (X_c.shape[0] + self.alpha)/(float(n_samples) + n_classes * self.alpha)
        
        return self.W, self.W_prior
    def log_likelihood_prior_probability(self, X):
        return X @ (np.log(self.W).T) + (1 - X) @ (np.log(1-self.W).T) + np.log(self.W_prior)

        
    def posterior_probability(self, X):
        q = self.log_likelihood_prior_probability(X)
        return np.exp(q) / np.expand_dims(np.sum(np.exp(q), axis=1), axis = 1)



In [None]:
model = NaiveBayesBernoulli()       
log_likelihood_prior = model.posterior_probability(X1)
print("--------------Binary Classification----------")
print(f"log likelihood of Prior probabilities of class : {log_likelihood_prior}")

print("-------------multi-class classification-----------")
log_likelihood_prior = model.log_likelihood_prior_probability(X2)
print(f"log likelihood of prior probabilities of class : {log_likelihood_prior}")

Finally, we predict the **most probable class** using the **maximum a posteriori (MAP) estimate**:

$$
\hat{y} = \arg\max_c P(y=c \mid x)
$$


In [None]:
class NaiveBayesBernoulli(object):
    def __init__(self, alpha=0.1):
        self.alpha = alpha
        
    def fit(self, X, y):
        """
        Task : Estimation of parameters of bernoulli distribution by naive bayes
        Inputs :
            X: Feature matrix of shape (n,m)
            y: Label vector of shape (n,) 
        Outputs:
            w_{j y_c}, w_prior
        """
        n_samples, m_features = X.shape
        n_classes = len(np.unique(y))

        # Inialization of weight matrix
        self.W = np.zeros((n_classes,m_features), dtype=np.float64)
        self.W_prior = np.zeros(n_classes, dtype=np.float64)

        # Processing samples for each classes seperatly
        for c in range(n_classes):
            # 1. find example with label c
            X_c = X[y==c]
            # 2. Estimation of W{j, y_c} = P(x_j | y_c = c)
            self.W[c, :] = (np.sum(X_c, axis=0) + self.alpha)/(X_c.shape[0] + 2.0 * self.alpha)
            # 3. Estimation of prior
            self.W_priors[c] = (X_c.shape[0] + self.alpha)/(float(n_samples) + n_classes * self.alpha)
        
        return self.W, self.W_prior
    def log_likelihood_prior_probability(self, X):
        return X @ (np.log(self.W).T) + (1 - X) @ (np.log(1-self.W).T) + np.log(self.W_prior)

        
    def posterior_probability(self, X):
        q = self.log_likelihood_prior_probability(X)
        return np.exp(q) / np.expand_dims(np.sum(np.exp(q), axis=1), axis = 1)
      
    def predict(self, X):
        return np.argmax(self.log_likelihood_prior_probability(X), axis=1)
    

In [None]:
model = NaiveBayesBernoulli()       
log_likelihood_prior = model.posterior_probability(X1)
print("--------------Binary Classification----------")
print(f"log likelihood of Prior probabilities of class : {log_likelihood_prior}")

print("-------------multi-class classification-----------")
log_likelihood_prior = model.log_likelihood_prior_probability(X2)
print(f"log likelihood of prior probabilities of class : {log_likelihood_prior}")