Trabalho realizado por: Bárbara Freixo, PG49169

Nesta notebook será implementado o Multinomial Naive Bayes Classifier, o Bernoulli Naive Bayes Classifier e o Gaussian Naive Bayes Classifier.

O Classificador Naive Bayes Multinomial é um modelo de classificação baseado no Teorema de Bayes que supõe que as características seguem uma distribuição multinomial. É utilizado principalmente para a classificação de texto, onde as características são representadas pelo número de ocorrências de palavras.

O Classificador Naive Bayes de Bernoulli é semelhante ao Classificador Naive Bayes Multinomial, mas pressupõe que as características seguem uma distribuição de Bernoulli, ou seja, cada característica representa a presença ou ausência de uma determinada característica. É frequentemente utilizado para classificação binária.

O Classificador Naive Bayes Gaussiano é outro modelo de classificação baseado no Teorema de Bayes que supe que as características seguem uma distribuição normal. É principalmente utilizado para a classificação de dados numéricos, onde as características podem ser representadas por uma distribuição normal.


In [2]:
import numpy as np

class MultinomialNaiveBayesClassifier:
    """
    A classifier that uses the multinomial naive bayes algorithm.
    The model assumes that the features are multinomial distributions.
    It is used primarily for text classification where the features are word counts.
    """
    def __init__(self, alpha=1.0):
        """
        Initialize the classifier with an optional smoothing factor (alpha).
        """
        self.alpha = alpha
        self.classes = None
        self.class_probs = None
        self.feature_probs = None

    def fit(self, X, y):
        """
        Fit the classifier to the training data.
        X: a 2D numpy array of shape (n_samples, n_features)
        y: a 1D numpy array of shape (n_samples,)
        """
        self.classes, counts = np.unique(y, return_counts=True)
        self.class_probs = counts / y.shape[0]

        n_classes = len(self.classes)
        n_features = X.shape[1]

        self.feature_probs = np.zeros((n_classes, n_features))

        for i, cls in enumerate(self.classes):
            X_cls = X[y == cls]
            self.feature_probs[i, :] = (X_cls.sum(axis=0) + self.alpha) / (np.sum(X_cls) + self.alpha * n_features)

    def predict(self, X):
        """
        Predict the class labels for a set of samples.
        X: a 2D numpy array of shape (n_samples, n_features)
        Returns: a 1D numpy array of shape (n_samples,)
        """
        log_probs = np.log(self.class_probs) + X @ np.log1p(np.clip(self.feature_probs.T, a_min=np.finfo(float).eps, a_max=None))
        return self.classes[np.argmax(log_probs, axis=1)]

# Exemplo de uso

In [3]:
# Data Splitting
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

# Load Digits dataset from scikit-learn library
digits = load_digits()
X, y = digits.data, digits.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create an instance of MultinomialNaiveBayesClassifier with a smoothing parameter alpha=1.0
mnb = MultinomialNaiveBayesClassifier(alpha=1.0)

# Fit the classifier to the training data
mnb.fit(X_train, y_train)

# Predict the labels of the test data
y_pred = mnb.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = np.mean(y_test == y_pred)
print("Multinomial Naive Bayes Accuracy:", accuracy)

Multinomial Naive Bayes Accuracy: 0.8722222222222222


# Teste "Unittest"

In [4]:
import unittest
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class TestMultinomialNaiveBayesClassifier(unittest.TestCase):
    """
    Test class for the MultinomialNaiveBayesClassifier
    """

    def setUp(self):
        """
        Initialize the classifier
        """
        self.clf = MultinomialNaiveBayesClassifier()

    def test_fit(self):
        """
        Test the fit method of the classifier
        """
        X, y = make_classification(n_samples=100, n_features=20, n_classes=3, n_informative=3, random_state=42)
        self.clf.fit(X, y)
        # Check if the classes, class probabilities and feature probabilities are set
        self.assertIsNotNone(self.clf.classes)
        self.assertIsNotNone(self.clf.class_probs)
        self.assertIsNotNone(self.clf.feature_probs)

    def test_predict(self):
        """
        Test the predict method of the classifier
        """
        X, y = make_classification(n_samples=100, n_features=20, n_classes=3, n_informative=3, random_state=42)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
        self.clf.fit(X_train, y_train)
        y_pred = self.clf.predict(X_test)
        # Check if the predictions are not None and the length of the predictions is equal to the length of the test data
        self.assertIsNotNone(y_pred)
        self.assertEqual(len(y_pred), len(y_test))

if __name__ == "__main__":
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

..
----------------------------------------------------------------------
Ran 2 tests in 0.014s

OK


In [5]:
import numpy as np

class BernoulliNaiveBayesClassifier:
    """
    A Bernoulli Naive Bayes Classifier model. This model is based on Bayes' theorem and assumes that the features are Bernoulli distributions, 
    meaning that each feature represents the presence or absence of a characteristic. It is commonly used for binary classification.
    """

    def __init__(self, alpha=1.0):
        """
        Initialize the classifier with a smoothing parameter alpha.
        
        Parameters:
        - alpha (float): Smoothing parameter to handle cases where features are not present in the training data.
        
        Returns:
        None
        """
        self.alpha = alpha
        self.classes = None
        self.class_probs = None
        self.feature_probs = None

    def fit(self, X, y):
        """
        Fit the classifier to the training data.
        
        Parameters:
        - X (numpy array): Features of the training data.
        - y (numpy array): Labels of the training data.
        
        Returns:
        None
        """
        self.classes, counts = np.unique(y, return_counts=True)
        if np.all(counts > 0):
            self.class_probs = counts / y.shape[0]
        else:
            raise ValueError("Classes with zero examples not supported")

        n_classes = len(self.classes)
        n_features = X.shape[1]

        self.feature_probs = np.zeros((n_classes, n_features))

        for i, cls in enumerate(self.classes):
            X_cls = X[y == cls]
            if X_cls.shape[0] > 0:
                self.feature_probs[i, :] = (X_cls.sum(axis=0) + self.alpha) / (X_cls.shape[0] + 2 * self.alpha)
            else:
                raise ValueError("Classes with zero examples not supported")

    def predict(self, X):
        """
        Predict the class labels for the given feature data.
        
        Parameters:
        - X (numpy array): Features of the data to be classified.
        
        Returns:
        - numpy array: Predicted class labels.
        """
        log_feature_probs = np.log(self.feature_probs)
        log_feature_probs_neg = np.log(1 - self.feature_probs)
        log_probs = np.zeros((X.shape[0], len(self.classes)))

        for i, x in enumerate(X):
            log_probs[i] = np.log(self.class_probs) + np.sum(x * log_feature_probs - x * log_feature_probs_neg + log_feature_probs_neg, axis=1)

        return self.classes[np.argmax(log_probs, axis=1)]

# Exemplo de uso

In [6]:
# This code trains and tests a Bernoulli Naive Bayes classifier using the binarized version of the digits dataset from scikit-learn.
# The classifier is initialized with an alpha value of 1.0, and is trained using the fit method on the training data. 
# The accuracy of the classifier is then computed by comparing its predictions on the test data to the actual target values, and printing the resulting accuracy.

from sklearn.datasets import load_digits
from sklearn.preprocessing import binarize
from sklearn.model_selection import train_test_split

# Load the digits dataset and extract features and target variables
digits = load_digits()
X, y = digits.data, digits.target

# Binarize the data
X = binarize(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Bernoulli Naive Bayes classifier with alpha = 1.0
bnb = BernoulliNaiveBayesClassifier(alpha=1.0)

# Fit the classifier on the training data
bnb.fit(X_train, y_train)

# Predict on the test data
y_pred = bnb.predict(X_test)

# Compute the accuracy of the classifier by comparing its predictions to the actual target values
accuracy = np.mean(y_test == y_pred)

# Print the accuracy
print("Bernoulli Naive Bayes Accuracy:", accuracy)

Bernoulli Naive Bayes Accuracy: 0.8592592592592593


# Teste "Unittest"

In [7]:
import numpy as np
import unittest
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

class TestBernoulliNaiveBayesClassifier(unittest.TestCase):
    """
    A class for testing the BernoulliNaiveBayesClassifier.
    """
    def setUp(self):
        """
        Initialize the classifier object.
        """
        self.classifier = BernoulliNaiveBayesClassifier()

    def test_init(self):
        """
        Test the initialization of the classifier object.
        """
        self.assertEqual(self.classifier.alpha, 1.0)
        self.assertIsNone(self.classifier.classes)
        self.assertIsNone(self.classifier.class_probs)
        self.assertIsNone(self.classifier.feature_probs)

    def test_fit(self):
        """
        Test the fit method of the classifier.
        """
        X, y = make_classification(n_samples=100, n_features=10, random_state=42)
        X = (X > 0).astype(int)
        self.classifier.fit(X, y)
        self.assertIsNotNone(self.classifier.classes)
        self.assertIsNotNone(self.classifier.class_probs)
        self.assertIsNotNone(self.classifier.feature_probs)

    def test_predict(self):
        """
        Test the predict method of the classifier.
        """
        X, y = make_classification(n_samples=100, n_features=10, random_state=42)
        X = (X > 0).astype(int)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.classifier.fit(X_train, y_train)
        y_pred = self.classifier.predict(X_test)
        self.assertIsNotNone(y_pred)
        self.assertEqual(len(y_pred), len(y_test))

    def test_accuracy(self):
        """
        Test the accuracy of the classifier.
        """
        X, y = make_classification(n_samples=100, n_features=10, random_state=42)
        X = (X > 0).astype(int)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.classifier.fit(X_train, y_train)
        y_pred = self.classifier.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        self.assertGreater(accuracy, 0.5)

if __name__ == "__main__":
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

......
----------------------------------------------------------------------
Ran 6 tests in 0.016s

OK


In [8]:
import numpy as np

class GaussianNaiveBayesClassifier:
    """
    Gaussian Naive Bayes Classifier

    This classifier implements the Gaussian Naive Bayes algorithm for classification.
    The algorithm assumes that the features are normally distributed and uses the mean
    and variance of each feature for each class to calculate the class probabilities.
    """

    def __init__(self):
        """
        Initialize the classifier.
        """
        self.classes = None
        self.class_prior = None
        self.mean = None
        self.var = None
        
    def fit(self, X, y):
        """
        Fit the classifier to the training data.

        Parameters:
            X (numpy.ndarray): The training data.
            y (numpy.ndarray): The target labels.
        """
        self.classes, counts = np.unique(y, return_counts=True)
        self.class_prior = counts / y.shape[0]
        
        n_classes = len(self.classes)
        n_features = X.shape[1]
        
        self.mean = np.zeros((n_classes, n_features))
        self.var = np.zeros((n_classes, n_features))
        
        for i, cls in enumerate(self.classes):
            X_cls = X[y == cls]
            self.mean[i, :] = X_cls.mean(axis=0)
            self.var[i, :] = X_cls.var(axis=0)
    
    def predict(self, X):
        """
        Predict the class labels for the given data.

        Parameters:
            X (numpy.ndarray): The data to predict.

        Returns:
            numpy.ndarray: The predicted class labels.
        """
        log_likelihood = -0.5 * np.sum(np.log(2.0 * np.pi * self.var), axis=1) \
                         -0.5 * np.sum((X[:, np.newaxis] - self.mean) ** 2 / self.var, axis=2)
        log_probs = np.log(self.class_prior) + log_likelihood
        return self.classes[np.argmax(log_probs, axis=1)]

# Exemplo de uso

In [9]:
from sklearn.model_selection import train_test_split

# Set seed for reproducibility
np.random.seed(42)

# Generate data
n_samples = 300
X1 = np.random.normal(2, 1, (n_samples // 2, 2))
X2 = np.random.normal(5, 1, (n_samples // 2, 2))
X = np.vstack((X1, X2))
y = np.hstack((np.zeros(n_samples // 2), np.ones(n_samples // 2)))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit Gaussian Naive Bayes classifier and predict test data
gnb = GaussianNaiveBayesClassifier()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

# Calculate accuracy
accuracy = np.mean(y_test == y_pred)
print("Gaussian Naive Bayes Accuracy:", accuracy)

Gaussian Naive Bayes Accuracy: 0.9666666666666667


# Teste "Unittest"

In [10]:
import unittest
import numpy as np

class TestGaussianNaiveBayesClassifier(unittest.TestCase):

    def setUp(self):
        self.X = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
        self.y = np.array([0, 0, 0, 1, 1, 1])

        self.clf = GaussianNaiveBayesClassifier()
        self.clf.fit(self.X, self.y)

    # Test the fit method of the GaussianNaiveBayesClassifier
    def test_fit(self):
        np.testing.assert_array_equal(self.clf.classes, np.array([0, 1]))
        np.testing.assert_almost_equal(self.clf.class_prior, np.array([0.5, 0.5]), decimal=6)

        np.testing.assert_almost_equal(self.clf.mean, np.array([[2., 3.], [5., 6.]]), decimal=6)
        np.testing.assert_almost_equal(self.clf.var, np.array([[0.66666667, 0.66666667], [0.66666667, 0.66666667]]), decimal=6)

    # Test the predict method of the GaussianNaiveBayesClassifier
    def test_predict(self):
        X_test = np.array([[1.5, 2.5], [4.5, 5.5]])
        y_pred = self.clf.predict(X_test)
        np.testing.assert_array_equal(y_pred, np.array([0, 1]))

if __name__ == "__main__":
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

........
----------------------------------------------------------------------
Ran 8 tests in 0.022s

OK
