<a href="https://colab.research.google.com/github/benbaz-2/comp551/blob/main/A4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

# Data processing

In [2]:
df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/raw/train-00000-of-00001.parquet")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
labels = df.columns[9:]

In [4]:
df1 = df[df[labels].sum(axis=1) == 1]

In [5]:
comments = df1['text'].tolist()
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(comments)
y = df1[labels].values

# Naive Bayes Implementation

In [8]:
# Note X must be a sparse matrix. This is because the session crashes otherwise

class NaiveBayes:
    def __init__(self):
        self.px = None
        self.py = None

    def fit(self, X, y):
        # Shape of X is (N, L) where L is the length of the embedding vectors, X is a sparse matrix
        # Shape of y is (N, C) where N is sample size and C is the number of classes

        self.X = X    # Bag of words representation
        self.y = y    # One-hot encoded labels (multiclass)
        n = X.shape[0]
        C = y.shape[1]

        # Compute class priors (py)
        self.py = np.sum(y, axis=0) / n    # Shape (C,) where each entry corresponds to the proportion of the class in the dataset

        # Initialize px as a list to store likelihoods
        self.px = []

        for c in range(C):
            # Select samples where the class is c
            y_c = y[:, c]  # Binary mask for samples with class c
            X_c = X[y_c == 1]  # Extract samples where class is c

            # Compute the likelihood P(x_i | y_c) for each feature
            px_c = (X_c.sum(axis=0) + 1) / (y_c.sum() + X.shape[1])
            px_c = np.asarray(px_c).ravel()  # Ensure it's a dense 1D array

            self.px.append(px_c)  # Add the likelihoods for class c

        # Convert px to numpy array of shape (C, L)
        self.px = np.array(self.px)

    def predict(self, X):
        # Compute the log of the posterior probabilities for each class
        log_py = np.log(self.py)  # Log of class priors
        log_px = np.log(self.px)  # Log of feature likelihoods

        # Compute log-posterior for each class (N samples, C classes)
        log_posterior = X.dot(log_px.T) + log_py  # `X` remains sparse

        # Return the class with the highest posterior probability for each sample
        return np.argmax(log_posterior, axis=1)

    def evaluate_acc(self, Y, Yh):
        # Convert one-hot encoded Y to class indices
        Y_indices = np.argmax(Y, axis=1)
        return np.mean(Y_indices == Yh)


In [9]:
model = NaiveBayes()
model.fit(X, y)
Yh = model.predict(X)
model.evaluate_acc(y, Yh)

0.33581073216156443

In [None]:
class_counts = np.sum(y, axis=0)  # Sum one-hot encoded labels along rows
print("Class counts:", class_counts)


In [12]:
baseline_accuracy = np.max(np.sum(y, axis=0)) / y.shape[0]
print(f"Baseline Accuracy: {baseline_accuracy:.2f}")


Baseline Accuracy: 0.32
