<a href="https://colab.research.google.com/github/benbaz-2/comp551/blob/main/A4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix

# Data processing

In [None]:
df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/raw/train-00000-of-00001.parquet")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
labels = df.columns[9:]

In [None]:
df1 = df[df[labels].sum(axis=1) == 1]

In [None]:
comments = df1['text'].tolist()
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(comments)
y = df1[labels].values
y = np.argmax(y, axis=1)

# Naive Bayes Implementation

In [None]:
# Note X must be a sparse matrix. This is because the session crashes otherwise
# y is integer encoded not one hot encoded

class NaiveBayes:
    def __init__(self):
        self.px = None
        self.py = None

    def fit(self, X, y):
        # Shape of X is (N, L) where L is the length of the embedding vectors, X is a sparse matrix
        # Shape of y is (N,) where N is sample size

        self.X = X    # Bag of words representation
        self.y = y    # Integer labeled
        n = X.shape[0]
        C = len(np.unique(y))

        # Compute class priors (py)
        for c in range(C):
            self.py = np.bincount(y)[c] / n

        # Initialize px as a list to store likelihoods
        self.px = []

        for c in range(C):
            # Select samples where the class is c
            y_c = (y == c)  # Binary mask for samples with class c
            X_c = X[y_c == 1]  # Extract samples where class is c

            # Compute the likelihood P(x_i | y_c) for each feature
            px_c = (X_c.sum(axis=0) + 1) / (y_c.sum() + X.shape[1])
            px_c = np.asarray(px_c).ravel()  # Ensure it's a dense 1D array

            self.px.append(px_c)  # Add the likelihoods for class c

        # Convert px to numpy array of shape (C, L)
        self.px = np.array(self.px)

    def predict(self, X):
        # Compute the log of the posterior probabilities for each class
        log_py = np.log(self.py)  # Log of class priors
        log_px = np.log(self.px)  # Log of feature likelihoods

        # Compute log-posterior for each class (N samples, C classes)
        log_posterior = X.dot(log_px.T) + log_py  # `X` remains sparse

        # Return the class with the highest posterior probability for each sample
        return np.argmax(log_posterior, axis=1)

    def evaluate_acc(self, Y, Yh):
        return np.mean(Y == Yh)


In [None]:
model = NaiveBayes()
model.fit(X, y)
Yh = model.predict(X)
model.evaluate_acc(y, Yh)

In [None]:
class_counts = np.bincount(y)
print("Class counts:", class_counts)


In [None]:
baseline_accuracy = np.max(class_counts) / len(y)
print(f"Baseline Accuracy: {baseline_accuracy:.2f}")


# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=40, random_state=42, n_jobs=-1, verbose=1, max_depth=40)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
y_train_pred = rf_classifier.predict(X_train)
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Classification Report:\n", classification_report(y_train, y_train_pred))

# Finetune Large Language Model

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")

In [None]:
for param in model.parameters():
    param.requires_grad = False

In [None]:
batch_size = 1000
tokenized_batches = []

for i in range(0, len(comments), batch_size):
    batch = comments[i:i + batch_size]
    tokenized_batch = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
    tokenized_batches.append(tokenized_batch)

In [None]:
import torch

In [None]:
# Optional: Concatenate all batches if needed
tokenized_batches["input_ids"] = torch.cat(tokenized_batches["input_ids"], dim=0)
tokenized_batches["attention_mask"] = torch.cat(tokenized_batches["attention_mask"], dim=0)

# Check result size
print(f"Total tokenized inputs: {tokenized_batches['input_ids'].shape}")