<a href="https://colab.research.google.com/github/benbaz-2/comp551/blob/main/A4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix

# Data processing

In [2]:
df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/raw/train-00000-of-00001.parquet")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
labels = df.columns[9:]

In [4]:
df1 = df[df[labels].sum(axis=1) == 1]

In [5]:
comments = df1['text'].tolist()
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(comments)
y = df1[labels].values
y = np.argmax(y, axis=1)

# Naive Bayes Implementation

In [6]:
# Note X must be a sparse matrix. This is because the session crashes otherwise
# y is integer encoded not one hot encoded

class NaiveBayes:
    def __init__(self):
        self.px = None
        self.py = None

    def fit(self, X, y):
        # Shape of X is (N, L) where L is the length of the embedding vectors, X is a sparse matrix
        # Shape of y is (N,) where N is sample size

        self.X = X    # Bag of words representation
        self.y = y    # Integer labeled
        n = X.shape[0]
        C = len(np.unique(y))

        # Compute class priors (py)
        for c in range(C):
            self.py = np.bincount(y)[c] / n

        # Initialize px as a list to store likelihoods
        self.px = []

        for c in range(C):
            # Select samples where the class is c
            y_c = (y == c)  # Binary mask for samples with class c
            X_c = X[y_c == 1]  # Extract samples where class is c

            # Compute the likelihood P(x_i | y_c) for each feature
            px_c = (X_c.sum(axis=0) + 1) / (y_c.sum() + X.shape[1])
            px_c = np.asarray(px_c).ravel()  # Ensure it's a dense 1D array

            self.px.append(px_c)  # Add the likelihoods for class c

        # Convert px to numpy array of shape (C, L)
        self.px = np.array(self.px)

    def predict(self, X):
        # Compute the log of the posterior probabilities for each class
        log_py = np.log(self.py)  # Log of class priors
        log_px = np.log(self.px)  # Log of feature likelihoods

        # Compute log-posterior for each class (N samples, C classes)
        log_posterior = X.dot(log_px.T) + log_py  # `X` remains sparse

        # Return the class with the highest posterior probability for each sample
        return np.argmax(log_posterior, axis=1)

    def evaluate_acc(self, Y, Yh):
        return np.mean(Y == Yh)


In [7]:
model = NaiveBayes()
model.fit(X, y)
Yh = model.predict(X)
model.evaluate_acc(y, Yh)

0.41989873123035737

In [8]:
class_counts = np.bincount(y)
print("Class counts:", class_counts)


Class counts: [10531  6130  5202  8342 11259  3523  4938  5885  2147  4706  7686  2914
  1433  3020  1778  7075   351  4329  4957   796  4519   690  4714   788
  1510  3827  3472 55298]


In [13]:
baseline_accuracy = np.max(class_counts) / len(y)
print(f"Baseline Accuracy: {baseline_accuracy:.2f}")


Baseline Accuracy: 0.32


# Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=40, random_state=42, n_jobs=-1, verbose=1, max_depth=40)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


Accuracy: 0.37940868350599466
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.23      0.34      2121
           1       0.56      0.22      0.32      1200
           2       0.39      0.01      0.02      1009
           3       0.27      0.00      0.01      1663
           4       0.63      0.03      0.06      2366
           5       0.42      0.01      0.01       708
           6       0.63      0.03      0.06       964
           7       0.80      0.05      0.09      1175
           8       0.40      0.11      0.17       416
           9       0.46      0.01      0.01       957
          10       0.25      0.00      0.01      1542
          11       0.77      0.06      0.11       584
          12       0.75      0.01      0.02       299
          13       0.71      0.06      0.11       572
          14       0.78      0.02      0.04       372
          15       0.85      0.70      0.77      1396
          16       0.00    

[Parallel(n_jobs=2)]: Done  40 out of  40 | elapsed:    0.3s finished
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
y_train_pred = rf_classifier.predict(X_train)
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Classification Report:\n", classification_report(y_train, y_train_pred))

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  40 out of  40 | elapsed:    1.0s finished
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.40796327552089395
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.27      0.41      8410
           1       0.75      0.33      0.46      4930
           2       0.87      0.03      0.06      4193
           3       0.80      0.01      0.02      6679
           4       0.81      0.04      0.07      8893
           5       0.78      0.02      0.03      2815
           6       0.92      0.05      0.09      3974
           7       0.92      0.06      0.11      4710
           8       0.68      0.19      0.29      1731
           9       0.82      0.01      0.02      3749
          10       0.85      0.01      0.02      6144
          11       0.93      0.07      0.13      2330
          12       0.90      0.02      0.04      1134
          13       0.89      0.06      0.12      2448
          14       0.95      0.04      0.08      1406
          15       0.90      0.75      0.82      5679
          16       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Finetune Large Language Model

In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture clas

In [13]:
for name, param in model.named_parameters():
    print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight True
bert.embeddings.position_embeddings.weight True
bert.embeddings.token_type_embeddings.weight True
bert.embeddings.LayerNorm.weight True
bert.embeddings.LayerNorm.bias True
bert.encoder.layer.0.attention.self.query.weight True
bert.encoder.layer.0.attention.self.query.bias True
bert.encoder.layer.0.attention.self.key.weight True
bert.encoder.layer.0.attention.self.key.bias True
bert.encoder.layer.0.attention.self.value.weight True
bert.encoder.layer.0.attention.self.value.bias True
bert.encoder.layer.0.attention.output.dense.weight True
bert.encoder.layer.0.attention.output.dense.bias True
bert.encoder.layer.0.attention.output.LayerNorm.weight True
bert.encoder.layer.0.attention.output.LayerNorm.bias True
bert.encoder.layer.0.intermediate.dense.weight True
bert.encoder.layer.0.intermediate.dense.bias True
bert.encoder.layer.0.output.dense.weight True
bert.encoder.layer.0.output.dense.bias True
bert.encoder.layer.0.output.LayerNorm.weight True
