In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertModel
from torch import nn

In [3]:
tokenizer = DistilBertTokenizerFast.from_pretrained("/content/drive/MyDrive/Colab Notebooks/Final Project AI/tokenizer/G15_tokenizer")

In [4]:
import os
print(os.listdir('/content/drive/MyDrive/Colab Notebooks/Final Project AI/tokenizer/G15_tokenizer'))

['tokenizer.json', 'vocab.txt', 'tokenizer_config.json', 'special_tokens_map.json']


In [5]:
# Load the model architecture
class CustomDistilBertModel(nn.Module):
    def __init__(self, base_model, num_labels, freeze_base=True, use_custom_head=True):
        super(CustomDistilBertModel, self).__init__()
        self.base_model = base_model
        if freeze_base:
            for param in self.base_model.parameters():
                param.requires_grad = False

        if use_custom_head:
            self.dropout = nn.Dropout(p=0.3)
            self.classifier = nn.Linear(768, num_labels)  # Assuming 768 is the base model output size
        else:
            self.classifier = nn.Identity()

        self.use_custom_head = use_custom_head

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0, :]
        if self.use_custom_head:
            hidden_state = self.dropout(hidden_state)
        logits = self.classifier(hidden_state)
        return logits

In [6]:
# Initialize the model
base_model = DistilBertModel.from_pretrained("distilbert-base-uncased")
model = CustomDistilBertModel(base_model, num_labels=5)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
# load the saved state dict

model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/Final Project AI/model.pth', map_location=torch.device('cpu')))
model.eval()

CustomDistilBertModel(
  (base_model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (li

In [8]:
def predict(comment):
    # tokenize my input
    inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=512)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # make prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)
        probs = torch.softmax(outputs, dim=1)

    return preds, probs

In [9]:
comment = "I loved this product. It was amazing!"

In [10]:
prediction, probs = predict(comment)

In [11]:
print(prediction)

tensor([4])


In [12]:
print(probs)

tensor([[0.0105, 0.0063, 0.0083, 0.0759, 0.8990]])


In [13]:
print(f"Comment: {comment}")
print(f"Predicted class: {prediction.item() + 1}")
print(f"Probability: {probs[0][prediction].item():.4f}")

Comment: I loved this product. It was amazing!
Predicted class: 5
Probability: 0.8990


In [14]:
import transformers
print(transformers.__version__)
print(torch.__version__)

4.42.4
2.3.1+cu121


# Final Evaluation of Model

In [15]:
!pip install datasets



In [16]:
from datasets import load_dataset
import pandas as pd
from torch.utils.data import DataLoader, Dataset

dataset = load_dataset('gyoungjr/amazon-electronics-reviews')

test_data = pd.DataFrame(dataset["test"])

In [17]:
# clean the test data
import re

def remove_special(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(text.split())
    return text

def clean_data(data):
    data["labels"] = data["labels"].astype(int)
    data["text"] = data["text"].astype(str)
    data.drop_duplicates(keep='last')
    data["text"].apply(remove_special)
    return data

test_data = clean_data(test_data)

In [28]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, max_length=512):
        self.texts = texts
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = tokenizer(self.texts[idx], return_tensors="pt", truncation=True, padding='max_length', max_length=self.max_length)
        input_ids = tokens['input_ids'].squeeze(0)
        attention_mask = tokens['attention_mask'].squeeze(0)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return input_ids, attention_mask, label

In [29]:
# create custom dataset
test_dataset = CustomDataset(test_data["text"].tolist(), test_data["labels"].tolist())

In [30]:
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [31]:
model.eval()

CustomDistilBertModel(
  (base_model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (li

In [32]:
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

with torch.no_grad():
    all_preds = []
    all_labels = []
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.tolist())
        all_labels.extend(labels.tolist())

accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='weighted')
conf_matrix = confusion_matrix(all_labels, all_preds)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.4939
Precision: 0.4804
Confusion Matrix:
[[1362  336  133   74   45]
 [ 594  549  320  141   51]
 [ 323  396  671  463  186]
 [ 105  110  296  778  735]
 [  73   31   72  405 1411]]


Overall we can say that the model is about 50% accurate with it's predictions.