In [1]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where you have saved the unzipped
FOLDERNAME = "Academics/DATA512/Project/llm-roberta-sentiment"
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
import os
sys.path.append('/content/drive/MyDrive/{}'.format(FOLDERNAME))
os.chdir('/content/drive/MyDrive/{}'.format(FOLDERNAME))


Mounted at /content/drive


In [2]:
! pip install -r requirements.txt

Collecting datasets==3.6.0 (from -r requirements.txt (line 2))
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dotenv==0.9.9 (from -r requirements.txt (line 3))
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting groq==0.36.0 (from -r requirements.txt (line 5))
  Downloading groq-0.36.0-py3-none-any.whl.metadata (16 kB)
Collecting imblearn==0.0 (from -r requirements.txt (line 7))
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting torchmetrics>=1.8.2 (from -r requirements.txt (line 24))
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics>=1.8.2->-r requirements.txt (line 24))
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Collecting jedi>=0.16 (from ipython>=7.23.1->ipykernel==6.17.1->-r requirements.txt (line 8))
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading datasets-3.6.0-py

In [3]:
import pandas as pd
import polars as pl
import numpy as np
from datasets import load_dataset

# New Section

In [4]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import torch

model_path = "model/sentiment_model_10k_5"

model = RobertaForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)

In [5]:
from torch.utils.data import Dataset, DataLoader, TensorDataset

test = pd.read_parquet("data/test_10k_5.parquet")
data = tokenizer(test["text"].tolist(), padding="max_length", max_length=512, truncation=True, return_attention_mask=True)

def create_dataloader(data, batch_size=32):

    # labels = torch.tensor(data['rating'], dtype=torch.long)
    input_ids = torch.tensor(data['input_ids'])
    attention_mask = torch.tensor(data['attention_mask'])
    train_data = TensorDataset(input_ids, attention_mask)

    return DataLoader(train_data, batch_size=batch_size, shuffle=False)

In [6]:
dataloader = create_dataloader(data)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

all_predictions = []
with torch.no_grad():

    for batch in dataloader:
        b_input_ids, b_attention_mask = [t.to(device) for t in batch]
        # resetting gradients before backpropagation
        model.zero_grad()
        # performing a forward pass to calculate outputs
        outcome = model(b_input_ids, attention_mask=b_attention_mask)
        pred = outcome.logits.argmax(dim=1).tolist()
        all_predictions.extend(pred)



In [7]:
test['pred_bert_10k'] = all_predictions

In [8]:
(test['pred_bert_10k'] == test['rating']).sum()/len(test)

np.float64(0.754)

In [9]:
model_path = "model/sentiment_model_50k_5"

model = RobertaForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)
data = tokenizer(test["text"].tolist(), padding="max_length", max_length=512, truncation=True, return_attention_mask=True)

In [10]:
dataloader = create_dataloader(data)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

all_predictions = []
with torch.no_grad():

    for batch in dataloader:
        b_input_ids, b_attention_mask = [t.to(device) for t in batch]
        # resetting gradients before backpropagation
        model.zero_grad()
        # performing a forward pass to calculate outputs
        outcome = model(b_input_ids, attention_mask=b_attention_mask)
        pred = outcome.logits.argmax(dim=1).tolist()
        all_predictions.extend(pred)

test['pred_bert_50k'] = all_predictions

In [11]:
(test['pred_bert_50k'] == test['rating']).sum()/len(test)

np.float64(0.761)

In [12]:
model_path = "model/sentiment_model_100k_5"

model = RobertaForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)
data = tokenizer(test["text"].tolist(), padding="max_length", max_length=512, truncation=True, return_attention_mask=True)

In [13]:
dataloader = create_dataloader(data)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

all_predictions = []
with torch.no_grad():

    for batch in dataloader:
        b_input_ids, b_attention_mask = [t.to(device) for t in batch]
        # resetting gradients before backpropagation
        model.zero_grad()
        # performing a forward pass to calculate outputs
        outcome = model(b_input_ids, attention_mask=b_attention_mask)
        pred = outcome.logits.argmax(dim=1).tolist()
        all_predictions.extend(pred)

test['pred_bert_100k'] = all_predictions

In [14]:
(test['pred_bert_100k'] == test['rating']).sum()/len(test)

np.float64(0.771)

In [15]:
test

Unnamed: 0,rating,text,text_cleaned,input_ids,attention_mask,pred_bert_10k,pred_bert_50k,pred_bert_100k
0,4.0,Used to freshen up linens,Used to freshen up linens,"[0, 47640, 7, 21862, 2457, 62, 24248, 1290, 2,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",4,4,4
1,0.0,need more proof:(to order!!!!,need more proof:(to order!!!!,"[0, 30484, 55, 6461, 48329, 560, 645, 32376, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",4,4,4
2,4.0,"This is a good flat iron, it has different tem...","This is a good flat iron, it has different tem...","[0, 713, 16, 10, 205, 3269, 6440, 6, 24, 34, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4,4,4
3,4.0,This is the best gel I’ve tried. Super hold w...,This is the best gel I’ve tried. Super hold wi...,"[0, 713, 16, 5, 275, 17916, 38, 17, 27, 548, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4,4,4
4,4.0,It comes with 1 set of the 4 smaller sizes and...,It comes with 1 set of the 4 smaller sizes and...,"[0, 243, 606, 19, 112, 278, 9, 5, 204, 2735, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4,4,4
...,...,...,...,...,...,...,...,...
995,4.0,Very effective. Can see good change :),Very effective. Can see good change :),"[0, 25101, 2375, 4, 2615, 192, 205, 464, 44660...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...",4,4,4
996,4.0,This product works! The treatment works on sh...,This product works! The treatment works on she...,"[0, 713, 1152, 1364, 328, 20, 1416, 1364, 15, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4,4,4
997,4.0,"Great price. Great product, and very fast ship...","Great price. Great product, and very fast ship...","[0, 19065, 425, 4, 2860, 1152, 6, 8, 182, 1769...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...",4,4,4
998,3.0,I actually liked the hair. It did shed a littl...,I actually liked the hair. It did shed a littl...,"[0, 100, 888, 6640, 5, 2549, 4, 85, 222, 7722,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",3,3,3


In [16]:
test.to_csv('results/test_5_bert.csv', index=False)