In [1]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where you have saved the unzipped
FOLDERNAME = "Academics/DATA512/Project/llm-roberta-sentiment"
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
import os
sys.path.append('/content/drive/MyDrive/{}'.format(FOLDERNAME))
os.chdir('/content/drive/MyDrive/{}'.format(FOLDERNAME))


Mounted at /content/drive


In [2]:
! pip install -r requirements.txt

Collecting datasets==3.6.0 (from -r requirements.txt (line 2))
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dotenv==0.9.9 (from -r requirements.txt (line 3))
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting groq==0.36.0 (from -r requirements.txt (line 5))
  Downloading groq-0.36.0-py3-none-any.whl.metadata (16 kB)
Collecting imblearn==0.0 (from -r requirements.txt (line 7))
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting torchmetrics>=1.8.2 (from -r requirements.txt (line 24))
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics>=1.8.2->-r requirements.txt (line 24))
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Collecting jedi>=0.16 (from ipython>=7.23.1->ipykernel==6.17.1->-r requirements.txt (line 8))
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading datasets-3.6.0-py

In [3]:
import pandas as pd
import polars as pl
import numpy as np
from datasets import load_dataset

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import AutoTokenizer

In [5]:
import re

val_size= 1000
test_size= 1000
train_size = 10000

dataset_all_beauty = load_dataset("McAuley-Lab/Amazon-Reviews-2023"
                                  , "raw_review_All_Beauty"
                                  , trust_remote_code=True)

dataset_all_beauty = dataset_all_beauty['full']

README.md: 0.00B [00:00, ?B/s]

Amazon-Reviews-2023.py: 0.00B [00:00, ?B/s]

raw/review_categories/All_Beauty.jsonl:   0%|          | 0.00/327M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

In [6]:
len(dataset_all_beauty)

701528

In [7]:
class TokenizedDataGenerator:
    def __init__(self, dataset, tokenizer, seed=1234):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.data_dict = {}
        self.seed = seed

    def clean_text(self, text: str) -> str:
        # Remove HTML tags
        text = re.sub(r"<[^>]+>", " ", text)

        # Remove URLs
        text = re.sub(r"https?://\S+|www\.\S+", " ", text)

        # Remove bracketed content: (), [], {}, <>
        text = re.sub(r"\([^)]*\)|\[[^\]]*\]|\{[^}]*\}|<[^>]*>", " ", text)

        # Replace newlines with space
        text = text.replace("\n", " ")

        # Collapse multiple spaces
        text = re.sub(r"\s+", " ", text)

        return text.strip()

    def clean_text_function(self, example):
        example["text_cleaned"] = self.clean_text(example["text"])
        return example

    def clean_rating_function(self, example, rating_cats):
        if len(rating_cats) == 2:
            # Binary classification: map ratings to 0 and 1
            example["rating"] = 1 if example["rating"] >= 4 else 0
        else:
            # Multi-class classification: map ratings to 0-4
            example["rating"] = example["rating"] - 1
        return example

    def get_training_validation_test_split(self, data, train_size, val_size, test_size,
                                           rating_cats, seed, use_cols):
        data = data.shuffle(seed=seed).filter(lambda example: example['rating'] in rating_cats)
        data_dict = {}

        # random sample 1000 as validation set and 1000 as test set
        train_test = data.train_test_split(test_size=test_size, seed=seed)
        valid_train = train_test['train'].train_test_split(test_size=val_size, seed=seed)
        data_train = train_test['train'].select(range(train_size))

        data_dict['train'] = data_train.select_columns(use_cols)
        data_dict['val'] = valid_train['test'].select_columns(use_cols)
        data_dict['test'] = train_test['test'].select_columns(use_cols)

        for key in data_dict.keys():
            print(f"{key} size: {data_dict[key].shape}")

            data_dict[key] = data_dict[key].map(self.clean_text_function)
            data_dict[key] = data_dict[key].map(self.clean_rating_function, fn_kwargs={"rating_cats": rating_cats})

        return data_dict

    def tokenize_function(self, example):
        return self.tokenizer(example["text_cleaned"], padding="max_length", max_length=512, truncation=True, return_attention_mask=True,)


    def get_dataloaders(self, train_size, val_size, test_size, rating_cats):
        self.data_dict = self.get_training_validation_test_split(self.dataset, train_size=train_size, val_size=val_size, test_size=test_size,
                                           rating_cats=rating_cats, seed=self.seed, use_cols = ['rating', 'text'])
        for key in self.data_dict.keys():
            tokenized_data = self.data_dict[key].map(self.tokenize_function, batched=True)
            tokenized_data.to_parquet(f"data/{key}_{train_size//1000}k_{len(rating_cats)}.parquet")
            print(f"Saved tokenized_data for {key} set.")
            self.data_dict[key] = tokenized_data

        return self.data_dict

In [8]:
train_size = 10000
val_size = 1000
test_size = 1000
rating_cats = [1,2,3,4,5]
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
tokenized_data_generator = TokenizedDataGenerator(dataset_all_beauty, tokenizer, seed=1234)
data_dict_10k = tokenized_data_generator.get_dataloaders(train_size=train_size, val_size=val_size, test_size=test_size, rating_cats=rating_cats)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Filter:   0%|          | 0/701528 [00:00<?, ? examples/s]

train size: (10000, 2)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

val size: (1000, 2)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

test size: (1000, 2)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Saved tokenized_data for train set.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved tokenized_data for val set.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved tokenized_data for test set.


In [24]:
tokenized_test = data_dict_10k['test']
def create_dataloader(data, batch_size=32):

    labels = torch.tensor(data['rating'], dtype=torch.long)
    input_ids = torch.tensor(data['input_ids'])
    attention_mask = torch.tensor(data['attention_mask'])
    train_data = TensorDataset(input_ids, attention_mask, labels)

    return DataLoader(train_data, batch_size=batch_size)

test_dataloader = create_dataloader(tokenized_test)

In [25]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

model_path = "model/sentiment_model_10k_5"

model = RobertaForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
with torch.no_grad():
    model.eval()
    total_eval_loss = 0
    preds = torch.tensor([]).to(device)
    target = torch.tensor([]).to(device)

    for batch in test_dataloader:
        b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        preds = torch.cat([preds, torch.argmax(outputs.logits, axis=1)])
        target = torch.cat([target, b_labels])

In [26]:
test = pd.DataFrame(tokenized_test)

In [27]:
preds_np = preds.detach().cpu().numpy()
test['pred_10k_5'] = preds_np

In [28]:
(test['pred_10k_5'] == test['rating']).sum()

np.int64(753)

In [None]:
train_size = 50000
val_size = 1000
test_size = 1000
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
rating_cats = [1,2,3,4,5]
tokenized_data_generator = TokenizedDataGenerator(dataset_all_beauty, tokenizer, seed=1234)
data_dict = tokenized_data_generator.get_dataloaders(train_size=train_size, val_size=val_size, test_size=test_size, rating_cats=rating_cats)

train size: (50000, 2)


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

val size: (1000, 2)
test size: (1000, 2)


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Saved tokenized_data for train set.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved tokenized_data for val set.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved tokenized_data for test set.


In [None]:
train_size = 10000
val_size = 1000
test_size = 1000
rating_cats = [1,5]
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
tokenized_data_generator = TokenizedDataGenerator(dataset_all_beauty, tokenizer, seed=1234)
data_dict = tokenized_data_generator.get_dataloaders(train_size=train_size, val_size=val_size, test_size=test_size, rating_cats=rating_cats)

train size: (10000, 2)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

val size: (1000, 2)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

test size: (1000, 2)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Saved tokenized_data for train set.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved tokenized_data for val set.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved tokenized_data for test set.


In [None]:
train_size = 50000
val_size = 1000
test_size = 1000
rating_cats = [1,5]
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
tokenized_data_generator = TokenizedDataGenerator(dataset_all_beauty, tokenizer, seed=1234)
data_dict = tokenized_data_generator.get_dataloaders(train_size=train_size, val_size=val_size, test_size=test_size, rating_cats=rating_cats)

train size: (50000, 2)


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

val size: (1000, 2)
test size: (1000, 2)


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Saved tokenized_data for train set.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved tokenized_data for val set.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved tokenized_data for test set.


In [11]:
train_size = 100000
val_size = 1000
test_size = 1000
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
rating_cats = [1,2,3,4,5]
tokenized_data_generator = TokenizedDataGenerator(dataset_all_beauty, tokenizer, seed=1234)
data_dict = tokenized_data_generator.get_dataloaders(train_size=train_size, val_size=val_size, test_size=test_size, rating_cats=rating_cats)

Filter:   0%|          | 0/701528 [00:00<?, ? examples/s]

train size: (100000, 2)


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

val size: (1000, 2)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

test size: (1000, 2)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/100 [00:00<?, ?ba/s]

Saved tokenized_data for train set.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved tokenized_data for val set.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved tokenized_data for test set.


In [13]:
!ls data

test_10k_2.parquet     test_50k_5.parquet   val_10k_2.parquet
test_10k_5.parquet     train_10k_2.parquet  val_10k_5.parquet
test_10k_5_texts.csv   train_10k_5.parquet  val_50k_2.parquet
test_10k_5_texts.json  train_50k_2.parquet  val_50k_5.parquet
test_50k_2.parquet     train_50k_5.parquet


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset


tokenized_train = pd.read_parquet("data/train_10k_5.parquet")
tokenized_val = pd.read_parquet("data/val_10k_5.parquet")
tokenized_test = pd.read_parquet("data/test_10k_5.parquet")


def create_dataloader(data, batch_size=32):

    labels = torch.tensor(data['rating'], dtype=torch.long)
    input_ids = torch.tensor(data['input_ids'])
    attention_mask = torch.tensor(data['attention_mask'])
    train_data = TensorDataset(input_ids, attention_mask, labels)

    return DataLoader(train_data, batch_size=batch_size, shuffle=True)

train_dataloader = create_dataloader(tokenized_train)
val_dataloader = create_dataloader(tokenized_val)
test_dataloader = create_dataloader(tokenized_test)

In [None]:
from torchmetrics.classification import MulticlassF1Score
from transformers import RobertaForSequenceClassification


# put the model on device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
mcf1s = MulticlassF1Score(num_classes=5, average=None).to(device)

model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=5)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
from torch.optim import AdamW
import time
# set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-6)


epochs = 5
start_time = time.time()
best_loss = float('inf')
best_model = None

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
        # resetting gradients before backpropagation
        model.zero_grad()
        # performing a forward pass to calculate outputs
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        # calculating the loss
        loss = outputs.loss
        total_loss += loss.item()
        # backpropagating the loss to compute gradients
        loss.backward()
        # updating model parameters using the computed gradients
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)

    with torch.no_grad():
        model.eval()
        total_eval_loss = 0
        preds = torch.tensor([]).to(device)
        target = torch.tensor([]).to(device)

        for batch in val_dataloader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
            outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
            loss = outputs.loss
            preds = torch.cat([preds, torch.argmax(outputs.logits, axis=1)])
            target = torch.cat([target, b_labels])
            total_eval_loss += loss.item()

        avg_val_loss = total_eval_loss / len(val_dataloader)

        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            best_model = model

    print(f"Epoch {epoch + 1}, Average Training Loss: {avg_train_loss:.4f}, Average Validation Loss: {avg_val_loss:.4f}")
    print(mcf1s(preds, target), '\n')

end_time = time.time()
# print(end_time)
print(f'{(end_time - start_time)/60} mins' )

Epoch 1, Average Training Loss: 0.9307, Average Validation Loss: 0.6203
tensor([0.7929, 0.0741, 0.4000, 0.2390, 0.9152], device='cuda:0') 

Epoch 2, Average Training Loss: 0.6503, Average Validation Loss: 0.6040
tensor([0.7988, 0.1053, 0.4507, 0.3068, 0.9190], device='cuda:0') 

Epoch 3, Average Training Loss: 0.6004, Average Validation Loss: 0.6025
tensor([0.7885, 0.2500, 0.4648, 0.3945, 0.9199], device='cuda:0') 

Epoch 4, Average Training Loss: 0.5608, Average Validation Loss: 0.6219
tensor([0.7841, 0.1639, 0.5098, 0.3279, 0.9201], device='cuda:0') 

Epoch 5, Average Training Loss: 0.5246, Average Validation Loss: 0.6403
tensor([0.7937, 0.1882, 0.4474, 0.3210, 0.9212], device='cuda:0') 

15.379777880509694 mins


In [None]:
# Save the model and tokenizer
best_model.save_pretrained("model/sentiment_model_10k_5")
tokenizer.save_pretrained("model/sentiment_model_10k_5")

('model/sentiment_model_10k_5/tokenizer_config.json',
 'model/sentiment_model_10k_5/special_tokens_map.json',
 'model/sentiment_model_10k_5/vocab.json',
 'model/sentiment_model_10k_5/merges.txt',
 'model/sentiment_model_10k_5/added_tokens.json',
 'model/sentiment_model_10k_5/tokenizer.json')

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset


tokenized_train = pd.read_parquet("data/train_50k_5.parquet")
tokenized_val = pd.read_parquet("data/val_50k_5.parquet")
tokenized_test = pd.read_parquet("data/test_50k_5.parquet")


def create_dataloader(data, batch_size=32):

    labels = torch.tensor(data['rating'], dtype=torch.long)
    input_ids = torch.tensor(data['input_ids'])
    attention_mask = torch.tensor(data['attention_mask'])
    train_data = TensorDataset(input_ids, attention_mask, labels)

    return DataLoader(train_data, batch_size=batch_size, shuffle=True)

train_dataloader = create_dataloader(tokenized_train)
val_dataloader = create_dataloader(tokenized_val)
test_dataloader = create_dataloader(tokenized_test)

In [None]:
from torchmetrics.classification import MulticlassF1Score
from transformers import RobertaForSequenceClassification


# put the model on device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
mcf1s = MulticlassF1Score(num_classes=5, average=None).to(device)

model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=5)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
from torch.optim import AdamW
import time
# set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-6)


epochs = 5
start_time = time.time()
best_loss = float('inf')
best_model = None

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
        # resetting gradients before backpropagation
        model.zero_grad()
        # performing a forward pass to calculate outputs
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        # calculating the loss
        loss = outputs.loss
        total_loss += loss.item()
        # backpropagating the loss to compute gradients
        loss.backward()
        # updating model parameters using the computed gradients
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)

    with torch.no_grad():
        model.eval()
        total_eval_loss = 0
        preds = torch.tensor([]).to(device)
        target = torch.tensor([]).to(device)

        for batch in val_dataloader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
            outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
            loss = outputs.loss
            preds = torch.cat([preds, torch.argmax(outputs.logits, axis=1)])
            target = torch.cat([target, b_labels])
            total_eval_loss += loss.item()

        avg_val_loss = total_eval_loss / len(val_dataloader)

        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            best_model = model

    print(f"Epoch {epoch + 1}, Average Training Loss: {avg_train_loss:.4f}, Average Validation Loss: {avg_val_loss:.4f}")
    print(mcf1s(preds, target), '\n')

end_time = time.time()
# print(end_time)
print(f'{(end_time - start_time)/60} mins' )

Epoch 1, Average Training Loss: 0.7082, Average Validation Loss: 0.5770
tensor([0.8049, 0.2785, 0.4444, 0.3452, 0.9180], device='cuda:0') 

Epoch 2, Average Training Loss: 0.5982, Average Validation Loss: 0.5524
tensor([0.8254, 0.3656, 0.4861, 0.3210, 0.9176], device='cuda:0') 

Epoch 3, Average Training Loss: 0.5681, Average Validation Loss: 0.5940
tensor([0.8207, 0.2500, 0.5067, 0.3804, 0.9196], device='cuda:0') 

Epoch 4, Average Training Loss: 0.5365, Average Validation Loss: 0.5645
tensor([0.7937, 0.2778, 0.5486, 0.3544, 0.9234], device='cuda:0') 

Epoch 5, Average Training Loss: 0.5078, Average Validation Loss: 0.5837
tensor([0.7900, 0.3256, 0.4930, 0.3394, 0.9224], device='cuda:0') 

74.96768669684728 mins


In [None]:
# Save the model and tokenizer
best_model.save_pretrained("model/sentiment_model_50k_5")
tokenizer.save_pretrained("model/sentiment_model_50k_5")

('model/sentiment_model_50k_5/tokenizer_config.json',
 'model/sentiment_model_50k_5/special_tokens_map.json',
 'model/sentiment_model_50k_5/vocab.json',
 'model/sentiment_model_50k_5/merges.txt',
 'model/sentiment_model_50k_5/added_tokens.json',
 'model/sentiment_model_50k_5/tokenizer.json')

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset


tokenized_train = pd.read_parquet("data/train_10k_2.parquet")
tokenized_val = pd.read_parquet("data/val_10k_2.parquet")
tokenized_test = pd.read_parquet("data/test_10k_2.parquet")


def create_dataloader(data, batch_size=32):

    labels = torch.tensor(data['rating'], dtype=torch.long)
    input_ids = torch.tensor(data['input_ids'])
    attention_mask = torch.tensor(data['attention_mask'])
    train_data = TensorDataset(input_ids, attention_mask, labels)

    return DataLoader(train_data, batch_size=batch_size, shuffle=True)

train_dataloader = create_dataloader(tokenized_train)
val_dataloader = create_dataloader(tokenized_val)
test_dataloader = create_dataloader(tokenized_test)

In [None]:
from torchmetrics.classification import MulticlassF1Score
from transformers import RobertaForSequenceClassification


# put the model on device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
mcf1s = MulticlassF1Score(num_classes=2, average=None).to(device)

model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=2)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
from torch.optim import AdamW
import time
# set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-6)


epochs = 5
start_time = time.time()
best_loss = float('inf')
best_model = None

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
        # resetting gradients before backpropagation
        model.zero_grad()
        # performing a forward pass to calculate outputs
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        # calculating the loss
        loss = outputs.loss
        total_loss += loss.item()
        # backpropagating the loss to compute gradients
        loss.backward()
        # updating model parameters using the computed gradients
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)

    with torch.no_grad():
        model.eval()
        total_eval_loss = 0
        preds = torch.tensor([]).to(device)
        target = torch.tensor([]).to(device)

        for batch in val_dataloader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
            outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
            loss = outputs.loss
            preds = torch.cat([preds, torch.argmax(outputs.logits, axis=1)])
            target = torch.cat([target, b_labels])
            total_eval_loss += loss.item()

        avg_val_loss = total_eval_loss / len(val_dataloader)

        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            best_model = model

    print(f"Epoch {epoch + 1}, Average Training Loss: {avg_train_loss:.4f}, Average Validation Loss: {avg_val_loss:.4f}")
    print(mcf1s(preds, target), '\n')

end_time = time.time()
# print(end_time)
print(f'{(end_time - start_time)/60} mins' )

Epoch 1, Average Training Loss: 0.2300, Average Validation Loss: 0.0877
tensor([0.8929, 0.9784], device='cuda:0') 

Epoch 2, Average Training Loss: 0.0725, Average Validation Loss: 0.0944
tensor([0.9046, 0.9815], device='cuda:0') 

Epoch 3, Average Training Loss: 0.0519, Average Validation Loss: 0.0799
tensor([0.9204, 0.9837], device='cuda:0') 

Epoch 4, Average Training Loss: 0.0407, Average Validation Loss: 0.0867
tensor([0.9288, 0.9848], device='cuda:0') 

Epoch 5, Average Training Loss: 0.0340, Average Validation Loss: 0.0839
tensor([0.9341, 0.9861], device='cuda:0') 

15.370466430981955 mins


In [None]:
# Save the model and tokenizer
best_model.save_pretrained("model/sentiment_model_10k_2")
tokenizer.save_pretrained("model/sentiment_model_10k_2")

('model/sentiment_model_10k_2/tokenizer_config.json',
 'model/sentiment_model_10k_2/special_tokens_map.json',
 'model/sentiment_model_10k_2/vocab.json',
 'model/sentiment_model_10k_2/merges.txt',
 'model/sentiment_model_10k_2/added_tokens.json',
 'model/sentiment_model_10k_2/tokenizer.json')

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset


tokenized_train = pd.read_parquet("data/train_50k_2.parquet")
tokenized_val = pd.read_parquet("data/val_50k_2.parquet")
tokenized_test = pd.read_parquet("data/test_50k_2.parquet")


def create_dataloader(data, batch_size=32):

    labels = torch.tensor(data['rating'], dtype=torch.long)
    input_ids = torch.tensor(data['input_ids'])
    attention_mask = torch.tensor(data['attention_mask'])
    train_data = TensorDataset(input_ids, attention_mask, labels)

    return DataLoader(train_data, batch_size=batch_size, shuffle=True)

train_dataloader = create_dataloader(tokenized_train)
val_dataloader = create_dataloader(tokenized_val)
test_dataloader = create_dataloader(tokenized_test)

In [None]:
from torchmetrics.classification import MulticlassF1Score
from transformers import RobertaForSequenceClassification


# put the model on device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
mcf1s = MulticlassF1Score(num_classes=2, average=None).to(device)

model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=2)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
from torch.optim import AdamW
import time
# set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-6)


epochs = 5
start_time = time.time()
best_loss = float('inf')
best_model = None

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
        # resetting gradients before backpropagation
        model.zero_grad()
        # performing a forward pass to calculate outputs
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        # calculating the loss
        loss = outputs.loss
        total_loss += loss.item()
        # backpropagating the loss to compute gradients
        loss.backward()
        # updating model parameters using the computed gradients
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)

    with torch.no_grad():
        model.eval()
        total_eval_loss = 0
        preds = torch.tensor([]).to(device)
        target = torch.tensor([]).to(device)

        for batch in val_dataloader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
            outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
            loss = outputs.loss
            preds = torch.cat([preds, torch.argmax(outputs.logits, axis=1)])
            target = torch.cat([target, b_labels])
            total_eval_loss += loss.item()

        avg_val_loss = total_eval_loss / len(val_dataloader)

        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            best_model = model

    print(f"Epoch {epoch + 1}, Average Training Loss: {avg_train_loss:.4f}, Average Validation Loss: {avg_val_loss:.4f}")
    print(mcf1s(preds, target), '\n')

end_time = time.time()
# print(end_time)
print(f'{(end_time - start_time)/60} mins' )

Epoch 1, Average Training Loss: 0.1001, Average Validation Loss: 0.0737
tensor([0.9244, 0.9843], device='cuda:0') 

Epoch 2, Average Training Loss: 0.0512, Average Validation Loss: 0.0786
tensor([0.9446, 0.9885], device='cuda:0') 

Epoch 3, Average Training Loss: 0.0400, Average Validation Loss: 0.0835
tensor([0.9353, 0.9867], device='cuda:0') 

Epoch 4, Average Training Loss: 0.0318, Average Validation Loss: 0.0798
tensor([0.9345, 0.9868], device='cuda:0') 

Epoch 5, Average Training Loss: 0.0235, Average Validation Loss: 0.0688
tensor([0.9560, 0.9910], device='cuda:0') 

74.97515540917715 mins


In [None]:
# Save the model and tokenizer
best_model.save_pretrained("model/sentiment_model_50k_2")
tokenizer.save_pretrained("model/sentiment_model_50k_2")

('model/sentiment_model_50k_2/tokenizer_config.json',
 'model/sentiment_model_50k_2/special_tokens_map.json',
 'model/sentiment_model_50k_2/vocab.json',
 'model/sentiment_model_50k_2/merges.txt',
 'model/sentiment_model_50k_2/added_tokens.json',
 'model/sentiment_model_50k_2/tokenizer.json')

In [None]:
data_dict

{'train': Dataset({
     features: ['rating', 'text', 'text_cleaned', 'input_ids', 'attention_mask'],
     num_rows: 100000
 }),
 'val': Dataset({
     features: ['rating', 'text', 'text_cleaned', 'input_ids', 'attention_mask'],
     num_rows: 1000
 }),
 'test': Dataset({
     features: ['rating', 'text', 'text_cleaned', 'input_ids', 'attention_mask'],
     num_rows: 1000
 })}

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset


tokenized_train = pd.read_parquet("data/train_100k_5.parquet")
tokenized_val = pd.read_parquet("data/val_100k_5.parquet")
tokenized_test = pd.read_parquet("data/test_100k_5.parquet")

# tokenized_train = data_dict['train']
# tokenized_val = data_dict['val']
# tokenized_test = data_dict['test']

train_dataloader = create_dataloader(tokenized_train)
val_dataloader = create_dataloader(tokenized_val)
test_dataloader = create_dataloader(tokenized_test)

  input_ids = torch.tensor(data['input_ids'])


In [None]:
from torchmetrics.classification import MulticlassF1Score
from transformers import RobertaForSequenceClassification


# put the model on device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
mcf1s = MulticlassF1Score(num_classes=5, average=None).to(device)

model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=5)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
from torch.optim import AdamW
import time
# set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-6)


epochs = 5
start_time = time.time()
best_loss = float('inf')
best_model = None

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
        # resetting gradients before backpropagation
        model.zero_grad()
        # performing a forward pass to calculate outputs
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        # calculating the loss
        loss = outputs.loss
        total_loss += loss.item()
        # backpropagating the loss to compute gradients
        loss.backward()
        # updating model parameters using the computed gradients
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)

    with torch.no_grad():
        model.eval()
        total_eval_loss = 0
        preds = torch.tensor([]).to(device)
        target = torch.tensor([]).to(device)

        for batch in val_dataloader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
            outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
            loss = outputs.loss
            preds = torch.cat([preds, torch.argmax(outputs.logits, axis=1)])
            target = torch.cat([target, b_labels])
            total_eval_loss += loss.item()

        avg_val_loss = total_eval_loss / len(val_dataloader)

        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            best_model = model

    print(f"Epoch {epoch + 1}, Average Training Loss: {avg_train_loss:.4f}, Average Validation Loss: {avg_val_loss:.4f}")
    print(mcf1s(preds, target), '\n')

end_time = time.time()
# print(end_time)
print(f'{(end_time - start_time)/60} mins' )

Epoch 1, Average Training Loss: 0.6587, Average Validation Loss: 0.5855
tensor([0.7976, 0.2462, 0.5000, 0.3247, 0.9199], device='cuda:0') 

Epoch 2, Average Training Loss: 0.5835, Average Validation Loss: 0.5526
tensor([0.8137, 0.2712, 0.5233, 0.3316, 0.9143], device='cuda:0') 

Epoch 3, Average Training Loss: 0.5562, Average Validation Loss: 0.5376
tensor([0.8075, 0.2903, 0.4845, 0.3584, 0.9220], device='cuda:0') 

Epoch 4, Average Training Loss: 0.5299, Average Validation Loss: 0.5405
tensor([0.7974, 0.3056, 0.6022, 0.3953, 0.9241], device='cuda:0') 

Epoch 5, Average Training Loss: 0.5039, Average Validation Loss: 0.5564
tensor([0.8025, 0.3243, 0.5476, 0.3736, 0.9196], device='cuda:0') 

148.95719420115154 mins


In [None]:
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
# Save the model and tokenizer
best_model.save_pretrained("model/sentiment_model_100k_5")
tokenizer.save_pretrained("model/sentiment_model_100k_5")

('model/sentiment_model_100k_5/tokenizer_config.json',
 'model/sentiment_model_100k_5/special_tokens_map.json',
 'model/sentiment_model_100k_5/vocab.json',
 'model/sentiment_model_100k_5/merges.txt',
 'model/sentiment_model_100k_5/added_tokens.json',
 'model/sentiment_model_100k_5/tokenizer.json')

In [None]:
with torch.no_grad():
    model.eval()
    total_eval_loss = 0
    preds = torch.tensor([]).to(device)
    target = torch.tensor([]).to(device)

    for batch in test_dataloader:
        b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        preds = torch.cat([preds, torch.argmax(outputs.logits, axis=1)])
        target = torch.cat([target, b_labels])

In [None]:
(target == preds).sum()

tensor(773, device='cuda:0')

In [None]:
from google.colab import runtime
runtime.unassign()

In [None]:
import gc
best_model.to('cpu')
del best_model #, checkpoint
gc.collect()
torch.cuda.empty_cache()