# Fine-tune roBERTa Model

This notebook is used to fine-tune the pre-trained roBERTa Model.


To successfully run this notebook, one may need to use online computing resourse. In my case, I use google colab with GPU to fine-tune the roBERTa model.


In order to get the access and download required data, we'll need to mount our google drive to the colab environment.

One can skip the following block, if not using google colab.

In [None]:
#### Skip this block if not using google colab. ####

# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where this repo is located.
FOLDERNAME = "Academics/DATA512/Project/llm-roberta-sentiment" # <--- ENTER FOLDERNAME HERE
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
import os
sys.path.append('/content/drive/MyDrive/{}'.format(FOLDERNAME))
os.chdir('/content/drive/MyDrive/{}'.format(FOLDERNAME))


Mounted at /content/drive


## 0. Settings

In [None]:
# Install dependencies
! pip install -r requirements.txt



Load libraries

In [None]:
import os
import re
import sys
import time

from datasets import load_dataset
import numpy as np
import pandas as pd

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, TensorDataset
import transformers
from transformers import AutoTokenizer

In [None]:
# Create necessary directories

os.makedirs("data", exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("results", exist_ok=True)

## 1. Load Data

We will use a small subset of the data from the `All Beauty` category in Amazon Review Data 2023 for finetuning and evaluation.

In [None]:
dataset_all_beauty = load_dataset("McAuley-Lab/Amazon-Reviews-2023"
                                  , "raw_review_All_Beauty"
                                  , trust_remote_code=True)

dataset_all_beauty = dataset_all_beauty['full']
len(dataset_all_beauty)

701528

## 2. Create Dataloader for Fine-tuning

Below we defined a helper class to generate tokenized data.

In [None]:
class TokenizedDataGenerator:
    '''
    A helper class to generate tokenized data for roBERTa model fine-tuning.
    '''
    def __init__(self
                 , dataset: Dataset
                 , tokenizer: transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast
                 , seed: int = 1234) -> None:
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.data_dict = {}
        self.seed = seed

    def clean_text(self
                   , text: str) -> str:
        '''
        Clean the input text by removing HTML tags, URLs, bracketed content, and extra spaces.
        '''

        # Remove HTML tags
        text = re.sub(r"<[^>]+>", " ", text)

        # Remove URLs
        text = re.sub(r"https?://\S+|www\.\S+", " ", text)

        # Remove bracketed content: (), [], {}, <>
        text = re.sub(r"\([^)]*\)|\[[^\]]*\]|\{[^}]*\}|<[^>]*>", " ", text)

        # Replace newlines with space
        text = text.replace("\n", " ")

        # Collapse multiple spaces
        text = re.sub(r"\s+", " ", text)

        return text.strip()

    def clean_text_function(self, example: dict) -> dict:
        '''
        Clean the text in the examples.
        '''
        example["text_cleaned"] = self.clean_text(example["text"])
        return example

    def clean_rating_function(self, example: dict, rating_cats: list) -> dict:
        '''
        Clean the rating in the examples based on the rating categories.
        '''
        if len(rating_cats) == 2:
            # Binary classification: map ratings to 0 and 1
            example["rating"] = 1 if example["rating"] >= 4 else 0
        else:
            # Multi-class classification: map ratings to 0-4
            example["rating"] = example["rating"] - 1
        return example

    def get_training_validation_test_split(self
                                           , data: Dataset
                                           , train_size:int
                                           , val_size:int
                                           , test_size:int
                                           , rating_cats: list
                                           , seed:int
                                           , use_cols:list)-> dict:
        '''
        Split the dataset into training, validation, and test sets.
        '''
        # Shuffle and filter the dataset based on rating categories
        data = data.shuffle(seed=seed).filter(lambda example: example['rating'] in rating_cats)
        data_dict = {}

        # random sample 1000 as validation set and 1000 as test set
        train_test = data.train_test_split(test_size=test_size, seed=seed)
        valid_train = train_test['train'].train_test_split(test_size=val_size, seed=seed)
        data_train = train_test['train'].select(range(train_size))

        data_dict['train'] = data_train.select_columns(use_cols)
        data_dict['val'] = valid_train['test'].select_columns(use_cols)
        data_dict['test'] = train_test['test'].select_columns(use_cols)

        # Clean text and ratings and save into data_dict
        for key in data_dict.keys():
            print(f"{key} size: {data_dict[key].shape}")

            data_dict[key] = data_dict[key].map(self.clean_text_function)
            data_dict[key] = data_dict[key].map(self.clean_rating_function, fn_kwargs={"rating_cats": rating_cats})

        return data_dict

    def tokenize_function(self, example: dict) -> dict:
        '''
        Tokenize the cleaned text in the examples.
        '''
        return self.tokenizer(example["text_cleaned"], padding="max_length", max_length=512, truncation=True, return_attention_mask=True,)


    def get_tokenized_data(self, train_size:int, val_size:int, test_size:int, rating_cats:list, save=True)-> dict:
        '''
        Get tokenized data for training, validation, and test sets.
        Save tokenized data as parquet files if save is True.
        '''

        # Get training, validation, and test splits
        self.data_dict = self.get_training_validation_test_split(self.dataset, train_size=train_size, val_size=val_size, test_size=test_size,
                                           rating_cats=rating_cats, seed=self.seed, use_cols = ['rating', 'text'])
        # Tokenize the data
        for key in self.data_dict.keys():
            tokenized_data = self.data_dict[key].map(self.tokenize_function, batched=True)
            # Save tokenized data as parquet files if save is True
            if save:
                tokenized_data.to_parquet(f"data/{key}_{train_size//1000}k_{len(rating_cats)}.parquet")
                print(f"Saved tokenized_data for {key} set.")
            self.data_dict[key] = tokenized_data

        return self.data_dict

After defining the class, we can create an TokenizedDataGenerator and tokenized data.

In [None]:
# complex task (multi-class classification) wiht 10k training samples
train_size = 10000
val_size = 1000
test_size = 1000
rating_cats = [1,2,3,4,5]
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
tokenized_data_generator = TokenizedDataGenerator(dataset_all_beauty, tokenizer, seed=1234)
data_dict_10k_5 = tokenized_data_generator.get_tokenized_data(train_size=train_size, val_size=val_size, test_size=test_size, rating_cats=rating_cats)

train size: (10000, 2)


Map: 100%|██████████| 10000/10000 [00:00<00:00, 40262.79 examples/s]
Map: 100%|██████████| 10000/10000 [00:00<00:00, 41532.94 examples/s]


val size: (1000, 2)


Map: 100%|██████████| 1000/1000 [00:00<00:00, 35090.26 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 80921.13 examples/s]


test size: (1000, 2)


Map: 100%|██████████| 1000/1000 [00:00<00:00, 33531.36 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 83668.54 examples/s]
Map: 100%|██████████| 10000/10000 [00:00<00:00, 14451.80 examples/s]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 95.28ba/s]


Saved tokenized_data for train set.


Map: 100%|██████████| 1000/1000 [00:00<00:00, 12608.31 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 108.57ba/s]


Saved tokenized_data for val set.


Map: 100%|██████████| 1000/1000 [00:00<00:00, 13972.96 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 100.22ba/s]

Saved tokenized_data for test set.





In [None]:
# complex task (multi-class classification) wiht 50k training samples
train_size = 50000
val_size = 1000
test_size = 1000
rating_cats = [1,2,3,4,5]
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")

tokenized_data_generator = TokenizedDataGenerator(dataset_all_beauty, tokenizer, seed=1234)
data_dict_50k_5 = tokenized_data_generator.get_tokenized_data(train_size=train_size, val_size=val_size, test_size=test_size, rating_cats=rating_cats)

train size: (50000, 2)


Map: 100%|██████████| 50000/50000 [00:01<00:00, 37636.55 examples/s]
Map: 100%|██████████| 50000/50000 [00:00<00:00, 85596.15 examples/s]


val size: (1000, 2)
test size: (1000, 2)


Map: 100%|██████████| 50000/50000 [00:04<00:00, 12320.63 examples/s]
Creating parquet from Arrow format: 100%|██████████| 50/50 [00:00<00:00, 94.49ba/s]


Saved tokenized_data for train set.


Map: 100%|██████████| 1000/1000 [00:00<00:00, 14030.92 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 94.04ba/s]


Saved tokenized_data for val set.


Map: 100%|██████████| 1000/1000 [00:00<00:00, 14252.91 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 95.36ba/s]

Saved tokenized_data for test set.





In [None]:
# complex task (multi-class classification) wiht 100k training samples
train_size = 100000
val_size = 1000
test_size = 1000
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
rating_cats = [1,2,3,4,5]
tokenized_data_generator = TokenizedDataGenerator(dataset_all_beauty, tokenizer, seed=1234)
data_dict_100k_5 = tokenized_data_generator.get_tokenized_data(train_size=train_size, val_size=val_size, test_size=test_size, rating_cats=rating_cats)

train size: (100000, 2)


Map: 100%|██████████| 100000/100000 [00:02<00:00, 37650.59 examples/s]
Map: 100%|██████████| 100000/100000 [00:01<00:00, 79683.75 examples/s]


val size: (1000, 2)
test size: (1000, 2)


Map: 100%|██████████| 100000/100000 [00:07<00:00, 12955.35 examples/s]
Creating parquet from Arrow format: 100%|██████████| 100/100 [00:01<00:00, 88.67ba/s]


Saved tokenized_data for train set.


Map: 100%|██████████| 1000/1000 [00:00<00:00, 13091.45 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 91.63ba/s]


Saved tokenized_data for val set.


Map: 100%|██████████| 1000/1000 [00:00<00:00, 12067.40 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 92.76ba/s]

Saved tokenized_data for test set.





In [None]:
# simple task (binary classification) wiht 10k training samples
train_size = 10000
val_size = 1000
test_size = 1000
rating_cats = [1,5]
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
tokenized_data_generator = TokenizedDataGenerator(dataset_all_beauty, tokenizer, seed=1234)
data_dict_10k_2 = tokenized_data_generator.get_tokenized_data(train_size=train_size, val_size=val_size, test_size=test_size, rating_cats=rating_cats)

train size: (10000, 2)


Map: 100%|██████████| 10000/10000 [00:00<00:00, 40591.15 examples/s]
Map: 100%|██████████| 10000/10000 [00:00<00:00, 75736.94 examples/s]


val size: (1000, 2)


Map: 100%|██████████| 1000/1000 [00:00<00:00, 9421.17 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 80611.64 examples/s]


test size: (1000, 2)


Map: 100%|██████████| 1000/1000 [00:00<00:00, 33539.14 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 70522.14 examples/s]
Map: 100%|██████████| 10000/10000 [00:00<00:00, 12940.07 examples/s]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 99.95ba/s]


Saved tokenized_data for train set.


Map: 100%|██████████| 1000/1000 [00:00<00:00, 11255.34 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 85.40ba/s]


Saved tokenized_data for val set.


Map: 100%|██████████| 1000/1000 [00:00<00:00, 11868.60 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 101.30ba/s]

Saved tokenized_data for test set.





In [None]:
# simple task (binary classification) wiht 50k training samples
train_size = 50000
val_size = 1000
test_size = 1000
rating_cats = [1,5]
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
tokenized_data_generator = TokenizedDataGenerator(dataset_all_beauty, tokenizer, seed=1234)
data_dict_50k_2 = tokenized_data_generator.get_tokenized_data(train_size=train_size, val_size=val_size, test_size=test_size, rating_cats=rating_cats)

train size: (50000, 2)


Map: 100%|██████████| 50000/50000 [00:01<00:00, 36877.42 examples/s]
Map: 100%|██████████| 50000/50000 [00:00<00:00, 75355.91 examples/s]


val size: (1000, 2)
test size: (1000, 2)


Map: 100%|██████████| 50000/50000 [00:03<00:00, 13106.13 examples/s]
Creating parquet from Arrow format: 100%|██████████| 50/50 [00:00<00:00, 87.22ba/s]


Saved tokenized_data for train set.


Map: 100%|██████████| 1000/1000 [00:00<00:00, 13650.76 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 90.14ba/s]


Saved tokenized_data for val set.


Map: 100%|██████████| 1000/1000 [00:00<00:00, 13256.08 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 87.46ba/s]

Saved tokenized_data for test set.





## 3. Fine-tune

In this part, we will fine-tune the bert model with different training sizes and predict targets.

Before doing it, we will need to create dataloader for batch training.

In [None]:
def create_dataloader(data: dict, batch_size:int=32) -> DataLoader:
    '''
        Create a DataLoader from the tokenized data.
    '''

    labels = torch.tensor(data['rating'], dtype=torch.long)
    input_ids = torch.tensor(data['input_ids'])
    attention_mask = torch.tensor(data['attention_mask'])
    train_data = TensorDataset(input_ids, attention_mask, labels)

    return DataLoader(train_data, batch_size=batch_size, shuffle=True)

Load pre-trained model and target metrics

In [None]:
from torchmetrics.classification import MulticlassF1Score
from transformers import RobertaForSequenceClassification

# put the model on device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
mcf1s = MulticlassF1Score(num_classes=5, average=None).to(device)
model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=5)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

Define the train_model function

In [None]:
def train_model(model, train_dataloader, val_dataloader, optimizer, device, mcf1s, epochs):
    """
    Train the model with the given dataloaders and optimizer.
    """

    start_time = time.time()
    best_loss = float('inf')
    best_model = None

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
            # resetting gradients before backpropagation
            model.zero_grad()
            # performing a forward pass to calculate outputs
            outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
            # calculating the loss
            loss = outputs.loss
            total_loss += loss.item()
            # backpropagating the loss to compute gradients
            loss.backward()
            # updating model parameters using the computed gradients
            optimizer.step()
        avg_train_loss = total_loss / len(train_dataloader)

        with torch.no_grad():
            model.eval()
            total_eval_loss = 0
            preds = torch.tensor([]).to(device)
            target = torch.tensor([]).to(device)

            for batch in val_dataloader:
                b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
                outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
                loss = outputs.loss
                preds = torch.cat([preds, torch.argmax(outputs.logits, axis=1)])
                target = torch.cat([target, b_labels])
                total_eval_loss += loss.item()

            avg_val_loss = total_eval_loss / len(val_dataloader)

            if avg_val_loss < best_loss:
                best_loss = avg_val_loss
                best_model = model

        print(f"Epoch {epoch + 1}, Average Training Loss: {avg_train_loss:.4f}, Average Validation Loss: {avg_val_loss:.4f}")
        print(mcf1s(preds, target), '\n')

    end_time = time.time()
    # print(end_time)
    print(f'{(end_time - start_time)/60} mins' )
    return best_model

### Fine-tune Bert model (10k trainging; 5 Class)

Create dataloader for batch training

In [None]:
# # Load data if runtime is restarted
# tokenized_train = pd.read_parquet("data/train_10k_5.parquet")
# tokenized_val = pd.read_parquet("data/val_10k_5.parquet")
# tokenized_test = pd.read_parquet("data/test_10k_5.parquet")

tokenized_train = data_dict_10k_5['train']
tokenized_val = data_dict_10k_5['val']
# tokenized_test = data_dict_10k_5['test']

train_dataloader = create_dataloader(tokenized_train)
val_dataloader = create_dataloader(tokenized_val)
# test_dataloader = create_dataloader(tokenized_test)

Train model

In [None]:
# set up the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 5

# train the model
best_model = train_model(model
                         , train_dataloader
                         , val_dataloader
                         , optimizer
                         , device=device
                         , mcf1s=mcf1s
                         , epochs=5)

# Save the model and tokenizer
best_model.save_pretrained("models/sentiment_model_10k_5")
tokenizer.save_pretrained("models/sentiment_model_10k_5")

Epoch 1, Average Training Loss: 0.8269, Average Validation Loss: 0.6256
tensor([0.7769, 0.1379, 0.4678, 0.2500, 0.9151], device='cuda:0') 

Epoch 2, Average Training Loss: 0.6250, Average Validation Loss: 0.6057
tensor([0.7666, 0.1791, 0.5371, 0.2500, 0.9224], device='cuda:0') 

Epoch 3, Average Training Loss: 0.5601, Average Validation Loss: 0.5879
tensor([0.8000, 0.1000, 0.4932, 0.3913, 0.9198], device='cuda:0') 

Epoch 4, Average Training Loss: 0.5065, Average Validation Loss: 0.6523
tensor([0.7551, 0.2553, 0.4493, 0.3663, 0.9214], device='cuda:0') 

Epoch 5, Average Training Loss: 0.4510, Average Validation Loss: 0.6680
tensor([0.7915, 0.1928, 0.4507, 0.3648, 0.9214], device='cuda:0') 

15.372741870085399 mins


### Fine-tune Bert model (50k trainging; 5 Class)

In [None]:
# # Load data if runtime is restarted
# tokenized_train = pd.read_parquet("data/train_50k_5.parquet")
# tokenized_val = pd.read_parquet("data/val_50k_5.parquet")
# tokenized_test = pd.read_parquet("data/test_50k_5.parquet")

tokenized_train = data_dict_50k_5['train']
tokenized_val = data_dict_50k_5['val']
# tokenized_test = data_dict_50k_5['test']

train_dataloader = create_dataloader(tokenized_train)
val_dataloader = create_dataloader(tokenized_val)
# test_dataloader = create_dataloader(tokenized_test)

In [None]:
# put the model on device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
mcf1s = MulticlassF1Score(num_classes=5, average=None).to(device)
model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=5)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
# set up the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 5

# train the model
best_model = train_model(model
                         , train_dataloader
                         , val_dataloader
                         , optimizer
                         , device=device
                         , mcf1s=mcf1s
                         , epochs=5)

# Save the model and tokenizer
best_model.save_pretrained("models/sentiment_model_50k_5")
tokenizer.save_pretrained("models/sentiment_model_50k_5")

Epoch 1, Average Training Loss: 0.7466, Average Validation Loss: 0.5845
tensor([0.8098, 0.2295, 0.4324, 0.2924, 0.9181], device='cuda:0') 

Epoch 2, Average Training Loss: 0.5990, Average Validation Loss: 0.5659
tensor([0.8000, 0.2740, 0.5814, 0.3247, 0.9218], device='cuda:0') 

Epoch 3, Average Training Loss: 0.5691, Average Validation Loss: 0.5490
tensor([0.8061, 0.2456, 0.5581, 0.3522, 0.9220], device='cuda:0') 

Epoch 4, Average Training Loss: 0.5432, Average Validation Loss: 0.5495
tensor([0.8037, 0.2571, 0.5647, 0.3875, 0.9257], device='cuda:0') 

Epoch 5, Average Training Loss: 0.5129, Average Validation Loss: 0.5536
tensor([0.7810, 0.2619, 0.5562, 0.3558, 0.9251], device='cuda:0') 

71.17498952150345 mins


### Fine-tune Bert model (100k trainging; 5 Class)

In [None]:
# # Load data if runtime is restarted
# tokenized_train = pd.read_parquet("data/train_100k_5.parquet")
# tokenized_val = pd.read_parquet("data/val_100k_5.parquet")
# tokenized_test = pd.read_parquet("data/test_100k_5.parquet")

tokenized_train = data_dict_100k_5['train']
tokenized_val = data_dict_100k_5['val']
# tokenized_test = data_dict_100k_5['test']

train_dataloader = create_dataloader(tokenized_train)
val_dataloader = create_dataloader(tokenized_val)
# test_dataloader = create_dataloader(tokenized_test)

In [None]:
# put the model on device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
mcf1s = MulticlassF1Score(num_classes=5, average=None).to(device)
model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=5)
model.to(device)

In [None]:
# set up the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 5

# train the model
best_model = train_model(model
                         , train_dataloader
                         , val_dataloader
                         , optimizer
                         , device=device
                         , mcf1s=mcf1s
                         , epochs=5)

# Save the model and tokenizer
best_model.save_pretrained("models/sentiment_model_100k_5")
tokenizer.save_pretrained("models/sentiment_model_100k_5")

Epoch 1, Average Training Loss: 0.6745, Average Validation Loss: 0.5541
tensor([0.7942, 0.0769, 0.5434, 0.3593, 0.9200], device='cuda:0') 

Epoch 2, Average Training Loss: 0.5850, Average Validation Loss: 0.5486
tensor([0.8224, 0.2222, 0.5269, 0.3409, 0.9236], device='cuda:0') 

Epoch 3, Average Training Loss: 0.5565, Average Validation Loss: 0.5301
tensor([0.8012, 0.0930, 0.5680, 0.3580, 0.9237], device='cuda:0') 

Epoch 4, Average Training Loss: 0.5318, Average Validation Loss: 0.5424
tensor([0.8373, 0.3051, 0.5444, 0.3593, 0.9254], device='cuda:0') 

Epoch 5, Average Training Loss: 0.5054, Average Validation Loss: 0.5442
tensor([0.7846, 0.2821, 0.5967, 0.4343, 0.9156], device='cuda:0') 

141.42560269435248 mins


### Fine-tune Bert model (10k trainging; 2 Class)

In [None]:
# # Load data if runtime is restarted
# tokenized_train = pd.read_parquet("data/train_10k_2.parquet")
# tokenized_val = pd.read_parquet("data/val_10k_2.parquet")
# tokenized_test = pd.read_parquet("data/test_10k_2.parquet")

tokenized_train = data_dict_10k_2['train']
tokenized_val = data_dict_10k_2['val']
# tokenized_test = data_dict_10k_2['test']

train_dataloader = create_dataloader(tokenized_train)
val_dataloader = create_dataloader(tokenized_val)
# test_dataloader = create_dataloader(tokenized_test)

In [None]:
# put the model on device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
mcf1s = MulticlassF1Score(num_classes=2, average=None).to(device)

model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=2)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 5

# train the model
best_model = train_model(model
                         , train_dataloader
                         , val_dataloader
                         , optimizer
                         , device=device
                         , mcf1s=mcf1s
                         , epochs=5)

# Save the model and tokenizer
best_model.save_pretrained("models/sentiment_model_10k_2")
tokenizer.save_pretrained("models/sentiment_model_10k_2")

Epoch 1, Average Training Loss: 0.2300, Average Validation Loss: 0.0877
tensor([0.8929, 0.9784], device='cuda:0') 

Epoch 2, Average Training Loss: 0.0725, Average Validation Loss: 0.0944
tensor([0.9046, 0.9815], device='cuda:0') 

Epoch 3, Average Training Loss: 0.0519, Average Validation Loss: 0.0799
tensor([0.9204, 0.9837], device='cuda:0') 

Epoch 4, Average Training Loss: 0.0407, Average Validation Loss: 0.0867
tensor([0.9288, 0.9848], device='cuda:0') 

Epoch 5, Average Training Loss: 0.0340, Average Validation Loss: 0.0839
tensor([0.9341, 0.9861], device='cuda:0') 

15.370466430981955 mins


### Fine-tune Bert model (50k trainging; 2 Class)

In [None]:
# # Load data if runtime is restarted
# tokenized_train = pd.read_parquet("data/train_50k_2.parquet")
# tokenized_val = pd.read_parquet("data/val_50k_2.parquet")
# tokenized_test = pd.read_parquet("data/test_50k_2.parquet")

tokenized_train = data_dict_50k_2['train']
tokenized_val = data_dict_50k_2['val']
# tokenized_test = data_dict_50k_2['test']

train_dataloader = create_dataloader(tokenized_train)
val_dataloader = create_dataloader(tokenized_val)
# test_dataloader = create_dataloader(tokenized_test)

In [None]:
# put the model on device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
mcf1s = MulticlassF1Score(num_classes=2, average=None).to(device)

model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=2)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 5

# train the model
best_model = train_model(model
                         , train_dataloader
                         , val_dataloader
                         , optimizer
                         , device=device
                         , mcf1s=mcf1s
                         , epochs=5)

# Save the model and tokenizer
best_model.save_pretrained("models/sentiment_model_50k_2")
tokenizer.save_pretrained("models/sentiment_model_50k_2")

Epoch 1, Average Training Loss: 0.1001, Average Validation Loss: 0.0737
tensor([0.9244, 0.9843], device='cuda:0') 

Epoch 2, Average Training Loss: 0.0512, Average Validation Loss: 0.0786
tensor([0.9446, 0.9885], device='cuda:0') 

Epoch 3, Average Training Loss: 0.0400, Average Validation Loss: 0.0835
tensor([0.9353, 0.9867], device='cuda:0') 

Epoch 4, Average Training Loss: 0.0318, Average Validation Loss: 0.0798
tensor([0.9345, 0.9868], device='cuda:0') 

Epoch 5, Average Training Loss: 0.0235, Average Validation Loss: 0.0688
tensor([0.9560, 0.9910], device='cuda:0') 

74.97515540917715 mins


Clean cache and checkpoint

In [None]:
import gc
best_model.to('cpu')
del best_model #, checkpoint
gc.collect()
torch.cuda.empty_cache()

Delete runtime

In [None]:
from google.colab import runtime
runtime.unassign()