In [4]:
#Installing libraries
!pip install evaluate



In [5]:
#import data
import pandas as pd
from datasets import Dataset
url = "https://raw.githubusercontent.com/affan002/DimABSA-SemEval-task03/refs/heads/main/train/eng_laptop_train_alltasks.jsonl?token=GHSAT0AAAAAADIPLQESA3TVFD2HJJDMVFX42GRX7MA"
df = pd.read_json(url, lines=True)

rows = []
for _, row in df.iterrows():
    text = row["Text"]
    for quad in row["Quadruplet"]:
        aspect = quad["Aspect"]
        va = quad["VA"]
        valence, arousal = map(float, va.split("#"))
        rows.append({
            "Text": text,
            "Aspect": aspect,
            "VA": va,
            "Valence": valence,
            "Arousal": arousal
        })

raw_datasets = Dataset.from_pandas(pd.DataFrame(rows))


In [6]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(batch):
    tokenized = tokenizer(
        list(batch["Aspect"]),
        list(batch["Text"]),
        truncation=True
    )
    valence = [float(v.split("#")[0]) for v in batch["VA"]]
    arousal = [float(v.split("#")[1]) for v in batch["VA"]]
    tokenized["labels"] = list(zip(valence, arousal))
    return tokenized

# Batched map works now
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print(tokenized_datasets[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5773 [00:00<?, ? examples/s]

{'Text': 'this unit is ` ` pretty ` ` and stylish , so my high school daughter was attracted to it for that reason .', 'Aspect': 'unit', 'VA': '7.12#7.12', 'Valence': 7.12, 'Arousal': 7.12, 'input_ids': [101, 3131, 102, 2023, 3131, 2003, 1036, 1036, 3492, 1036, 1036, 1998, 2358, 8516, 4509, 1010, 2061, 2026, 2152, 2082, 2684, 2001, 6296, 2000, 2009, 2005, 2008, 3114, 1012, 102], 'token_type_ids': [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [7.12, 7.12]}


In [7]:
# Drop raw columns we don't need for training
tokenized_datasets = tokenized_datasets.remove_columns(
    ["Text", "Aspect", "VA", "Valence", "Arousal"]
)

# Convert dataset to PyTorch tensors
tokenized_datasets.set_format("torch")

# Check final columns
print(tokenized_datasets.column_names)

['input_ids', 'token_type_ids', 'attention_mask', 'labels']


In [8]:
# Split into train/validation/test
# First: train + temp (where temp will be split further into val + test)
dataset_splits = tokenized_datasets.train_test_split(test_size=0.2, seed=42)

train_dataset = dataset_splits["train"]
temp_dataset = dataset_splits["test"]

# Now split temp into validation and test (50/50 → 10% val, 10% test overall)
temp_splits = temp_dataset.train_test_split(test_size=0.5, seed=42)

eval_dataset = temp_splits["train"]   # validation set
test_dataset = temp_splits["test"]    # final test set

# Make DataLoaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=8, collate_fn=data_collator
)
test_dataloader = DataLoader(
    test_dataset, batch_size=8, collate_fn=data_collator
)

In [9]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 49]),
 'token_type_ids': torch.Size([8, 49]),
 'attention_mask': torch.Size([8, 49]),
 'labels': torch.Size([8, 2])}

In [10]:
from transformers import AutoConfig, AutoModelForSequenceClassification

# Tell HF to treat this as regression with 2 outputs (Valence, Arousal)
config = AutoConfig.from_pretrained(checkpoint, num_labels=2, problem_type="regression")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, config=config)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Pass the batch through the model to get outputs (predictions + loss)
outputs = model(**batch)

# Print:
#  - outputs.loss → the current Mean Squared Error (MSE) loss
#  - outputs.logits.shape → the shape of the predictions tensor (batch_size × 2)
print(outputs.loss, outputs.logits.shape)

tensor(38.8081, grad_fn=<MseLossBackward0>) torch.Size([8, 2])


In [12]:
from torch.optim import AdamW

# Define optimizer that updates the model's parameters
optimizer = AdamW(model.parameters(), lr=5e-5)
# ⚠️ The only thing you might tune later:
# Learning rate (lr) → try 5e-5, 3e-5, or 1e-5 to see which gives better results.
# Weight decay → if overfitting, you can add e.g. weight_decay=0.01.

In [13]:
from transformers import get_scheduler

# Train for more epochs since dataset is small
num_epochs = 5   # you can try 5, 8, or even 10

# Total number of training steps
num_training_steps = num_epochs * len(train_dataloader)

# Warmup = 10% of training steps
num_warmup_steps = int(0.1 * num_training_steps)

# Define learning rate scheduler
lr_scheduler = get_scheduler(
    "linear",                # linear decay schedule
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,   # gradual warmup
    num_training_steps=num_training_steps,
)

print(f"Total steps: {num_training_steps}, Warmup steps: {num_warmup_steps}")


Total steps: 2890, Warmup steps: 289


In [15]:
import torch

# Step 1: Choose device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Step 2: Move the model to the chosen device
model.to(device)
device


device(type='cuda')

In [16]:
from tqdm.auto import tqdm

# Create a progress bar for visual feedback
progress_bar = tqdm(range(num_training_steps))

model.train()  # Set model to training mode
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}  # Move inputs & labels to GPU/CPU
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()  # Compute

        optimizer.step()       # Update model weights using gradients
        lr_scheduler.step()    # Update learning rate according to schedule
        optimizer.zero_grad()  # Reset gradients for next step
        progress_bar.update(1) # Advance progress bar




  0%|          | 0/2890 [00:00<?, ?it/s]

In [18]:
import torch
import numpy as np

model.eval()  # evaluation mode

valence_preds_all = []
arousal_preds_all = []
valence_refs_all = []
arousal_refs_all = []

for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits  # [batch_size, 2]
    references = batch["labels"]   # [batch_size, 2]

    # Separate valence and arousal
    valence_preds_all.append(predictions[:, 0].cpu().numpy())
    arousal_preds_all.append(predictions[:, 1].cpu().numpy())
    valence_refs_all.append(references[:, 0].cpu().numpy())
    arousal_refs_all.append(references[:, 1].cpu().numpy())

# Concatenate all batches
valence_preds_all = np.concatenate(valence_preds_all)
arousal_preds_all = np.concatenate(arousal_preds_all)
valence_refs_all = np.concatenate(valence_refs_all)
arousal_refs_all = np.concatenate(arousal_refs_all)

# Compute MSE and MAE for each dimension
valence_mse = ((valence_preds_all - valence_refs_all) ** 2).mean()
arousal_mse = ((arousal_preds_all - arousal_refs_all) ** 2).mean()

valence_mae = np.abs(valence_preds_all - valence_refs_all).mean()
arousal_mae = np.abs(arousal_preds_all - arousal_refs_all).mean()

print(f"Valence → MSE: {valence_mse:.4f}, MAE: {valence_mae:.4f}")
print(f"Arousal → MSE: {arousal_mse:.4f}, MAE: {arousal_mae:.4f}")


Valence → MSE: 0.6087, MAE: 0.5287
Arousal → MSE: 0.4739, MAE: 0.5129
