In [38]:
!pip install evaluate



In [40]:
import pandas as pd
from datasets import Dataset

# Load JSONL from your repo
url = "https://raw.githubusercontent.com/affan002/DimABSA-SemEval-task03/refs/heads/main/train/eng_laptop_train_alltasks.jsonl?token=GHSAT0AAAAAADFHNCHZDQLRRFHHZTTVLGCM2GT2CHA"
df = pd.read_json(url, lines=True)

rows = []
for _, row in df.iterrows():
    text = row["Text"]
    for quad in row["Quadruplet"]:
        aspect = quad["Aspect"]
        opinion = quad["Opinion"]
        va = quad["VA"]
        valence, arousal = map(float, va.split("#"))

        rows.append({
            "Text": text,
            "Aspect": aspect,
            "Opinion": opinion,
            "Valence": valence,
            "Arousal": arousal
        })

raw_datasets = Dataset.from_pandas(pd.DataFrame(rows))

In [45]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification

# --- Load pretrained tokenizer and add [NULL] token ---
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

special_tokens_dict = {'additional_special_tokens': ['[NULL]']}
tokenizer.add_special_tokens(special_tokens_dict)

# --- Define label mapping (added NULL label) ---
label_list = ["O", "B-ASP", "I-ASP", "B-OPI", "I-OPI"]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

print("Label mapping:", label2id, id2label)

# --- Function to create BIO labels ---
def create_bio_labels(text, quadruplets):
    words = text.split()
    labels = ["O"] * len(words)

    for quad in quadruplets:
        aspect = quad["Aspect"]
        opinion = quad["Opinion"]

        # Aspect span
        if aspect != "NULL":
            aspect_tokens = aspect.split()
            for i, token in enumerate(words):
                if token.lower() == aspect_tokens[0].lower():  # simple first token match
                    labels[i] = "B-ASP"
                    if len(aspect_tokens) > 1:
                        for j in range(1, len(aspect_tokens)):
                            if i + j < len(words):
                                labels[i + j] = "I-ASP"

        # Opinion span
        if opinion != "NULL":
            opinion_tokens = opinion.split()
            for i, token in enumerate(words):
                if token.lower() == opinion_tokens[0].lower():
                    labels[i] = "B-OPI"
                    if len(opinion_tokens) > 1:
                        for j in range(1, len(opinion_tokens)):
                            if i + j < len(words):
                                labels[i + j] = "I-OPI"

    # Prepend [NULL] token (always O)
    words = ["[NULL]"] + words
    labels = ["O"] + labels

    return words, [label2id[l] for l in labels]

# --- Convert dataset to word-level BIO format ---
rows = []
for _, row in df.iterrows():
    words, labels = create_bio_labels(row["Text"], row["Quadruplet"])
    rows.append({"tokens": words, "labels": labels})

bio_dataset = Dataset.from_pandas(pd.DataFrame(rows))

print(bio_dataset[0])

# --- Tokenize & align labels ---
def tokenize_and_align_labels(batch):
    tokenized = tokenizer(batch["tokens"], is_split_into_words=True, truncation=True, padding=False)
    aligned_labels = []

    for i, labels in enumerate(batch["labels"]):
        word_ids = tokenized.word_ids(batch_index=i)
        new_labels = []
        prev_word = None
        for word_id in word_ids:
            if word_id is None:
                new_labels.append(-100)  # ignore special tokens
            elif word_id != prev_word:
                new_labels.append(labels[word_id])
            else:
                new_labels.append(-100)  # only label first subword
            prev_word = word_id
        aligned_labels.append(new_labels)

    tokenized["labels"] = aligned_labels
    return tokenized

tokenized_datasets = bio_dataset.map(tokenize_and_align_labels, batched=True)

# --- Data collator ---
data_collator = DataCollatorForTokenClassification(tokenizer) # this collator will be used in dataloader

print(tokenized_datasets.column_names)
print(tokenized_datasets[0])

Label mapping: {'O': 0, 'B-ASP': 1, 'I-ASP': 2, 'B-OPI': 3, 'I-OPI': 4} {0: 'O', 1: 'B-ASP', 2: 'I-ASP', 3: 'B-OPI', 4: 'I-OPI'}
{'tokens': ['[NULL]', 'this', 'unit', 'is', '`', '`', 'pretty', '`', '`', 'and', 'stylish', ',', 'so', 'my', 'high', 'school', 'daughter', 'was', 'attracted', 'to', 'it', 'for', 'that', 'reason', '.'], 'labels': [0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


Map:   0%|          | 0/4076 [00:00<?, ? examples/s]

DataCollatorForTokenClassification(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[NULL]']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	30522: AddedToken("[NULL]", rstrip=False, l

In [46]:
tokenized_datasets = tokenized_datasets.remove_columns(["tokens"])

# Convert dataset to PyTorch tensors
tokenized_datasets.set_format("torch")

# Check final columns
print(tokenized_datasets.column_names)

['labels', 'input_ids', 'token_type_ids', 'attention_mask']


In [47]:
# Split into train/validation/test
# First: train + temp (where temp will be split further into val + test)
dataset_splits = tokenized_datasets.train_test_split(test_size=0.2, seed=42)

train_dataset = dataset_splits["train"]
temp_dataset = dataset_splits["test"]

# Now split temp into validation and test (50/50 → 10% val, 10% test overall)
temp_splits = temp_dataset.train_test_split(test_size=0.5, seed=42)

eval_dataset = temp_splits["train"]   # validation set
test_dataset = temp_splits["test"]    # final test set

# Make DataLoaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=8, collate_fn=data_collator
)
test_dataloader = DataLoader(
    test_dataset, batch_size=8, collate_fn=data_collator
)


In [48]:
for batch in train_dataloader: #inspecting the batch
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 45]),
 'token_type_ids': torch.Size([8, 45]),
 'attention_mask': torch.Size([8, 45]),
 'labels': torch.Size([8, 45])}

In [50]:
from transformers import AutoConfig, AutoModelForTokenClassification

checkpoint = "bert-base-uncased"

config = AutoConfig.from_pretrained(
    checkpoint,
    num_labels=len(label_list),   # e.g., 5: O, B-ASP, I-ASP, B-OPI, I-OPI
    id2label=id2label,
    label2id=label2id
)

model = AutoModelForTokenClassification.from_pretrained(checkpoint, config=config)

# Resize embeddings if you added special tokens like [NULL]
model.resize_token_embeddings(len(tokenizer))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(30523, 768, padding_idx=0)

In [51]:
# Pass the batch through the model
outputs = model(**batch)

# Cross-entropy loss for token classification
print("Loss:", outputs.loss)

# Logits shape: (batch_size, seq_len, num_labels)
print("Logits shape:", outputs.logits.shape)


Loss: tensor(1.5172, grad_fn=<NllLossBackward0>)
Logits shape: torch.Size([8, 45, 5])


In [52]:
from torch.optim import AdamW

# Define optimizer that updates the model's parameters
optimizer = AdamW(model.parameters(), lr=5e-5)
# ⚠️ The only thing you might tune later:
# Learning rate (lr) → try 5e-5, 3e-5, or 1e-5 to see which gives better results.
# Weight decay → if overfitting, you can add e.g. weight_decay=0.01.

In [53]:
from transformers import get_scheduler

# Train for more epochs since dataset is small
num_epochs = 5   # you can try 5, 8, or even 10

# Total number of training steps
num_training_steps = num_epochs * len(train_dataloader)

# Warmup = 10% of training steps
num_warmup_steps = int(0.1 * num_training_steps)

# Define learning rate scheduler
lr_scheduler = get_scheduler(
    "linear",                # linear decay schedule
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,   # gradual warmup
    num_training_steps=num_training_steps,
)

print(f"Total steps: {num_training_steps}, Warmup steps: {num_warmup_steps}")


Total steps: 2040, Warmup steps: 204


In [None]:
import torch

# Step 1: Choose device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Step 2: Move the model to the chosen device
model.to(device)
device


In [54]:
! pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=29a439e30dfa96b5ed4575bb7c98889b0fa4cac7409464db9e581ccdd0f570d1
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
from tqdm.auto import tqdm
from seqeval.metrics import f1_score, precision_score, recall_score
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 5  # you can adjust
best_f1 = 0.0   # track best F1 for checkpointing

for epoch in range(num_epochs):
    # ------------------- TRAINING -------------------
    model.train()
    total_train_loss = 0
    train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training")

    for batch in train_progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_train_loss += loss.item()
        train_progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"\nEpoch {epoch+1}: Avg Train Loss = {avg_train_loss:.4f}")

    # ------------------- VALIDATION -------------------
    model.eval()
    all_preds = []
    all_labels = []

    val_progress_bar = tqdm(eval_dataloader, desc=f"Epoch {epoch+1} Validation", leave=False)
    with torch.no_grad():
        for batch in val_progress_bar:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            for pred_seq, label_seq, mask_seq in zip(predictions, batch["labels"], batch["attention_mask"]):
                pred_labels = []
                true_labels = []
                for p, l, m in zip(pred_seq, label_seq, mask_seq):
                    if l.item() != -100 and m.item() == 1:
                        pred_labels.append(id2label[p.item()])
                        true_labels.append(id2label[l.item()])
                all_preds.append(pred_labels)
                all_labels.append(true_labels)

    # Compute metrics
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    print(f"Validation — F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    # ------------------- CHECKPOINT -------------------
    if f1 > best_f1:
        best_f1 = f1
        save_path = "./best_model"
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        print(f"New best model saved at F1 = {f1:.4f}\n")
