<a href="https://colab.research.google.com/github/affan002/DimABSA-SemEval-task03/blob/main/ST2_semeval_duplet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
!pip install evaluate



In [20]:
import pandas as pd
from datasets import Dataset

# Load JSONL from your repo
url = "https://raw.githubusercontent.com/affan002/DimABSA-SemEval-task03/refs/heads/main/train/eng_laptop_train_alltasks.jsonl?token=GHSAT0AAAAAADFHNCHZE65KEAHTGJMOQSIC2GT55EQ"
df = pd.read_json(url, lines=True)

rows = []
for _, row in df.iterrows():
    text = row["Text"]
    for quad in row["Quadruplet"]:
        aspect = quad["Aspect"]
        opinion = quad["Opinion"]
        va = quad["VA"]
        valence, arousal = map(float, va.split("#"))

        rows.append({
            "Text": text,
            "Aspect": aspect,
            "Opinion": opinion,
            "Valence": valence,
            "Arousal": arousal
        })

raw_datasets = Dataset.from_pandas(pd.DataFrame(rows))

In [21]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification

# --- Load pretrained tokenizer and add [NULL] token ---
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

special_tokens_dict = {'additional_special_tokens': ['[NULL]']}
tokenizer.add_special_tokens(special_tokens_dict)

# --- Define label mapping (added NULL label) ---
label_list = ["O", "B-ASP", "I-ASP", "B-OPI", "I-OPI"]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

print("Label mapping:", label2id, id2label)

# --- Function to create BIO labels ---
def create_bio_labels(text, quadruplets):
    words = text.split()
    labels = ["O"] * len(words)

    for quad in quadruplets:
        aspect = quad["Aspect"]
        opinion = quad["Opinion"]

        # Aspect span
        if aspect != "NULL":
            aspect_tokens = aspect.split()
            for i, token in enumerate(words):
                if token.lower() == aspect_tokens[0].lower():  # simple first token match
                    labels[i] = "B-ASP"
                    if len(aspect_tokens) > 1:
                        for j in range(1, len(aspect_tokens)):
                            if i + j < len(words):
                                labels[i + j] = "I-ASP"

        # Opinion span
        if opinion != "NULL":
            opinion_tokens = opinion.split()
            for i, token in enumerate(words):
                if token.lower() == opinion_tokens[0].lower():
                    labels[i] = "B-OPI"
                    if len(opinion_tokens) > 1:
                        for j in range(1, len(opinion_tokens)):
                            if i + j < len(words):
                                labels[i + j] = "I-OPI"

    # Prepend [NULL] token (always O)
    words = ["[NULL]"] + words
    labels = ["O"] + labels

    return words, [label2id[l] for l in labels]

# --- Convert dataset to word-level BIO format ---
rows = []
for _, row in df.iterrows():
    words, labels = create_bio_labels(row["Text"], row["Quadruplet"])
    rows.append({"tokens": words, "labels": labels})

bio_dataset = Dataset.from_pandas(pd.DataFrame(rows))

print(bio_dataset[0])

# --- Tokenize & align labels ---
def tokenize_and_align_labels(batch):
    tokenized = tokenizer(batch["tokens"], is_split_into_words=True, truncation=True, padding=False)
    aligned_labels = []

    for i, labels in enumerate(batch["labels"]):
        word_ids = tokenized.word_ids(batch_index=i)
        new_labels = []
        prev_word = None
        for word_id in word_ids:
            if word_id is None:
                new_labels.append(-100)  # ignore special tokens
            elif word_id != prev_word:
                new_labels.append(labels[word_id])
            else:
                new_labels.append(-100)  # only label first subword
            prev_word = word_id
        aligned_labels.append(new_labels)

    tokenized["labels"] = aligned_labels
    return tokenized

tokenized_datasets = bio_dataset.map(tokenize_and_align_labels, batched=True)

# --- Data collator ---
data_collator = DataCollatorForTokenClassification(tokenizer) # this collator will be used in dataloader

print(tokenized_datasets.column_names)
print(tokenized_datasets[0])

Label mapping: {'O': 0, 'B-ASP': 1, 'I-ASP': 2, 'B-OPI': 3, 'I-OPI': 4} {0: 'O', 1: 'B-ASP', 2: 'I-ASP', 3: 'B-OPI', 4: 'I-OPI'}
{'tokens': ['[NULL]', 'this', 'unit', 'is', '`', '`', 'pretty', '`', '`', 'and', 'stylish', ',', 'so', 'my', 'high', 'school', 'daughter', 'was', 'attracted', 'to', 'it', 'for', 'that', 'reason', '.'], 'labels': [0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


Map:   0%|          | 0/4076 [00:00<?, ? examples/s]

['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask']
{'tokens': ['[NULL]', 'this', 'unit', 'is', '`', '`', 'pretty', '`', '`', 'and', 'stylish', ',', 'so', 'my', 'high', 'school', 'daughter', 'was', 'attracted', 'to', 'it', 'for', 'that', 'reason', '.'], 'labels': [-100, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 3, -100, -100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100], 'input_ids': [101, 30522, 2023, 3131, 2003, 1036, 1036, 3492, 1036, 1036, 1998, 2358, 8516, 4509, 1010, 2061, 2026, 2152, 2082, 2684, 2001, 6296, 2000, 2009, 2005, 2008, 3114, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [22]:
tokenized_datasets = tokenized_datasets.remove_columns(["tokens"])

# Convert dataset to PyTorch tensors
tokenized_datasets.set_format("torch")

# Check final columns
print(tokenized_datasets.column_names)

['labels', 'input_ids', 'token_type_ids', 'attention_mask']


In [23]:
# Split into train/validation/test
# First: train + temp (where temp will be split further into val + test)
dataset_splits = tokenized_datasets.train_test_split(test_size=0.2, seed=42)

train_dataset = dataset_splits["train"]
temp_dataset = dataset_splits["test"]

# Now split temp into validation and test (50/50 → 10% val, 10% test overall)
temp_splits = temp_dataset.train_test_split(test_size=0.5, seed=42)

eval_dataset = temp_splits["train"]   # validation set
test_dataset = temp_splits["test"]    # final test set

# Make DataLoaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=8, collate_fn=data_collator
)
test_dataloader = DataLoader(
    test_dataset, batch_size=8, collate_fn=data_collator
)


In [24]:
for batch in train_dataloader: #inspecting the batch
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 51]),
 'token_type_ids': torch.Size([8, 51]),
 'attention_mask': torch.Size([8, 51]),
 'labels': torch.Size([8, 51])}

In [25]:
from transformers import AutoConfig, AutoModelForTokenClassification

checkpoint = "bert-base-uncased"

config = AutoConfig.from_pretrained(
    checkpoint,
    num_labels=len(label_list),   # e.g., 5: O, B-ASP, I-ASP, B-OPI, I-OPI
    id2label=id2label,
    label2id=label2id
)

model = AutoModelForTokenClassification.from_pretrained(checkpoint, config=config)

# Resize embeddings if you added special tokens like [NULL]
model.resize_token_embeddings(len(tokenizer))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(30523, 768, padding_idx=0)

In [26]:
# Pass the batch through the model
outputs = model(**batch)

# Cross-entropy loss for token classification
print("Loss:", outputs.loss)

# Logits shape: (batch_size, seq_len, num_labels)
print("Logits shape:", outputs.logits.shape)


Loss: tensor(1.7091, grad_fn=<NllLossBackward0>)
Logits shape: torch.Size([8, 51, 5])


In [27]:
from torch.optim import AdamW

# Define optimizer that updates the model's parameters
optimizer = AdamW(model.parameters(), lr=5e-5)
# ⚠️ The only thing you might tune later:
# Learning rate (lr) → try 5e-5, 3e-5, or 1e-5 to see which gives better results.
# Weight decay → if overfitting, you can add e.g. weight_decay=0.01.

In [28]:
from transformers import get_scheduler

# Train for more epochs since dataset is small
num_epochs = 5   # you can try 5, 8, or even 10

# Total number of training steps
num_training_steps = num_epochs * len(train_dataloader)

# Warmup = 10% of training steps
num_warmup_steps = int(0.1 * num_training_steps)

# Define learning rate scheduler
lr_scheduler = get_scheduler(
    "linear",                # linear decay schedule
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,   # gradual warmup
    num_training_steps=num_training_steps,
)

print(f"Total steps: {num_training_steps}, Warmup steps: {num_warmup_steps}")


Total steps: 2040, Warmup steps: 204


In [29]:
import torch

# Step 1: Choose device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Step 2: Move the model to the chosen device
model.to(device)
device


device(type='cuda')

In [30]:
! pip install seqeval



In [31]:
from tqdm.auto import tqdm
from seqeval.metrics import f1_score, precision_score, recall_score
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 8  # you can adjust
best_f1 = 0.0   # track best F1 for checkpointing

for epoch in range(num_epochs):
    # ------------------- TRAINING -------------------
    model.train()
    total_train_loss = 0
    train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training")

    for batch in train_progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_train_loss += loss.item()
        train_progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"\nEpoch {epoch+1}: Avg Train Loss = {avg_train_loss:.4f}")

    # ------------------- VALIDATION -------------------
    model.eval()
    all_preds = []
    all_labels = []

    val_progress_bar = tqdm(eval_dataloader, desc=f"Epoch {epoch+1} Validation", leave=False)
    with torch.no_grad():
        for batch in val_progress_bar:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            for pred_seq, label_seq, mask_seq in zip(predictions, batch["labels"], batch["attention_mask"]):
                pred_labels = []
                true_labels = []
                for p, l, m in zip(pred_seq, label_seq, mask_seq):
                    if l.item() != -100 and m.item() == 1:
                        pred_labels.append(id2label[p.item()])
                        true_labels.append(id2label[l.item()])
                all_preds.append(pred_labels)
                all_labels.append(true_labels)

    # Compute metrics
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    print(f"Validation — F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    # ------------------- CHECKPOINT -------------------
    if f1 > best_f1:
        best_f1 = f1
        save_path = "./best_model"
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        print(f"New best model saved at F1 = {f1:.4f}\n")


Epoch 1 Training:   0%|          | 0/408 [00:00<?, ?it/s]


Epoch 1: Avg Train Loss = 0.3577


Epoch 1 Validation:   0%|          | 0/51 [00:00<?, ?it/s]

Validation — F1: 0.7767, Precision: 0.7271, Recall: 0.8335
New best model saved at F1 = 0.7767



Epoch 2 Training:   0%|          | 0/408 [00:00<?, ?it/s]


Epoch 2: Avg Train Loss = 0.1185


Epoch 2 Validation:   0%|          | 0/51 [00:00<?, ?it/s]

Validation — F1: 0.8116, Precision: 0.7726, Recall: 0.8548
New best model saved at F1 = 0.8116



Epoch 3 Training:   0%|          | 0/408 [00:00<?, ?it/s]


Epoch 3: Avg Train Loss = 0.0644


Epoch 3 Validation:   0%|          | 0/51 [00:00<?, ?it/s]

Validation — F1: 0.8160, Precision: 0.7608, Recall: 0.8798
New best model saved at F1 = 0.8160



Epoch 4 Training:   0%|          | 0/408 [00:00<?, ?it/s]


Epoch 4: Avg Train Loss = 0.0339


Epoch 4 Validation:   0%|          | 0/51 [00:00<?, ?it/s]

Validation — F1: 0.8310, Precision: 0.7998, Recall: 0.8648
New best model saved at F1 = 0.8310



Epoch 5 Training:   0%|          | 0/408 [00:00<?, ?it/s]


Epoch 5: Avg Train Loss = 0.0183


Epoch 5 Validation:   0%|          | 0/51 [00:00<?, ?it/s]

Validation — F1: 0.8337, Precision: 0.8037, Recall: 0.8661
New best model saved at F1 = 0.8337



Epoch 6 Training:   0%|          | 0/408 [00:00<?, ?it/s]


Epoch 6: Avg Train Loss = 0.0144


Epoch 6 Validation:   0%|          | 0/51 [00:00<?, ?it/s]

Validation — F1: 0.8337, Precision: 0.8037, Recall: 0.8661


Epoch 7 Training:   0%|          | 0/408 [00:00<?, ?it/s]


Epoch 7: Avg Train Loss = 0.0139


Epoch 7 Validation:   0%|          | 0/51 [00:00<?, ?it/s]

Validation — F1: 0.8337, Precision: 0.8037, Recall: 0.8661


Epoch 8 Training:   0%|          | 0/408 [00:00<?, ?it/s]


Epoch 8: Avg Train Loss = 0.0143


Epoch 8 Validation:   0%|          | 0/51 [00:00<?, ?it/s]

Validation — F1: 0.8337, Precision: 0.8037, Recall: 0.8661


## Saving the model on hugging face

In [62]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [63]:
repo_name = "laptop-aspect-opinion-bio"

In [64]:
# Make sure model and tokenizer are the ones you fine-tuned
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...4lldo55/model.safetensors:   0%|          |  567kB /  436MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/affan002/laptop-aspect-opinion-bio/commit/0ac7d263a42285014677da98eab2cd4ff3969baf', commit_message='Upload tokenizer', commit_description='', oid='0ac7d263a42285014677da98eab2cd4ff3969baf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/affan002/laptop-aspect-opinion-bio', endpoint='https://huggingface.co', repo_type='model', repo_id='affan002/laptop-aspect-opinion-bio'), pr_revision=None, pr_num=None)

## Testing the locally saved model

In [46]:
finetuned_path = "./best_model"

finetunedmodel = AutoModelForTokenClassification.from_pretrained(finetuned_path) #loading finetuned model
finetunedtokenizer = AutoTokenizer.from_pretrained(finetuned_path)





In [61]:


def predict_aspect_opinion(sentence, model, tokenizer, id2label, device="cpu"):
    """
    Predict all aspect-opinion spans for a given sentence.
    """
    # --- Preprocess sentence the same as training ---
    words = ["[NULL]"] + sentence.split()  # prepend [NULL]

    # Tokenize with word-level info
    tokens = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True)
    word_ids = tokens.word_ids(batch_index=0)
    tokens = {k: v.to(device) for k, v in tokens.items()}

    # --- Model forward pass ---
    model.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(**tokens)
        preds = torch.argmax(outputs.logits, dim=-1)[0]  # shape: [seq_len]

    # --- Map subword predictions back to words ---

    final_labels = []
    prev_word = None
    for idx, word_id in enumerate(word_ids):
        if word_id is None:
            continue  # skip special tokens
        elif word_id != prev_word:
            final_labels.append((words[word_id], id2label[preds[idx].item()]))
            prev_word = word_id

    # --- Merge B/I spans into aspect and opinion phrases ---
    aspects, opinions = [], []
    current_aspect, current_opinion = [], []

    for token, label in final_labels[1:]:  # skip [NULL] at index 0
        if label == "B-ASP":
            if current_aspect:
                aspects.append(" ".join(current_aspect))
            current_aspect = [token]
        elif label == "I-ASP":
            current_aspect.append(token)
        elif label == "B-OPI":
            if current_opinion:
                opinions.append(" ".join(current_opinion))
            current_opinion = [token]
        elif label == "I-OPI":
            current_opinion.append(token)
        else:
            if current_aspect:
                aspects.append(" ".join(current_aspect))
                current_aspect = []
            if current_opinion:
                opinions.append(" ".join(current_opinion))
                current_opinion = []

    # Catch last spans
    if current_aspect:
        aspects.append(" ".join(current_aspect))
    if current_opinion:
        opinions.append(" ".join(current_opinion))

    # Return results as tuples (aspect, opinion)
    # Simple pairing: each aspect with closest opinion (can refine later)
    triplets = list(zip(aspects, opinions))
    return triplets

# --- Example usage ---
sentence = "Performance is great for everyday tasks, and the screen brightness is sufficient even outdoors, however the charger is a bit bulky."
triplets = predict_aspect_opinion(sentence, model, tokenizer, id2label, device="cpu")
print(triplets)


[('screen brightness', 'great'), ('charger', 'sufficient')]
