<a href="https://colab.research.google.com/github/affan002/DimABSA-SemEval-task03/blob/main/ST2_Triplet_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A pipelined approach
## Integrating BERT from task 1 with BERT from task 2

### Initializing the two models

In [1]:
device = "cpu"

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

st2_repo = "affan002/laptop-aspect-opinion-bio"  # Hugging Face repo
tokenizer_st2 = AutoTokenizer.from_pretrained(st2_repo)
st2_model = AutoModelForTokenClassification.from_pretrained(st2_repo)
st2_model.to(device)  # move to GPU/CPU


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/302 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30523, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

st1_repo = "hassanshahzad2003/bert-base-uncased-finetuned-task1-semeval"  # Hugging Face repo
tokenizer_st1 = AutoTokenizer.from_pretrained(st1_repo)
st1_model = AutoModelForSequenceClassification.from_pretrained(st1_repo)
st1_model.to(device)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [4]:
id2label_st2 = {0: "O", 1: "B-ASP", 2: "I-ASP", 3: "B-OPI", 4: "I-OPI"}


### Building an inference pipeline for triplet extraction

In [5]:
import torch

def predict_triplets(sentence, st2_model, st2_tokenizer, st2_id2label,
                     st1_model, st1_tokenizer, device="cpu"):
    """
    Given a sentence, predict aspect-opinion pairs using ST2 model,
    then predict Valence#Arousal scores for each aspect using ST1 model.

    Returns:
        List of tuples: (aspect, opinion, "Valence#Arousal")
    """

    # --- Step 1: Tokenize for ST2 ---
    words = sentence.split()
    tokens = st2_tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True)
    word_ids = tokens.word_ids(batch_index=0)
    tokens = {k: v.to(device) for k, v in tokens.items()}

    # --- Step 2: Predict aspect/opinion labels ---
    st2_model.eval()
    with torch.no_grad():
        outputs = st2_model(**tokens)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)[0]  # shape: [seq_len]

    # Align predictions to words
    pred_labels = []
    for p, word_id in zip(preds, word_ids):
        if word_id is None:
            continue
        pred_labels.append((words[word_id], st2_id2label[p.item()]))

    # --- Step 3: Extract aspect-opinion pairs from BIO labels ---
    def extract_aspect_opinion(pred_labels):
        aspects, opinions = [], []
        current_aspect, current_opinion = [], []

        for word, label in pred_labels:
            if label == "B-ASP":
                if current_aspect:
                    aspects.append(" ".join(current_aspect))
                current_aspect = [word]
            elif label == "I-ASP" and current_aspect:
                current_aspect.append(word)
            elif label == "B-OPI":
                if current_opinion:
                    opinions.append(" ".join(current_opinion))
                current_opinion = [word]
            elif label == "I-OPI" and current_opinion:
                current_opinion.append(word)
            else:
                if current_aspect:
                    aspects.append(" ".join(current_aspect))
                    current_aspect = []
                if current_opinion:
                    opinions.append(" ".join(current_opinion))
                    current_opinion = []
        if current_aspect:
            aspects.append(" ".join(current_aspect))
        if current_opinion:
            opinions.append(" ".join(current_opinion))

        # Pairing aspects and opinions (simple zip, assumes same order)
        return list(zip(aspects, opinions))

    aspect_opinion_pairs = extract_aspect_opinion(pred_labels)

    # --- Step 4: Predict Valence & Arousal for each aspect ---
    va_triplets = []
    st1_model.eval()
    for aspect, opinion in aspect_opinion_pairs:
        inputs = st1_tokenizer(aspect, sentence, return_tensors="pt", truncation=True).to(device)
        with torch.no_grad():
            outputs = st1_model(**inputs)
            valence, arousal = outputs.logits[0].tolist()
            va_score = f"{valence:.2f}#{arousal:.2f}"
        va_triplets.append((aspect, opinion, va_score))

    return va_triplets


In [6]:
sentence = "this unit is ` ` pretty ` ` and stylish , so my high school daughter was attracted to it for that reason ."
triplets = predict_triplets(sentence, st2_model, tokenizer_st2, id2label_st2,
                            st1_model, tokenizer_st1, device="cpu")
print(triplets)


[('unit', 'pretty', '7.39#7.39')]


### Loading trial dataset and calculating F1

In [6]:
import pandas as pd
from datasets import Dataset

# Load JSONL from your repo
url = "https://raw.githubusercontent.com/affan002/DimABSA-SemEval-task03/refs/heads/main/train/eng_laptop_train_alltasks.jsonl?token=GHSAT0AAAAAADFHNCHZE65KEAHTGJMOQSIC2GT55EQ"
df = pd.read_json(url, lines=True)

rows = []
for _, row in df.iterrows():
    text = row["Text"]
    for quad in row["Quadruplet"]:
        aspect = quad["Aspect"]
        opinion = quad["Opinion"]
        va = quad["VA"]
        valence, arousal = map(float, va.split("#"))

        rows.append({
            "Text": text,
            "Aspect": aspect,
            "Opinion": opinion,
            "Valence": valence,
            "Arousal": arousal
        })

raw_datasets = Dataset.from_pandas(pd.DataFrame(rows))