In [9]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DebertaV2Tokenizer, AutoModel, RobertaTokenizer
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report
import numpy as np
import shap
from captum.attr import IntegratedGradients
from transformers import AutoTokenizer
import torch.nn.functional as F
import hf_xet

# Single-Task Learning

Below is the current pipeline for single-task learning. The code chunk bellow allows to switch between the two tasks: "narrative_classification" and "entity_framing".

We can also specify the training and test domains.

In [11]:
# ==========================
# CONTROL PANEL
# ==========================

# choose a task for the pipeline below: "narrative_classification" or "entity_framing"
TASK = "narrative_classification"

# select domains for training and testing: "UA"; "CC"; "UA", "CC";
TRAIN_DOMAIN = ["UA"]
TEST_DOMAIN = ["UA", "CC"] # The test data comes from a separate dataset. 
# The test data is always the same regardless of the domain we choose to train on. This is for consistency.

"""
Note that all articles are now in English, but if we wanted to control for e.g. certain cultural variations of a specific language,
we could exclude articles that were originally written in that language.

Not to use the functionality, 'ALL' should be selected.

"""
# select languages for training and testing: "ALL";"EN";"HI";"BG";"RU";"PT"
TRAIN_LANGUAGES = ["ALL"] 
TEST_LANGUAGES = ["ALL"]

# debug mode -- reduced samples
DEBUG_MODE = False

# change the training hyperparameters here
MODEL_NAME = "roberta-base" # OR "deberta-v3-base"
MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 2e-5
MODEL_PATH = f"{TASK}_STL_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt" # -- to save the model later


In [13]:
# ==========================
# LOAD AND MERGE DATA
# ==========================

articles = pd.read_csv("train-all-articles.csv")
s1 = pd.read_csv("train-S1-labels.csv")
s2 = pd.read_csv("train-S2-labels.csv")

test_s1_articles = pd.read_csv("test-S1-articles.csv")
test_s1_labels = pd.read_csv("test-S1-labels.csv")
test_s2_articles = pd.read_csv("test-S2-articles.csv")
test_s2_labels = pd.read_csv("test-S2-labels.csv")

# ==========================
# STANDARDISE TEST SET COLUMNS
# ==========================

if TASK == "entity_framing":
    test_s1_labels.rename(columns={"Translated_Entity": "Entity"}, inplace=True)
elif TASK == "narrative_classification":
    test_s2_labels.columns = ["Filename", "Narrative", "Subnarrative"]

# ==========================
# FILTER + SPLIT TRAIN/VAL
# ==========================

# filter domains/languages for train/val
filtered_articles = articles[articles["Domain"].isin(TRAIN_DOMAIN)]
if "ALL" not in TRAIN_LANGUAGES:
    filtered_articles = filtered_articles[filtered_articles["Language"].isin(TRAIN_LANGUAGES)]

# 80/20 train/val split
filtered_articles = filtered_articles.sample(frac=1, random_state=42).reset_index(drop=True)
split_idx = int(0.8 * len(filtered_articles))
train_articles = filtered_articles.iloc[:split_idx].copy()
val_articles = filtered_articles.iloc[split_idx:].copy()

# debug subsampling if needed -- off by default
if DEBUG_MODE:
    train_articles = train_articles.sample(100)
    val_articles = val_articles.sample(100)
    test_s1_articles = test_s1_articles.sample(100)
    test_s2_articles = test_s2_articles.sample(100)


In [15]:
# ==========================
# TASK-SPECIFIC MERGE + PROCESSING
# ==========================

if TASK == "narrative_classification":
    # Merge articles with S2 labels
    df_train = pd.merge(train_articles, s2, on="Filename")
    df_val   = pd.merge(val_articles, s2, on="Filename")
    df_test  = pd.merge(test_s2_articles, test_s2_labels, on="Filename")

    TEXT_COL = "Translated_Text"
    LABEL_COL = "Narrative"

    for df in [df_train, df_val, df_test]:
        df.dropna(subset=[TEXT_COL, LABEL_COL], inplace=True)
        df[LABEL_COL] = df[LABEL_COL].apply(
            lambda x: [s.strip() for s in str(x).split(";") if s.strip().lower() != "nan"]
        )

    # Create shared label space from all available narrative data (zero-shot setup)
    full_set = pd.concat([df_train, df_val, df_test])
    mlb = MultiLabelBinarizer()
    mlb.fit(full_set[LABEL_COL])

    y_train = mlb.transform(df_train[LABEL_COL])
    y_val   = mlb.transform(df_val[LABEL_COL])
    y_test  = mlb.transform(df_test[LABEL_COL])
    num_classes = len(mlb.classes_)

elif TASK == "entity_framing":
    # Merge entity labels with articles
    df_train = pd.merge(s1, train_articles, on="Filename")
    df_val   = pd.merge(s1, val_articles, on="Filename")
    df_test  = pd.merge(test_s1_labels, test_s1_articles, on="Filename")

    TEXT_COL = "Translated_Text"
    LABEL_COL = "Label"

    def insert_entity_marker(text, start, end):
        try:
            start, end = int(start), int(end)
            return text[:start] + "[ENTITY]" + text[start:end] + "[/ENTITY]" + text[end:]
        except:
            return text

    for df in [df_train, df_val, df_test]:
        df.dropna(subset=[TEXT_COL, "Entity", LABEL_COL, "Start", "End"], inplace=True)
        df["Start"] = df["Start"].astype(int)
        df["End"] = df["End"].astype(int)
        df["Input_Text"] = df.apply(lambda row: insert_entity_marker(row[TEXT_COL], row["Start"], row["End"]), axis=1)
        df[LABEL_COL] = df[LABEL_COL].apply(lambda x: [s.strip() for s in str(x).split(",") if s.strip().lower() != "nan"])

    # For entity framing, create a separate label binarizer
    mlb = MultiLabelBinarizer()
    mlb.fit(df_train[LABEL_COL] + df_val[LABEL_COL] + df_test[LABEL_COL])  # full entity label space

    y_train = mlb.transform(df_train[LABEL_COL])
    y_val   = mlb.transform(df_val[LABEL_COL])
    y_test  = mlb.transform(df_test[LABEL_COL])
    num_classes = len(mlb.classes_)

else:
    raise ValueError("Unknown TASK specified.")


In [17]:
# ==========================
# TOKENISATION and DATASET CLASS
# ==========================

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
#tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)

class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }

train_dataset = MultiLabelDataset(df_train[TEXT_COL].tolist(), y_train, tokenizer, MAX_LEN)
val_dataset   = MultiLabelDataset(df_val[TEXT_COL].tolist(), y_val, tokenizer, MAX_LEN)
test_dataset  = MultiLabelDataset(df_test[TEXT_COL].tolist(), y_test, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE)




In [19]:
# ==========================
# MODEL CLASS
# ==========================

class TransformerClassifier(nn.Module):
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.encoder.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # CLS token
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)


In [21]:
# ==========================
# TRAINING UTILS
# ==========================

def predict_proba(model, loader, device):
    model.eval()
    probs = []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids, attention_mask)
            probs.extend(torch.sigmoid(outputs).cpu().numpy())
    return np.array(probs)

def evaluate_threshold_sweep(y_true, y_pred, thresholds=np.arange(0.1, 0.9, 0.05)):
    best_thresh = 0.5
    best_f1 = 0
    results = []

    for thresh in thresholds:
        y_pred_bin = (y_pred > thresh).astype(int)
        macro = f1_score(y_true, y_pred_bin, average='macro', zero_division=0)
        micro = f1_score(y_true, y_pred_bin, average='micro', zero_division=0)
        exact = (y_pred_bin == y_true).all(axis=1).mean()

        results.append((thresh, macro, micro, exact))
        if macro > best_f1:
            best_f1 = macro
            best_thresh = thresh

    print("Threshold sweep results:")
    for t, macro, micro, exact in results:
        print(f"Thresh {t:.2f} | Macro F1: {macro:.3f} | Micro F1: {micro:.3f} | Exact Match: {exact:.3f}")

    print(f"\n Best threshold = {best_thresh:.2f} with Macro F1 = {best_f1:.3f}")
    return best_thresh


In [24]:
# ==========================
# TRAINING LOOP
# ==========================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(MODEL_NAME, num_classes).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()
best_macro_f1 = 0.0 

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"\nEpoch {epoch+1}: Loss = {total_loss / len(train_loader):.4f}")

    # validation
    val_probs = predict_proba(model, val_loader, device)
    threshold = evaluate_threshold_sweep(y_val, val_probs)
    y_val_pred = (val_probs > threshold).astype(int)
    macro_f1 = f1_score(y_val, y_val_pred, average="macro", zero_division=0)
    print(f"Validation Macro F1 (Epoch {epoch+1}): {macro_f1:.4f}")

    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
        torch.save(model.state_dict(), MODEL_PATH)
        print(f"Saved best model (Epoch {epoch+1}) to {MODEL_PATH}")


  _torch_pytree._register_pytree_node(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 192/192 [01:18<00:00,  2.43it/s]

Epoch 1: Loss = 0.2709
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.241 | Micro F1: 0.402 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.237 | Micro F1: 0.456 | Exact Match: 0.008
Thresh 0.20 | Macro F1: 0.219 | Micro F1: 0.491 | Exact Match: 0.089
Thresh 0.25 | Macro F1: 0.225 | Micro F1: 0.505 | Exact Match: 0.189
Thresh 0.30 | Macro F1: 0.201 | Micro F1: 0.488 | Exact Match: 0.197
Thresh 0.35 | Macro F1: 0.191 | Micro F1: 0.456 | Exact Match: 0.194
Thresh 0.40 | Macro F1: 0.143 | Micro F1: 0.366 | Exact Match: 0.173
Thresh 0.45 | Macro F1: 0.099 | Micro F1: 0.287 | Exact Match: 0.121
Thresh 0.5

In [None]:
torch.cuda.empty_cache()

In [25]:
# ==========================
# FIXED THRESHOLD EVALUATION
# ==========================

def evaluate(loader, df_source, mlb, label="TEST", threshold=0.25): 

    """
    Evaluates a multi-label classification model using a fixed probability threshold.

    Args:
        loader (DataLoader): A PyTorch DataLoader yielding batches of tokenised input data
        df_source (pd.DataFrame): Source dataframe containing metadata for each example, including domain info.
        mlb (MultiLabelBinarizer): The fitted multi-label binarizer used for encoding and decoding labels.
        label (str, optional): Label for the dataset (e.g., 'TEST', 'VALIDATION'). Used for logging. Defaults to "TEST".
        threshold (float, optional): Probability threshold to convert predicted probabilities into binary labels. Defaults to 0.25.

    Returns:
        dict: A dictionary containing overall macro F1, micro F1, exact match score, 
              the threshold used, and the list of labels used after filtering.
              Also prints per-domain breakdowns of these metrics.

    Notes:
        - Filters out labels that are completely unseen in both predictions and ground truths 
          to avoid skewed metric calculations.
        - Performs evaluation on the entire dataset as well as broken down by domain.
    """
    model.eval()
    y_true, y_pred, domains = [], [], []

    with torch.no_grad():
        for i, batch in enumerate(tqdm(loader, desc=f"Evaluating {label}")):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].cpu().numpy()

            outputs = model(input_ids, attention_mask)
            probs = torch.sigmoid(outputs).cpu().numpy()

            y_pred.extend(probs)
            y_true.extend(labels)

            start = i * loader.batch_size
            end = start + len(labels)
            domains.extend(df_source["Domain"].iloc[start:end].tolist())

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    domains = np.array(domains)

    y_pred_bin = (y_pred > threshold).astype(int)

    # filter columns where y_true or y_pred has no samples (i.e., unseen label)
    mask = (y_true.sum(axis=0) + y_pred_bin.sum(axis=0)) > 0
    y_true = y_true[:, mask]
    y_pred_bin = y_pred_bin[:, mask]
    filtered_labels = np.array(mlb.classes_)[mask]

    macro = f1_score(y_true, y_pred_bin, average="macro", zero_division=0)
    micro = f1_score(y_true, y_pred_bin, average="micro", zero_division=0)
    exact = (y_pred_bin == y_true).all(axis=1).mean()

    print(f"\n {label} (Fixed Threshold={threshold:.2f}):")
    print(f"Macro F1: {macro:.3f}")
    print(f"Micro F1: {micro:.3f}")
    print(f"Exact Match: {exact:.3f}")

    print("\n----------------------------")
    print("Per-Domain Breakdown")
    print("----------------------------")
    for domain in np.unique(domains):
        idx = np.where(domains == domain)[0]
        y_true_d = y_true[idx]
        y_pred_d = y_pred_bin[idx]

        macro_d = f1_score(y_true_d, y_pred_d, average="macro", zero_division=0)
        micro_d = f1_score(y_true_d, y_pred_d, average="micro", zero_division=0)
        exact_d = (y_pred_d == y_true_d).all(axis=1).mean()

        print(f"\n Domain: {domain}")
        print(f"Macro F1: {macro_d:.3f}")
        print(f"Micro F1: {micro_d:.3f}")
        print(f"Exact Match: {exact_d:.3f}")

    return {
        "macro": macro,
        "micro": micro,
        "exact": exact,
        "threshold": threshold,
        "labels_used": filtered_labels.tolist()
    }


def evaluate_and_compare_fixed_thresh(val_loader, df_val, test_loader, df_test, mlb, threshold=0.25):
    print("\n=========================")
    print("Validation (Fixed Threshold)")
    print("=========================")
    val_results = evaluate(val_loader, df_val.reset_index(drop=True), mlb, label="VALIDATION", threshold=threshold)

    print("\n=========================")
    print("Test (Fixed Threshold)")
    print("=========================")
    test_results = evaluate(test_loader, df_test.reset_index(drop=True), mlb, label="TEST", threshold=threshold)

    print("\n=========================")
    print("OOD Generalization (Fixed Threshold)")
    print("=========================")
    macro_drop = val_results["macro"] - test_results["macro"]
    print(f"Δ Macro F1 (val - test): {macro_drop:.3f}")

    return {
        "val": val_results,
        "test": test_results,
        "ood_gap_macro": macro_drop
    }



In [26]:
results = evaluate_and_compare_fixed_thresh(
    val_loader, df_val,
    test_loader, df_test,
    mlb
)


Validation (Fixed Threshold)
Evaluating VALIDATION: 100%|██████████| 48/48 [00:07<00:00,  6.66it/s]

 VALIDATION (Fixed Threshold=0.25):
Macro F1: 0.355
Micro F1: 0.578
Exact Match: 0.299

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.192
Micro F1: 0.570
Exact Match: 0.460

 Domain: UA
Macro F1: 0.189
Micro F1: 0.581
Exact Match: 0.231

Test (Fixed Threshold)
Evaluating TEST: 100%|██████████| 23/23 [00:03<00:00,  6.74it/s]

 TEST (Fixed Threshold=0.25):
Macro F1: 0.268
Micro F1: 0.513
Exact Match: 0.247

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.229
Micro F1: 0.592
Exact Match: 0.452

 Domain: UA
Macro F1: 0.150
Micro F1: 0.473
Exact Match: 0.105

OOD Generalization (Fixed Threshold)
Δ Macro F1 (val - test): 0.087


## Zero-Shot Narrative Classification -- experimenting with label embeddings

In [32]:

#Zero-Shot Narrative Classification


zs_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load your fine-tuned model from checkpoint
fine_tuned_model = AutoModel.from_pretrained(MODEL_NAME)
state_dict = torch.load(MODEL_PATH, map_location='cpu')
fine_tuned_model.load_state_dict(state_dict, strict=False)
fine_tuned_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fine_tuned_model.to(device)

# Use narrative labels from MultiLabelBinarizer
zs_labels = list(mlb.classes_)

# Encode label names into embeddings
with torch.no_grad():
    zs_label_inputs = zs_tokenizer(zs_labels, padding=True, truncation=True, return_tensors="pt").to(device)
    zs_label_outputs = fine_tuned_model(**zs_label_inputs)
    zs_label_embeddings = zs_label_outputs.last_hidden_state[:, 0, :]
    zs_label_embeddings = F.normalize(zs_label_embeddings, dim=1)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:

# Zero-shot prediction function
def zero_shot_predict(text, top_k=3):
    with torch.no_grad():
        input_enc = zs_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        output = fine_tuned_model(**input_enc)
        text_embedding = output.last_hidden_state[:, 0, :]
        text_embedding = F.normalize(text_embedding, dim=1)

        sims = F.cosine_similarity(text_embedding, zs_label_embeddings)
        topk_indices = torch.topk(sims, k=top_k).indices.tolist()
        return [(zs_labels[i], sims[i].item()) for i in topk_indices]


In [40]:

# Run zero-shot predictions on validation or test set
# You can switch df_val to df_test here
print("Running zero-shot predictions on test set...")

zs_results = []
for text in df_test[TEXT_COL].tolist():
    zs_results.append(zero_shot_predict(text, top_k=3))

# Example: print the first 3 predictions
for i in range(3):
    print(f"Text: {df_test[TEXT_COL].iloc[i][:120]}...")
    print("Predictions:", zs_results[i])
    print()


Running zero-shot predictions on test set...
Text: General Milley: Russian military stocks rapidly depleting, soldiers demoralized 

“Russia remains isolated. Their milita...
Predictions: [('URW: Speculating war outcomes', 0.995092511177063), ('URW: Discrediting Ukraine', 0.995059609413147), ('URW: Blaming the war on others rather than the invader', 0.9949306845664978)]

Text: Ukrainian nationalism, Ukrainian patriotism will be their downfall. 

 Ukrainian nationalism, Ukrainian patriotism will ...
Predictions: [('URW: Russia is the Victim', 0.9955462217330933), ('URW: Discrediting Ukraine', 0.9954404234886169), ('URW: Blaming the war on others rather than the invader', 0.9953905344009399)]

Text: Medvedev: Russia Seeks More in Ukraine, 'Probably Should Be Kyiv' 

 Russian troops will go much further into Ukraine, t...
Predictions: [('URW: Praise of Russia', 0.9958609342575073), ('URW: Speculating war outcomes', 0.9958573579788208), ('URW: Discrediting Ukraine', 0.9958275556564331)]



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=49d39932-ba1f-4621-a036-ab99ade88496' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>