In [111]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.spatial.distance import jensenshannon, cosine
from scipy.stats import chi2_contingency
from scipy.spatial.distance import cdist
from scipy.linalg import norm
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from wordcloud import WordCloud

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import DebertaV2Tokenizer, RobertaTokenizer
from transformers import AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
import gc
from tqdm import tqdm
import sentencepiece


# Multi-Task Learning

Below is the current pipeline for multi-task learning. The code chunk bellow allows to switch between the two tasks: "narrative_classification" and "entity_framing".

We can also specify the training and test domains.

In [159]:
# ==========================
# CONTROL PANEL
# ==========================

# choose a task for the pipeline below: "narrative_classification" or "entity_framing"
TASK = "multi_task"

# select domains for training and testing: "UA"; "CC"; or both
TRAIN_DOMAIN = ["CC"]
TEST_DOMAIN = ["UA", "CC"]
# The test data comes from a separate dataset. 
# The test data is always the same regardless of the domain we choose to train on. This is for consistency.

"""
Note that all articles are now in English, but if we wanted to control for e.g. certain cultural variations of a specific language,
we could exclude articles that were originally written in that language.

Not to use the functionality, 'ALL' should be selected.

"""

# select languages for training and testing: "ALL";"EN";"HI";"BG";"RU";"PT"
TRAIN_LANGUAGES = ["ALL"]
TEST_LANGUAGES = ["ALL"]

# debug mode
DEBUG_MODE = False

# change the training hyperparameters here
MODEL_NAME = "roberta-base" # OR "deberta-v3-base"
MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 2e-5

# Use consistent naming
MODEL_PATH = f"{TASK}_MTL_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt"  # -- to save the model later


In [161]:
# ==========================
# LOAD DATA
# ==========================

articles = pd.read_csv("train-all-articles.csv")
s1 = pd.read_csv("train-S1-labels.csv")
s2 = pd.read_csv("train-S2-labels.csv")

test_s1_articles = pd.read_csv("test-S1-articles.csv")
test_s1_labels = pd.read_csv("test-S1-labels.csv")
test_s2_articles = pd.read_csv("test-S2-articles.csv")
test_s2_labels = pd.read_csv("test-S2-labels.csv")

test_s1_labels.rename(columns={"Translated_Entity": "Entity"}, inplace=True)
test_s2_labels.columns = ["Filename", "Narrative", "Subnarrative"]


In [163]:
# ==========================
# FILTER + SPLIT TRAIN/VAL
# ==========================

# filter domains/languages for shared train/val
filtered_articles = articles[articles["Domain"].isin(TRAIN_DOMAIN)]
if "ALL" not in TRAIN_LANGUAGES:
    filtered_articles = filtered_articles[filtered_articles["Language"].isin(TRAIN_LANGUAGES)]

# 80/20 train/val split
filtered_articles = filtered_articles.sample(frac=1, random_state=42).reset_index(drop=True)
split_idx = int(0.8 * len(filtered_articles))
train_articles = filtered_articles.iloc[:split_idx].copy()
val_articles = filtered_articles.iloc[split_idx:].copy()

# debug subsamling
if DEBUG_MODE:
    train_articles = train_articles.sample(100)
    val_articles = val_articles.sample(100)
    test_s1_articles = test_s1_articles.sample(100)
    test_s2_articles = test_s2_articles.sample(100)


In [165]:
# ==========================
# MERGE WITH LABELS
# ==========================

# ENTITY FRAMING
df_train_s1 = pd.merge(s1, train_articles, on="Filename")
df_val_s1   = pd.merge(s1, val_articles, on="Filename")
df_test_s1  = pd.merge(test_s1_labels, test_s1_articles, on="Filename")

for df in [df_train_s1, df_val_s1, df_test_s1]:
    df.dropna(subset=["Translated_Text", "Entity", "Label", "Start", "End"], inplace=True)
    df["Start"] = df["Start"].astype(int)
    df["End"] = df["End"].astype(int)

    def insert_entity_marker(text, start, end):
        try:
            return text[:start] + "[ENTITY]" + text[start:end] + "[/ENTITY]" + text[end:]
        except:
            return text

    df["Input_Text"] = df.apply(lambda row: insert_entity_marker(row["Translated_Text"], row["Start"], row["End"]), axis=1)
    df["Entity_Labels"] = df["Label"].apply(lambda x: [s.strip() for s in str(x).split(",") if s.strip().lower() != "nan"])

# NARRATIVE CLASSIFICATION
df_train_s2 = pd.merge(train_articles, s2, on="Filename")
df_val_s2   = pd.merge(val_articles, s2, on="Filename")
df_test_s2  = pd.merge(test_s2_articles, test_s2_labels, on="Filename")

for df in [df_train_s2, df_val_s2, df_test_s2]:
    df.dropna(subset=["Translated_Text", "Narrative"], inplace=True)
    df["Narrative_Labels"] = df["Narrative"].apply(lambda x: [s.strip() for s in str(x).split(";") if s.strip().lower() != "nan"])


In [167]:
# ==========================
# MULTI-TASK LABEL BINARISATION
# ==========================

# Narrative Classification
mlb_s2 = MultiLabelBinarizer()
mlb_s2.fit(pd.concat([df_train_s2["Narrative_Labels"], df_val_s2["Narrative_Labels"], df_test_s2["Narrative_Labels"]]))
y_train_s2 = mlb_s2.transform(df_train_s2["Narrative_Labels"])
y_val_s2   = mlb_s2.transform(df_val_s2["Narrative_Labels"])
y_test_s2  = mlb_s2.transform(df_test_s2["Narrative_Labels"])
num_classes_s2 = len(mlb_s2.classes_)


# Entity Framing
mlb_s1 = MultiLabelBinarizer()
mlb_s1.fit(pd.concat([df_train_s1["Entity_Labels"], df_val_s1["Entity_Labels"], df_test_s1["Entity_Labels"]]))
y_train_s1 = mlb_s1.transform(df_train_s1["Entity_Labels"])
y_val_s1   = mlb_s1.transform(df_val_s1["Entity_Labels"])
y_test_s1  = mlb_s1.transform(df_test_s1["Entity_Labels"])
num_classes_s1 = len(mlb_s1.classes_)

task_classes = {
    "narrative_classification": y_train_s2.shape[1],
    "entity_framing": y_train_s1.shape[1]
}


In [169]:
# ==========================
# TOKENISATION and DATASET CLASS
# ==========================

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
#tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)

class MultiTaskDataset(Dataset):
    def __init__(self, texts, task_labels_dict, tokenizer, max_len):
        self.texts = texts
        self.task_labels_dict = task_labels_dict  # dict of task_name: label matrix
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        item = {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
        }

        for task_name, label_matrix in self.task_labels_dict.items():
            item[f"{task_name}_labels"] = torch.tensor(label_matrix[idx], dtype=torch.float)

        return item


# Dataset + DataLoader for Entity Framing
dataset_train_s1 = MultiTaskDataset(df_train_s1["Input_Text"].tolist(), {"entity_framing": y_train_s1}, tokenizer, MAX_LEN)
dataset_val_s1   = MultiTaskDataset(df_val_s1["Input_Text"].tolist(),   {"entity_framing": y_val_s1},   tokenizer, MAX_LEN)
dataset_test_s1  = MultiTaskDataset(df_test_s1["Input_Text"].tolist(),  {"entity_framing": y_test_s1},  tokenizer, MAX_LEN)

# Dataset + DataLoader for Narrative Classification
dataset_train_s2 = MultiTaskDataset(df_train_s2["Translated_Text"].tolist(), {"narrative_classification": y_train_s2}, tokenizer, MAX_LEN)
dataset_val_s2   = MultiTaskDataset(df_val_s2["Translated_Text"].tolist(),   {"narrative_classification": y_val_s2},   tokenizer, MAX_LEN)
dataset_test_s2  = MultiTaskDataset(df_test_s2["Translated_Text"].tolist(),  {"narrative_classification": y_test_s2},  tokenizer, MAX_LEN)

# ==========================
# DATALOADERS FOR MTL
# ==========================

# Narrative Classification
train_loader_nc = DataLoader(dataset_train_s2, batch_size=BATCH_SIZE, shuffle=True)
val_loader_nc   = DataLoader(dataset_val_s2, batch_size=BATCH_SIZE)
test_loader_nc  = DataLoader(dataset_test_s2, batch_size=BATCH_SIZE)

# Entity Framing
train_loader_ef = DataLoader(dataset_train_s1, batch_size=BATCH_SIZE, shuffle=True)
val_loader_ef   = DataLoader(dataset_val_s1, batch_size=BATCH_SIZE)
test_loader_ef  = DataLoader(dataset_test_s1, batch_size=BATCH_SIZE)


In [171]:
# ==========================
# MODEL CLASS MTL
# ==========================

class MultiTaskTransformer(nn.Module):
    def __init__(self, model_name, num_classes_dict):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)

        hidden_size = self.encoder.config.hidden_size

        # task-specific classifier heads
        self.task_heads = nn.ModuleDict({
            "narrative_classification": nn.Linear(hidden_size, num_classes_dict["narrative_classification"]),
            "entity_framing": nn.Linear(hidden_size, num_classes_dict["entity_framing"]),
        })

    def forward(self, input_ids, attention_mask, task):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # CLS token
        pooled_output = self.dropout(pooled_output)

        return self.task_heads[task](pooled_output)


In [173]:
# ==========================
# TRAINING LOOP UTILS (MTL)
# ==========================

def train_mtl(model, loaders, val_data, mlbs, optimizer, criterion, device, epochs):
    best_macro_f1 = {task: 0.0 for task in loaders.keys()}

    # Convert loaders to iterables that cycle
    from itertools import cycle

    # Find the length of the longest dataloader
    max_len = max(len(loader) for loader in loaders.values())
    
    task_names = list(loaders.keys())

    for epoch in range(epochs):
        print(f"\n Starting Epoch {epoch+1}/{epochs}...") 
        model.train()
        total_loss = 0.0

        # initialise iterators (cycle to match max length)
        iters = {task: cycle(loaders[task]) for task in task_names}

        for _ in range(max_len):
            optimizer.zero_grad()
            loss = 0.0

            for task in task_names:
                batch = next(iters[task])
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch[f"{task}_labels"].to(device)

                outputs = model(input_ids, attention_mask, task=task)
                task_loss = criterion(outputs, labels)
                loss += task_loss

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / max_len
        print(f"\nEpoch {epoch+1} - Average MTL Loss: {avg_loss:.4f}")

        # =====================
        # VALIDATION PER TASK
        # =====================
        model.eval()
        for task in val_data:
            val_loader, df_val, y_val, mlb = val_data[task]

            print(f"\n Validating task: {task}")
            threshold = 0.25  # fixed threshold
            y_pred = []

            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch["input_ids"].to(device)
                    attention_mask = batch["attention_mask"].to(device)
                    outputs = model(input_ids, attention_mask, task=task)
                    probs = torch.sigmoid(outputs).cpu().numpy()
                    y_pred.extend(probs)

            y_pred = np.array(y_pred)
            y_pred_bin = (y_pred > threshold).astype(int)

            macro = f1_score(y_val, y_pred_bin, average="macro", zero_division=0)
            print(f"[{task}] Macro F1: {macro:.4f}")

            if macro > best_macro_f1[task]:
                best_macro_f1[task] = macro
                save_path = f"{task}_MTL_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt"
                torch.save(model.state_dict(), save_path)
                print(f" Best model for task '{task}' saved to {save_path}")


def predict_proba(model, loader, task, device):
    model.eval()
    preds = []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(input_ids, attention_mask, task=task)
            probs = torch.sigmoid(outputs).cpu().numpy()
            preds.extend(probs)

    return np.array(preds)



In [179]:
# ==========================
# RUN MTL TRAINING
# ==========================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model with correct class counts per task
task_classes = {
    "narrative_classification": y_train_s2.shape[1],
    "entity_framing": y_train_s1.shape[1]
}

model = MultiTaskTransformer(MODEL_NAME, task_classes).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

# Run training loop
train_mtl(
    model=model,
    loaders={
        "narrative_classification": train_loader_nc,
        "entity_framing": train_loader_ef
    },
    val_data={
        "narrative_classification": (val_loader_nc, df_val_s2, y_val_s2, mlb_s2),
        "entity_framing": (val_loader_ef, df_val_s1, y_val_s1, mlb_s1)
    },
    mlbs={
        "narrative_classification": mlb_s2,
        "entity_framing": mlb_s1
    },
    optimizer=optimizer,
    criterion=criterion,
    device=device,
    epochs=EPOCHS
)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

 Starting Epoch 1/3...

Epoch 1 - Average MTL Loss: 0.6962

 Validating task: narrative_classification
[narrative_classification] Macro F1: 0.0499
 Best model for task 'narrative_classification' saved to narrative_classification_MTL_CC_to_UA-CC.pt

 Validating task: entity_framing
[entity_framing] Macro F1: 0.0559
 Best model for task 'entity_framing' saved to entity_framing_MTL_CC_to_UA-CC.pt

 Starting Epoch 2/3...

Epoch 2 - Average MTL Loss: 0.4104

 Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1685
 Best model for task 'narrative_classification' saved to narrative_classification_MTL_CC_to_UA-CC.pt

 Validating task: entity_framing
[entity_framing] Macro F1: 0.1257
 Best mode

In [None]:
# ==========================
# FIXED THRESHOLD EVALUATION
# ==========================

def evaluate_mtl(model, loader, df_source, y_true, mlb, task, label="TEST", threshold=0.25):
    model.eval()
    y_pred, domains = [], []

    with torch.no_grad():
        for i, batch in enumerate(tqdm(loader, desc=f"Evaluating {label} [{task}]")):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(input_ids, attention_mask, task=task)
            probs = torch.sigmoid(outputs).cpu().numpy()
            y_pred.extend(probs)

            # track domain for each sample
            start = i * loader.batch_size
            end = start + len(probs)
            domains.extend(df_source["Domain"].iloc[start:end].tolist())

    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    domains = np.array(domains)

    y_pred_bin = (y_pred > threshold).astype(int)

    # For narrative classification: filter unseen labels
    if task == "narrative_classification":
        mask = (y_true.sum(axis=0) + y_pred_bin.sum(axis=0)) > 0
        y_true = y_true[:, mask]
        y_pred_bin = y_pred_bin[:, mask]
        filtered_labels = np.array(mlb.classes_)[mask]
    else:
        filtered_labels = mlb.classes_

    macro = f1_score(y_true, y_pred_bin, average="macro", zero_division=0)
    micro = f1_score(y_true, y_pred_bin, average="micro", zero_division=0)
    exact = (y_pred_bin == y_true).all(axis=1).mean()

    print(f"\n{label} ({task}) [Threshold={threshold:.2f}]")
    print(f"Macro F1: {macro:.3f}")
    print(f"Micro F1: {micro:.3f}")
    print(f"Exact Match: {exact:.3f}")

    print("\n----------------------------")
    print("Per-Domain Breakdown")
    print("----------------------------")
    for domain in np.unique(domains):
        idx = np.where(domains == domain)[0]
        y_true_d = y_true[idx]
        y_pred_d = y_pred_bin[idx]

        macro_d = f1_score(y_true_d, y_pred_d, average="macro", zero_division=0)
        micro_d = f1_score(y_true_d, y_pred_d, average="micro", zero_division=0)
        exact_d = (y_pred_d == y_true_d).all(axis=1).mean()

        print(f"\nDomain: {domain}")
        print(f"Macro F1: {macro_d:.3f}")
        print(f"Micro F1: {micro_d:.3f}")
        print(f"Exact Match: {exact_d:.3f}")

    return {
        "macro": macro,
        "micro": micro,
        "exact": exact,
        "labels_used": filtered_labels.tolist()
    }


In [180]:
# ==========================
# TEST SET EVALUATION
# ==========================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("\n=========================")
print("FINAL TEST EVALUATION")
print("=========================")

for task in ["narrative_classification", "entity_framing"]:
    print(f"\n--- Task: {task.upper()} ---")

    # Load the best model 
    task_model_path = f"{task}_MTL_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt"
    model.load_state_dict(torch.load(task_model_path))
    model.to(device)
    model.eval()

    # Task-specific test data
    test_loader, df_test, y_test, mlb = {
        "narrative_classification": (test_loader_nc, df_test_s2, y_test_s2, mlb_s2),
        "entity_framing": (test_loader_ef, df_test_s1, y_test_s1, mlb_s1)
    }[task]

    # Evaluate
    evaluate_mtl(
        model=model,
        loader=test_loader,
        df_source=df_test,
        y_true=y_test,
        mlb=mlb,
        task=task,
        label="TEST"
    )



FINAL TEST EVALUATION

--- Task: NARRATIVE_CLASSIFICATION ---
Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:02<00:00,  8.69it/s]

TEST (narrative_classification) [Threshold=0.25]
Macro F1: 0.116
Micro F1: 0.229
Exact Match: 0.140

----------------------------
Per-Domain Breakdown
----------------------------

Domain: CC
Macro F1: 0.187
Micro F1: 0.570
Exact Match: 0.315

Domain: UA
Macro F1: 0.016
Micro F1: 0.025
Exact Match: 0.019

--- Task: ENTITY_FRAMING ---
Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:06<00:00,  8.56it/s]
TEST (entity_framing) [Threshold=0.25]
Macro F1: 0.126
Micro F1: 0.498
Exact Match: 0.145

----------------------------
Per-Domain Breakdown
----------------------------

Domain: CC
Macro F1: 0.165
Micro F1: 0.713
Exact Match: 0.526

Domain: UA
Macro F1: 0.101
Micro F1: 0.434
Exact Match: 0.040



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=49d39932-ba1f-4621-a036-ab99ade88496' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>