In [7]:
# RUN THESE IMPORTS FIRST
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DebertaV2Tokenizer, AutoModel, RobertaTokenizer
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report
import numpy as np
import shap
from captum.attr import IntegratedGradients
from transformers import AutoTokenizer, Trainer, TrainingArguments
import torch.nn.functional as F
import hf_xet
import optuna
import sys
import importlib # !pip install importlib
sys.path.append('.')


### Custom built modules ###
import importlib

import data_loader_STL
importlib.reload(data_loader_STL)
from data_loader_STL import prepare_data_STL_fine, prepare_data_STL_hierarchical, prepare_data_STL_coarse

import single_task
importlib.reload(single_task)
from single_task import TransformerClassifier, MultiLabelDataset, train_single_task_model, train_hierarchical_classifier

import multi_task
importlib.reload(multi_task)
from multi_task import MultiTaskTransformer, train_mtl_flat, train_mtl_hierarchical, apply_hierarchical_constraints_mtl, hierarchical_loss_mtl, AdapterMultiTaskTransformer

import data_loader_MTL
importlib.reload(data_loader_MTL)
from data_loader_MTL import prepare_data_MTL_fine_flat, prepare_data_MTL_hierarchical, prepare_data_MTL_coarse, MultiTaskDataset

import evaluation_utils as eval_util
importlib.reload(eval_util)
from evaluation_utils import evaluate_flat, evaluate_hierarchy, evaluate_mtl_all_tasks, evaluate_per_class_flat, evaluate_per_domain_flat, predict_proba, evaluate_threshold_sweep, evaluate_mtl_hierarchical_task, evaluate_mtl_hierarchical_all_tasks, evaluate_flat_custom, compute_fine_vs_coarse_metrics, get_coarse_label_list


  from .autonotebook import tqdm as notebook_tqdm


# Master Notebook

Through this interface the user can experiment with all the models and experimental conditions used in the thesis.

## Hyperparameters

In [2]:
import sys
sys.path.append(".")  # Ensure current directory is in path

from merged_optuna_script import objective_stl, objective_mtl, objective_mtl_adapter
import optuna
import pandas as pd

# === Fast Experiment Sweep ===
EXPERIMENTS = [
    {"setup": "stl", "task": "entity_framing", "encoder": "roberta-base"},
    {"setup": "stl", "task": "narrative_classification", "encoder": "roberta-base"},
    {"setup": "mtl", "task": None, "encoder": "roberta-base"},
    {"setup": "stl", "task": "entity_framing", "encoder": "distilbert-base-uncased"},
    {"setup": "stl", "task": "narrative_classification", "encoder": "distilbert-base-uncased"},
    {"setup": "mtl", "task": None, "encoder": "distilbert-base-uncased"},
    {"setup": "mtl_adapter", "task": None, "encoder": "roberta-base"},
    {"setup": "mtl_adapter", "task": None, "encoder": "distilbert-base-uncased"},
]

all_results = []

for config in EXPERIMENTS:
    setup = config["setup"]
    task = config["task"]
    encoder = config["encoder"]

    print(f"\n Starting Optuna Study → Setup: {setup.upper()} | Task: {task or 'MTL'} | Encoder: {encoder}")
    
    study = optuna.create_study(direction="maximize")

    if setup == "stl":
        study.optimize(lambda trial: objective_stl(trial, task_type=task, model_name=encoder), n_trials=3)
    elif setup == "mtl":
        study.optimize(lambda trial: objective_mtl(trial, model_name=encoder), n_trials=3)
    elif setup == "mtl_adapter":
        study.optimize(lambda trial: objective_mtl_adapter(trial, model_name=encoder), n_trials=3)
    else:
        raise ValueError(f"Unknown setup: {setup}")

    best_params = study.best_trial.params
    best_score = study.best_trial.value

    print(f"\n Best hyperparameters for {setup.upper()} | {task or 'MTL'} | {encoder}:")
    for k, v in best_params.items():
        print(f"  {k}: {v}")
    print(f"  score: {best_score:.4f}")

    all_results.append({
        "setup": setup,
        "task": task or "mtl",
        "encoder": encoder,
        "score": best_score,
        **best_params
    })

# === Save results ===
df = pd.DataFrame(all_results)
df.to_csv("optuna_quick_sweep_results_adapter.csv", index=False)
print("\n Saved results to optuna_quick_sweep_results.csv")


[I 2025-05-10 10:29:04,051] A new study created in memory with name: no-name-efb61a5a-df9a-4bc7-a131-8da529d3017d

 Starting Optuna Study → Setup: MTL_ADAPTER | Task: MTL | Encoder: roberta-base
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fca33dea170>>
Traceback (most recent call last):
  File "/toolkit-cache/0.2.16/python3.10/kernel-libs/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Starting Epoch 1/2...
[W 2025-05-10 10:29:12,061] Trial 0 failed with parameters: {'learning_rate': 2.5620923423875518e-05, 'batch_s

KeyboardInterrupt: 

In [40]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

from single_task import TransformerClassifier, train_single_task_model, MultiLabelDataset
from multi_task import MultiTaskTransformer, AdapterMultiTaskTransformer, train_mtl_flat
from data_loader_STL import prepare_data_STL_fine
from data_loader_MTL import prepare_data_MTL_fine_flat
from evaluation_utils import evaluate_flat_custom, compute_fine_vs_coarse_metrics, get_coarse_label_list

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PARAMS = {
    "learning_rate": 3e-5,
    "batch_size": 8,
    "epochs": 3,
    "threshold": 0.35,
    "max_len": 512
}

SETUPS = [
    #{"setup": "stl", "task": "entity_framing", "encoder": "roberta-base"},
    #{"setup": "stl", "task": "narrative_classification", "encoder": "roberta-base"},
    #{"setup": "mtl", "task": None, "encoder": "roberta-base"},
    #{"setup": "mtl_adapter", "task": None, "encoder": "roberta-base"},
    #{"setup": "stl", "task": "entity_framing", "encoder": "distilbert-base-uncased"},
    #{"setup": "stl", "task": "narrative_classification", "encoder": "distilbert-base-uncased"},
   {"setup": "mtl", "task": None, "encoder": "distilbert-base-uncased"},
    #{"setup": "mtl_adapter", "task": None, "encoder": "distilbert-base-uncased"},
]

TRAIN_SPLITS = [["CC"]]
EVAL_SPLITS = ["UA", "CC"]

SUMMARY_COLUMNS = [
    "setup", "encoder", "task", "train_domain", "eval_domain",
    "overall_macro", "overall_micro", "overall_exact",
    "macro_fine", "micro_fine", "macro_coarse", "micro_coarse"
]

for config in SETUPS:
    setup = config["setup"]
    task = config["task"]
    encoder = config["encoder"]
    tokenizer = AutoTokenizer.from_pretrained(encoder)

    setup_name = f"{setup}_{task or 'mtl'}_{encoder.replace('/', '-')}"
    csv_path = f"results_summary__{setup_name}.csv"
    all_rows = []

    for train_domains in TRAIN_SPLITS:
        train_str = "+".join(train_domains)

        if setup == "stl":
            df_train, df_val, df_test, y_train, y_val, y_test, mlb, text_col, label_col = prepare_data_STL_fine(
                task, train_domains, ["UA", "CC"]
            )
            train_dataset = MultiLabelDataset(df_train[text_col].tolist(), y_train, tokenizer, PARAMS["max_len"])
            val_dataset = MultiLabelDataset(df_val[text_col].tolist(), y_val, tokenizer, PARAMS["max_len"])
            train_loader = DataLoader(train_dataset, batch_size=PARAMS["batch_size"], shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=PARAMS["batch_size"])

            model = TransformerClassifier(encoder, len(mlb.classes_)).to(device)
            model = train_single_task_model(
                model=model,
                train_loader=train_loader,
                val_loader=val_loader,
                y_val=y_val,
                MODEL_PATH="tmp.pt",
                LEARNING_RATE=PARAMS["learning_rate"],
                EPOCHS=PARAMS["epochs"],
                device=device
            )

            results_fine, results_coarse = {}, {}
            coarse_list = get_coarse_label_list(task)

            for domain in EVAL_SPLITS:
                df_eval = df_test[df_test["Domain"] == domain].copy()
                known_labels = set(mlb.classes_)
                df_eval[label_col] = df_eval[label_col].apply(lambda labels: [l for l in labels if l in known_labels])
                y_eval = mlb.transform(df_eval[label_col])
                test_loader = DataLoader(
                    MultiLabelDataset(df_eval[text_col].tolist(), y_eval, tokenizer, PARAMS["max_len"]),
                    batch_size=PARAMS["batch_size"]
                )
                eval_result = evaluate_flat_custom(model, test_loader, df_eval, mlb, device, threshold=PARAMS["threshold"])
                score_dict = compute_fine_vs_coarse_metrics(eval_result["y_true"], eval_result["y_pred_bin"], list(mlb.classes_), coarse_list)

                all_rows.append({
                    "setup": setup,
                    "encoder": encoder,
                    "task": task,
                    "train_domain": train_str,
                    "eval_domain": domain,
                    "overall_macro": round((score_dict["macro_fine"] + score_dict["macro_coarse"]) / 2, 4),
                    "overall_micro": round((score_dict["micro_fine"] + score_dict["micro_coarse"]) / 2, 4),
                    "overall_exact": round(eval_result["exact"], 4),
                    "macro_fine": round(score_dict["macro_fine"], 4),
                    "micro_fine": round(score_dict["micro_fine"], 4),
                    "macro_coarse": round(score_dict["macro_coarse"], 4),
                    "micro_coarse": round(score_dict["micro_coarse"], 4)
                })

        elif setup in ["mtl", "mtl_adapter"]:
            (
                df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
                df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
                train_loader_s1, val_loader_s1, test_loader_s1,
                train_loader_s2, val_loader_s2, test_loader_s2,
                num_classes_dict
            ) = prepare_data_MTL_fine_flat(
                TASK="multi_task",
                model_name=encoder,
                max_len=PARAMS["max_len"],
                batch_size=PARAMS["batch_size"],
                train_domains=train_domains,
                test_domains=["UA", "CC"],
                train_languages=["ALL"]
            )

            task_classes = {
                "entity_framing": y_train_s1.shape[1],
                "narrative_classification": y_train_s2.shape[1]
            }
            model = MultiTaskTransformer(encoder, task_classes).to(device) if setup == "mtl" else \
                AdapterMultiTaskTransformer(model_name=encoder, num_classes_dict=task_classes).to(device)

            optimizer = torch.optim.AdamW(model.parameters(), lr=PARAMS["learning_rate"])
            criterion = torch.nn.BCEWithLogitsLoss()

            train_mtl_flat(
                model=model,
                loaders={"entity_framing": train_loader_s1, "narrative_classification": train_loader_s2},
                val_data={
                    "entity_framing": (val_loader_s1, df_val_s1, y_val_s1, mlb_s1),
                    "narrative_classification": (val_loader_s2, df_val_s2, y_val_s2, mlb_s2)
                },
                mlbs={"entity_framing": mlb_s1, "narrative_classification": mlb_s2},
                optimizer=optimizer,
                criterion=criterion,
                device=device,
                epochs=PARAMS["epochs"],
                train_domain=train_domains,
                test_domain=["UA", "CC"]
            )

            for domain in EVAL_SPLITS:
                for subtask, df_test, mlb, text_key, label_key in [
                    ("entity_framing", df_test_s1, mlb_s1, "Input_Text", "Label"),
                    ("narrative_classification", df_test_s2, mlb_s2, "Translated_Text", "Label")
                ]:
                    df_eval = df_test[df_test["Domain"] == domain].copy()
                    known_labels = set(mlb.classes_)
                    df_eval[label_key] = df_eval[label_key].apply(
                        lambda labels: [l for l in labels if l in known_labels] if isinstance(labels, list) else []
                    )
                    y_eval = mlb.transform(df_eval[label_key])

                    test_loader = DataLoader(
                        MultiLabelDataset(df_eval[text_key].tolist(), y_eval, tokenizer, PARAMS["max_len"]),
                        batch_size=PARAMS["batch_size"]
                    )

                    model_path = f"{subtask}_MTL_{'-'.join(train_domains)}_to_{'-'.join(EVAL_SPLITS)}.pt"
                    if os.path.exists(model_path):
                        model.load_state_dict(torch.load(model_path))
                        model.to(device)
                        print(f"✅ Loaded model for {subtask}")
                    else:
                        print(f"⚠️ Missing checkpoint: {model_path}")


                    eval_result = evaluate_flat_custom(
                        model=model,
                        loader=test_loader,
                        df_source=df_eval,
                        mlb=mlb,
                        device=device,
                        threshold=PARAMS["threshold"],
                        task=subtask
                    )
                    coarse_list = get_coarse_label_list(subtask)
                    score_dict = compute_fine_vs_coarse_metrics(
                        eval_result["y_true"], eval_result["y_pred_bin"], list(mlb.classes_), coarse_list
                    )

                    all_rows.append({
                        "setup": setup,
                        "encoder": encoder,
                        "task": subtask,
                        "train_domain": train_str,
                        "eval_domain": domain,
                        "overall_macro": round((score_dict["macro_fine"] + score_dict["macro_coarse"]) / 2, 4),
                        "overall_micro": round((score_dict["micro_fine"] + score_dict["micro_coarse"]) / 2, 4),
                        "overall_exact": round(eval_result["exact"], 4),
                        "macro_fine": round(score_dict["macro_fine"], 4),
                        "micro_fine": round(score_dict["micro_fine"], 4),
                        "macro_coarse": round(score_dict["macro_coarse"], 4),
                        "micro_coarse": round(score_dict["micro_coarse"], 4),
                    })


    pd.DataFrame(all_rows, columns=SUMMARY_COLUMNS).to_csv(csv_path, index=False)
    print(f"✅ Saved: {csv_path}")


In [13]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PARAMS = {
    "learning_rate": 3e-5,
    "batch_size": 8,
    "epochs": 3,
    "threshold": 0.35,
    "max_len": 512
}

SETUPS = [
    {"setup": "mtl", "task": None, "encoder": "distilbert-base-uncased"},
]

TRAIN_SPLITS = [["CC"]]
EVAL_SPLITS = ["UA", "CC"]

SUMMARY_COLUMNS = [
    "setup", "encoder", "task", "train_domain", "eval_domain",
    "overall_macro", "overall_micro", "overall_exact",
    "macro_fine", "micro_fine", "macro_coarse", "micro_coarse"
]

for config in SETUPS:
    setup = config["setup"]
    task = config["task"]
    encoder = config["encoder"]
    tokenizer = AutoTokenizer.from_pretrained(encoder)

    setup_name = f"{setup}_{task or 'mtl'}_{encoder.replace('/', '-')}"
    csv_path = f"results_summary__{setup_name}.csv"
    all_rows = []

    for train_domains in TRAIN_SPLITS:
        train_str = "+".join(train_domains)

        if setup == "stl":
            df_train, df_val, df_test, y_train, y_val, y_test, mlb, text_col, label_col = prepare_data_STL_fine(
                task, train_domains, EVAL_SPLITS
            )
            train_dataset = MultiLabelDataset(df_train[text_col].tolist(), y_train, tokenizer, PARAMS["max_len"])
            val_dataset = MultiLabelDataset(df_val[text_col].tolist(), y_val, tokenizer, PARAMS["max_len"])
            train_loader = DataLoader(train_dataset, batch_size=PARAMS["batch_size"], shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=PARAMS["batch_size"])

            model = TransformerClassifier(encoder, len(mlb.classes_)).to(device)
            model = train_single_task_model(
                model=model,
                train_loader=train_loader,
                val_loader=val_loader,
                y_val=y_val,
                MODEL_PATH="tmp.pt",
                LEARNING_RATE=PARAMS["learning_rate"],
                EPOCHS=PARAMS["epochs"],
                device=device
            )

            for domain in EVAL_SPLITS:
                df_eval = df_test[df_test["Domain"] == domain].copy()
                known_labels = set(mlb.classes_)
                df_eval[label_col] = df_eval[label_col].apply(lambda labels: [l for l in labels if l in known_labels])
                y_eval = mlb.transform(df_eval[label_col])
                test_loader = DataLoader(
                    MultiLabelDataset(df_eval[text_col].tolist(), y_eval, tokenizer, PARAMS["max_len"]),
                    batch_size=PARAMS["batch_size"]
                )
                eval_result = evaluate_flat_custom(model, test_loader, df_eval, mlb, device, threshold=PARAMS["threshold"])
                score_dict = compute_fine_vs_coarse_metrics(eval_result["y_true"], eval_result["y_pred_bin"], list(mlb.classes_), get_coarse_label_list(task))

                all_rows.append({
                    "setup": setup,
                    "encoder": encoder,
                    "task": task,
                    "train_domain": train_str,
                    "eval_domain": domain,
                    "overall_macro": round((score_dict["macro_fine"] + score_dict["macro_coarse"]) / 2, 4),
                    "overall_micro": round((score_dict["micro_fine"] + score_dict["micro_coarse"]) / 2, 4),
                    "overall_exact": round(eval_result["exact"], 4),
                    "macro_fine": round(score_dict["macro_fine"], 4),
                    "micro_fine": round(score_dict["micro_fine"], 4),
                    "macro_coarse": round(score_dict["macro_coarse"], 4),
                    "micro_coarse": round(score_dict["micro_coarse"], 4),
                })

        elif setup in ["mtl", "mtl_adapter"]:
            (
                df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
                df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
                train_loader_s1, val_loader_s1, test_loader_s1,
                train_loader_s2, val_loader_s2, test_loader_s2,
                num_classes_dict
            ) = prepare_data_MTL_fine_flat(
                TASK="multi_task",
                model_name=encoder,
                max_len=PARAMS["max_len"],
                batch_size=PARAMS["batch_size"],
                train_domains=train_domains,
                test_domains=EVAL_SPLITS,
                train_languages=["ALL"]
            )

            task_classes = {
                "entity_framing": y_train_s1.shape[1],
                "narrative_classification": y_train_s2.shape[1]
            }
            model = MultiTaskTransformer(encoder, task_classes).to(device) if setup == "mtl" else \
                AdapterMultiTaskTransformer(model_name=encoder, num_classes_dict=task_classes).to(device)

            for subtask in ["entity_framing", "narrative_classification"]:
                model_path = f"{subtask}_MTL_{'-'.join(train_domains)}_to_{'-'.join(EVAL_SPLITS)}.pt"
                if os.path.exists(model_path):
                    model.load_state_dict(torch.load(model_path), strict=False)
                    print(f"✅ Loaded model for {subtask}")
                else:
                    print(f"⚠️ Missing checkpoint: {model_path}")

                df_test = df_test_s1 if subtask == "entity_framing" else df_test_s2
                mlb = mlb_s1 if subtask == "entity_framing" else mlb_s2
                text_key = "Input_Text" if subtask == "entity_framing" else "Translated_Text"
                label_key = "Label"

                for domain in EVAL_SPLITS:
                    df_eval = df_test[df_test["Domain"] == domain].copy()
                    known_labels = set(mlb.classes_)
                    df_eval[label_key] = df_eval[label_key].apply(lambda labels: [l for l in labels if l in known_labels] if isinstance(labels, list) else [])
                    y_eval = mlb.transform(df_eval[label_key])

                    test_loader = DataLoader(
                        MultiTaskDataset(df_eval[text_key].tolist(), {subtask: y_eval}, tokenizer, PARAMS["max_len"]),
                        batch_size=PARAMS["batch_size"]
                    )

                    eval_result = evaluate_flat_custom(
                        model=model,
                        loader=test_loader,
                        df_source=df_eval,
                        mlb=mlb,
                        device=device,
                        threshold=PARAMS["threshold"],
                        task=subtask
                    )
                    coarse_list = get_coarse_label_list(subtask)
                    score_dict = compute_fine_vs_coarse_metrics(eval_result["y_true"], eval_result["y_pred_bin"], list(mlb.classes_), coarse_list)

                    all_rows.append({
                        "setup": setup,
                        "encoder": encoder,
                        "task": subtask,
                        "train_domain": train_str,
                        "eval_domain": domain,
                        "overall_macro": round((score_dict["macro_fine"] + score_dict["macro_coarse"]) / 2, 4),
                        "overall_micro": round((score_dict["micro_fine"] + score_dict["micro_coarse"]) / 2, 4),
                        "overall_exact": round(eval_result["exact"], 4),
                        "macro_fine": round(score_dict["macro_fine"], 4),
                        "micro_fine": round(score_dict["micro_fine"], 4),
                        "macro_coarse": round(score_dict["macro_coarse"], 4),
                        "micro_coarse": round(score_dict["micro_coarse"], 4),
                    })

    pd.DataFrame(all_rows, columns=SUMMARY_COLUMNS).to_csv(csv_path, index=False)
    print(f"✅ Saved: {csv_path}")


## Control Panel

In [31]:
# Choose a task for the pipeline below: "narrative_classification" or "entity_framing" or "multi_task" or "multi_task_adapter"
TASK = "narrative_classification"

# select domains for training and testing: "UA"; "CC"; "UA", "CC";
TRAIN_DOMAIN = ["UA"]
TEST_DOMAIN = ["UA", "CC"] # The test data comes from a separate dataset.
# The test data is always the same regardless of the domain we choose to train on. This is for consistency.

# select languages for training and testing: "ALL";"EN";"HI";"BG";"RU";"PT"
TRAIN_LANGUAGES = ["ALL"]
TEST_LANGUAGES = ["ALL"]

# Taxonomy Depth
TAXONOMY_DEPTH = "FINE" # "COARSE" OR "FINE"

# Classifier Complexity
CLASSIFIER_COMPLEXITY = "FLAT" # "FLAT" OR "HIERARCHICAL"

# change the training hyperparameters here
MODEL_NAME = "roberta-base" # OR  "distilbert-base-uncased" "roberta-base"
MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 4
LEARNING_RATE = 3e-5
MODEL_PATH = f"{TASK}_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt" # -- to save the model later

#tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
#tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


# debug mode -- reduced samples
DEBUG_MODE = False

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## UTILS Assemble Dataset

In [37]:
if TASK != "multi_task" and TASK != "multi_task_adapter":
    if TAXONOMY_DEPTH == 'FINE':
        if CLASSIFIER_COMPLEXITY == 'FLAT':
            df_train, df_val, df_test, y_train, y_val, y_test, mlb, TEXT_COL, LABEL_COL = prepare_data_STL_fine(
                TASK,
                TRAIN_DOMAIN,
                TEST_DOMAIN,
            )
        elif CLASSIFIER_COMPLEXITY == 'HIERARCHICAL':
            df_train, df_val, df_test, y_train, y_val, y_test, mlb, TEXT_COL, LABEL_COL, child_to_parent, label_to_index = prepare_data_STL_hierarchical(
                TASK,
                TRAIN_DOMAIN,
                TEST_DOMAIN,
            )


    elif TAXONOMY_DEPTH == 'COARSE':
        df_train, df_val, df_test, y_train, y_val, y_test, mlb, TEXT_COL, LABEL_COL = prepare_data_STL_coarse(
                TASK,
                TRAIN_DOMAIN,
                TEST_DOMAIN,
            )

    train_dataset = MultiLabelDataset(df_train[TEXT_COL].tolist(), y_train, tokenizer, MAX_LEN)
    val_dataset = MultiLabelDataset(df_val[TEXT_COL].tolist(), y_val, tokenizer, MAX_LEN)
    test_dataset = MultiLabelDataset(df_test[TEXT_COL].tolist(), y_test, tokenizer, MAX_LEN)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    num_classes = len(mlb.classes_)


elif TASK == "multi_task" or TASK == "multi_task_adapter":

    if TAXONOMY_DEPTH == 'FINE':
        
        if CLASSIFIER_COMPLEXITY == 'FLAT':
            (
                df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
                df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
                train_loader_s1, val_loader_s1, test_loader_s1,
                train_loader_s2, val_loader_s2, test_loader_s2,
                num_classes_dict
            ) = prepare_data_MTL_fine_flat(
                TASK,
                train_domains=TRAIN_DOMAIN,
                test_domains=TEST_DOMAIN,
                train_languages=TRAIN_LANGUAGES,
                model_name=MODEL_NAME,
                max_len=MAX_LEN,
                batch_size=BATCH_SIZE
            )

        elif CLASSIFIER_COMPLEXITY == 'HIERARCHICAL':
            (
                df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
                df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
                train_loader_s1, val_loader_s1, test_loader_s1,
                train_loader_s2, val_loader_s2, test_loader_s2,
                num_classes_dict,
                child_to_parent_map,
                label_to_index_map
            ) = prepare_data_MTL_hierarchical(
                TASK,
                train_domains=TRAIN_DOMAIN,
                test_domains=TEST_DOMAIN,
                train_languages=TRAIN_LANGUAGES,
                model_name=MODEL_NAME,
                max_len=MAX_LEN,
                batch_size=BATCH_SIZE
            )

    elif TAXONOMY_DEPTH == 'COARSE':
        (
            df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
            df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
            train_loader_s1, val_loader_s1, test_loader_s1,
            train_loader_s2, val_loader_s2, test_loader_s2,
            num_classes_dict
        ) = prepare_data_MTL_coarse(
            TASK,
            train_domains=TRAIN_DOMAIN,
            test_domains=TEST_DOMAIN,
            train_languages=TRAIN_LANGUAGES,
            model_name=MODEL_NAME,
            max_len=MAX_LEN,
            batch_size=BATCH_SIZE
        )




## Training Loop

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if TASK != "multi_task" and TASK != "multi_task_adapter":
    print("\n>>> Running Single-Task (no adapter) Model <<<")
    model = TransformerClassifier(MODEL_NAME, num_classes).to(device)

    if CLASSIFIER_COMPLEXITY == 'FLAT':
        trained_model = train_single_task_model(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            y_val=y_val,
            MODEL_PATH=MODEL_PATH,
            LEARNING_RATE=LEARNING_RATE,
            EPOCHS=EPOCHS,
            device=device,
            predict_proba=eval_util.predict_proba,
            evaluate_threshold_sweep=eval_util.evaluate_threshold_sweep
        )
        trained_model.load_state_dict(torch.load(MODEL_PATH))
        trained_model.to(device)

    elif CLASSIFIER_COMPLEXITY == 'HIERARCHICAL':
        trained_model = train_hierarchical_classifier(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            y_val=y_val,
            MODEL_PATH=MODEL_PATH,
            child_to_parent=child_to_parent,
            label_to_index=label_to_index,
            predict_proba=eval_util.predict_proba,
            evaluate_threshold_sweep=eval_util.evaluate_threshold_sweep,
            LEARNING_RATE=LEARNING_RATE,
            EPOCHS=EPOCHS
        )
        trained_model.load_state_dict(torch.load(MODEL_PATH))
        trained_model.to(device)

elif TASK == "multi_task":
    print("\n>>> Running Multi-Task (no adapter) Model <<<")
    task_classes = {
        "narrative_classification": y_train_s2.shape[1],
        "entity_framing": y_train_s1.shape[1]
    }
    model = MultiTaskTransformer(MODEL_NAME, task_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.BCEWithLogitsLoss()

    if CLASSIFIER_COMPLEXITY == 'FLAT':
        train_mtl_flat(
            model=model,
            loaders={
                "narrative_classification": train_loader_s2,
                "entity_framing": train_loader_s1
            },
            val_data={
                "narrative_classification": (val_loader_s2, df_val_s2, y_val_s2, mlb_s2),
                "entity_framing": (val_loader_s1, df_val_s1, y_val_s1, mlb_s1)
            },
            mlbs={
                "narrative_classification": mlb_s2,
                "entity_framing": mlb_s1
            },
            optimizer=optimizer,
            criterion=criterion,
            device=device,
            epochs=EPOCHS,
            train_domain=TRAIN_DOMAIN,
            test_domain=TEST_DOMAIN
        )

    elif CLASSIFIER_COMPLEXITY == 'HIERARCHICAL':
        train_mtl_hierarchical(
            model=model,
            loaders={
                "narrative_classification": train_loader_s2,
                "entity_framing": train_loader_s1
            },
            val_data={
                "narrative_classification": (val_loader_s2, df_val_s2, y_val_s2, mlb_s2),
                "entity_framing": (val_loader_s1, df_val_s1, y_val_s1, mlb_s1)
            },
            child_to_parent_map=child_to_parent_map,
            label_to_index_map=label_to_index_map,
            optimizer=optimizer,
            criterion=criterion,
            device=device,
            epochs=EPOCHS,
            train_domain=TRAIN_DOMAIN,
            test_domain=TEST_DOMAIN
        )


    # Re-load best saved model per task
    model.load_state_dict(torch.load(f"entity_framing_MTL_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt"), strict=False)
    model.load_state_dict(torch.load(f"narrative_classification_MTL_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt"), strict=False)
    trained_model = model


elif TASK == "multi_task_adapter":
    print("\n>>> Running Multi-Task Adapter Model <<<")
    
    task_classes = {
        "narrative_classification": y_train_s2.shape[1],
        "entity_framing": y_train_s1.shape[1]
    }
    
    model = AdapterMultiTaskTransformer(
        model_name=MODEL_NAME,
        num_classes_dict=task_classes,
        adapter_dim=128 
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.BCEWithLogitsLoss()

    if CLASSIFIER_COMPLEXITY == 'FLAT':
        train_mtl_flat(
            model=model,
            loaders={
                "narrative_classification": train_loader_s2,
                "entity_framing": train_loader_s1
            },
            val_data={
                "narrative_classification": (val_loader_s2, df_val_s2, y_val_s2, mlb_s2),
                "entity_framing": (val_loader_s1, df_val_s1, y_val_s1, mlb_s1)
            },
            mlbs={
                "narrative_classification": mlb_s2,
                "entity_framing": mlb_s1
            },
            optimizer=optimizer,
            criterion=criterion,
            device=device,
            epochs=EPOCHS,
            train_domain=TRAIN_DOMAIN,
            test_domain=TEST_DOMAIN
        )

    # load best saved models
    model.load_state_dict(torch.load(f"entity_framing_MTL_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt"), strict=False)
    model.load_state_dict(torch.load(f"narrative_classification_MTL_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt"), strict=False)
    trained_model = model


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

>>> Running Single-Task (no adapter) Model <<<
Epoch 1:  38%|███▊      | 57/152 [00:40<01:07,  1.40it/s]

In [25]:

model.load_state_dict(torch.load(f"entity_framing_MTL_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt"), strict=False)
model.load_state_dict(torch.load(f"narrative_classification_MTL_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt"), strict=False)
trained_model = model

## Evaluation

In [28]:
# ==========================
# EVALUATION (Single Task)
# ==========================
if TASK != "multi_task" and TASK != "multi_task_adapter":
    print(f"\nEvaluating Single-Task Model ({TASK})")

    if CLASSIFIER_COMPLEXITY == 'FLAT':
        results_domain = eval_util.evaluate_per_domain_flat(
            trained_model,
            val_loader, df_val.reset_index(drop=True),
            test_loader, df_test.reset_index(drop=True),
            mlb,
            device=device
        )

        results_class = eval_util.evaluate_per_class_flat(
            trained_model,
            test_loader,
            df_test.reset_index(drop=True),
            mlb,
            device=device,
            label="TEST"
        )


    elif CLASSIFIER_COMPLEXITY == 'HIERARCHICAL':
        results_hierarchical = eval_util.evaluate_and_compare_hierarchical(
            model=trained_model,
            val_loader=val_loader,
            val_df=df_val.reset_index(drop=True),
            val_targets=y_val,
            test_loader=test_loader,
            test_df=df_test.reset_index(drop=True),
            test_targets=y_test,
            mlb=mlb,
            device=device,
            child_to_parent=child_to_parent,
            label_to_index=label_to_index
        )

# ==========================
# EVALUATION (Multi-Task)
# ==========================
elif TASK == "multi_task" or TASK == "multi_task_adapter":
    print(f"\nEvaluating Multi-Task Model ({TASK})")
    if CLASSIFIER_COMPLEXITY == 'FLAT':
        task_loaders = {
            "narrative_classification": test_loader_s2,
            "entity_framing": test_loader_s1,
        }

        task_dfs = {
            "narrative_classification": df_test_s2,
            "entity_framing": df_test_s1,
        }

        task_targets = {
            "narrative_classification": y_test_s2,
            "entity_framing": y_test_s1,
        }

        task_mlbs = {
            "narrative_classification": mlb_s2,
            "entity_framing": mlb_s1,
        }

        results_mtl = eval_util.evaluate_mtl_all_tasks(
            model=trained_model,
            task_loaders=task_loaders,
            task_dfs=task_dfs,
            task_targets=task_targets,
            task_mlbs=task_mlbs,
            domain_list=TRAIN_DOMAIN,
            device=device,
            load_from_disk=False
        )


    elif CLASSIFIER_COMPLEXITY == 'HIERARCHICAL':
        eval_util.evaluate_mtl_hierarchical_all_tasks(
            model=trained_model,
            test_loaders={
                "narrative_classification": test_loader_s2,
                "entity_framing": test_loader_s1
            },
            df_tests={
                "narrative_classification": df_test_s2,
                "entity_framing": df_test_s1
            },
            y_tests={
                "narrative_classification": y_test_s2,
                "entity_framing": y_test_s1
            },
            mlbs={
                "narrative_classification": mlb_s2,
                "entity_framing": mlb_s1
            },
            child_to_parent_map=child_to_parent_map,
            label_to_index_map=label_to_index_map,
            device=device
        )



Evaluating Multi-Task Model (multi_task)

--- Task: NARRATIVE_CLASSIFICATION ---
Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:04<00:00,  5.08it/s]

TEST (narrative_classification) [Threshold=0.35]
Macro F1: 0.173
Micro F1: 0.411
Exact Match: 0.062

----------------------------
Per-Domain Breakdown
----------------------------

Domain: CC
Macro F1: 0.035
Micro F1: 0.367
Exact Match: 0.043

Domain: UA
Macro F1: 0.140
Micro F1: 0.429
Exact Match: 0.074

--- Task: ENTITY_FRAMING ---
Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:11<00:00,  4.84it/s]
TEST (entity_framing) [Threshold=0.35]
Macro F1: 0.238
Micro F1: 0.575
Exact Match: 0.342

----------------------------
Per-Domain Breakdown
----------------------------

Domain: CC
Macro F1: 0.168
Micro F1: 0.793
Exact Match: 0.626

Domain: UA
Macro F1: 0.221
Micro F1: 0.521
Exact Match: 0.269



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=49d39932-ba1f-4621-a036-ab99ade88496' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>