In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
os.chdir("/content/drive/MyDrive/Thesis Repository")

In [5]:
pip install scikit-multilearn hf_xet

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl.metadata (6.0 kB)
Collecting hf_xet
  Downloading hf_xet-1.1.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hf_xet-1.1.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-multilearn, hf_xet
Successfully installed hf_xet-1.1.2 scikit-multilearn-0.2.0


In [6]:
# RUN THESE IMPORTS FIRST
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DebertaV2Tokenizer, AutoModel, RobertaTokenizer
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report
import numpy as np
import shap
#from captum.attr import IntegratedGradients
from transformers import AutoTokenizer, Trainer, TrainingArguments
import torch.nn.functional as F
import hf_xet
import itertools
#import optuna
import sys
import importlib # !pip install importlib
sys.path.append('.')


### Custom built modules ###
import importlib

import data_loader_STL
importlib.reload(data_loader_STL)
from data_loader_STL import prepare_data_STL_fine, prepare_data_STL_hierarchical, prepare_data_STL_coarse

import single_task
importlib.reload(single_task)
from single_task import TransformerClassifier, MultiLabelDataset, train_single_task_model, train_hierarchical_classifier

import multi_task
importlib.reload(multi_task)
from multi_task import MultiTaskTransformer, train_mtl_flat, train_mtl_hierarchical, apply_hierarchical_constraints_mtl, hierarchical_loss_mtl, AdapterMultiTaskTransformer

import data_loader_MTL
importlib.reload(data_loader_MTL)
from data_loader_MTL import prepare_data_MTL_fine_flat, prepare_data_MTL_hierarchical, prepare_data_MTL_coarse, MultiTaskDataset, prepare_data_MTL_mixed

import evaluation_utils as eval_util
importlib.reload(eval_util)
from evaluation_utils import evaluate_flat, evaluate_hierarchy, evaluate_mtl_all_tasks, evaluate_mtl_task, evaluate_per_class_flat, evaluate_per_domain_flat, predict_proba, evaluate_threshold_sweep, evaluate_mtl_hierarchical_task, evaluate_mtl_hierarchical_all_tasks, evaluate_flat_custom, compute_fine_vs_coarse_metrics, get_coarse_label_list


# Ablation function

In [7]:
def prepare_data_MTL_mixed(
    task,
    train_domains,
    test_domains,
    train_languages,
    model_name,
    max_len,
    batch_size,
    granularity_s1="coarse",
    granularity_s2="fine"
):
    # --- Task 1 ---
    if granularity_s1 == "fine":
        (
            df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
            _, _, _, _, _, _, _,
            train_loader_s1, val_loader_s1, test_loader_s1,
            _, _, _,
            _
        ) = prepare_data_MTL_fine_flat(
            task,
            train_domains=train_domains,
            test_domains=test_domains,
            train_languages=train_languages,
            model_name=model_name,
            max_len=max_len,
            batch_size=batch_size
        )
    elif granularity_s1 == "coarse":
        (
            df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
            _, _, _, _, _, _, _,
            train_loader_s1, val_loader_s1, test_loader_s1,
            _, _, _,
            _
        ) = prepare_data_MTL_coarse(
            task,
            train_domains=train_domains,
            test_domains=test_domains,
            train_languages=train_languages,
            model_name=model_name,
            max_len=max_len,
            batch_size=batch_size
        )

    # --- Task 2 ---
    if granularity_s2 == "fine":
        (
            _, _, _, _, _, _, _,
            df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
            _, _, _,
            train_loader_s2, val_loader_s2, test_loader_s2,
            _
        ) = prepare_data_MTL_fine_flat(
            task,
            train_domains=train_domains,
            test_domains=test_domains,
            train_languages=train_languages,
            model_name=model_name,
            max_len=max_len,
            batch_size=batch_size
        )
    elif granularity_s2 == "coarse":
        (
            _, _, _, _, _, _, _,
            df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
            _, _, _,
            train_loader_s2, val_loader_s2, test_loader_s2,
            _
        ) = prepare_data_MTL_coarse(
            task,
            train_domains=train_domains,
            test_domains=test_domains,
            train_languages=train_languages,
            model_name=model_name,
            max_len=max_len,
            batch_size=batch_size
        )

    # --- Return consistent structure ---
    num_classes_dict = {
        "task1": len(mlb_s1.classes_),
        "task2": len(mlb_s2.classes_)
    }

    return (
        df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
        df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
        train_loader_s1, val_loader_s1, test_loader_s1,
        train_loader_s2, val_loader_s2, test_loader_s2,
        num_classes_dict
    )


# Ablation - Granularity - MTL

In [None]:
import pandas as pd
import itertools
import torch
import os
from transformers import AutoTokenizer

# === Configuration ===
MODEL_NAME = "distilbert-base-uncased"  # or "roberta-base", "facebook/roberta-base"
MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 4
LEARNING_RATE = 3e-5
TRAIN_LANGUAGES = ["ALL"]

granularity_options = ['fine','coarse'] #  38, 39, 40, 54, 55, 71, 72, 73, 74, 75
seeds = [71, 72, 73, 74, 75] # 42, 43, 44
granularity_configs = [('coarse', 'coarse')]


train_domains_all = [["UA"], ["CC"], ["UA", "CC"]]
test_domains = ["UA", "CC"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

os.makedirs("ablation_results_final_seeds", exist_ok=True)

# === Experiment Loop ===
for gran_s1, gran_s2 in granularity_configs:
    config_id = f"ef-{gran_s1}_nc-{gran_s2}"
    results = []

    for train_domain in train_domains_all:
        for task_type in ["multi_task", "multi_task_adapter"]:
            for seed in seeds:
                torch.manual_seed(seed)

                # Set model path if needed for saving
                model_path = f"{task_type}_{'-'.join(train_domain)}_to_{'-'.join(test_domains)}_ef-{gran_s1}_nc-{gran_s2}_seed{seed}.pt"

                # --- Data Prep ---
                (
                    df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
                    df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
                    train_loader_s1, val_loader_s1, test_loader_s1,
                    train_loader_s2, val_loader_s2, test_loader_s2,
                    num_classes_dict
                ) = prepare_data_MTL_mixed(
                    task=task_type,
                    train_domains=train_domain,
                    test_domains=test_domains,
                    train_languages=TRAIN_LANGUAGES,
                    model_name=MODEL_NAME,
                    max_len=MAX_LEN,
                    batch_size=BATCH_SIZE,
                    granularity_s1=gran_s1,
                    granularity_s2=gran_s2
                )

                task_classes = {
                    "narrative_classification": y_train_s2.shape[1],
                    "entity_framing": y_train_s1.shape[1]
                }

                # --- Model ---
                if task_type == "multi_task":
                    model = MultiTaskTransformer(MODEL_NAME, task_classes).to(device)
                else:
                    model = AdapterMultiTaskTransformer(
                        model_name=MODEL_NAME,
                        num_classes_dict=task_classes,
                        adapter_dim=128
                    ).to(device)

                optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
                criterion = torch.nn.BCEWithLogitsLoss()

                # --- Train ---
                train_mtl_flat(
                    model=model,
                    loaders={
                        "narrative_classification": train_loader_s2,
                        "entity_framing": train_loader_s1
                    },
                    val_data={
                        "narrative_classification": (val_loader_s2, df_val_s2, y_val_s2, mlb_s2),
                        "entity_framing": (val_loader_s1, df_val_s1, y_val_s1, mlb_s1)
                    },
                    mlbs={
                        "narrative_classification": mlb_s2,
                        "entity_framing": mlb_s1
                    },
                    optimizer=optimizer,
                    criterion=criterion,
                    device=device,
                    epochs=EPOCHS,
                    train_domain=train_domain,
                    test_domain=test_domains
                )

                # torch.save(model.state_dict(), model_path)  # Optionally save

                # --- Evaluate ---
                eval_results = evaluate_mtl_all_tasks(
                    model=model,
                    task_loaders={
                        "narrative_classification": test_loader_s2,
                        "entity_framing": test_loader_s1
                    },
                    task_dfs={
                        "narrative_classification": df_test_s2,
                        "entity_framing": df_test_s1
                    },
                    task_targets={
                        "narrative_classification": y_test_s2,
                        "entity_framing": y_test_s1
                    },
                    task_mlbs={
                        "narrative_classification": mlb_s2,
                        "entity_framing": mlb_s1
                    },
                    domain_list=train_domain,
                    device=device
                )

                ef = eval_results["entity_framing"]
                nc = eval_results["narrative_classification"]

                results.append({
                    "task_type": task_type,
                    "seed": seed,
                    "train_domain": "-".join(train_domain),
                    "ef_granularity": gran_s1,
                    "nc_granularity": gran_s2,

                    # EF metrics
                    "ef_micro_ua": ef["UA"]["micro"],
                    "ef_macro_ua": ef["UA"]["macro"],
                    "ef_exact_ua": ef["UA"]["exact"],
                    "ef_micro_cc": ef["CC"]["micro"],
                    "ef_macro_cc": ef["CC"]["macro"],
                    "ef_exact_cc": ef["CC"]["exact"],

                    # NC metrics
                    "nc_micro_ua": nc["UA"]["micro"],
                    "nc_macro_ua": nc["UA"]["macro"],
                    "nc_exact_ua": nc["UA"]["exact"],
                    "nc_micro_cc": nc["CC"]["micro"],
                    "nc_macro_cc": nc["CC"]["macro"],
                    "nc_exact_cc": nc["CC"]["exact"]
                })

    # --- Save CSV ---
    df_out = pd.DataFrame(results)
    out_path = f"ablation_results/ablation_{config_id}.csv"
    df_out.to_csv(out_path, index=False)
    print(f" Saved: {out_path}")


KeyboardInterrupt: 

# Ablation - Domain/Task - MTL

In [None]:
import pandas as pd
import torch
import itertools
import os
from transformers import AutoTokenizer

# === Configuration ===
MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 4
LEARNING_RATE = 3e-5
TRAIN_LANGUAGES = ["ALL"]
TEST_LANGUAGES = ["ALL"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
os.makedirs("ablation_results_cross_domain_more_seeds", exist_ok=True)

# === Domain Configurations for Task Splits ===
domain_configs = [
    ("UA", "CC"),  # EF on UA, NC on CC
    ("CC", "UA")   # EF on CC, NC on UA
]
seeds = [71, 72, 73, 74, 75] # 42, 43, 44, 31, 32

for ef_domain, nc_domain in domain_configs:
    config_id = f"EF-{ef_domain}_NC-{nc_domain}"
    results = []

    for task_type in ["multi_task", "multi_task_adapter"]:
        for seed in seeds:
            torch.manual_seed(seed)

            # Set model path if you want to save
            model_path = f"{task_type}_EF-{ef_domain}_NC-{nc_domain}_seed{seed}.pt"

            # --- Prepare data ---
            (
                df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
                df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
                train_loader_s1, val_loader_s1, test_loader_s1,
                train_loader_s2, val_loader_s2, test_loader_s2,
                num_classes_dict
            ) = prepare_data_MTL_mixed(
                task=task_type,
                train_domains=[ef_domain, nc_domain],  # both tasks in one call
                test_domains=["UA", "CC"],  # evaluate on both
                train_languages=TRAIN_LANGUAGES,
                model_name=MODEL_NAME,
                max_len=MAX_LEN,
                batch_size=BATCH_SIZE,
                granularity_s1="fine",
                granularity_s2="fine"
            )

            task_classes = {
                "entity_framing": y_train_s1.shape[1],
                "narrative_classification": y_train_s2.shape[1]
            }

            # --- Model ---
            if task_type == "multi_task":
                model = MultiTaskTransformer(MODEL_NAME, task_classes).to(device)
            else:
                model = AdapterMultiTaskTransformer(
                    model_name=MODEL_NAME,
                    num_classes_dict=task_classes,
                    adapter_dim=128
                ).to(device)

            optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
            criterion = torch.nn.BCEWithLogitsLoss()

            # --- Train ---
            train_mtl_flat(
                model=model,
                loaders={
                    "narrative_classification": train_loader_s2,
                    "entity_framing": train_loader_s1
                },
                val_data={
                    "narrative_classification": (val_loader_s2, df_val_s2, y_val_s2, mlb_s2),
                    "entity_framing": (val_loader_s1, df_val_s1, y_val_s1, mlb_s1)
                },
                mlbs={
                    "narrative_classification": mlb_s2,
                    "entity_framing": mlb_s1
                },
                optimizer=optimizer,
                criterion=criterion,
                device=device,
                epochs=EPOCHS,
                train_domain=[ef_domain, nc_domain],
                test_domain=["UA", "CC"]
            )

            # --- Evaluate ---
            eval_results = evaluate_mtl_all_tasks(
                model=model,
                task_loaders={
                    "narrative_classification": test_loader_s2,
                    "entity_framing": test_loader_s1
                },
                task_dfs={
                    "narrative_classification": df_test_s2,
                    "entity_framing": df_test_s1
                },
                task_targets={
                    "narrative_classification": y_test_s2,
                    "entity_framing": y_test_s1
                },
                task_mlbs={
                    "narrative_classification": mlb_s2,
                    "entity_framing": mlb_s1
                },
                domain_list=[ef_domain, nc_domain],
                device=device
            )

            ef = eval_results["entity_framing"]
            nc = eval_results["narrative_classification"]

            results.append({
                "task_type": task_type,
                "seed": seed,
                "ef_train_domain": ef_domain,
                "nc_train_domain": nc_domain,

                "ef_micro_ua": ef["UA"]["micro"],
                "ef_macro_ua": ef["UA"]["macro"],
                "ef_exact_ua": ef["UA"]["exact"],
                "ef_micro_cc": ef["CC"]["micro"],
                "ef_macro_cc": ef["CC"]["macro"],
                "ef_exact_cc": ef["CC"]["exact"],

                "nc_micro_ua": nc["UA"]["micro"],
                "nc_macro_ua": nc["UA"]["macro"],
                "nc_exact_ua": nc["UA"]["exact"],
                "nc_micro_cc": nc["CC"]["micro"],
                "nc_macro_cc": nc["CC"]["macro"],
                "nc_exact_cc": nc["CC"]["exact"]
            })

    # --- Save CSV for this config ---
    df_out = pd.DataFrame(results)
    out_path = f"ablation_results_cross_domain_more_seeds/ablation_{config_id}.csv"
    df_out.to_csv(out_path, index=False)
    print(f" Saved: {out_path}")


# STL - COARSE and FINE + ROBERTA + distilbert

In [None]:
from torch.utils.data import DataLoader
import pandas as pd
import itertools
import torch
import os
import numpy as np
from transformers import AutoTokenizer

# === Config ===
MODELS = ["distilbert-base-uncased"]
TASKS = ["entity_framing","narrative_classification"]
TAXONOMY_DEPTHS = ["fine"]
SEEDS = [71, 72, 73, 74, 75, 42, 43, 44] # 42,43,44, 31,32
TRAIN_DOMAINS = [["UA"],["UA","CC"],["CC"]]
TEST_DOMAIN = ["UA", "CC"]
TRAIN_LANGUAGES = ["ALL"]
MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 4
LEARNING_RATE = 3e-5
THRESHOLD = 0.35

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.makedirs("ablation_results_stl_augmented_new", exist_ok=True)
results = []

for model_name, task, taxonomy, train_domain, seed in itertools.product(MODELS, TASKS, TAXONOMY_DEPTHS, TRAIN_DOMAINS, SEEDS):
    domain_str = "-".join(train_domain)
    print(f"\n--- Running STL: {task} | {taxonomy} | {model_name} | Seed={seed} | Train on {domain_str} ---")
    torch.manual_seed(seed)

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # === Data Prep ===
    if taxonomy == "fine":
        df_train, df_val, df_test, y_train, y_val, y_test, mlb, TEXT_COL, LABEL_COL = prepare_data_STL_fine(
            TASK=task,
            train_domains=train_domain,
            test_domains=TEST_DOMAIN,
            train_languages=TRAIN_LANGUAGES
        )
    else:
        df_train, df_val, df_test, y_train, y_val, y_test, mlb, TEXT_COL, LABEL_COL = prepare_data_STL_coarse(
            TASK=task,
            train_domains=train_domain,
            test_domains=TEST_DOMAIN,
            train_languages=TRAIN_LANGUAGES
        )

    # === Dataset Setup ===
    train_dataset = MultiLabelDataset(df_train[TEXT_COL].tolist(), y_train, tokenizer, MAX_LEN)
    val_dataset = MultiLabelDataset(df_val[TEXT_COL].tolist(), y_val, tokenizer, MAX_LEN)
    test_dataset = MultiLabelDataset(df_test[TEXT_COL].tolist(), y_test, tokenizer, MAX_LEN)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    # === Model Init ===
    num_classes = y_train.shape[1]
    model = TransformerClassifier(model_name, num_classes)

    model_path = (
    f"model_{task}_{taxonomy}_trained_on_{domain_str}"
    f"_{model_name.replace('/', '-')}_seed{seed}.pt"
    )

    # === Train ===
    model = train_single_task_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        y_val=y_val,
        MODEL_PATH=model_path,
        LEARNING_RATE=LEARNING_RATE,
        EPOCHS=EPOCHS,
        device=device,
        predict_proba=predict_proba,
        evaluate_threshold_sweep=evaluate_threshold_sweep
    )

    # === Evaluate ===
    eval_result = evaluate_flat(
        model=model,
        loader=test_loader,
        df_source=df_test,
        mlb=mlb,
        device=device,
        label="TEST",
        threshold=THRESHOLD
    )

    # === Extract Per-Domain Metrics ===
    for domain in ["UA", "CC"]:
        results.append({
            "model": model_name,
            "task": task,
            "taxonomy": taxonomy,
            "train_domain": domain_str,
            "seed": seed,
            "eval_domain": domain,
            "micro": eval_result["per_domain"][domain]["micro"],
            "macro": eval_result["per_domain"][domain]["macro"],
            "exact": eval_result["per_domain"][domain]["exact"]
        })

# === Save to CSV ===
df_out = pd.DataFrame(results)
df_out.to_csv("ablation_results_stl_augmented_new/stl_all_results_augmented_more_seeds.csv", index=False)
print("ablation_results_stl_augmented_new/stl_all_results_augmented_more_seeds.csv")



--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=71 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1221/1221 [02:03<00:00,  9.91it/s]



Epoch 1: Loss = 0.2241
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.257 | Micro F1: 0.445 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.251 | Micro F1: 0.518 | Exact Match: 0.017
Thresh 0.20 | Macro F1: 0.226 | Micro F1: 0.547 | Exact Match: 0.075
Thresh 0.25 | Macro F1: 0.210 | Micro F1: 0.556 | Exact Match: 0.144
Thresh 0.30 | Macro F1: 0.195 | Micro F1: 0.554 | Exact Match: 0.171
Thresh 0.35 | Macro F1: 0.170 | Micro F1: 0.537 | Exact Match: 0.176
Thresh 0.40 | Macro F1: 0.151 | Micro F1: 0.520 | Exact Match: 0.164
Thresh 0.45 | Macro F1: 0.136 | Micro F1: 0.497 | Exact Match: 0.143
Thresh 0.50 | Macro F1: 0.121 | Micro F1: 0.473 | Exact Match: 0.118
Thresh 0.55 | Macro F1: 0.107 | Micro F1: 0.444 | Exact Match: 0.091
Thresh 0.60 | Macro F1: 0.095 | Micro F1: 0.412 | Exact Match: 0.069
Thresh 0.65 | Macro F1: 0.081 | Micro F1: 0.375 | Exact Match: 0.046
Thresh 0.70 | Macro F1: 0.065 | Micro F1: 0.328 | Exact Match: 0.026
Thresh 0.75 | Macro F1: 0.050 | Micro F1: 0.281 | Exac

Epoch 2: 100%|██████████| 1221/1221 [02:03<00:00,  9.89it/s]



Epoch 2: Loss = 0.1676
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.423 | Micro F1: 0.587 | Exact Match: 0.056
Thresh 0.15 | Macro F1: 0.435 | Micro F1: 0.666 | Exact Match: 0.196
Thresh 0.20 | Macro F1: 0.439 | Micro F1: 0.708 | Exact Match: 0.299
Thresh 0.25 | Macro F1: 0.434 | Micro F1: 0.728 | Exact Match: 0.376
Thresh 0.30 | Macro F1: 0.415 | Micro F1: 0.738 | Exact Match: 0.420
Thresh 0.35 | Macro F1: 0.400 | Micro F1: 0.741 | Exact Match: 0.434
Thresh 0.40 | Macro F1: 0.380 | Micro F1: 0.737 | Exact Match: 0.426
Thresh 0.45 | Macro F1: 0.341 | Micro F1: 0.725 | Exact Match: 0.398
Thresh 0.50 | Macro F1: 0.313 | Micro F1: 0.714 | Exact Match: 0.369
Thresh 0.55 | Macro F1: 0.291 | Micro F1: 0.697 | Exact Match: 0.328
Thresh 0.60 | Macro F1: 0.271 | Micro F1: 0.676 | Exact Match: 0.290
Thresh 0.65 | Macro F1: 0.247 | Micro F1: 0.651 | Exact Match: 0.254
Thresh 0.70 | Macro F1: 0.220 | Micro F1: 0.616 | Exact Match: 0.211
Thresh 0.75 | Macro F1: 0.188 | Micro F1: 0.572 | Exac

Epoch 3: 100%|██████████| 1221/1221 [02:02<00:00,  9.94it/s]



Epoch 3: Loss = 0.1183
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.574 | Micro F1: 0.709 | Exact Match: 0.250
Thresh 0.15 | Macro F1: 0.609 | Micro F1: 0.770 | Exact Match: 0.373
Thresh 0.20 | Macro F1: 0.605 | Micro F1: 0.801 | Exact Match: 0.462
Thresh 0.25 | Macro F1: 0.600 | Micro F1: 0.816 | Exact Match: 0.523
Thresh 0.30 | Macro F1: 0.593 | Micro F1: 0.828 | Exact Match: 0.568
Thresh 0.35 | Macro F1: 0.585 | Micro F1: 0.832 | Exact Match: 0.585
Thresh 0.40 | Macro F1: 0.555 | Micro F1: 0.829 | Exact Match: 0.579
Thresh 0.45 | Macro F1: 0.532 | Micro F1: 0.821 | Exact Match: 0.556
Thresh 0.50 | Macro F1: 0.502 | Micro F1: 0.814 | Exact Match: 0.530
Thresh 0.55 | Macro F1: 0.477 | Micro F1: 0.803 | Exact Match: 0.503
Thresh 0.60 | Macro F1: 0.451 | Micro F1: 0.792 | Exact Match: 0.476
Thresh 0.65 | Macro F1: 0.413 | Micro F1: 0.776 | Exact Match: 0.437
Thresh 0.70 | Macro F1: 0.372 | Micro F1: 0.756 | Exact Match: 0.391
Thresh 0.75 | Macro F1: 0.328 | Micro F1: 0.732 | Exac

Epoch 4: 100%|██████████| 1221/1221 [02:03<00:00,  9.89it/s]



Epoch 4: Loss = 0.0829
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.684 | Micro F1: 0.799 | Exact Match: 0.403
Thresh 0.15 | Macro F1: 0.743 | Micro F1: 0.849 | Exact Match: 0.540
Thresh 0.20 | Macro F1: 0.769 | Micro F1: 0.877 | Exact Match: 0.634
Thresh 0.25 | Macro F1: 0.777 | Micro F1: 0.892 | Exact Match: 0.697
Thresh 0.30 | Macro F1: 0.766 | Micro F1: 0.901 | Exact Match: 0.728
Thresh 0.35 | Macro F1: 0.741 | Micro F1: 0.904 | Exact Match: 0.745
Thresh 0.40 | Macro F1: 0.731 | Micro F1: 0.907 | Exact Match: 0.755
Thresh 0.45 | Macro F1: 0.711 | Micro F1: 0.909 | Exact Match: 0.759
Thresh 0.50 | Macro F1: 0.693 | Micro F1: 0.905 | Exact Match: 0.745
Thresh 0.55 | Macro F1: 0.674 | Micro F1: 0.902 | Exact Match: 0.733
Thresh 0.60 | Macro F1: 0.646 | Micro F1: 0.895 | Exact Match: 0.711
Thresh 0.65 | Macro F1: 0.622 | Micro F1: 0.888 | Exact Match: 0.687
Thresh 0.70 | Macro F1: 0.595 | Micro F1: 0.878 | Exact Match: 0.655
Thresh 0.75 | Macro F1: 0.567 | Micro F1: 0.864 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 22.13it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.268
Micro F1: 0.522
Exact Match: 0.297

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.145
Micro F1: 0.704
Exact Match: 0.604

 Domain: UA
Macro F1: 0.255
Micro F1: 0.478
Exact Match: 0.218

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=72 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1221/1221 [02:05<00:00,  9.76it/s]



Epoch 1: Loss = 0.2204
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.293 | Micro F1: 0.462 | Exact Match: 0.002
Thresh 0.15 | Macro F1: 0.275 | Micro F1: 0.530 | Exact Match: 0.032
Thresh 0.20 | Macro F1: 0.239 | Micro F1: 0.563 | Exact Match: 0.129
Thresh 0.25 | Macro F1: 0.228 | Micro F1: 0.577 | Exact Match: 0.210
Thresh 0.30 | Macro F1: 0.216 | Micro F1: 0.581 | Exact Match: 0.241
Thresh 0.35 | Macro F1: 0.200 | Micro F1: 0.574 | Exact Match: 0.246
Thresh 0.40 | Macro F1: 0.187 | Micro F1: 0.562 | Exact Match: 0.234
Thresh 0.45 | Macro F1: 0.174 | Micro F1: 0.547 | Exact Match: 0.217
Thresh 0.50 | Macro F1: 0.160 | Micro F1: 0.523 | Exact Match: 0.191
Thresh 0.55 | Macro F1: 0.141 | Micro F1: 0.493 | Exact Match: 0.160
Thresh 0.60 | Macro F1: 0.123 | Micro F1: 0.458 | Exact Match: 0.130
Thresh 0.65 | Macro F1: 0.108 | Micro F1: 0.413 | Exact Match: 0.102
Thresh 0.70 | Macro F1: 0.090 | Micro F1: 0.353 | Exact Match: 0.068
Thresh 0.75 | Macro F1: 0.070 | Micro F1: 0.277 | Exac

Epoch 2: 100%|██████████| 1221/1221 [02:05<00:00,  9.76it/s]



Epoch 2: Loss = 0.1566
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.473 | Micro F1: 0.625 | Exact Match: 0.128
Thresh 0.15 | Macro F1: 0.487 | Micro F1: 0.700 | Exact Match: 0.257
Thresh 0.20 | Macro F1: 0.496 | Micro F1: 0.740 | Exact Match: 0.366
Thresh 0.25 | Macro F1: 0.479 | Micro F1: 0.763 | Exact Match: 0.432
Thresh 0.30 | Macro F1: 0.464 | Micro F1: 0.774 | Exact Match: 0.469
Thresh 0.35 | Macro F1: 0.445 | Micro F1: 0.774 | Exact Match: 0.478
Thresh 0.40 | Macro F1: 0.415 | Micro F1: 0.768 | Exact Match: 0.467
Thresh 0.45 | Macro F1: 0.373 | Micro F1: 0.760 | Exact Match: 0.445
Thresh 0.50 | Macro F1: 0.340 | Micro F1: 0.749 | Exact Match: 0.419
Thresh 0.55 | Macro F1: 0.313 | Micro F1: 0.733 | Exact Match: 0.384
Thresh 0.60 | Macro F1: 0.285 | Micro F1: 0.714 | Exact Match: 0.347
Thresh 0.65 | Macro F1: 0.267 | Micro F1: 0.698 | Exact Match: 0.321
Thresh 0.70 | Macro F1: 0.245 | Micro F1: 0.675 | Exact Match: 0.284
Thresh 0.75 | Macro F1: 0.219 | Micro F1: 0.641 | Exac

Epoch 3: 100%|██████████| 1221/1221 [02:05<00:00,  9.76it/s]



Epoch 3: Loss = 0.1060
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.616 | Micro F1: 0.750 | Exact Match: 0.325
Thresh 0.15 | Macro F1: 0.647 | Micro F1: 0.807 | Exact Match: 0.450
Thresh 0.20 | Macro F1: 0.662 | Micro F1: 0.836 | Exact Match: 0.535
Thresh 0.25 | Macro F1: 0.669 | Micro F1: 0.852 | Exact Match: 0.591
Thresh 0.30 | Macro F1: 0.669 | Micro F1: 0.863 | Exact Match: 0.630
Thresh 0.35 | Macro F1: 0.644 | Micro F1: 0.867 | Exact Match: 0.651
Thresh 0.40 | Macro F1: 0.624 | Micro F1: 0.868 | Exact Match: 0.657
Thresh 0.45 | Macro F1: 0.608 | Micro F1: 0.865 | Exact Match: 0.650
Thresh 0.50 | Macro F1: 0.582 | Micro F1: 0.857 | Exact Match: 0.630
Thresh 0.55 | Macro F1: 0.554 | Micro F1: 0.850 | Exact Match: 0.609
Thresh 0.60 | Macro F1: 0.516 | Micro F1: 0.838 | Exact Match: 0.576
Thresh 0.65 | Macro F1: 0.480 | Micro F1: 0.824 | Exact Match: 0.530
Thresh 0.70 | Macro F1: 0.438 | Micro F1: 0.807 | Exact Match: 0.491
Thresh 0.75 | Macro F1: 0.382 | Micro F1: 0.785 | Exac

Epoch 4: 100%|██████████| 1221/1221 [02:05<00:00,  9.75it/s]



Epoch 4: Loss = 0.0717
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.749 | Micro F1: 0.845 | Exact Match: 0.551
Thresh 0.15 | Macro F1: 0.795 | Micro F1: 0.885 | Exact Match: 0.659
Thresh 0.20 | Macro F1: 0.808 | Micro F1: 0.906 | Exact Match: 0.735
Thresh 0.25 | Macro F1: 0.805 | Micro F1: 0.915 | Exact Match: 0.772
Thresh 0.30 | Macro F1: 0.802 | Micro F1: 0.923 | Exact Match: 0.799
Thresh 0.35 | Macro F1: 0.798 | Micro F1: 0.926 | Exact Match: 0.811
Thresh 0.40 | Macro F1: 0.780 | Micro F1: 0.925 | Exact Match: 0.811
Thresh 0.45 | Macro F1: 0.771 | Micro F1: 0.925 | Exact Match: 0.810
Thresh 0.50 | Macro F1: 0.761 | Micro F1: 0.924 | Exact Match: 0.803
Thresh 0.55 | Macro F1: 0.739 | Micro F1: 0.919 | Exact Match: 0.784
Thresh 0.60 | Macro F1: 0.717 | Micro F1: 0.915 | Exact Match: 0.771
Thresh 0.65 | Macro F1: 0.690 | Micro F1: 0.909 | Exact Match: 0.749
Thresh 0.70 | Macro F1: 0.657 | Micro F1: 0.901 | Exact Match: 0.721
Thresh 0.75 | Macro F1: 0.622 | Micro F1: 0.889 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.46it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.276
Micro F1: 0.543
Exact Match: 0.337

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.161
Micro F1: 0.764
Exact Match: 0.626

 Domain: UA
Macro F1: 0.262
Micro F1: 0.487
Exact Match: 0.263

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=73 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1221/1221 [02:04<00:00,  9.85it/s]



Epoch 1: Loss = 0.2218
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.274 | Micro F1: 0.463 | Exact Match: 0.001
Thresh 0.15 | Macro F1: 0.265 | Micro F1: 0.521 | Exact Match: 0.018
Thresh 0.20 | Macro F1: 0.264 | Micro F1: 0.559 | Exact Match: 0.068
Thresh 0.25 | Macro F1: 0.252 | Micro F1: 0.581 | Exact Match: 0.132
Thresh 0.30 | Macro F1: 0.223 | Micro F1: 0.582 | Exact Match: 0.187
Thresh 0.35 | Macro F1: 0.209 | Micro F1: 0.571 | Exact Match: 0.202
Thresh 0.40 | Macro F1: 0.195 | Micro F1: 0.553 | Exact Match: 0.182
Thresh 0.45 | Macro F1: 0.168 | Micro F1: 0.525 | Exact Match: 0.145
Thresh 0.50 | Macro F1: 0.124 | Micro F1: 0.478 | Exact Match: 0.091
Thresh 0.55 | Macro F1: 0.098 | Micro F1: 0.428 | Exact Match: 0.050
Thresh 0.60 | Macro F1: 0.079 | Micro F1: 0.383 | Exact Match: 0.026
Thresh 0.65 | Macro F1: 0.061 | Micro F1: 0.335 | Exact Match: 0.008
Thresh 0.70 | Macro F1: 0.051 | Micro F1: 0.286 | Exact Match: 0.003
Thresh 0.75 | Macro F1: 0.041 | Micro F1: 0.232 | Exac

Epoch 2: 100%|██████████| 1221/1221 [02:04<00:00,  9.79it/s]



Epoch 2: Loss = 0.1573
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.448 | Micro F1: 0.620 | Exact Match: 0.105
Thresh 0.15 | Macro F1: 0.484 | Micro F1: 0.696 | Exact Match: 0.248
Thresh 0.20 | Macro F1: 0.506 | Micro F1: 0.737 | Exact Match: 0.360
Thresh 0.25 | Macro F1: 0.494 | Micro F1: 0.755 | Exact Match: 0.419
Thresh 0.30 | Macro F1: 0.464 | Micro F1: 0.762 | Exact Match: 0.452
Thresh 0.35 | Macro F1: 0.438 | Micro F1: 0.764 | Exact Match: 0.464
Thresh 0.40 | Macro F1: 0.410 | Micro F1: 0.761 | Exact Match: 0.457
Thresh 0.45 | Macro F1: 0.377 | Micro F1: 0.755 | Exact Match: 0.438
Thresh 0.50 | Macro F1: 0.354 | Micro F1: 0.745 | Exact Match: 0.411
Thresh 0.55 | Macro F1: 0.329 | Micro F1: 0.734 | Exact Match: 0.380
Thresh 0.60 | Macro F1: 0.305 | Micro F1: 0.720 | Exact Match: 0.350
Thresh 0.65 | Macro F1: 0.279 | Micro F1: 0.701 | Exact Match: 0.315
Thresh 0.70 | Macro F1: 0.258 | Micro F1: 0.678 | Exact Match: 0.281
Thresh 0.75 | Macro F1: 0.232 | Micro F1: 0.647 | Exac

Epoch 3: 100%|██████████| 1221/1221 [02:04<00:00,  9.79it/s]



Epoch 3: Loss = 0.1076
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.591 | Micro F1: 0.724 | Exact Match: 0.280
Thresh 0.15 | Macro F1: 0.646 | Micro F1: 0.790 | Exact Match: 0.416
Thresh 0.20 | Macro F1: 0.664 | Micro F1: 0.825 | Exact Match: 0.515
Thresh 0.25 | Macro F1: 0.659 | Micro F1: 0.842 | Exact Match: 0.580
Thresh 0.30 | Macro F1: 0.658 | Micro F1: 0.855 | Exact Match: 0.628
Thresh 0.35 | Macro F1: 0.652 | Micro F1: 0.862 | Exact Match: 0.654
Thresh 0.40 | Macro F1: 0.635 | Micro F1: 0.862 | Exact Match: 0.660
Thresh 0.45 | Macro F1: 0.619 | Micro F1: 0.860 | Exact Match: 0.655
Thresh 0.50 | Macro F1: 0.593 | Micro F1: 0.858 | Exact Match: 0.642
Thresh 0.55 | Macro F1: 0.568 | Micro F1: 0.851 | Exact Match: 0.619
Thresh 0.60 | Macro F1: 0.544 | Micro F1: 0.842 | Exact Match: 0.585
Thresh 0.65 | Macro F1: 0.512 | Micro F1: 0.829 | Exact Match: 0.551
Thresh 0.70 | Macro F1: 0.478 | Micro F1: 0.814 | Exact Match: 0.513
Thresh 0.75 | Macro F1: 0.439 | Micro F1: 0.792 | Exac

Epoch 4: 100%|██████████| 1221/1221 [02:04<00:00,  9.84it/s]



Epoch 4: Loss = 0.0739
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.712 | Micro F1: 0.827 | Exact Match: 0.481
Thresh 0.15 | Macro F1: 0.770 | Micro F1: 0.874 | Exact Match: 0.622
Thresh 0.20 | Macro F1: 0.795 | Micro F1: 0.898 | Exact Match: 0.705
Thresh 0.25 | Macro F1: 0.802 | Micro F1: 0.911 | Exact Match: 0.755
Thresh 0.30 | Macro F1: 0.800 | Micro F1: 0.919 | Exact Match: 0.789
Thresh 0.35 | Macro F1: 0.787 | Micro F1: 0.923 | Exact Match: 0.800
Thresh 0.40 | Macro F1: 0.779 | Micro F1: 0.925 | Exact Match: 0.808
Thresh 0.45 | Macro F1: 0.761 | Micro F1: 0.925 | Exact Match: 0.806
Thresh 0.50 | Macro F1: 0.746 | Micro F1: 0.921 | Exact Match: 0.788
Thresh 0.55 | Macro F1: 0.727 | Micro F1: 0.917 | Exact Match: 0.777
Thresh 0.60 | Macro F1: 0.716 | Micro F1: 0.913 | Exact Match: 0.760
Thresh 0.65 | Macro F1: 0.696 | Micro F1: 0.906 | Exact Match: 0.739
Thresh 0.70 | Macro F1: 0.663 | Micro F1: 0.898 | Exact Match: 0.712
Thresh 0.75 | Macro F1: 0.632 | Micro F1: 0.886 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.89it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.244
Micro F1: 0.501
Exact Match: 0.268

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.149
Micro F1: 0.665
Exact Match: 0.505

 Domain: UA
Macro F1: 0.227
Micro F1: 0.459
Exact Match: 0.207

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=74 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1221/1221 [02:05<00:00,  9.71it/s]



Epoch 1: Loss = 0.2222
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.276 | Micro F1: 0.489 | Exact Match: 0.002
Thresh 0.15 | Macro F1: 0.265 | Micro F1: 0.549 | Exact Match: 0.040
Thresh 0.20 | Macro F1: 0.261 | Micro F1: 0.577 | Exact Match: 0.102
Thresh 0.25 | Macro F1: 0.243 | Micro F1: 0.583 | Exact Match: 0.150
Thresh 0.30 | Macro F1: 0.205 | Micro F1: 0.575 | Exact Match: 0.182
Thresh 0.35 | Macro F1: 0.186 | Micro F1: 0.562 | Exact Match: 0.190
Thresh 0.40 | Macro F1: 0.169 | Micro F1: 0.545 | Exact Match: 0.175
Thresh 0.45 | Macro F1: 0.147 | Micro F1: 0.516 | Exact Match: 0.140
Thresh 0.50 | Macro F1: 0.126 | Micro F1: 0.485 | Exact Match: 0.103
Thresh 0.55 | Macro F1: 0.106 | Micro F1: 0.453 | Exact Match: 0.073
Thresh 0.60 | Macro F1: 0.081 | Micro F1: 0.409 | Exact Match: 0.043
Thresh 0.65 | Macro F1: 0.064 | Micro F1: 0.373 | Exact Match: 0.017
Thresh 0.70 | Macro F1: 0.053 | Micro F1: 0.331 | Exact Match: 0.003
Thresh 0.75 | Macro F1: 0.047 | Micro F1: 0.288 | Exac

Epoch 2: 100%|██████████| 1221/1221 [02:05<00:00,  9.70it/s]



Epoch 2: Loss = 0.1600
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.455 | Micro F1: 0.619 | Exact Match: 0.082
Thresh 0.15 | Macro F1: 0.462 | Micro F1: 0.697 | Exact Match: 0.233
Thresh 0.20 | Macro F1: 0.466 | Micro F1: 0.738 | Exact Match: 0.353
Thresh 0.25 | Macro F1: 0.457 | Micro F1: 0.757 | Exact Match: 0.430
Thresh 0.30 | Macro F1: 0.434 | Micro F1: 0.764 | Exact Match: 0.454
Thresh 0.35 | Macro F1: 0.402 | Micro F1: 0.763 | Exact Match: 0.455
Thresh 0.40 | Macro F1: 0.371 | Micro F1: 0.757 | Exact Match: 0.436
Thresh 0.45 | Macro F1: 0.330 | Micro F1: 0.745 | Exact Match: 0.408
Thresh 0.50 | Macro F1: 0.297 | Micro F1: 0.733 | Exact Match: 0.377
Thresh 0.55 | Macro F1: 0.274 | Micro F1: 0.719 | Exact Match: 0.347
Thresh 0.60 | Macro F1: 0.256 | Micro F1: 0.706 | Exact Match: 0.317
Thresh 0.65 | Macro F1: 0.236 | Micro F1: 0.686 | Exact Match: 0.282
Thresh 0.70 | Macro F1: 0.215 | Micro F1: 0.659 | Exact Match: 0.241
Thresh 0.75 | Macro F1: 0.196 | Micro F1: 0.627 | Exac

Epoch 3: 100%|██████████| 1221/1221 [02:05<00:00,  9.71it/s]



Epoch 3: Loss = 0.1105
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.582 | Micro F1: 0.715 | Exact Match: 0.265
Thresh 0.15 | Macro F1: 0.624 | Micro F1: 0.775 | Exact Match: 0.389
Thresh 0.20 | Macro F1: 0.648 | Micro F1: 0.810 | Exact Match: 0.487
Thresh 0.25 | Macro F1: 0.655 | Micro F1: 0.831 | Exact Match: 0.555
Thresh 0.30 | Macro F1: 0.650 | Micro F1: 0.842 | Exact Match: 0.601
Thresh 0.35 | Macro F1: 0.644 | Micro F1: 0.849 | Exact Match: 0.627
Thresh 0.40 | Macro F1: 0.620 | Micro F1: 0.850 | Exact Match: 0.635
Thresh 0.45 | Macro F1: 0.606 | Micro F1: 0.850 | Exact Match: 0.637
Thresh 0.50 | Macro F1: 0.583 | Micro F1: 0.846 | Exact Match: 0.624
Thresh 0.55 | Macro F1: 0.546 | Micro F1: 0.839 | Exact Match: 0.603
Thresh 0.60 | Macro F1: 0.511 | Micro F1: 0.828 | Exact Match: 0.569
Thresh 0.65 | Macro F1: 0.467 | Micro F1: 0.813 | Exact Match: 0.530
Thresh 0.70 | Macro F1: 0.430 | Micro F1: 0.795 | Exact Match: 0.484
Thresh 0.75 | Macro F1: 0.386 | Micro F1: 0.771 | Exac

Epoch 4: 100%|██████████| 1221/1221 [02:05<00:00,  9.70it/s]



Epoch 4: Loss = 0.0755
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.693 | Micro F1: 0.827 | Exact Match: 0.485
Thresh 0.15 | Macro F1: 0.760 | Micro F1: 0.874 | Exact Match: 0.617
Thresh 0.20 | Macro F1: 0.788 | Micro F1: 0.894 | Exact Match: 0.690
Thresh 0.25 | Macro F1: 0.800 | Micro F1: 0.905 | Exact Match: 0.739
Thresh 0.30 | Macro F1: 0.808 | Micro F1: 0.914 | Exact Match: 0.771
Thresh 0.35 | Macro F1: 0.800 | Micro F1: 0.916 | Exact Match: 0.781
Thresh 0.40 | Macro F1: 0.798 | Micro F1: 0.917 | Exact Match: 0.785
Thresh 0.45 | Macro F1: 0.790 | Micro F1: 0.917 | Exact Match: 0.783
Thresh 0.50 | Macro F1: 0.779 | Micro F1: 0.916 | Exact Match: 0.776
Thresh 0.55 | Macro F1: 0.752 | Micro F1: 0.912 | Exact Match: 0.763
Thresh 0.60 | Macro F1: 0.720 | Micro F1: 0.907 | Exact Match: 0.745
Thresh 0.65 | Macro F1: 0.697 | Micro F1: 0.900 | Exact Match: 0.719
Thresh 0.70 | Macro F1: 0.680 | Micro F1: 0.893 | Exact Match: 0.695
Thresh 0.75 | Macro F1: 0.624 | Micro F1: 0.877 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.21it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.220
Micro F1: 0.481
Exact Match: 0.261

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.155
Micro F1: 0.735
Exact Match: 0.538

 Domain: UA
Macro F1: 0.200
Micro F1: 0.420
Exact Match: 0.190

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=75 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1221/1221 [02:05<00:00,  9.76it/s]



Epoch 1: Loss = 0.2213
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.286 | Micro F1: 0.470 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.288 | Micro F1: 0.541 | Exact Match: 0.024
Thresh 0.20 | Macro F1: 0.255 | Micro F1: 0.571 | Exact Match: 0.071
Thresh 0.25 | Macro F1: 0.223 | Micro F1: 0.583 | Exact Match: 0.123
Thresh 0.30 | Macro F1: 0.213 | Micro F1: 0.588 | Exact Match: 0.172
Thresh 0.35 | Macro F1: 0.205 | Micro F1: 0.579 | Exact Match: 0.193
Thresh 0.40 | Macro F1: 0.196 | Micro F1: 0.570 | Exact Match: 0.196
Thresh 0.45 | Macro F1: 0.181 | Micro F1: 0.547 | Exact Match: 0.175
Thresh 0.50 | Macro F1: 0.166 | Micro F1: 0.523 | Exact Match: 0.147
Thresh 0.55 | Macro F1: 0.144 | Micro F1: 0.492 | Exact Match: 0.111
Thresh 0.60 | Macro F1: 0.124 | Micro F1: 0.459 | Exact Match: 0.086
Thresh 0.65 | Macro F1: 0.103 | Micro F1: 0.418 | Exact Match: 0.060
Thresh 0.70 | Macro F1: 0.085 | Micro F1: 0.365 | Exact Match: 0.041
Thresh 0.75 | Macro F1: 0.067 | Micro F1: 0.299 | Exac

Epoch 2: 100%|██████████| 1221/1221 [02:05<00:00,  9.71it/s]



Epoch 2: Loss = 0.1582
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.454 | Micro F1: 0.628 | Exact Match: 0.112
Thresh 0.15 | Macro F1: 0.463 | Micro F1: 0.702 | Exact Match: 0.265
Thresh 0.20 | Macro F1: 0.472 | Micro F1: 0.744 | Exact Match: 0.384
Thresh 0.25 | Macro F1: 0.463 | Micro F1: 0.762 | Exact Match: 0.449
Thresh 0.30 | Macro F1: 0.441 | Micro F1: 0.767 | Exact Match: 0.476
Thresh 0.35 | Macro F1: 0.416 | Micro F1: 0.766 | Exact Match: 0.473
Thresh 0.40 | Macro F1: 0.395 | Micro F1: 0.761 | Exact Match: 0.461
Thresh 0.45 | Macro F1: 0.370 | Micro F1: 0.752 | Exact Match: 0.436
Thresh 0.50 | Macro F1: 0.341 | Micro F1: 0.741 | Exact Match: 0.401
Thresh 0.55 | Macro F1: 0.314 | Micro F1: 0.727 | Exact Match: 0.361
Thresh 0.60 | Macro F1: 0.292 | Micro F1: 0.712 | Exact Match: 0.328
Thresh 0.65 | Macro F1: 0.267 | Micro F1: 0.692 | Exact Match: 0.290
Thresh 0.70 | Macro F1: 0.237 | Micro F1: 0.667 | Exact Match: 0.252
Thresh 0.75 | Macro F1: 0.209 | Micro F1: 0.635 | Exac

Epoch 3: 100%|██████████| 1221/1221 [02:06<00:00,  9.68it/s]



Epoch 3: Loss = 0.1066
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.617 | Micro F1: 0.740 | Exact Match: 0.295
Thresh 0.15 | Macro F1: 0.652 | Micro F1: 0.800 | Exact Match: 0.439
Thresh 0.20 | Macro F1: 0.674 | Micro F1: 0.832 | Exact Match: 0.530
Thresh 0.25 | Macro F1: 0.662 | Micro F1: 0.849 | Exact Match: 0.601
Thresh 0.30 | Macro F1: 0.657 | Micro F1: 0.857 | Exact Match: 0.634
Thresh 0.35 | Macro F1: 0.636 | Micro F1: 0.861 | Exact Match: 0.653
Thresh 0.40 | Macro F1: 0.620 | Micro F1: 0.862 | Exact Match: 0.654
Thresh 0.45 | Macro F1: 0.600 | Micro F1: 0.859 | Exact Match: 0.644
Thresh 0.50 | Macro F1: 0.578 | Micro F1: 0.853 | Exact Match: 0.622
Thresh 0.55 | Macro F1: 0.554 | Micro F1: 0.845 | Exact Match: 0.596
Thresh 0.60 | Macro F1: 0.525 | Micro F1: 0.835 | Exact Match: 0.566
Thresh 0.65 | Macro F1: 0.487 | Micro F1: 0.821 | Exact Match: 0.523
Thresh 0.70 | Macro F1: 0.443 | Micro F1: 0.802 | Exact Match: 0.480
Thresh 0.75 | Macro F1: 0.397 | Micro F1: 0.779 | Exac

Epoch 4: 100%|██████████| 1221/1221 [02:06<00:00,  9.65it/s]



Epoch 4: Loss = 0.0728
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.740 | Micro F1: 0.837 | Exact Match: 0.513
Thresh 0.15 | Macro F1: 0.790 | Micro F1: 0.879 | Exact Match: 0.626
Thresh 0.20 | Macro F1: 0.810 | Micro F1: 0.901 | Exact Match: 0.709
Thresh 0.25 | Macro F1: 0.808 | Micro F1: 0.914 | Exact Match: 0.754
Thresh 0.30 | Macro F1: 0.801 | Micro F1: 0.919 | Exact Match: 0.777
Thresh 0.35 | Macro F1: 0.800 | Micro F1: 0.922 | Exact Match: 0.797
Thresh 0.40 | Macro F1: 0.790 | Micro F1: 0.925 | Exact Match: 0.806
Thresh 0.45 | Macro F1: 0.775 | Micro F1: 0.927 | Exact Match: 0.811
Thresh 0.50 | Macro F1: 0.742 | Micro F1: 0.924 | Exact Match: 0.803
Thresh 0.55 | Macro F1: 0.723 | Micro F1: 0.921 | Exact Match: 0.789
Thresh 0.60 | Macro F1: 0.701 | Micro F1: 0.917 | Exact Match: 0.773
Thresh 0.65 | Macro F1: 0.682 | Micro F1: 0.909 | Exact Match: 0.748
Thresh 0.70 | Macro F1: 0.656 | Micro F1: 0.899 | Exact Match: 0.714
Thresh 0.75 | Macro F1: 0.613 | Micro F1: 0.886 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 20.62it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.264
Micro F1: 0.505
Exact Match: 0.310

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.153
Micro F1: 0.721
Exact Match: 0.571

 Domain: UA
Macro F1: 0.245
Micro F1: 0.451
Exact Match: 0.244

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=42 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1221/1221 [02:05<00:00,  9.71it/s]



Epoch 1: Loss = 0.2239
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.267 | Micro F1: 0.439 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.266 | Micro F1: 0.507 | Exact Match: 0.011
Thresh 0.20 | Macro F1: 0.243 | Micro F1: 0.543 | Exact Match: 0.059
Thresh 0.25 | Macro F1: 0.207 | Micro F1: 0.551 | Exact Match: 0.109
Thresh 0.30 | Macro F1: 0.186 | Micro F1: 0.547 | Exact Match: 0.139
Thresh 0.35 | Macro F1: 0.165 | Micro F1: 0.533 | Exact Match: 0.141
Thresh 0.40 | Macro F1: 0.139 | Micro F1: 0.509 | Exact Match: 0.116
Thresh 0.45 | Macro F1: 0.117 | Micro F1: 0.474 | Exact Match: 0.089
Thresh 0.50 | Macro F1: 0.103 | Micro F1: 0.439 | Exact Match: 0.072
Thresh 0.55 | Macro F1: 0.091 | Micro F1: 0.397 | Exact Match: 0.055
Thresh 0.60 | Macro F1: 0.075 | Micro F1: 0.348 | Exact Match: 0.029
Thresh 0.65 | Macro F1: 0.062 | Micro F1: 0.294 | Exact Match: 0.015
Thresh 0.70 | Macro F1: 0.043 | Micro F1: 0.221 | Exact Match: 0.004
Thresh 0.75 | Macro F1: 0.029 | Micro F1: 0.151 | Exac

Epoch 2: 100%|██████████| 1221/1221 [02:05<00:00,  9.73it/s]



Epoch 2: Loss = 0.1641
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.444 | Micro F1: 0.623 | Exact Match: 0.103
Thresh 0.15 | Macro F1: 0.476 | Micro F1: 0.704 | Exact Match: 0.256
Thresh 0.20 | Macro F1: 0.480 | Micro F1: 0.743 | Exact Match: 0.368
Thresh 0.25 | Macro F1: 0.468 | Micro F1: 0.758 | Exact Match: 0.422
Thresh 0.30 | Macro F1: 0.442 | Micro F1: 0.762 | Exact Match: 0.437
Thresh 0.35 | Macro F1: 0.405 | Micro F1: 0.759 | Exact Match: 0.435
Thresh 0.40 | Macro F1: 0.374 | Micro F1: 0.753 | Exact Match: 0.421
Thresh 0.45 | Macro F1: 0.339 | Micro F1: 0.744 | Exact Match: 0.398
Thresh 0.50 | Macro F1: 0.312 | Micro F1: 0.732 | Exact Match: 0.370
Thresh 0.55 | Macro F1: 0.282 | Micro F1: 0.715 | Exact Match: 0.334
Thresh 0.60 | Macro F1: 0.254 | Micro F1: 0.693 | Exact Match: 0.297
Thresh 0.65 | Macro F1: 0.225 | Micro F1: 0.668 | Exact Match: 0.258
Thresh 0.70 | Macro F1: 0.204 | Micro F1: 0.636 | Exact Match: 0.214
Thresh 0.75 | Macro F1: 0.175 | Micro F1: 0.588 | Exac

Epoch 3: 100%|██████████| 1221/1221 [02:05<00:00,  9.69it/s]



Epoch 3: Loss = 0.1068
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.646 | Micro F1: 0.759 | Exact Match: 0.330
Thresh 0.15 | Macro F1: 0.677 | Micro F1: 0.815 | Exact Match: 0.458
Thresh 0.20 | Macro F1: 0.670 | Micro F1: 0.844 | Exact Match: 0.555
Thresh 0.25 | Macro F1: 0.672 | Micro F1: 0.861 | Exact Match: 0.617
Thresh 0.30 | Macro F1: 0.653 | Micro F1: 0.870 | Exact Match: 0.650
Thresh 0.35 | Macro F1: 0.622 | Micro F1: 0.875 | Exact Match: 0.664
Thresh 0.40 | Macro F1: 0.600 | Micro F1: 0.876 | Exact Match: 0.667
Thresh 0.45 | Macro F1: 0.574 | Micro F1: 0.872 | Exact Match: 0.658
Thresh 0.50 | Macro F1: 0.556 | Micro F1: 0.867 | Exact Match: 0.641
Thresh 0.55 | Macro F1: 0.529 | Micro F1: 0.860 | Exact Match: 0.619
Thresh 0.60 | Macro F1: 0.491 | Micro F1: 0.849 | Exact Match: 0.582
Thresh 0.65 | Macro F1: 0.454 | Micro F1: 0.833 | Exact Match: 0.540
Thresh 0.70 | Macro F1: 0.423 | Micro F1: 0.818 | Exact Match: 0.506
Thresh 0.75 | Macro F1: 0.387 | Micro F1: 0.798 | Exac

Epoch 4: 100%|██████████| 1221/1221 [02:05<00:00,  9.69it/s]



Epoch 4: Loss = 0.0696
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.726 | Micro F1: 0.844 | Exact Match: 0.546
Thresh 0.15 | Macro F1: 0.784 | Micro F1: 0.884 | Exact Match: 0.658
Thresh 0.20 | Macro F1: 0.812 | Micro F1: 0.904 | Exact Match: 0.728
Thresh 0.25 | Macro F1: 0.821 | Micro F1: 0.915 | Exact Match: 0.769
Thresh 0.30 | Macro F1: 0.820 | Micro F1: 0.921 | Exact Match: 0.794
Thresh 0.35 | Macro F1: 0.807 | Micro F1: 0.922 | Exact Match: 0.802
Thresh 0.40 | Macro F1: 0.792 | Micro F1: 0.922 | Exact Match: 0.804
Thresh 0.45 | Macro F1: 0.779 | Micro F1: 0.921 | Exact Match: 0.799
Thresh 0.50 | Macro F1: 0.766 | Micro F1: 0.918 | Exact Match: 0.790
Thresh 0.55 | Macro F1: 0.751 | Micro F1: 0.915 | Exact Match: 0.778
Thresh 0.60 | Macro F1: 0.722 | Micro F1: 0.912 | Exact Match: 0.760
Thresh 0.65 | Macro F1: 0.700 | Micro F1: 0.905 | Exact Match: 0.735
Thresh 0.70 | Macro F1: 0.673 | Micro F1: 0.897 | Exact Match: 0.711
Thresh 0.75 | Macro F1: 0.628 | Micro F1: 0.885 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.28it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.243
Micro F1: 0.531
Exact Match: 0.326

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.156
Micro F1: 0.776
Exact Match: 0.659

 Domain: UA
Macro F1: 0.231
Micro F1: 0.470
Exact Match: 0.241

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=43 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1221/1221 [02:05<00:00,  9.77it/s]



Epoch 1: Loss = 0.2203
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.291 | Micro F1: 0.485 | Exact Match: 0.001
Thresh 0.15 | Macro F1: 0.266 | Micro F1: 0.549 | Exact Match: 0.038
Thresh 0.20 | Macro F1: 0.243 | Micro F1: 0.578 | Exact Match: 0.100
Thresh 0.25 | Macro F1: 0.221 | Micro F1: 0.591 | Exact Match: 0.144
Thresh 0.30 | Macro F1: 0.203 | Micro F1: 0.591 | Exact Match: 0.174
Thresh 0.35 | Macro F1: 0.191 | Micro F1: 0.584 | Exact Match: 0.190
Thresh 0.40 | Macro F1: 0.176 | Micro F1: 0.568 | Exact Match: 0.182
Thresh 0.45 | Macro F1: 0.155 | Micro F1: 0.542 | Exact Match: 0.146
Thresh 0.50 | Macro F1: 0.133 | Micro F1: 0.511 | Exact Match: 0.111
Thresh 0.55 | Macro F1: 0.115 | Micro F1: 0.478 | Exact Match: 0.077
Thresh 0.60 | Macro F1: 0.102 | Micro F1: 0.448 | Exact Match: 0.054
Thresh 0.65 | Macro F1: 0.083 | Micro F1: 0.404 | Exact Match: 0.027
Thresh 0.70 | Macro F1: 0.068 | Micro F1: 0.361 | Exact Match: 0.012
Thresh 0.75 | Macro F1: 0.055 | Micro F1: 0.310 | Exac

Epoch 2: 100%|██████████| 1221/1221 [02:04<00:00,  9.77it/s]



Epoch 2: Loss = 0.1582
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.449 | Micro F1: 0.599 | Exact Match: 0.069
Thresh 0.15 | Macro F1: 0.482 | Micro F1: 0.678 | Exact Match: 0.194
Thresh 0.20 | Macro F1: 0.485 | Micro F1: 0.721 | Exact Match: 0.314
Thresh 0.25 | Macro F1: 0.476 | Micro F1: 0.747 | Exact Match: 0.400
Thresh 0.30 | Macro F1: 0.458 | Micro F1: 0.759 | Exact Match: 0.448
Thresh 0.35 | Macro F1: 0.434 | Micro F1: 0.762 | Exact Match: 0.462
Thresh 0.40 | Macro F1: 0.402 | Micro F1: 0.753 | Exact Match: 0.441
Thresh 0.45 | Macro F1: 0.365 | Micro F1: 0.742 | Exact Match: 0.408
Thresh 0.50 | Macro F1: 0.332 | Micro F1: 0.729 | Exact Match: 0.374
Thresh 0.55 | Macro F1: 0.302 | Micro F1: 0.713 | Exact Match: 0.337
Thresh 0.60 | Macro F1: 0.264 | Micro F1: 0.694 | Exact Match: 0.298
Thresh 0.65 | Macro F1: 0.229 | Micro F1: 0.666 | Exact Match: 0.252
Thresh 0.70 | Macro F1: 0.203 | Micro F1: 0.639 | Exact Match: 0.214
Thresh 0.75 | Macro F1: 0.176 | Micro F1: 0.600 | Exac

Epoch 3: 100%|██████████| 1221/1221 [02:05<00:00,  9.76it/s]



Epoch 3: Loss = 0.1101
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.601 | Micro F1: 0.731 | Exact Match: 0.264
Thresh 0.15 | Macro F1: 0.657 | Micro F1: 0.791 | Exact Match: 0.404
Thresh 0.20 | Macro F1: 0.677 | Micro F1: 0.825 | Exact Match: 0.511
Thresh 0.25 | Macro F1: 0.680 | Micro F1: 0.843 | Exact Match: 0.586
Thresh 0.30 | Macro F1: 0.662 | Micro F1: 0.852 | Exact Match: 0.619
Thresh 0.35 | Macro F1: 0.644 | Micro F1: 0.857 | Exact Match: 0.635
Thresh 0.40 | Macro F1: 0.617 | Micro F1: 0.855 | Exact Match: 0.634
Thresh 0.45 | Macro F1: 0.589 | Micro F1: 0.851 | Exact Match: 0.617
Thresh 0.50 | Macro F1: 0.562 | Micro F1: 0.845 | Exact Match: 0.600
Thresh 0.55 | Macro F1: 0.518 | Micro F1: 0.834 | Exact Match: 0.567
Thresh 0.60 | Macro F1: 0.479 | Micro F1: 0.820 | Exact Match: 0.525
Thresh 0.65 | Macro F1: 0.439 | Micro F1: 0.802 | Exact Match: 0.478
Thresh 0.70 | Macro F1: 0.397 | Micro F1: 0.783 | Exact Match: 0.433
Thresh 0.75 | Macro F1: 0.347 | Micro F1: 0.756 | Exac

Epoch 4: 100%|██████████| 1221/1221 [02:05<00:00,  9.74it/s]



Epoch 4: Loss = 0.0766
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.724 | Micro F1: 0.819 | Exact Match: 0.472
Thresh 0.15 | Macro F1: 0.773 | Micro F1: 0.864 | Exact Match: 0.601
Thresh 0.20 | Macro F1: 0.782 | Micro F1: 0.886 | Exact Match: 0.676
Thresh 0.25 | Macro F1: 0.785 | Micro F1: 0.899 | Exact Match: 0.725
Thresh 0.30 | Macro F1: 0.771 | Micro F1: 0.905 | Exact Match: 0.755
Thresh 0.35 | Macro F1: 0.746 | Micro F1: 0.910 | Exact Match: 0.770
Thresh 0.40 | Macro F1: 0.729 | Micro F1: 0.910 | Exact Match: 0.772
Thresh 0.45 | Macro F1: 0.708 | Micro F1: 0.908 | Exact Match: 0.770
Thresh 0.50 | Macro F1: 0.687 | Micro F1: 0.905 | Exact Match: 0.756
Thresh 0.55 | Macro F1: 0.665 | Micro F1: 0.901 | Exact Match: 0.740
Thresh 0.60 | Macro F1: 0.640 | Micro F1: 0.895 | Exact Match: 0.718
Thresh 0.65 | Macro F1: 0.617 | Micro F1: 0.887 | Exact Match: 0.692
Thresh 0.70 | Macro F1: 0.592 | Micro F1: 0.877 | Exact Match: 0.661
Thresh 0.75 | Macro F1: 0.557 | Micro F1: 0.860 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.57it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.225
Micro F1: 0.504
Exact Match: 0.279

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.156
Micro F1: 0.748
Exact Match: 0.582

 Domain: UA
Macro F1: 0.210
Micro F1: 0.442
Exact Match: 0.202

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=44 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1221/1221 [02:06<00:00,  9.68it/s]



Epoch 1: Loss = 0.2228
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.259 | Micro F1: 0.438 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.244 | Micro F1: 0.496 | Exact Match: 0.010
Thresh 0.20 | Macro F1: 0.232 | Micro F1: 0.527 | Exact Match: 0.048
Thresh 0.25 | Macro F1: 0.201 | Micro F1: 0.539 | Exact Match: 0.103
Thresh 0.30 | Macro F1: 0.194 | Micro F1: 0.537 | Exact Match: 0.155
Thresh 0.35 | Macro F1: 0.185 | Micro F1: 0.528 | Exact Match: 0.178
Thresh 0.40 | Macro F1: 0.174 | Micro F1: 0.513 | Exact Match: 0.175
Thresh 0.45 | Macro F1: 0.160 | Micro F1: 0.488 | Exact Match: 0.152
Thresh 0.50 | Macro F1: 0.139 | Micro F1: 0.454 | Exact Match: 0.120
Thresh 0.55 | Macro F1: 0.118 | Micro F1: 0.415 | Exact Match: 0.085
Thresh 0.60 | Macro F1: 0.096 | Micro F1: 0.371 | Exact Match: 0.054
Thresh 0.65 | Macro F1: 0.074 | Micro F1: 0.320 | Exact Match: 0.034
Thresh 0.70 | Macro F1: 0.054 | Micro F1: 0.257 | Exact Match: 0.018
Thresh 0.75 | Macro F1: 0.038 | Micro F1: 0.186 | Exac

Epoch 2: 100%|██████████| 1221/1221 [02:06<00:00,  9.69it/s]



Epoch 2: Loss = 0.1644
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.450 | Micro F1: 0.592 | Exact Match: 0.083
Thresh 0.15 | Macro F1: 0.452 | Micro F1: 0.668 | Exact Match: 0.227
Thresh 0.20 | Macro F1: 0.460 | Micro F1: 0.713 | Exact Match: 0.339
Thresh 0.25 | Macro F1: 0.453 | Micro F1: 0.735 | Exact Match: 0.413
Thresh 0.30 | Macro F1: 0.428 | Micro F1: 0.744 | Exact Match: 0.452
Thresh 0.35 | Macro F1: 0.407 | Micro F1: 0.753 | Exact Match: 0.468
Thresh 0.40 | Macro F1: 0.382 | Micro F1: 0.752 | Exact Match: 0.465
Thresh 0.45 | Macro F1: 0.360 | Micro F1: 0.749 | Exact Match: 0.451
Thresh 0.50 | Macro F1: 0.339 | Micro F1: 0.742 | Exact Match: 0.435
Thresh 0.55 | Macro F1: 0.318 | Micro F1: 0.733 | Exact Match: 0.412
Thresh 0.60 | Macro F1: 0.295 | Micro F1: 0.721 | Exact Match: 0.382
Thresh 0.65 | Macro F1: 0.275 | Micro F1: 0.707 | Exact Match: 0.351
Thresh 0.70 | Macro F1: 0.246 | Micro F1: 0.687 | Exact Match: 0.316
Thresh 0.75 | Macro F1: 0.220 | Micro F1: 0.663 | Exac

Epoch 3: 100%|██████████| 1221/1221 [02:05<00:00,  9.69it/s]



Epoch 3: Loss = 0.1097
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.617 | Micro F1: 0.732 | Exact Match: 0.278
Thresh 0.15 | Macro F1: 0.636 | Micro F1: 0.791 | Exact Match: 0.413
Thresh 0.20 | Macro F1: 0.633 | Micro F1: 0.824 | Exact Match: 0.504
Thresh 0.25 | Macro F1: 0.623 | Micro F1: 0.843 | Exact Match: 0.571
Thresh 0.30 | Macro F1: 0.608 | Micro F1: 0.853 | Exact Match: 0.610
Thresh 0.35 | Macro F1: 0.593 | Micro F1: 0.859 | Exact Match: 0.640
Thresh 0.40 | Macro F1: 0.572 | Micro F1: 0.860 | Exact Match: 0.647
Thresh 0.45 | Macro F1: 0.557 | Micro F1: 0.859 | Exact Match: 0.647
Thresh 0.50 | Macro F1: 0.535 | Micro F1: 0.856 | Exact Match: 0.636
Thresh 0.55 | Macro F1: 0.510 | Micro F1: 0.850 | Exact Match: 0.610
Thresh 0.60 | Macro F1: 0.479 | Micro F1: 0.841 | Exact Match: 0.576
Thresh 0.65 | Macro F1: 0.451 | Micro F1: 0.829 | Exact Match: 0.538
Thresh 0.70 | Macro F1: 0.421 | Micro F1: 0.813 | Exact Match: 0.498
Thresh 0.75 | Macro F1: 0.381 | Micro F1: 0.795 | Exac

Epoch 4: 100%|██████████| 1221/1221 [02:06<00:00,  9.68it/s]



Epoch 4: Loss = 0.0761
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.723 | Micro F1: 0.818 | Exact Match: 0.456
Thresh 0.15 | Macro F1: 0.764 | Micro F1: 0.857 | Exact Match: 0.558
Thresh 0.20 | Macro F1: 0.787 | Micro F1: 0.882 | Exact Match: 0.650
Thresh 0.25 | Macro F1: 0.779 | Micro F1: 0.892 | Exact Match: 0.693
Thresh 0.30 | Macro F1: 0.774 | Micro F1: 0.902 | Exact Match: 0.729
Thresh 0.35 | Macro F1: 0.764 | Micro F1: 0.905 | Exact Match: 0.745
Thresh 0.40 | Macro F1: 0.752 | Micro F1: 0.907 | Exact Match: 0.755
Thresh 0.45 | Macro F1: 0.734 | Micro F1: 0.908 | Exact Match: 0.759
Thresh 0.50 | Macro F1: 0.708 | Micro F1: 0.907 | Exact Match: 0.756
Thresh 0.55 | Macro F1: 0.689 | Micro F1: 0.904 | Exact Match: 0.744
Thresh 0.60 | Macro F1: 0.663 | Micro F1: 0.899 | Exact Match: 0.723
Thresh 0.65 | Macro F1: 0.640 | Micro F1: 0.893 | Exact Match: 0.705
Thresh 0.70 | Macro F1: 0.596 | Micro F1: 0.884 | Exact Match: 0.675
Thresh 0.75 | Macro F1: 0.571 | Micro F1: 0.874 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.22it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.253
Micro F1: 0.512
Exact Match: 0.315

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.163
Micro F1: 0.720
Exact Match: 0.604

 Domain: UA
Macro F1: 0.238
Micro F1: 0.462
Exact Match: 0.241

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=71 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1365/1365 [02:21<00:00,  9.67it/s]



Epoch 1: Loss = 0.2165
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.287 | Micro F1: 0.499 | Exact Match: 0.010
Thresh 0.15 | Macro F1: 0.274 | Micro F1: 0.555 | Exact Match: 0.078
Thresh 0.20 | Macro F1: 0.236 | Micro F1: 0.575 | Exact Match: 0.113
Thresh 0.25 | Macro F1: 0.220 | Micro F1: 0.589 | Exact Match: 0.157
Thresh 0.30 | Macro F1: 0.210 | Micro F1: 0.593 | Exact Match: 0.190
Thresh 0.35 | Macro F1: 0.201 | Micro F1: 0.589 | Exact Match: 0.205
Thresh 0.40 | Macro F1: 0.190 | Micro F1: 0.580 | Exact Match: 0.203
Thresh 0.45 | Macro F1: 0.176 | Micro F1: 0.560 | Exact Match: 0.174
Thresh 0.50 | Macro F1: 0.158 | Micro F1: 0.532 | Exact Match: 0.141
Thresh 0.55 | Macro F1: 0.133 | Micro F1: 0.497 | Exact Match: 0.102
Thresh 0.60 | Macro F1: 0.113 | Micro F1: 0.462 | Exact Match: 0.073
Thresh 0.65 | Macro F1: 0.098 | Micro F1: 0.422 | Exact Match: 0.055
Thresh 0.70 | Macro F1: 0.083 | Micro F1: 0.377 | Exact Match: 0.040
Thresh 0.75 | Macro F1: 0.071 | Micro F1: 0.324 | Exac

Epoch 2: 100%|██████████| 1365/1365 [02:21<00:00,  9.66it/s]



Epoch 2: Loss = 0.1560
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.463 | Micro F1: 0.607 | Exact Match: 0.106
Thresh 0.15 | Macro F1: 0.497 | Micro F1: 0.685 | Exact Match: 0.230
Thresh 0.20 | Macro F1: 0.485 | Micro F1: 0.726 | Exact Match: 0.341
Thresh 0.25 | Macro F1: 0.464 | Micro F1: 0.748 | Exact Match: 0.416
Thresh 0.30 | Macro F1: 0.441 | Micro F1: 0.758 | Exact Match: 0.452
Thresh 0.35 | Macro F1: 0.421 | Micro F1: 0.761 | Exact Match: 0.468
Thresh 0.40 | Macro F1: 0.395 | Micro F1: 0.757 | Exact Match: 0.460
Thresh 0.45 | Macro F1: 0.361 | Micro F1: 0.749 | Exact Match: 0.439
Thresh 0.50 | Macro F1: 0.337 | Micro F1: 0.740 | Exact Match: 0.412
Thresh 0.55 | Macro F1: 0.314 | Micro F1: 0.727 | Exact Match: 0.379
Thresh 0.60 | Macro F1: 0.288 | Micro F1: 0.710 | Exact Match: 0.348
Thresh 0.65 | Macro F1: 0.264 | Micro F1: 0.688 | Exact Match: 0.311
Thresh 0.70 | Macro F1: 0.239 | Micro F1: 0.658 | Exact Match: 0.268
Thresh 0.75 | Macro F1: 0.209 | Micro F1: 0.616 | Exac

Epoch 3: 100%|██████████| 1365/1365 [02:21<00:00,  9.68it/s]



Epoch 3: Loss = 0.1076
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.579 | Micro F1: 0.730 | Exact Match: 0.299
Thresh 0.15 | Macro F1: 0.601 | Micro F1: 0.783 | Exact Match: 0.422
Thresh 0.20 | Macro F1: 0.605 | Micro F1: 0.813 | Exact Match: 0.508
Thresh 0.25 | Macro F1: 0.596 | Micro F1: 0.828 | Exact Match: 0.561
Thresh 0.30 | Macro F1: 0.582 | Micro F1: 0.838 | Exact Match: 0.593
Thresh 0.35 | Macro F1: 0.566 | Micro F1: 0.843 | Exact Match: 0.615
Thresh 0.40 | Macro F1: 0.553 | Micro F1: 0.846 | Exact Match: 0.626
Thresh 0.45 | Macro F1: 0.535 | Micro F1: 0.845 | Exact Match: 0.624
Thresh 0.50 | Macro F1: 0.524 | Micro F1: 0.843 | Exact Match: 0.615
Thresh 0.55 | Macro F1: 0.508 | Micro F1: 0.838 | Exact Match: 0.598
Thresh 0.60 | Macro F1: 0.485 | Micro F1: 0.830 | Exact Match: 0.572
Thresh 0.65 | Macro F1: 0.460 | Micro F1: 0.820 | Exact Match: 0.542
Thresh 0.70 | Macro F1: 0.426 | Micro F1: 0.806 | Exact Match: 0.509
Thresh 0.75 | Macro F1: 0.389 | Micro F1: 0.791 | Exac

Epoch 4: 100%|██████████| 1365/1365 [02:20<00:00,  9.69it/s]



Epoch 4: Loss = 0.0753
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.704 | Micro F1: 0.828 | Exact Match: 0.500
Thresh 0.15 | Macro F1: 0.750 | Micro F1: 0.871 | Exact Match: 0.627
Thresh 0.20 | Macro F1: 0.772 | Micro F1: 0.894 | Exact Match: 0.703
Thresh 0.25 | Macro F1: 0.782 | Micro F1: 0.908 | Exact Match: 0.750
Thresh 0.30 | Macro F1: 0.785 | Micro F1: 0.916 | Exact Match: 0.780
Thresh 0.35 | Macro F1: 0.777 | Micro F1: 0.919 | Exact Match: 0.793
Thresh 0.40 | Macro F1: 0.776 | Micro F1: 0.919 | Exact Match: 0.792
Thresh 0.45 | Macro F1: 0.765 | Micro F1: 0.918 | Exact Match: 0.785
Thresh 0.50 | Macro F1: 0.749 | Micro F1: 0.915 | Exact Match: 0.771
Thresh 0.55 | Macro F1: 0.736 | Micro F1: 0.910 | Exact Match: 0.756
Thresh 0.60 | Macro F1: 0.720 | Micro F1: 0.904 | Exact Match: 0.736
Thresh 0.65 | Macro F1: 0.699 | Micro F1: 0.897 | Exact Match: 0.714
Thresh 0.70 | Macro F1: 0.658 | Micro F1: 0.887 | Exact Match: 0.682
Thresh 0.75 | Macro F1: 0.609 | Micro F1: 0.872 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.35it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.249
Micro F1: 0.510
Exact Match: 0.288

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.142
Micro F1: 0.737
Exact Match: 0.626

 Domain: UA
Macro F1: 0.234
Micro F1: 0.452
Exact Match: 0.202

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=72 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1365/1365 [02:21<00:00,  9.67it/s]



Epoch 1: Loss = 0.2161
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.279 | Micro F1: 0.465 | Exact Match: 0.001
Thresh 0.15 | Macro F1: 0.297 | Micro F1: 0.542 | Exact Match: 0.059
Thresh 0.20 | Macro F1: 0.282 | Micro F1: 0.581 | Exact Match: 0.137
Thresh 0.25 | Macro F1: 0.265 | Micro F1: 0.595 | Exact Match: 0.200
Thresh 0.30 | Macro F1: 0.250 | Micro F1: 0.602 | Exact Match: 0.242
Thresh 0.35 | Macro F1: 0.232 | Micro F1: 0.598 | Exact Match: 0.261
Thresh 0.40 | Macro F1: 0.215 | Micro F1: 0.586 | Exact Match: 0.248
Thresh 0.45 | Macro F1: 0.194 | Micro F1: 0.568 | Exact Match: 0.225
Thresh 0.50 | Macro F1: 0.175 | Micro F1: 0.548 | Exact Match: 0.196
Thresh 0.55 | Macro F1: 0.158 | Micro F1: 0.519 | Exact Match: 0.164
Thresh 0.60 | Macro F1: 0.137 | Micro F1: 0.478 | Exact Match: 0.128
Thresh 0.65 | Macro F1: 0.119 | Micro F1: 0.439 | Exact Match: 0.101
Thresh 0.70 | Macro F1: 0.099 | Micro F1: 0.388 | Exact Match: 0.077
Thresh 0.75 | Macro F1: 0.083 | Micro F1: 0.335 | Exac

Epoch 2: 100%|██████████| 1365/1365 [02:21<00:00,  9.65it/s]



Epoch 2: Loss = 0.1526
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.466 | Micro F1: 0.635 | Exact Match: 0.143
Thresh 0.15 | Macro F1: 0.479 | Micro F1: 0.697 | Exact Match: 0.249
Thresh 0.20 | Macro F1: 0.481 | Micro F1: 0.737 | Exact Match: 0.345
Thresh 0.25 | Macro F1: 0.482 | Micro F1: 0.764 | Exact Match: 0.428
Thresh 0.30 | Macro F1: 0.465 | Micro F1: 0.775 | Exact Match: 0.469
Thresh 0.35 | Macro F1: 0.446 | Micro F1: 0.779 | Exact Match: 0.487
Thresh 0.40 | Macro F1: 0.427 | Micro F1: 0.779 | Exact Match: 0.489
Thresh 0.45 | Macro F1: 0.400 | Micro F1: 0.773 | Exact Match: 0.471
Thresh 0.50 | Macro F1: 0.373 | Micro F1: 0.763 | Exact Match: 0.442
Thresh 0.55 | Macro F1: 0.339 | Micro F1: 0.750 | Exact Match: 0.410
Thresh 0.60 | Macro F1: 0.300 | Micro F1: 0.736 | Exact Match: 0.376
Thresh 0.65 | Macro F1: 0.273 | Micro F1: 0.718 | Exact Match: 0.341
Thresh 0.70 | Macro F1: 0.247 | Micro F1: 0.697 | Exact Match: 0.305
Thresh 0.75 | Macro F1: 0.224 | Micro F1: 0.672 | Exac

Epoch 3: 100%|██████████| 1365/1365 [02:20<00:00,  9.69it/s]



Epoch 3: Loss = 0.0996
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.646 | Micro F1: 0.769 | Exact Match: 0.370
Thresh 0.15 | Macro F1: 0.691 | Micro F1: 0.823 | Exact Match: 0.493
Thresh 0.20 | Macro F1: 0.690 | Micro F1: 0.853 | Exact Match: 0.581
Thresh 0.25 | Macro F1: 0.681 | Micro F1: 0.867 | Exact Match: 0.640
Thresh 0.30 | Macro F1: 0.670 | Micro F1: 0.875 | Exact Match: 0.672
Thresh 0.35 | Macro F1: 0.656 | Micro F1: 0.879 | Exact Match: 0.689
Thresh 0.40 | Macro F1: 0.630 | Micro F1: 0.879 | Exact Match: 0.693
Thresh 0.45 | Macro F1: 0.614 | Micro F1: 0.876 | Exact Match: 0.682
Thresh 0.50 | Macro F1: 0.592 | Micro F1: 0.870 | Exact Match: 0.663
Thresh 0.55 | Macro F1: 0.566 | Micro F1: 0.864 | Exact Match: 0.640
Thresh 0.60 | Macro F1: 0.529 | Micro F1: 0.855 | Exact Match: 0.612
Thresh 0.65 | Macro F1: 0.499 | Micro F1: 0.843 | Exact Match: 0.574
Thresh 0.70 | Macro F1: 0.459 | Micro F1: 0.828 | Exact Match: 0.533
Thresh 0.75 | Macro F1: 0.421 | Micro F1: 0.813 | Exac

Epoch 4: 100%|██████████| 1365/1365 [02:20<00:00,  9.70it/s]



Epoch 4: Loss = 0.0654
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.771 | Micro F1: 0.867 | Exact Match: 0.597
Thresh 0.15 | Macro F1: 0.809 | Micro F1: 0.899 | Exact Match: 0.697
Thresh 0.20 | Macro F1: 0.822 | Micro F1: 0.918 | Exact Match: 0.768
Thresh 0.25 | Macro F1: 0.821 | Micro F1: 0.927 | Exact Match: 0.800
Thresh 0.30 | Macro F1: 0.821 | Micro F1: 0.932 | Exact Match: 0.817
Thresh 0.35 | Macro F1: 0.800 | Micro F1: 0.934 | Exact Match: 0.826
Thresh 0.40 | Macro F1: 0.789 | Micro F1: 0.934 | Exact Match: 0.827
Thresh 0.45 | Macro F1: 0.771 | Micro F1: 0.933 | Exact Match: 0.823
Thresh 0.50 | Macro F1: 0.757 | Micro F1: 0.931 | Exact Match: 0.814
Thresh 0.55 | Macro F1: 0.742 | Micro F1: 0.927 | Exact Match: 0.803
Thresh 0.60 | Macro F1: 0.728 | Micro F1: 0.923 | Exact Match: 0.787
Thresh 0.65 | Macro F1: 0.701 | Micro F1: 0.917 | Exact Match: 0.765
Thresh 0.70 | Macro F1: 0.673 | Micro F1: 0.910 | Exact Match: 0.740
Thresh 0.75 | Macro F1: 0.646 | Micro F1: 0.898 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.28it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.258
Micro F1: 0.530
Exact Match: 0.308

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.150
Micro F1: 0.745
Exact Match: 0.648

 Domain: UA
Macro F1: 0.244
Micro F1: 0.478
Exact Match: 0.221

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=73 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1365/1365 [02:20<00:00,  9.71it/s]



Epoch 1: Loss = 0.2123
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.304 | Micro F1: 0.505 | Exact Match: 0.042
Thresh 0.15 | Macro F1: 0.307 | Micro F1: 0.569 | Exact Match: 0.083
Thresh 0.20 | Macro F1: 0.299 | Micro F1: 0.604 | Exact Match: 0.154
Thresh 0.25 | Macro F1: 0.275 | Micro F1: 0.620 | Exact Match: 0.231
Thresh 0.30 | Macro F1: 0.251 | Micro F1: 0.624 | Exact Match: 0.269
Thresh 0.35 | Macro F1: 0.232 | Micro F1: 0.619 | Exact Match: 0.276
Thresh 0.40 | Macro F1: 0.218 | Micro F1: 0.612 | Exact Match: 0.263
Thresh 0.45 | Macro F1: 0.203 | Micro F1: 0.599 | Exact Match: 0.245
Thresh 0.50 | Macro F1: 0.191 | Micro F1: 0.586 | Exact Match: 0.224
Thresh 0.55 | Macro F1: 0.175 | Micro F1: 0.568 | Exact Match: 0.197
Thresh 0.60 | Macro F1: 0.160 | Micro F1: 0.548 | Exact Match: 0.172
Thresh 0.65 | Macro F1: 0.143 | Micro F1: 0.522 | Exact Match: 0.145
Thresh 0.70 | Macro F1: 0.127 | Micro F1: 0.492 | Exact Match: 0.116
Thresh 0.75 | Macro F1: 0.108 | Micro F1: 0.456 | Exac

Epoch 2: 100%|██████████| 1365/1365 [02:20<00:00,  9.71it/s]



Epoch 2: Loss = 0.1477
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.469 | Micro F1: 0.646 | Exact Match: 0.160
Thresh 0.15 | Macro F1: 0.486 | Micro F1: 0.713 | Exact Match: 0.277
Thresh 0.20 | Macro F1: 0.490 | Micro F1: 0.751 | Exact Match: 0.386
Thresh 0.25 | Macro F1: 0.479 | Micro F1: 0.771 | Exact Match: 0.464
Thresh 0.30 | Macro F1: 0.462 | Micro F1: 0.778 | Exact Match: 0.500
Thresh 0.35 | Macro F1: 0.447 | Micro F1: 0.780 | Exact Match: 0.510
Thresh 0.40 | Macro F1: 0.416 | Micro F1: 0.773 | Exact Match: 0.492
Thresh 0.45 | Macro F1: 0.386 | Micro F1: 0.764 | Exact Match: 0.469
Thresh 0.50 | Macro F1: 0.362 | Micro F1: 0.755 | Exact Match: 0.438
Thresh 0.55 | Macro F1: 0.334 | Micro F1: 0.742 | Exact Match: 0.406
Thresh 0.60 | Macro F1: 0.307 | Micro F1: 0.725 | Exact Match: 0.371
Thresh 0.65 | Macro F1: 0.277 | Micro F1: 0.708 | Exact Match: 0.336
Thresh 0.70 | Macro F1: 0.248 | Micro F1: 0.688 | Exact Match: 0.299
Thresh 0.75 | Macro F1: 0.219 | Micro F1: 0.659 | Exac

Epoch 3: 100%|██████████| 1365/1365 [02:20<00:00,  9.71it/s]



Epoch 3: Loss = 0.0997
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.636 | Micro F1: 0.764 | Exact Match: 0.357
Thresh 0.15 | Macro F1: 0.670 | Micro F1: 0.819 | Exact Match: 0.490
Thresh 0.20 | Macro F1: 0.667 | Micro F1: 0.845 | Exact Match: 0.574
Thresh 0.25 | Macro F1: 0.667 | Micro F1: 0.859 | Exact Match: 0.630
Thresh 0.30 | Macro F1: 0.664 | Micro F1: 0.869 | Exact Match: 0.665
Thresh 0.35 | Macro F1: 0.649 | Micro F1: 0.873 | Exact Match: 0.677
Thresh 0.40 | Macro F1: 0.635 | Micro F1: 0.874 | Exact Match: 0.684
Thresh 0.45 | Macro F1: 0.613 | Micro F1: 0.871 | Exact Match: 0.675
Thresh 0.50 | Macro F1: 0.589 | Micro F1: 0.866 | Exact Match: 0.655
Thresh 0.55 | Macro F1: 0.571 | Micro F1: 0.860 | Exact Match: 0.639
Thresh 0.60 | Macro F1: 0.552 | Micro F1: 0.852 | Exact Match: 0.613
Thresh 0.65 | Macro F1: 0.517 | Micro F1: 0.838 | Exact Match: 0.574
Thresh 0.70 | Macro F1: 0.483 | Micro F1: 0.823 | Exact Match: 0.536
Thresh 0.75 | Macro F1: 0.431 | Micro F1: 0.802 | Exac

Epoch 4: 100%|██████████| 1365/1365 [02:20<00:00,  9.71it/s]



Epoch 4: Loss = 0.0669
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.750 | Micro F1: 0.849 | Exact Match: 0.560
Thresh 0.15 | Macro F1: 0.782 | Micro F1: 0.884 | Exact Match: 0.661
Thresh 0.20 | Macro F1: 0.788 | Micro F1: 0.901 | Exact Match: 0.721
Thresh 0.25 | Macro F1: 0.784 | Micro F1: 0.911 | Exact Match: 0.771
Thresh 0.30 | Macro F1: 0.767 | Micro F1: 0.914 | Exact Match: 0.784
Thresh 0.35 | Macro F1: 0.757 | Micro F1: 0.917 | Exact Match: 0.795
Thresh 0.40 | Macro F1: 0.748 | Micro F1: 0.919 | Exact Match: 0.799
Thresh 0.45 | Macro F1: 0.724 | Micro F1: 0.917 | Exact Match: 0.796
Thresh 0.50 | Macro F1: 0.711 | Micro F1: 0.916 | Exact Match: 0.788
Thresh 0.55 | Macro F1: 0.690 | Micro F1: 0.912 | Exact Match: 0.774
Thresh 0.60 | Macro F1: 0.678 | Micro F1: 0.907 | Exact Match: 0.759
Thresh 0.65 | Macro F1: 0.650 | Micro F1: 0.901 | Exact Match: 0.738
Thresh 0.70 | Macro F1: 0.624 | Micro F1: 0.894 | Exact Match: 0.716
Thresh 0.75 | Macro F1: 0.597 | Micro F1: 0.884 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.42it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.239
Micro F1: 0.516
Exact Match: 0.324

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.129
Micro F1: 0.745
Exact Match: 0.681

 Domain: UA
Macro F1: 0.225
Micro F1: 0.457
Exact Match: 0.232

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=74 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1365/1365 [02:21<00:00,  9.63it/s]



Epoch 1: Loss = 0.2183
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.270 | Micro F1: 0.458 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.281 | Micro F1: 0.529 | Exact Match: 0.037
Thresh 0.20 | Macro F1: 0.278 | Micro F1: 0.564 | Exact Match: 0.098
Thresh 0.25 | Macro F1: 0.265 | Micro F1: 0.577 | Exact Match: 0.138
Thresh 0.30 | Macro F1: 0.233 | Micro F1: 0.573 | Exact Match: 0.162
Thresh 0.35 | Macro F1: 0.195 | Micro F1: 0.560 | Exact Match: 0.169
Thresh 0.40 | Macro F1: 0.176 | Micro F1: 0.543 | Exact Match: 0.165
Thresh 0.45 | Macro F1: 0.160 | Micro F1: 0.525 | Exact Match: 0.154
Thresh 0.50 | Macro F1: 0.141 | Micro F1: 0.498 | Exact Match: 0.127
Thresh 0.55 | Macro F1: 0.124 | Micro F1: 0.468 | Exact Match: 0.101
Thresh 0.60 | Macro F1: 0.102 | Micro F1: 0.427 | Exact Match: 0.067
Thresh 0.65 | Macro F1: 0.087 | Micro F1: 0.388 | Exact Match: 0.052
Thresh 0.70 | Macro F1: 0.075 | Micro F1: 0.345 | Exact Match: 0.040
Thresh 0.75 | Macro F1: 0.064 | Micro F1: 0.293 | Exac

Epoch 2: 100%|██████████| 1365/1365 [02:20<00:00,  9.70it/s]



Epoch 2: Loss = 0.1608
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.434 | Micro F1: 0.608 | Exact Match: 0.100
Thresh 0.15 | Macro F1: 0.478 | Micro F1: 0.688 | Exact Match: 0.246
Thresh 0.20 | Macro F1: 0.492 | Micro F1: 0.731 | Exact Match: 0.374
Thresh 0.25 | Macro F1: 0.475 | Micro F1: 0.752 | Exact Match: 0.438
Thresh 0.30 | Macro F1: 0.459 | Micro F1: 0.756 | Exact Match: 0.460
Thresh 0.35 | Macro F1: 0.431 | Micro F1: 0.752 | Exact Match: 0.459
Thresh 0.40 | Macro F1: 0.394 | Micro F1: 0.744 | Exact Match: 0.440
Thresh 0.45 | Macro F1: 0.360 | Micro F1: 0.730 | Exact Match: 0.409
Thresh 0.50 | Macro F1: 0.326 | Micro F1: 0.716 | Exact Match: 0.375
Thresh 0.55 | Macro F1: 0.299 | Micro F1: 0.699 | Exact Match: 0.342
Thresh 0.60 | Macro F1: 0.280 | Micro F1: 0.683 | Exact Match: 0.314
Thresh 0.65 | Macro F1: 0.256 | Micro F1: 0.659 | Exact Match: 0.278
Thresh 0.70 | Macro F1: 0.229 | Micro F1: 0.632 | Exact Match: 0.243
Thresh 0.75 | Macro F1: 0.201 | Micro F1: 0.588 | Exac

Epoch 3: 100%|██████████| 1365/1365 [02:20<00:00,  9.68it/s]



Epoch 3: Loss = 0.1106
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.599 | Micro F1: 0.720 | Exact Match: 0.276
Thresh 0.15 | Macro F1: 0.634 | Micro F1: 0.776 | Exact Match: 0.395
Thresh 0.20 | Macro F1: 0.636 | Micro F1: 0.806 | Exact Match: 0.493
Thresh 0.25 | Macro F1: 0.627 | Micro F1: 0.822 | Exact Match: 0.548
Thresh 0.30 | Macro F1: 0.605 | Micro F1: 0.829 | Exact Match: 0.575
Thresh 0.35 | Macro F1: 0.589 | Micro F1: 0.835 | Exact Match: 0.603
Thresh 0.40 | Macro F1: 0.575 | Micro F1: 0.836 | Exact Match: 0.610
Thresh 0.45 | Macro F1: 0.549 | Micro F1: 0.833 | Exact Match: 0.604
Thresh 0.50 | Macro F1: 0.525 | Micro F1: 0.829 | Exact Match: 0.596
Thresh 0.55 | Macro F1: 0.502 | Micro F1: 0.822 | Exact Match: 0.574
Thresh 0.60 | Macro F1: 0.469 | Micro F1: 0.812 | Exact Match: 0.543
Thresh 0.65 | Macro F1: 0.448 | Micro F1: 0.800 | Exact Match: 0.517
Thresh 0.70 | Macro F1: 0.412 | Micro F1: 0.782 | Exact Match: 0.479
Thresh 0.75 | Macro F1: 0.378 | Micro F1: 0.763 | Exac

Epoch 4: 100%|██████████| 1365/1365 [02:20<00:00,  9.68it/s]



Epoch 4: Loss = 0.0744
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.723 | Micro F1: 0.835 | Exact Match: 0.533
Thresh 0.15 | Macro F1: 0.775 | Micro F1: 0.874 | Exact Match: 0.646
Thresh 0.20 | Macro F1: 0.796 | Micro F1: 0.893 | Exact Match: 0.709
Thresh 0.25 | Macro F1: 0.802 | Micro F1: 0.905 | Exact Match: 0.748
Thresh 0.30 | Macro F1: 0.804 | Micro F1: 0.912 | Exact Match: 0.775
Thresh 0.35 | Macro F1: 0.799 | Micro F1: 0.915 | Exact Match: 0.788
Thresh 0.40 | Macro F1: 0.788 | Micro F1: 0.916 | Exact Match: 0.794
Thresh 0.45 | Macro F1: 0.769 | Micro F1: 0.914 | Exact Match: 0.784
Thresh 0.50 | Macro F1: 0.753 | Micro F1: 0.910 | Exact Match: 0.770
Thresh 0.55 | Macro F1: 0.735 | Micro F1: 0.908 | Exact Match: 0.759
Thresh 0.60 | Macro F1: 0.707 | Micro F1: 0.902 | Exact Match: 0.737
Thresh 0.65 | Macro F1: 0.683 | Micro F1: 0.893 | Exact Match: 0.710
Thresh 0.70 | Macro F1: 0.658 | Micro F1: 0.883 | Exact Match: 0.678
Thresh 0.75 | Macro F1: 0.613 | Micro F1: 0.869 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.08it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.248
Micro F1: 0.508
Exact Match: 0.317

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.145
Micro F1: 0.763
Exact Match: 0.659

 Domain: UA
Macro F1: 0.231
Micro F1: 0.445
Exact Match: 0.230

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=75 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1365/1365 [02:20<00:00,  9.71it/s]



Epoch 1: Loss = 0.2122
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.301 | Micro F1: 0.506 | Exact Match: 0.028
Thresh 0.15 | Macro F1: 0.293 | Micro F1: 0.570 | Exact Match: 0.067
Thresh 0.20 | Macro F1: 0.280 | Micro F1: 0.603 | Exact Match: 0.139
Thresh 0.25 | Macro F1: 0.265 | Micro F1: 0.622 | Exact Match: 0.201
Thresh 0.30 | Macro F1: 0.248 | Micro F1: 0.626 | Exact Match: 0.240
Thresh 0.35 | Macro F1: 0.232 | Micro F1: 0.625 | Exact Match: 0.255
Thresh 0.40 | Macro F1: 0.217 | Micro F1: 0.614 | Exact Match: 0.256
Thresh 0.45 | Macro F1: 0.203 | Micro F1: 0.601 | Exact Match: 0.237
Thresh 0.50 | Macro F1: 0.184 | Micro F1: 0.578 | Exact Match: 0.201
Thresh 0.55 | Macro F1: 0.168 | Micro F1: 0.557 | Exact Match: 0.170
Thresh 0.60 | Macro F1: 0.149 | Micro F1: 0.529 | Exact Match: 0.138
Thresh 0.65 | Macro F1: 0.130 | Micro F1: 0.496 | Exact Match: 0.107
Thresh 0.70 | Macro F1: 0.115 | Micro F1: 0.464 | Exact Match: 0.081
Thresh 0.75 | Macro F1: 0.100 | Micro F1: 0.427 | Exac

Epoch 2: 100%|██████████| 1365/1365 [02:20<00:00,  9.71it/s]



Epoch 2: Loss = 0.1444
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.466 | Micro F1: 0.642 | Exact Match: 0.168
Thresh 0.15 | Macro F1: 0.483 | Micro F1: 0.712 | Exact Match: 0.292
Thresh 0.20 | Macro F1: 0.483 | Micro F1: 0.748 | Exact Match: 0.379
Thresh 0.25 | Macro F1: 0.479 | Micro F1: 0.770 | Exact Match: 0.446
Thresh 0.30 | Macro F1: 0.465 | Micro F1: 0.779 | Exact Match: 0.484
Thresh 0.35 | Macro F1: 0.448 | Micro F1: 0.779 | Exact Match: 0.492
Thresh 0.40 | Macro F1: 0.428 | Micro F1: 0.779 | Exact Match: 0.490
Thresh 0.45 | Macro F1: 0.399 | Micro F1: 0.773 | Exact Match: 0.470
Thresh 0.50 | Macro F1: 0.372 | Micro F1: 0.767 | Exact Match: 0.446
Thresh 0.55 | Macro F1: 0.343 | Micro F1: 0.755 | Exact Match: 0.416
Thresh 0.60 | Macro F1: 0.317 | Micro F1: 0.738 | Exact Match: 0.381
Thresh 0.65 | Macro F1: 0.290 | Micro F1: 0.720 | Exact Match: 0.341
Thresh 0.70 | Macro F1: 0.262 | Micro F1: 0.696 | Exact Match: 0.299
Thresh 0.75 | Macro F1: 0.233 | Micro F1: 0.665 | Exac

Epoch 3: 100%|██████████| 1365/1365 [02:21<00:00,  9.67it/s]



Epoch 3: Loss = 0.0959
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.624 | Micro F1: 0.771 | Exact Match: 0.354
Thresh 0.15 | Macro F1: 0.659 | Micro F1: 0.821 | Exact Match: 0.481
Thresh 0.20 | Macro F1: 0.678 | Micro F1: 0.852 | Exact Match: 0.579
Thresh 0.25 | Macro F1: 0.662 | Micro F1: 0.866 | Exact Match: 0.637
Thresh 0.30 | Macro F1: 0.641 | Micro F1: 0.874 | Exact Match: 0.674
Thresh 0.35 | Macro F1: 0.635 | Micro F1: 0.877 | Exact Match: 0.684
Thresh 0.40 | Macro F1: 0.618 | Micro F1: 0.877 | Exact Match: 0.686
Thresh 0.45 | Macro F1: 0.604 | Micro F1: 0.876 | Exact Match: 0.683
Thresh 0.50 | Macro F1: 0.593 | Micro F1: 0.874 | Exact Match: 0.671
Thresh 0.55 | Macro F1: 0.577 | Micro F1: 0.869 | Exact Match: 0.651
Thresh 0.60 | Macro F1: 0.554 | Micro F1: 0.860 | Exact Match: 0.622
Thresh 0.65 | Macro F1: 0.531 | Micro F1: 0.849 | Exact Match: 0.590
Thresh 0.70 | Macro F1: 0.497 | Micro F1: 0.835 | Exact Match: 0.552
Thresh 0.75 | Macro F1: 0.452 | Micro F1: 0.817 | Exac

Epoch 4: 100%|██████████| 1365/1365 [02:21<00:00,  9.63it/s]



Epoch 4: Loss = 0.0627
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.748 | Micro F1: 0.860 | Exact Match: 0.609
Thresh 0.15 | Macro F1: 0.791 | Micro F1: 0.894 | Exact Match: 0.702
Thresh 0.20 | Macro F1: 0.809 | Micro F1: 0.910 | Exact Match: 0.752
Thresh 0.25 | Macro F1: 0.821 | Micro F1: 0.922 | Exact Match: 0.786
Thresh 0.30 | Macro F1: 0.827 | Micro F1: 0.928 | Exact Match: 0.811
Thresh 0.35 | Macro F1: 0.819 | Micro F1: 0.931 | Exact Match: 0.823
Thresh 0.40 | Macro F1: 0.803 | Micro F1: 0.931 | Exact Match: 0.825
Thresh 0.45 | Macro F1: 0.791 | Micro F1: 0.931 | Exact Match: 0.825
Thresh 0.50 | Macro F1: 0.780 | Micro F1: 0.929 | Exact Match: 0.817
Thresh 0.55 | Macro F1: 0.768 | Micro F1: 0.927 | Exact Match: 0.808
Thresh 0.60 | Macro F1: 0.749 | Micro F1: 0.924 | Exact Match: 0.793
Thresh 0.65 | Macro F1: 0.722 | Micro F1: 0.918 | Exact Match: 0.768
Thresh 0.70 | Macro F1: 0.695 | Micro F1: 0.910 | Exact Match: 0.742
Thresh 0.75 | Macro F1: 0.673 | Micro F1: 0.900 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.19it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.243
Micro F1: 0.514
Exact Match: 0.319

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.192
Micro F1: 0.776
Exact Match: 0.659

 Domain: UA
Macro F1: 0.216
Micro F1: 0.447
Exact Match: 0.232

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=42 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1365/1365 [02:21<00:00,  9.64it/s]



Epoch 1: Loss = 0.2163
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.289 | Micro F1: 0.473 | Exact Match: 0.007
Thresh 0.15 | Macro F1: 0.299 | Micro F1: 0.548 | Exact Match: 0.067
Thresh 0.20 | Macro F1: 0.260 | Micro F1: 0.576 | Exact Match: 0.135
Thresh 0.25 | Macro F1: 0.236 | Micro F1: 0.591 | Exact Match: 0.181
Thresh 0.30 | Macro F1: 0.217 | Micro F1: 0.595 | Exact Match: 0.209
Thresh 0.35 | Macro F1: 0.201 | Micro F1: 0.588 | Exact Match: 0.218
Thresh 0.40 | Macro F1: 0.192 | Micro F1: 0.575 | Exact Match: 0.207
Thresh 0.45 | Macro F1: 0.177 | Micro F1: 0.550 | Exact Match: 0.184
Thresh 0.50 | Macro F1: 0.157 | Micro F1: 0.513 | Exact Match: 0.152
Thresh 0.55 | Macro F1: 0.137 | Micro F1: 0.472 | Exact Match: 0.122
Thresh 0.60 | Macro F1: 0.120 | Micro F1: 0.429 | Exact Match: 0.099
Thresh 0.65 | Macro F1: 0.102 | Micro F1: 0.377 | Exact Match: 0.078
Thresh 0.70 | Macro F1: 0.084 | Micro F1: 0.318 | Exact Match: 0.057
Thresh 0.75 | Macro F1: 0.065 | Micro F1: 0.240 | Exac

Epoch 2: 100%|██████████| 1365/1365 [02:21<00:00,  9.66it/s]



Epoch 2: Loss = 0.1519
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.458 | Micro F1: 0.661 | Exact Match: 0.228
Thresh 0.15 | Macro F1: 0.482 | Micro F1: 0.722 | Exact Match: 0.332
Thresh 0.20 | Macro F1: 0.481 | Micro F1: 0.751 | Exact Match: 0.408
Thresh 0.25 | Macro F1: 0.471 | Micro F1: 0.768 | Exact Match: 0.457
Thresh 0.30 | Macro F1: 0.457 | Micro F1: 0.777 | Exact Match: 0.483
Thresh 0.35 | Macro F1: 0.428 | Micro F1: 0.775 | Exact Match: 0.488
Thresh 0.40 | Macro F1: 0.403 | Micro F1: 0.772 | Exact Match: 0.482
Thresh 0.45 | Macro F1: 0.375 | Micro F1: 0.767 | Exact Match: 0.462
Thresh 0.50 | Macro F1: 0.348 | Micro F1: 0.761 | Exact Match: 0.437
Thresh 0.55 | Macro F1: 0.318 | Micro F1: 0.750 | Exact Match: 0.406
Thresh 0.60 | Macro F1: 0.292 | Micro F1: 0.737 | Exact Match: 0.376
Thresh 0.65 | Macro F1: 0.266 | Micro F1: 0.722 | Exact Match: 0.345
Thresh 0.70 | Macro F1: 0.236 | Micro F1: 0.701 | Exact Match: 0.309
Thresh 0.75 | Macro F1: 0.208 | Micro F1: 0.676 | Exac

Epoch 3: 100%|██████████| 1365/1365 [02:21<00:00,  9.67it/s]



Epoch 3: Loss = 0.0987
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.617 | Micro F1: 0.763 | Exact Match: 0.342
Thresh 0.15 | Macro F1: 0.654 | Micro F1: 0.819 | Exact Match: 0.484
Thresh 0.20 | Macro F1: 0.646 | Micro F1: 0.848 | Exact Match: 0.574
Thresh 0.25 | Macro F1: 0.636 | Micro F1: 0.864 | Exact Match: 0.636
Thresh 0.30 | Macro F1: 0.627 | Micro F1: 0.875 | Exact Match: 0.674
Thresh 0.35 | Macro F1: 0.628 | Micro F1: 0.880 | Exact Match: 0.694
Thresh 0.40 | Macro F1: 0.618 | Micro F1: 0.883 | Exact Match: 0.704
Thresh 0.45 | Macro F1: 0.610 | Micro F1: 0.883 | Exact Match: 0.701
Thresh 0.50 | Macro F1: 0.598 | Micro F1: 0.880 | Exact Match: 0.689
Thresh 0.55 | Macro F1: 0.583 | Micro F1: 0.876 | Exact Match: 0.674
Thresh 0.60 | Macro F1: 0.569 | Micro F1: 0.869 | Exact Match: 0.647
Thresh 0.65 | Macro F1: 0.548 | Micro F1: 0.860 | Exact Match: 0.619
Thresh 0.70 | Macro F1: 0.513 | Micro F1: 0.846 | Exact Match: 0.576
Thresh 0.75 | Macro F1: 0.480 | Micro F1: 0.829 | Exac

Epoch 4: 100%|██████████| 1365/1365 [02:21<00:00,  9.67it/s]



Epoch 4: Loss = 0.0645
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.747 | Micro F1: 0.850 | Exact Match: 0.561
Thresh 0.15 | Macro F1: 0.795 | Micro F1: 0.886 | Exact Match: 0.671
Thresh 0.20 | Macro F1: 0.815 | Micro F1: 0.904 | Exact Match: 0.729
Thresh 0.25 | Macro F1: 0.808 | Micro F1: 0.913 | Exact Match: 0.765
Thresh 0.30 | Macro F1: 0.802 | Micro F1: 0.921 | Exact Match: 0.791
Thresh 0.35 | Macro F1: 0.794 | Micro F1: 0.924 | Exact Match: 0.804
Thresh 0.40 | Macro F1: 0.782 | Micro F1: 0.926 | Exact Match: 0.813
Thresh 0.45 | Macro F1: 0.772 | Micro F1: 0.927 | Exact Match: 0.813
Thresh 0.50 | Macro F1: 0.751 | Micro F1: 0.925 | Exact Match: 0.809
Thresh 0.55 | Macro F1: 0.733 | Micro F1: 0.923 | Exact Match: 0.798
Thresh 0.60 | Macro F1: 0.708 | Micro F1: 0.919 | Exact Match: 0.785
Thresh 0.65 | Macro F1: 0.687 | Micro F1: 0.914 | Exact Match: 0.765
Thresh 0.70 | Macro F1: 0.663 | Micro F1: 0.908 | Exact Match: 0.747
Thresh 0.75 | Macro F1: 0.629 | Micro F1: 0.898 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.35it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.264
Micro F1: 0.525
Exact Match: 0.299

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.159
Micro F1: 0.747
Exact Match: 0.604

 Domain: UA
Macro F1: 0.250
Micro F1: 0.471
Exact Match: 0.221

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=43 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1365/1365 [02:21<00:00,  9.67it/s]



Epoch 1: Loss = 0.2113
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.304 | Micro F1: 0.503 | Exact Match: 0.009
Thresh 0.15 | Macro F1: 0.305 | Micro F1: 0.557 | Exact Match: 0.068
Thresh 0.20 | Macro F1: 0.285 | Micro F1: 0.587 | Exact Match: 0.140
Thresh 0.25 | Macro F1: 0.262 | Micro F1: 0.602 | Exact Match: 0.184
Thresh 0.30 | Macro F1: 0.244 | Micro F1: 0.608 | Exact Match: 0.218
Thresh 0.35 | Macro F1: 0.228 | Micro F1: 0.605 | Exact Match: 0.231
Thresh 0.40 | Macro F1: 0.216 | Micro F1: 0.597 | Exact Match: 0.230
Thresh 0.45 | Macro F1: 0.208 | Micro F1: 0.587 | Exact Match: 0.220
Thresh 0.50 | Macro F1: 0.196 | Micro F1: 0.572 | Exact Match: 0.193
Thresh 0.55 | Macro F1: 0.177 | Micro F1: 0.552 | Exact Match: 0.161
Thresh 0.60 | Macro F1: 0.153 | Micro F1: 0.524 | Exact Match: 0.124
Thresh 0.65 | Macro F1: 0.129 | Micro F1: 0.493 | Exact Match: 0.091
Thresh 0.70 | Macro F1: 0.108 | Micro F1: 0.459 | Exact Match: 0.063
Thresh 0.75 | Macro F1: 0.090 | Micro F1: 0.422 | Exac

Epoch 2: 100%|██████████| 1365/1365 [02:21<00:00,  9.65it/s]



Epoch 2: Loss = 0.1445
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.471 | Micro F1: 0.642 | Exact Match: 0.154
Thresh 0.15 | Macro F1: 0.508 | Micro F1: 0.718 | Exact Match: 0.284
Thresh 0.20 | Macro F1: 0.519 | Micro F1: 0.761 | Exact Match: 0.396
Thresh 0.25 | Macro F1: 0.518 | Micro F1: 0.785 | Exact Match: 0.471
Thresh 0.30 | Macro F1: 0.500 | Micro F1: 0.797 | Exact Match: 0.515
Thresh 0.35 | Macro F1: 0.475 | Micro F1: 0.798 | Exact Match: 0.526
Thresh 0.40 | Macro F1: 0.445 | Micro F1: 0.793 | Exact Match: 0.510
Thresh 0.45 | Macro F1: 0.419 | Micro F1: 0.786 | Exact Match: 0.481
Thresh 0.50 | Macro F1: 0.388 | Micro F1: 0.774 | Exact Match: 0.448
Thresh 0.55 | Macro F1: 0.352 | Micro F1: 0.760 | Exact Match: 0.408
Thresh 0.60 | Macro F1: 0.330 | Micro F1: 0.745 | Exact Match: 0.373
Thresh 0.65 | Macro F1: 0.306 | Micro F1: 0.725 | Exact Match: 0.332
Thresh 0.70 | Macro F1: 0.275 | Micro F1: 0.698 | Exact Match: 0.286
Thresh 0.75 | Macro F1: 0.249 | Micro F1: 0.671 | Exac

Epoch 3: 100%|██████████| 1365/1365 [02:21<00:00,  9.63it/s]



Epoch 3: Loss = 0.0974
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.628 | Micro F1: 0.760 | Exact Match: 0.328
Thresh 0.15 | Macro F1: 0.671 | Micro F1: 0.816 | Exact Match: 0.464
Thresh 0.20 | Macro F1: 0.686 | Micro F1: 0.848 | Exact Match: 0.553
Thresh 0.25 | Macro F1: 0.682 | Micro F1: 0.866 | Exact Match: 0.618
Thresh 0.30 | Macro F1: 0.677 | Micro F1: 0.878 | Exact Match: 0.664
Thresh 0.35 | Macro F1: 0.661 | Micro F1: 0.883 | Exact Match: 0.688
Thresh 0.40 | Macro F1: 0.647 | Micro F1: 0.883 | Exact Match: 0.693
Thresh 0.45 | Macro F1: 0.634 | Micro F1: 0.885 | Exact Match: 0.697
Thresh 0.50 | Macro F1: 0.621 | Micro F1: 0.882 | Exact Match: 0.685
Thresh 0.55 | Macro F1: 0.592 | Micro F1: 0.877 | Exact Match: 0.669
Thresh 0.60 | Macro F1: 0.567 | Micro F1: 0.870 | Exact Match: 0.640
Thresh 0.65 | Macro F1: 0.538 | Micro F1: 0.862 | Exact Match: 0.614
Thresh 0.70 | Macro F1: 0.502 | Micro F1: 0.847 | Exact Match: 0.578
Thresh 0.75 | Macro F1: 0.457 | Micro F1: 0.830 | Exac

Epoch 4: 100%|██████████| 1365/1365 [02:21<00:00,  9.68it/s]



Epoch 4: Loss = 0.0663
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.759 | Micro F1: 0.865 | Exact Match: 0.594
Thresh 0.15 | Macro F1: 0.799 | Micro F1: 0.901 | Exact Match: 0.705
Thresh 0.20 | Macro F1: 0.813 | Micro F1: 0.917 | Exact Match: 0.758
Thresh 0.25 | Macro F1: 0.814 | Micro F1: 0.926 | Exact Match: 0.793
Thresh 0.30 | Macro F1: 0.802 | Micro F1: 0.930 | Exact Match: 0.811
Thresh 0.35 | Macro F1: 0.791 | Micro F1: 0.932 | Exact Match: 0.823
Thresh 0.40 | Macro F1: 0.775 | Micro F1: 0.932 | Exact Match: 0.819
Thresh 0.45 | Macro F1: 0.760 | Micro F1: 0.930 | Exact Match: 0.811
Thresh 0.50 | Macro F1: 0.744 | Micro F1: 0.928 | Exact Match: 0.802
Thresh 0.55 | Macro F1: 0.722 | Micro F1: 0.925 | Exact Match: 0.791
Thresh 0.60 | Macro F1: 0.702 | Micro F1: 0.920 | Exact Match: 0.773
Thresh 0.65 | Macro F1: 0.662 | Micro F1: 0.914 | Exact Match: 0.750
Thresh 0.70 | Macro F1: 0.633 | Micro F1: 0.905 | Exact Match: 0.724
Thresh 0.75 | Macro F1: 0.611 | Micro F1: 0.897 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.26it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.255
Micro F1: 0.538
Exact Match: 0.333

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.188
Micro F1: 0.798
Exact Match: 0.692

 Domain: UA
Macro F1: 0.233
Micro F1: 0.473
Exact Match: 0.241

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=44 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1365/1365 [02:20<00:00,  9.71it/s]



Epoch 1: Loss = 0.2131
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.305 | Micro F1: 0.488 | Exact Match: 0.007
Thresh 0.15 | Macro F1: 0.330 | Micro F1: 0.559 | Exact Match: 0.075
Thresh 0.20 | Macro F1: 0.316 | Micro F1: 0.599 | Exact Match: 0.141
Thresh 0.25 | Macro F1: 0.289 | Micro F1: 0.621 | Exact Match: 0.219
Thresh 0.30 | Macro F1: 0.259 | Micro F1: 0.626 | Exact Match: 0.253
Thresh 0.35 | Macro F1: 0.234 | Micro F1: 0.621 | Exact Match: 0.252
Thresh 0.40 | Macro F1: 0.220 | Micro F1: 0.610 | Exact Match: 0.242
Thresh 0.45 | Macro F1: 0.202 | Micro F1: 0.594 | Exact Match: 0.217
Thresh 0.50 | Macro F1: 0.185 | Micro F1: 0.574 | Exact Match: 0.189
Thresh 0.55 | Macro F1: 0.164 | Micro F1: 0.550 | Exact Match: 0.158
Thresh 0.60 | Macro F1: 0.146 | Micro F1: 0.524 | Exact Match: 0.128
Thresh 0.65 | Macro F1: 0.128 | Micro F1: 0.498 | Exact Match: 0.101
Thresh 0.70 | Macro F1: 0.110 | Micro F1: 0.468 | Exact Match: 0.076
Thresh 0.75 | Macro F1: 0.097 | Micro F1: 0.436 | Exac

Epoch 2: 100%|██████████| 1365/1365 [02:20<00:00,  9.70it/s]



Epoch 2: Loss = 0.1446
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.452 | Micro F1: 0.649 | Exact Match: 0.163
Thresh 0.15 | Macro F1: 0.476 | Micro F1: 0.712 | Exact Match: 0.272
Thresh 0.20 | Macro F1: 0.480 | Micro F1: 0.748 | Exact Match: 0.371
Thresh 0.25 | Macro F1: 0.478 | Micro F1: 0.772 | Exact Match: 0.452
Thresh 0.30 | Macro F1: 0.468 | Micro F1: 0.782 | Exact Match: 0.492
Thresh 0.35 | Macro F1: 0.451 | Micro F1: 0.787 | Exact Match: 0.514
Thresh 0.40 | Macro F1: 0.428 | Micro F1: 0.787 | Exact Match: 0.519
Thresh 0.45 | Macro F1: 0.405 | Micro F1: 0.782 | Exact Match: 0.502
Thresh 0.50 | Macro F1: 0.384 | Micro F1: 0.777 | Exact Match: 0.483
Thresh 0.55 | Macro F1: 0.366 | Micro F1: 0.769 | Exact Match: 0.456
Thresh 0.60 | Macro F1: 0.349 | Micro F1: 0.759 | Exact Match: 0.424
Thresh 0.65 | Macro F1: 0.333 | Micro F1: 0.750 | Exact Match: 0.397
Thresh 0.70 | Macro F1: 0.314 | Micro F1: 0.733 | Exact Match: 0.364
Thresh 0.75 | Macro F1: 0.285 | Micro F1: 0.709 | Exac

Epoch 3: 100%|██████████| 1365/1365 [02:21<00:00,  9.64it/s]



Epoch 3: Loss = 0.0958
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.635 | Micro F1: 0.782 | Exact Match: 0.380
Thresh 0.15 | Macro F1: 0.668 | Micro F1: 0.835 | Exact Match: 0.523
Thresh 0.20 | Macro F1: 0.685 | Micro F1: 0.863 | Exact Match: 0.613
Thresh 0.25 | Macro F1: 0.688 | Micro F1: 0.880 | Exact Match: 0.673
Thresh 0.30 | Macro F1: 0.667 | Micro F1: 0.889 | Exact Match: 0.705
Thresh 0.35 | Macro F1: 0.649 | Micro F1: 0.893 | Exact Match: 0.716
Thresh 0.40 | Macro F1: 0.636 | Micro F1: 0.893 | Exact Match: 0.718
Thresh 0.45 | Macro F1: 0.625 | Micro F1: 0.888 | Exact Match: 0.705
Thresh 0.50 | Macro F1: 0.612 | Micro F1: 0.884 | Exact Match: 0.686
Thresh 0.55 | Macro F1: 0.595 | Micro F1: 0.876 | Exact Match: 0.664
Thresh 0.60 | Macro F1: 0.569 | Micro F1: 0.866 | Exact Match: 0.632
Thresh 0.65 | Macro F1: 0.538 | Micro F1: 0.852 | Exact Match: 0.591
Thresh 0.70 | Macro F1: 0.500 | Micro F1: 0.834 | Exact Match: 0.545
Thresh 0.75 | Macro F1: 0.452 | Micro F1: 0.814 | Exac

Epoch 4: 100%|██████████| 1365/1365 [02:20<00:00,  9.69it/s]



Epoch 4: Loss = 0.0643
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.745 | Micro F1: 0.854 | Exact Match: 0.575
Thresh 0.15 | Macro F1: 0.789 | Micro F1: 0.889 | Exact Match: 0.681
Thresh 0.20 | Macro F1: 0.807 | Micro F1: 0.906 | Exact Match: 0.740
Thresh 0.25 | Macro F1: 0.812 | Micro F1: 0.916 | Exact Match: 0.772
Thresh 0.30 | Macro F1: 0.810 | Micro F1: 0.923 | Exact Match: 0.796
Thresh 0.35 | Macro F1: 0.810 | Micro F1: 0.929 | Exact Match: 0.819
Thresh 0.40 | Macro F1: 0.800 | Micro F1: 0.931 | Exact Match: 0.822
Thresh 0.45 | Macro F1: 0.792 | Micro F1: 0.928 | Exact Match: 0.813
Thresh 0.50 | Macro F1: 0.780 | Micro F1: 0.925 | Exact Match: 0.804
Thresh 0.55 | Macro F1: 0.760 | Micro F1: 0.923 | Exact Match: 0.796
Thresh 0.60 | Macro F1: 0.747 | Micro F1: 0.919 | Exact Match: 0.782
Thresh 0.65 | Macro F1: 0.719 | Micro F1: 0.914 | Exact Match: 0.763
Thresh 0.70 | Macro F1: 0.683 | Micro F1: 0.907 | Exact Match: 0.739
Thresh 0.75 | Macro F1: 0.654 | Micro F1: 0.898 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.19it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.248
Micro F1: 0.500
Exact Match: 0.301

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.146
Micro F1: 0.743
Exact Match: 0.626

 Domain: UA
Macro F1: 0.234
Micro F1: 0.439
Exact Match: 0.218

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=71 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 145/145 [00:15<00:00,  9.56it/s]



Epoch 1: Loss = 0.2621
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.169 | Micro F1: 0.406 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.201 | Micro F1: 0.540 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.197 | Micro F1: 0.631 | Exact Match: 0.130
Thresh 0.25 | Macro F1: 0.157 | Micro F1: 0.662 | Exact Match: 0.252
Thresh 0.30 | Macro F1: 0.142 | Micro F1: 0.663 | Exact Match: 0.278
Thresh 0.35 | Macro F1: 0.139 | Micro F1: 0.667 | Exact Match: 0.316
Thresh 0.40 | Macro F1: 0.139 | Micro F1: 0.666 | Exact Match: 0.318
Thresh 0.45 | Macro F1: 0.134 | Micro F1: 0.652 | Exact Match: 0.318
Thresh 0.50 | Macro F1: 0.121 | Micro F1: 0.628 | Exact Match: 0.316
Thresh 0.55 | Macro F1: 0.112 | Micro F1: 0.615 | Exact Match: 0.316
Thresh 0.60 | Macro F1: 0.111 | Micro F1: 0.609 | Exact Match: 0.309
Thresh 0.65 | Macro F1: 0.109 | Micro F1: 0.592 | Exact Match: 0.296
Thresh 0.70 | Macro F1: 0.101 | Micro F1: 0.543 | Exact Match: 0.265
Thresh 0.75 | Macro F1: 0.095 | Micro F1: 0.499 | Exac

Epoch 2: 100%|██████████| 145/145 [00:15<00:00,  9.60it/s]



Epoch 2: Loss = 0.1726
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.233 | Micro F1: 0.558 | Exact Match: 0.201
Thresh 0.15 | Macro F1: 0.252 | Micro F1: 0.652 | Exact Match: 0.305
Thresh 0.20 | Macro F1: 0.265 | Micro F1: 0.740 | Exact Match: 0.435
Thresh 0.25 | Macro F1: 0.235 | Micro F1: 0.750 | Exact Match: 0.397
Thresh 0.30 | Macro F1: 0.210 | Micro F1: 0.749 | Exact Match: 0.373
Thresh 0.35 | Macro F1: 0.205 | Micro F1: 0.749 | Exact Match: 0.371
Thresh 0.40 | Macro F1: 0.208 | Micro F1: 0.754 | Exact Match: 0.380
Thresh 0.45 | Macro F1: 0.208 | Micro F1: 0.752 | Exact Match: 0.391
Thresh 0.50 | Macro F1: 0.207 | Micro F1: 0.749 | Exact Match: 0.397
Thresh 0.55 | Macro F1: 0.203 | Micro F1: 0.745 | Exact Match: 0.395
Thresh 0.60 | Macro F1: 0.195 | Micro F1: 0.731 | Exact Match: 0.380
Thresh 0.65 | Macro F1: 0.174 | Micro F1: 0.715 | Exact Match: 0.362
Thresh 0.70 | Macro F1: 0.163 | Micro F1: 0.698 | Exact Match: 0.340
Thresh 0.75 | Macro F1: 0.154 | Micro F1: 0.680 | Exac

Epoch 3: 100%|██████████| 145/145 [00:15<00:00,  9.55it/s]



Epoch 3: Loss = 0.1407
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.310 | Micro F1: 0.625 | Exact Match: 0.311
Thresh 0.15 | Macro F1: 0.312 | Micro F1: 0.723 | Exact Match: 0.364
Thresh 0.20 | Macro F1: 0.300 | Micro F1: 0.758 | Exact Match: 0.483
Thresh 0.25 | Macro F1: 0.275 | Micro F1: 0.764 | Exact Match: 0.519
Thresh 0.30 | Macro F1: 0.268 | Micro F1: 0.772 | Exact Match: 0.536
Thresh 0.35 | Macro F1: 0.262 | Micro F1: 0.775 | Exact Match: 0.530
Thresh 0.40 | Macro F1: 0.253 | Micro F1: 0.781 | Exact Match: 0.523
Thresh 0.45 | Macro F1: 0.235 | Micro F1: 0.775 | Exact Match: 0.481
Thresh 0.50 | Macro F1: 0.217 | Micro F1: 0.766 | Exact Match: 0.444
Thresh 0.55 | Macro F1: 0.206 | Micro F1: 0.761 | Exact Match: 0.424
Thresh 0.60 | Macro F1: 0.182 | Micro F1: 0.742 | Exact Match: 0.393
Thresh 0.65 | Macro F1: 0.150 | Micro F1: 0.722 | Exact Match: 0.355
Thresh 0.70 | Macro F1: 0.147 | Micro F1: 0.713 | Exact Match: 0.347
Thresh 0.75 | Macro F1: 0.143 | Micro F1: 0.702 | Exac

Epoch 4: 100%|██████████| 145/145 [00:15<00:00,  9.54it/s]



Epoch 4: Loss = 0.1122
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.354 | Micro F1: 0.721 | Exact Match: 0.380
Thresh 0.15 | Macro F1: 0.361 | Micro F1: 0.801 | Exact Match: 0.497
Thresh 0.20 | Macro F1: 0.369 | Micro F1: 0.833 | Exact Match: 0.561
Thresh 0.25 | Macro F1: 0.371 | Micro F1: 0.845 | Exact Match: 0.565
Thresh 0.30 | Macro F1: 0.372 | Micro F1: 0.856 | Exact Match: 0.589
Thresh 0.35 | Macro F1: 0.350 | Micro F1: 0.854 | Exact Match: 0.592
Thresh 0.40 | Macro F1: 0.317 | Micro F1: 0.849 | Exact Match: 0.565
Thresh 0.45 | Macro F1: 0.306 | Micro F1: 0.845 | Exact Match: 0.565
Thresh 0.50 | Macro F1: 0.294 | Micro F1: 0.840 | Exact Match: 0.554
Thresh 0.55 | Macro F1: 0.275 | Micro F1: 0.826 | Exact Match: 0.506
Thresh 0.60 | Macro F1: 0.244 | Micro F1: 0.808 | Exact Match: 0.459
Thresh 0.65 | Macro F1: 0.213 | Micro F1: 0.790 | Exact Match: 0.419
Thresh 0.70 | Macro F1: 0.190 | Micro F1: 0.773 | Exact Match: 0.391
Thresh 0.75 | Macro F1: 0.178 | Micro F1: 0.762 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.25it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.118
Micro F1: 0.415
Exact Match: 0.176

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.163
Micro F1: 0.734
Exact Match: 0.593

 Domain: UA
Macro F1: 0.086
Micro F1: 0.323
Exact Match: 0.070

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=72 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 145/145 [00:15<00:00,  9.57it/s]



Epoch 1: Loss = 0.2609
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.164 | Micro F1: 0.396 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.164 | Micro F1: 0.502 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.137 | Micro F1: 0.584 | Exact Match: 0.231
Thresh 0.25 | Macro F1: 0.131 | Micro F1: 0.614 | Exact Match: 0.325
Thresh 0.30 | Macro F1: 0.132 | Micro F1: 0.652 | Exact Match: 0.352
Thresh 0.35 | Macro F1: 0.119 | Micro F1: 0.644 | Exact Match: 0.365
Thresh 0.40 | Macro F1: 0.110 | Micro F1: 0.633 | Exact Match: 0.365
Thresh 0.45 | Macro F1: 0.111 | Micro F1: 0.631 | Exact Match: 0.361
Thresh 0.50 | Macro F1: 0.109 | Micro F1: 0.617 | Exact Match: 0.350
Thresh 0.55 | Macro F1: 0.109 | Micro F1: 0.609 | Exact Match: 0.339
Thresh 0.60 | Macro F1: 0.106 | Micro F1: 0.587 | Exact Match: 0.330
Thresh 0.65 | Macro F1: 0.095 | Micro F1: 0.532 | Exact Match: 0.327
Thresh 0.70 | Macro F1: 0.077 | Micro F1: 0.475 | Exact Match: 0.325
Thresh 0.75 | Macro F1: 0.075 | Micro F1: 0.458 | Exac

Epoch 2: 100%|██████████| 145/145 [00:15<00:00,  9.65it/s]



Epoch 2: Loss = 0.1805
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.233 | Micro F1: 0.545 | Exact Match: 0.220
Thresh 0.15 | Macro F1: 0.223 | Micro F1: 0.644 | Exact Match: 0.318
Thresh 0.20 | Macro F1: 0.228 | Micro F1: 0.687 | Exact Match: 0.383
Thresh 0.25 | Macro F1: 0.237 | Micro F1: 0.724 | Exact Match: 0.401
Thresh 0.30 | Macro F1: 0.227 | Micro F1: 0.741 | Exact Match: 0.419
Thresh 0.35 | Macro F1: 0.191 | Micro F1: 0.723 | Exact Match: 0.386
Thresh 0.40 | Macro F1: 0.191 | Micro F1: 0.725 | Exact Match: 0.397
Thresh 0.45 | Macro F1: 0.188 | Micro F1: 0.723 | Exact Match: 0.390
Thresh 0.50 | Macro F1: 0.177 | Micro F1: 0.722 | Exact Match: 0.377
Thresh 0.55 | Macro F1: 0.169 | Micro F1: 0.715 | Exact Match: 0.363
Thresh 0.60 | Macro F1: 0.162 | Micro F1: 0.705 | Exact Match: 0.350
Thresh 0.65 | Macro F1: 0.152 | Micro F1: 0.696 | Exact Match: 0.336
Thresh 0.70 | Macro F1: 0.142 | Micro F1: 0.676 | Exact Match: 0.332
Thresh 0.75 | Macro F1: 0.137 | Micro F1: 0.659 | Exac

Epoch 3: 100%|██████████| 145/145 [00:14<00:00,  9.70it/s]



Epoch 3: Loss = 0.1485
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.294 | Micro F1: 0.604 | Exact Match: 0.332
Thresh 0.15 | Macro F1: 0.329 | Micro F1: 0.718 | Exact Match: 0.399
Thresh 0.20 | Macro F1: 0.318 | Micro F1: 0.765 | Exact Match: 0.489
Thresh 0.25 | Macro F1: 0.287 | Micro F1: 0.776 | Exact Match: 0.491
Thresh 0.30 | Macro F1: 0.267 | Micro F1: 0.780 | Exact Match: 0.475
Thresh 0.35 | Macro F1: 0.253 | Micro F1: 0.784 | Exact Match: 0.480
Thresh 0.40 | Macro F1: 0.244 | Micro F1: 0.783 | Exact Match: 0.482
Thresh 0.45 | Macro F1: 0.244 | Micro F1: 0.787 | Exact Match: 0.482
Thresh 0.50 | Macro F1: 0.236 | Micro F1: 0.784 | Exact Match: 0.471
Thresh 0.55 | Macro F1: 0.226 | Micro F1: 0.776 | Exact Match: 0.460
Thresh 0.60 | Macro F1: 0.221 | Micro F1: 0.773 | Exact Match: 0.446
Thresh 0.65 | Macro F1: 0.200 | Micro F1: 0.761 | Exact Match: 0.419
Thresh 0.70 | Macro F1: 0.178 | Micro F1: 0.751 | Exact Match: 0.390
Thresh 0.75 | Macro F1: 0.166 | Micro F1: 0.732 | Exac

Epoch 4: 100%|██████████| 145/145 [00:15<00:00,  9.62it/s]



Epoch 4: Loss = 0.1203
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.409 | Micro F1: 0.697 | Exact Match: 0.374
Thresh 0.15 | Macro F1: 0.405 | Micro F1: 0.779 | Exact Match: 0.455
Thresh 0.20 | Macro F1: 0.390 | Micro F1: 0.812 | Exact Match: 0.525
Thresh 0.25 | Macro F1: 0.373 | Micro F1: 0.828 | Exact Match: 0.583
Thresh 0.30 | Macro F1: 0.353 | Micro F1: 0.830 | Exact Match: 0.587
Thresh 0.35 | Macro F1: 0.333 | Micro F1: 0.831 | Exact Match: 0.581
Thresh 0.40 | Macro F1: 0.318 | Micro F1: 0.830 | Exact Match: 0.574
Thresh 0.45 | Macro F1: 0.296 | Micro F1: 0.826 | Exact Match: 0.547
Thresh 0.50 | Macro F1: 0.283 | Micro F1: 0.824 | Exact Match: 0.531
Thresh 0.55 | Macro F1: 0.265 | Micro F1: 0.818 | Exact Match: 0.511
Thresh 0.60 | Macro F1: 0.254 | Micro F1: 0.810 | Exact Match: 0.498
Thresh 0.65 | Macro F1: 0.235 | Micro F1: 0.801 | Exact Match: 0.473
Thresh 0.70 | Macro F1: 0.224 | Micro F1: 0.792 | Exact Match: 0.462
Thresh 0.75 | Macro F1: 0.188 | Micro F1: 0.763 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.51it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.111
Micro F1: 0.405
Exact Match: 0.232

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.143
Micro F1: 0.697
Exact Match: 0.593

 Domain: UA
Macro F1: 0.083
Micro F1: 0.328
Exact Match: 0.140

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=73 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 145/145 [00:14<00:00,  9.67it/s]



Epoch 1: Loss = 0.2594
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.177 | Micro F1: 0.422 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.191 | Micro F1: 0.503 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.162 | Micro F1: 0.562 | Exact Match: 0.048
Thresh 0.25 | Macro F1: 0.165 | Micro F1: 0.625 | Exact Match: 0.279
Thresh 0.30 | Macro F1: 0.174 | Micro F1: 0.672 | Exact Match: 0.338
Thresh 0.35 | Macro F1: 0.160 | Micro F1: 0.679 | Exact Match: 0.333
Thresh 0.40 | Macro F1: 0.145 | Micro F1: 0.667 | Exact Match: 0.322
Thresh 0.45 | Macro F1: 0.135 | Micro F1: 0.649 | Exact Match: 0.320
Thresh 0.50 | Macro F1: 0.130 | Micro F1: 0.639 | Exact Match: 0.320
Thresh 0.55 | Macro F1: 0.118 | Micro F1: 0.620 | Exact Match: 0.318
Thresh 0.60 | Macro F1: 0.110 | Micro F1: 0.600 | Exact Match: 0.303
Thresh 0.65 | Macro F1: 0.105 | Micro F1: 0.562 | Exact Match: 0.288
Thresh 0.70 | Macro F1: 0.099 | Micro F1: 0.525 | Exact Match: 0.261
Thresh 0.75 | Macro F1: 0.085 | Micro F1: 0.435 | Exac

Epoch 2: 100%|██████████| 145/145 [00:14<00:00,  9.68it/s]



Epoch 2: Loss = 0.1742
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.215 | Micro F1: 0.529 | Exact Match: 0.235
Thresh 0.15 | Macro F1: 0.261 | Micro F1: 0.659 | Exact Match: 0.373
Thresh 0.20 | Macro F1: 0.218 | Micro F1: 0.680 | Exact Match: 0.388
Thresh 0.25 | Macro F1: 0.199 | Micro F1: 0.690 | Exact Match: 0.377
Thresh 0.30 | Macro F1: 0.182 | Micro F1: 0.692 | Exact Match: 0.373
Thresh 0.35 | Macro F1: 0.178 | Micro F1: 0.696 | Exact Match: 0.375
Thresh 0.40 | Macro F1: 0.176 | Micro F1: 0.699 | Exact Match: 0.379
Thresh 0.45 | Macro F1: 0.170 | Micro F1: 0.695 | Exact Match: 0.383
Thresh 0.50 | Macro F1: 0.156 | Micro F1: 0.685 | Exact Match: 0.373
Thresh 0.55 | Macro F1: 0.146 | Micro F1: 0.680 | Exact Match: 0.366
Thresh 0.60 | Macro F1: 0.136 | Micro F1: 0.672 | Exact Match: 0.357
Thresh 0.65 | Macro F1: 0.132 | Micro F1: 0.657 | Exact Match: 0.349
Thresh 0.70 | Macro F1: 0.125 | Micro F1: 0.636 | Exact Match: 0.346
Thresh 0.75 | Macro F1: 0.119 | Micro F1: 0.617 | Exac

Epoch 3: 100%|██████████| 145/145 [00:14<00:00,  9.72it/s]



Epoch 3: Loss = 0.1414
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.368 | Micro F1: 0.655 | Exact Match: 0.336
Thresh 0.15 | Macro F1: 0.331 | Micro F1: 0.697 | Exact Match: 0.366
Thresh 0.20 | Macro F1: 0.319 | Micro F1: 0.749 | Exact Match: 0.440
Thresh 0.25 | Macro F1: 0.311 | Micro F1: 0.780 | Exact Match: 0.497
Thresh 0.30 | Macro F1: 0.314 | Micro F1: 0.799 | Exact Match: 0.547
Thresh 0.35 | Macro F1: 0.284 | Micro F1: 0.795 | Exact Match: 0.542
Thresh 0.40 | Macro F1: 0.265 | Micro F1: 0.797 | Exact Match: 0.534
Thresh 0.45 | Macro F1: 0.243 | Micro F1: 0.795 | Exact Match: 0.514
Thresh 0.50 | Macro F1: 0.235 | Micro F1: 0.795 | Exact Match: 0.497
Thresh 0.55 | Macro F1: 0.219 | Micro F1: 0.789 | Exact Match: 0.464
Thresh 0.60 | Macro F1: 0.204 | Micro F1: 0.777 | Exact Match: 0.438
Thresh 0.65 | Macro F1: 0.200 | Micro F1: 0.772 | Exact Match: 0.425
Thresh 0.70 | Macro F1: 0.184 | Micro F1: 0.760 | Exact Match: 0.392
Thresh 0.75 | Macro F1: 0.172 | Micro F1: 0.748 | Exac

Epoch 4: 100%|██████████| 145/145 [00:14<00:00,  9.72it/s]



Epoch 4: Loss = 0.1113
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.440 | Micro F1: 0.742 | Exact Match: 0.394
Thresh 0.15 | Macro F1: 0.436 | Micro F1: 0.821 | Exact Match: 0.516
Thresh 0.20 | Macro F1: 0.422 | Micro F1: 0.855 | Exact Match: 0.614
Thresh 0.25 | Macro F1: 0.415 | Micro F1: 0.870 | Exact Match: 0.658
Thresh 0.30 | Macro F1: 0.386 | Micro F1: 0.872 | Exact Match: 0.649
Thresh 0.35 | Macro F1: 0.369 | Micro F1: 0.868 | Exact Match: 0.627
Thresh 0.40 | Macro F1: 0.346 | Micro F1: 0.855 | Exact Match: 0.610
Thresh 0.45 | Macro F1: 0.315 | Micro F1: 0.845 | Exact Match: 0.580
Thresh 0.50 | Macro F1: 0.298 | Micro F1: 0.843 | Exact Match: 0.569
Thresh 0.55 | Macro F1: 0.280 | Micro F1: 0.840 | Exact Match: 0.547
Thresh 0.60 | Macro F1: 0.271 | Micro F1: 0.837 | Exact Match: 0.529
Thresh 0.65 | Macro F1: 0.258 | Micro F1: 0.835 | Exact Match: 0.514
Thresh 0.70 | Macro F1: 0.247 | Micro F1: 0.823 | Exact Match: 0.484
Thresh 0.75 | Macro F1: 0.229 | Micro F1: 0.812 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.80it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.123
Micro F1: 0.463
Exact Match: 0.221

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.141
Micro F1: 0.731
Exact Match: 0.648

 Domain: UA
Macro F1: 0.095
Micro F1: 0.387
Exact Match: 0.112

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=74 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 145/145 [00:15<00:00,  9.65it/s]



Epoch 1: Loss = 0.2573
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.169 | Micro F1: 0.412 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.186 | Micro F1: 0.522 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.161 | Micro F1: 0.582 | Exact Match: 0.018
Thresh 0.25 | Macro F1: 0.142 | Micro F1: 0.620 | Exact Match: 0.098
Thresh 0.30 | Macro F1: 0.133 | Micro F1: 0.641 | Exact Match: 0.251
Thresh 0.35 | Macro F1: 0.136 | Micro F1: 0.653 | Exact Match: 0.310
Thresh 0.40 | Macro F1: 0.126 | Micro F1: 0.636 | Exact Match: 0.315
Thresh 0.45 | Macro F1: 0.112 | Micro F1: 0.615 | Exact Match: 0.302
Thresh 0.50 | Macro F1: 0.112 | Micro F1: 0.616 | Exact Match: 0.299
Thresh 0.55 | Macro F1: 0.108 | Micro F1: 0.593 | Exact Match: 0.273
Thresh 0.60 | Macro F1: 0.105 | Micro F1: 0.573 | Exact Match: 0.257
Thresh 0.65 | Macro F1: 0.096 | Micro F1: 0.514 | Exact Match: 0.213
Thresh 0.70 | Macro F1: 0.082 | Micro F1: 0.432 | Exact Match: 0.153
Thresh 0.75 | Macro F1: 0.057 | Micro F1: 0.301 | Exac

Epoch 2: 100%|██████████| 145/145 [00:14<00:00,  9.68it/s]



Epoch 2: Loss = 0.1782
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.255 | Micro F1: 0.543 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.254 | Micro F1: 0.667 | Exact Match: 0.322
Thresh 0.20 | Macro F1: 0.273 | Micro F1: 0.739 | Exact Match: 0.459
Thresh 0.25 | Macro F1: 0.244 | Micro F1: 0.740 | Exact Match: 0.437
Thresh 0.30 | Macro F1: 0.217 | Micro F1: 0.728 | Exact Match: 0.395
Thresh 0.35 | Macro F1: 0.213 | Micro F1: 0.728 | Exact Match: 0.386
Thresh 0.40 | Macro F1: 0.202 | Micro F1: 0.719 | Exact Match: 0.373
Thresh 0.45 | Macro F1: 0.194 | Micro F1: 0.711 | Exact Match: 0.379
Thresh 0.50 | Macro F1: 0.172 | Micro F1: 0.701 | Exact Match: 0.357
Thresh 0.55 | Macro F1: 0.161 | Micro F1: 0.693 | Exact Match: 0.350
Thresh 0.60 | Macro F1: 0.140 | Micro F1: 0.677 | Exact Match: 0.328
Thresh 0.65 | Macro F1: 0.138 | Micro F1: 0.671 | Exact Match: 0.326
Thresh 0.70 | Macro F1: 0.132 | Micro F1: 0.660 | Exact Match: 0.324
Thresh 0.75 | Macro F1: 0.127 | Micro F1: 0.643 | Exac

Epoch 3: 100%|██████████| 145/145 [00:14<00:00,  9.73it/s]



Epoch 3: Loss = 0.1462
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.300 | Micro F1: 0.616 | Exact Match: 0.335
Thresh 0.15 | Macro F1: 0.292 | Micro F1: 0.675 | Exact Match: 0.397
Thresh 0.20 | Macro F1: 0.291 | Micro F1: 0.724 | Exact Match: 0.448
Thresh 0.25 | Macro F1: 0.297 | Micro F1: 0.753 | Exact Match: 0.494
Thresh 0.30 | Macro F1: 0.270 | Micro F1: 0.756 | Exact Match: 0.466
Thresh 0.35 | Macro F1: 0.249 | Micro F1: 0.755 | Exact Match: 0.446
Thresh 0.40 | Macro F1: 0.232 | Micro F1: 0.749 | Exact Match: 0.428
Thresh 0.45 | Macro F1: 0.230 | Micro F1: 0.755 | Exact Match: 0.437
Thresh 0.50 | Macro F1: 0.235 | Micro F1: 0.762 | Exact Match: 0.448
Thresh 0.55 | Macro F1: 0.231 | Micro F1: 0.762 | Exact Match: 0.446
Thresh 0.60 | Macro F1: 0.211 | Micro F1: 0.747 | Exact Match: 0.417
Thresh 0.65 | Macro F1: 0.183 | Micro F1: 0.730 | Exact Match: 0.388
Thresh 0.70 | Macro F1: 0.156 | Micro F1: 0.716 | Exact Match: 0.361
Thresh 0.75 | Macro F1: 0.150 | Micro F1: 0.694 | Exac

Epoch 4: 100%|██████████| 145/145 [00:14<00:00,  9.71it/s]



Epoch 4: Loss = 0.1149
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.428 | Micro F1: 0.733 | Exact Match: 0.408
Thresh 0.15 | Macro F1: 0.443 | Micro F1: 0.818 | Exact Match: 0.523
Thresh 0.20 | Macro F1: 0.439 | Micro F1: 0.851 | Exact Match: 0.588
Thresh 0.25 | Macro F1: 0.415 | Micro F1: 0.859 | Exact Match: 0.621
Thresh 0.30 | Macro F1: 0.386 | Micro F1: 0.857 | Exact Match: 0.614
Thresh 0.35 | Macro F1: 0.385 | Micro F1: 0.860 | Exact Match: 0.616
Thresh 0.40 | Macro F1: 0.326 | Micro F1: 0.848 | Exact Match: 0.568
Thresh 0.45 | Macro F1: 0.294 | Micro F1: 0.839 | Exact Match: 0.539
Thresh 0.50 | Macro F1: 0.285 | Micro F1: 0.834 | Exact Match: 0.519
Thresh 0.55 | Macro F1: 0.268 | Micro F1: 0.822 | Exact Match: 0.490
Thresh 0.60 | Macro F1: 0.248 | Micro F1: 0.809 | Exact Match: 0.463
Thresh 0.65 | Macro F1: 0.223 | Micro F1: 0.794 | Exact Match: 0.432
Thresh 0.70 | Macro F1: 0.186 | Micro F1: 0.771 | Exact Match: 0.388
Thresh 0.75 | Macro F1: 0.165 | Micro F1: 0.749 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.86it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.095
Micro F1: 0.370
Exact Match: 0.152

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.130
Micro F1: 0.649
Exact Match: 0.538

 Domain: UA
Macro F1: 0.069
Micro F1: 0.291
Exact Match: 0.053

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=75 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 145/145 [00:14<00:00,  9.68it/s]



Epoch 1: Loss = 0.2532
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.211 | Micro F1: 0.402 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.208 | Micro F1: 0.513 | Exact Match: 0.068
Thresh 0.20 | Macro F1: 0.210 | Micro F1: 0.604 | Exact Match: 0.185
Thresh 0.25 | Macro F1: 0.193 | Micro F1: 0.649 | Exact Match: 0.282
Thresh 0.30 | Macro F1: 0.172 | Micro F1: 0.662 | Exact Match: 0.304
Thresh 0.35 | Macro F1: 0.148 | Micro F1: 0.653 | Exact Match: 0.295
Thresh 0.40 | Macro F1: 0.146 | Micro F1: 0.650 | Exact Match: 0.302
Thresh 0.45 | Macro F1: 0.136 | Micro F1: 0.638 | Exact Match: 0.295
Thresh 0.50 | Macro F1: 0.130 | Micro F1: 0.626 | Exact Match: 0.284
Thresh 0.55 | Macro F1: 0.122 | Micro F1: 0.616 | Exact Match: 0.278
Thresh 0.60 | Macro F1: 0.115 | Micro F1: 0.592 | Exact Match: 0.253
Thresh 0.65 | Macro F1: 0.108 | Micro F1: 0.583 | Exact Match: 0.247
Thresh 0.70 | Macro F1: 0.101 | Micro F1: 0.558 | Exact Match: 0.227
Thresh 0.75 | Macro F1: 0.092 | Micro F1: 0.503 | Exac

Epoch 2: 100%|██████████| 145/145 [00:14<00:00,  9.67it/s]



Epoch 2: Loss = 0.1684
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.268 | Micro F1: 0.588 | Exact Match: 0.286
Thresh 0.15 | Macro F1: 0.277 | Micro F1: 0.681 | Exact Match: 0.363
Thresh 0.20 | Macro F1: 0.279 | Micro F1: 0.747 | Exact Match: 0.463
Thresh 0.25 | Macro F1: 0.274 | Micro F1: 0.778 | Exact Match: 0.509
Thresh 0.30 | Macro F1: 0.273 | Micro F1: 0.793 | Exact Match: 0.518
Thresh 0.35 | Macro F1: 0.252 | Micro F1: 0.789 | Exact Match: 0.474
Thresh 0.40 | Macro F1: 0.241 | Micro F1: 0.785 | Exact Match: 0.460
Thresh 0.45 | Macro F1: 0.217 | Micro F1: 0.777 | Exact Match: 0.443
Thresh 0.50 | Macro F1: 0.185 | Micro F1: 0.763 | Exact Match: 0.405
Thresh 0.55 | Macro F1: 0.178 | Micro F1: 0.756 | Exact Match: 0.392
Thresh 0.60 | Macro F1: 0.174 | Micro F1: 0.750 | Exact Match: 0.383
Thresh 0.65 | Macro F1: 0.167 | Micro F1: 0.741 | Exact Match: 0.370
Thresh 0.70 | Macro F1: 0.158 | Micro F1: 0.730 | Exact Match: 0.361
Thresh 0.75 | Macro F1: 0.149 | Micro F1: 0.719 | Exac

Epoch 3: 100%|██████████| 145/145 [00:14<00:00,  9.75it/s]



Epoch 3: Loss = 0.1303
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.330 | Micro F1: 0.653 | Exact Match: 0.350
Thresh 0.15 | Macro F1: 0.355 | Micro F1: 0.716 | Exact Match: 0.379
Thresh 0.20 | Macro F1: 0.348 | Micro F1: 0.778 | Exact Match: 0.471
Thresh 0.25 | Macro F1: 0.346 | Micro F1: 0.820 | Exact Match: 0.568
Thresh 0.30 | Macro F1: 0.326 | Micro F1: 0.834 | Exact Match: 0.588
Thresh 0.35 | Macro F1: 0.307 | Micro F1: 0.831 | Exact Match: 0.570
Thresh 0.40 | Macro F1: 0.278 | Micro F1: 0.824 | Exact Match: 0.531
Thresh 0.45 | Macro F1: 0.248 | Micro F1: 0.809 | Exact Match: 0.487
Thresh 0.50 | Macro F1: 0.222 | Micro F1: 0.799 | Exact Match: 0.443
Thresh 0.55 | Macro F1: 0.207 | Micro F1: 0.789 | Exact Match: 0.421
Thresh 0.60 | Macro F1: 0.190 | Micro F1: 0.779 | Exact Match: 0.399
Thresh 0.65 | Macro F1: 0.180 | Micro F1: 0.771 | Exact Match: 0.388
Thresh 0.70 | Macro F1: 0.167 | Micro F1: 0.760 | Exact Match: 0.374
Thresh 0.75 | Macro F1: 0.161 | Micro F1: 0.751 | Exac

Epoch 4: 100%|██████████| 145/145 [00:14<00:00,  9.74it/s]



Epoch 4: Loss = 0.1006
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.483 | Micro F1: 0.743 | Exact Match: 0.434
Thresh 0.15 | Macro F1: 0.455 | Micro F1: 0.824 | Exact Match: 0.513
Thresh 0.20 | Macro F1: 0.469 | Micro F1: 0.866 | Exact Match: 0.604
Thresh 0.25 | Macro F1: 0.457 | Micro F1: 0.882 | Exact Match: 0.643
Thresh 0.30 | Macro F1: 0.434 | Micro F1: 0.889 | Exact Match: 0.672
Thresh 0.35 | Macro F1: 0.412 | Micro F1: 0.888 | Exact Match: 0.667
Thresh 0.40 | Macro F1: 0.389 | Micro F1: 0.886 | Exact Match: 0.650
Thresh 0.45 | Macro F1: 0.340 | Micro F1: 0.880 | Exact Match: 0.634
Thresh 0.50 | Macro F1: 0.320 | Micro F1: 0.874 | Exact Match: 0.612
Thresh 0.55 | Macro F1: 0.312 | Micro F1: 0.873 | Exact Match: 0.601
Thresh 0.60 | Macro F1: 0.297 | Micro F1: 0.869 | Exact Match: 0.579
Thresh 0.65 | Macro F1: 0.276 | Micro F1: 0.859 | Exact Match: 0.548
Thresh 0.70 | Macro F1: 0.264 | Micro F1: 0.851 | Exact Match: 0.524
Thresh 0.75 | Macro F1: 0.236 | Micro F1: 0.830 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.85it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.116
Micro F1: 0.420
Exact Match: 0.221

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.152
Micro F1: 0.703
Exact Match: 0.626

 Domain: UA
Macro F1: 0.088
Micro F1: 0.343
Exact Match: 0.118

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=42 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 145/145 [00:14<00:00,  9.68it/s]



Epoch 1: Loss = 0.2581
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.202 | Micro F1: 0.421 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.191 | Micro F1: 0.510 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.157 | Micro F1: 0.590 | Exact Match: 0.160
Thresh 0.25 | Macro F1: 0.167 | Micro F1: 0.643 | Exact Match: 0.240
Thresh 0.30 | Macro F1: 0.163 | Micro F1: 0.663 | Exact Match: 0.307
Thresh 0.35 | Macro F1: 0.157 | Micro F1: 0.673 | Exact Match: 0.318
Thresh 0.40 | Macro F1: 0.154 | Micro F1: 0.673 | Exact Match: 0.329
Thresh 0.45 | Macro F1: 0.147 | Micro F1: 0.657 | Exact Match: 0.329
Thresh 0.50 | Macro F1: 0.137 | Micro F1: 0.654 | Exact Match: 0.331
Thresh 0.55 | Macro F1: 0.130 | Micro F1: 0.633 | Exact Match: 0.322
Thresh 0.60 | Macro F1: 0.117 | Micro F1: 0.604 | Exact Match: 0.316
Thresh 0.65 | Macro F1: 0.107 | Micro F1: 0.575 | Exact Match: 0.304
Thresh 0.70 | Macro F1: 0.100 | Micro F1: 0.540 | Exact Match: 0.284
Thresh 0.75 | Macro F1: 0.088 | Micro F1: 0.469 | Exac

Epoch 2: 100%|██████████| 145/145 [00:15<00:00,  9.67it/s]



Epoch 2: Loss = 0.1763
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.234 | Micro F1: 0.568 | Exact Match: 0.313
Thresh 0.15 | Macro F1: 0.238 | Micro F1: 0.674 | Exact Match: 0.396
Thresh 0.20 | Macro F1: 0.205 | Micro F1: 0.687 | Exact Match: 0.373
Thresh 0.25 | Macro F1: 0.199 | Micro F1: 0.700 | Exact Match: 0.373
Thresh 0.30 | Macro F1: 0.187 | Micro F1: 0.707 | Exact Match: 0.382
Thresh 0.35 | Macro F1: 0.192 | Micro F1: 0.717 | Exact Match: 0.393
Thresh 0.40 | Macro F1: 0.177 | Micro F1: 0.708 | Exact Match: 0.391
Thresh 0.45 | Macro F1: 0.170 | Micro F1: 0.704 | Exact Match: 0.382
Thresh 0.50 | Macro F1: 0.155 | Micro F1: 0.687 | Exact Match: 0.371
Thresh 0.55 | Macro F1: 0.147 | Micro F1: 0.673 | Exact Match: 0.360
Thresh 0.60 | Macro F1: 0.139 | Micro F1: 0.659 | Exact Match: 0.353
Thresh 0.65 | Macro F1: 0.129 | Micro F1: 0.622 | Exact Match: 0.342
Thresh 0.70 | Macro F1: 0.118 | Micro F1: 0.585 | Exact Match: 0.340
Thresh 0.75 | Macro F1: 0.104 | Micro F1: 0.542 | Exac

Epoch 3: 100%|██████████| 145/145 [00:14<00:00,  9.69it/s]



Epoch 3: Loss = 0.1477
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.316 | Micro F1: 0.646 | Exact Match: 0.351
Thresh 0.15 | Macro F1: 0.340 | Micro F1: 0.710 | Exact Match: 0.409
Thresh 0.20 | Macro F1: 0.328 | Micro F1: 0.751 | Exact Match: 0.478
Thresh 0.25 | Macro F1: 0.337 | Micro F1: 0.786 | Exact Match: 0.549
Thresh 0.30 | Macro F1: 0.287 | Micro F1: 0.786 | Exact Match: 0.529
Thresh 0.35 | Macro F1: 0.267 | Micro F1: 0.782 | Exact Match: 0.498
Thresh 0.40 | Macro F1: 0.251 | Micro F1: 0.777 | Exact Match: 0.473
Thresh 0.45 | Macro F1: 0.245 | Micro F1: 0.775 | Exact Match: 0.462
Thresh 0.50 | Macro F1: 0.218 | Micro F1: 0.763 | Exact Match: 0.438
Thresh 0.55 | Macro F1: 0.208 | Micro F1: 0.754 | Exact Match: 0.418
Thresh 0.60 | Macro F1: 0.203 | Micro F1: 0.749 | Exact Match: 0.407
Thresh 0.65 | Macro F1: 0.181 | Micro F1: 0.728 | Exact Match: 0.369
Thresh 0.70 | Macro F1: 0.155 | Micro F1: 0.707 | Exact Match: 0.340
Thresh 0.75 | Macro F1: 0.151 | Micro F1: 0.696 | Exac

Epoch 4: 100%|██████████| 145/145 [00:14<00:00,  9.67it/s]



Epoch 4: Loss = 0.1178
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.391 | Micro F1: 0.713 | Exact Match: 0.396
Thresh 0.15 | Macro F1: 0.430 | Micro F1: 0.798 | Exact Match: 0.467
Thresh 0.20 | Macro F1: 0.433 | Micro F1: 0.837 | Exact Match: 0.556
Thresh 0.25 | Macro F1: 0.434 | Micro F1: 0.859 | Exact Match: 0.611
Thresh 0.30 | Macro F1: 0.408 | Micro F1: 0.868 | Exact Match: 0.636
Thresh 0.35 | Macro F1: 0.390 | Micro F1: 0.867 | Exact Match: 0.638
Thresh 0.40 | Macro F1: 0.375 | Micro F1: 0.862 | Exact Match: 0.629
Thresh 0.45 | Macro F1: 0.357 | Micro F1: 0.855 | Exact Match: 0.591
Thresh 0.50 | Macro F1: 0.332 | Micro F1: 0.848 | Exact Match: 0.576
Thresh 0.55 | Macro F1: 0.288 | Micro F1: 0.835 | Exact Match: 0.542
Thresh 0.60 | Macro F1: 0.265 | Micro F1: 0.824 | Exact Match: 0.513
Thresh 0.65 | Macro F1: 0.257 | Micro F1: 0.817 | Exact Match: 0.502
Thresh 0.70 | Macro F1: 0.225 | Micro F1: 0.799 | Exact Match: 0.449
Thresh 0.75 | Macro F1: 0.202 | Micro F1: 0.781 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.53it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.123
Micro F1: 0.457
Exact Match: 0.176

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.148
Micro F1: 0.700
Exact Match: 0.582

 Domain: UA
Macro F1: 0.092
Micro F1: 0.388
Exact Match: 0.073

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=43 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 145/145 [00:15<00:00,  9.66it/s]



Epoch 1: Loss = 0.2536
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.188 | Micro F1: 0.435 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.204 | Micro F1: 0.544 | Exact Match: 0.018
Thresh 0.20 | Macro F1: 0.168 | Micro F1: 0.621 | Exact Match: 0.190
Thresh 0.25 | Macro F1: 0.164 | Micro F1: 0.654 | Exact Match: 0.287
Thresh 0.30 | Macro F1: 0.145 | Micro F1: 0.661 | Exact Match: 0.320
Thresh 0.35 | Macro F1: 0.143 | Micro F1: 0.667 | Exact Match: 0.340
Thresh 0.40 | Macro F1: 0.138 | Micro F1: 0.668 | Exact Match: 0.340
Thresh 0.45 | Macro F1: 0.134 | Micro F1: 0.656 | Exact Match: 0.340
Thresh 0.50 | Macro F1: 0.129 | Micro F1: 0.648 | Exact Match: 0.340
Thresh 0.55 | Macro F1: 0.122 | Micro F1: 0.633 | Exact Match: 0.331
Thresh 0.60 | Macro F1: 0.111 | Micro F1: 0.612 | Exact Match: 0.325
Thresh 0.65 | Macro F1: 0.107 | Micro F1: 0.585 | Exact Match: 0.320
Thresh 0.70 | Macro F1: 0.101 | Micro F1: 0.548 | Exact Match: 0.307
Thresh 0.75 | Macro F1: 0.085 | Micro F1: 0.476 | Exac

Epoch 2: 100%|██████████| 145/145 [00:15<00:00,  9.60it/s]



Epoch 2: Loss = 0.1727
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.259 | Micro F1: 0.561 | Exact Match: 0.234
Thresh 0.15 | Macro F1: 0.277 | Micro F1: 0.652 | Exact Match: 0.353
Thresh 0.20 | Macro F1: 0.292 | Micro F1: 0.724 | Exact Match: 0.411
Thresh 0.25 | Macro F1: 0.270 | Micro F1: 0.750 | Exact Match: 0.448
Thresh 0.30 | Macro F1: 0.252 | Micro F1: 0.764 | Exact Match: 0.457
Thresh 0.35 | Macro F1: 0.225 | Micro F1: 0.761 | Exact Match: 0.435
Thresh 0.40 | Macro F1: 0.210 | Micro F1: 0.760 | Exact Match: 0.411
Thresh 0.45 | Macro F1: 0.204 | Micro F1: 0.758 | Exact Match: 0.404
Thresh 0.50 | Macro F1: 0.199 | Micro F1: 0.753 | Exact Match: 0.395
Thresh 0.55 | Macro F1: 0.189 | Micro F1: 0.746 | Exact Match: 0.384
Thresh 0.60 | Macro F1: 0.172 | Micro F1: 0.735 | Exact Match: 0.373
Thresh 0.65 | Macro F1: 0.157 | Micro F1: 0.717 | Exact Match: 0.362
Thresh 0.70 | Macro F1: 0.150 | Micro F1: 0.705 | Exact Match: 0.349
Thresh 0.75 | Macro F1: 0.146 | Micro F1: 0.692 | Exac

Epoch 3: 100%|██████████| 145/145 [00:14<00:00,  9.72it/s]



Epoch 3: Loss = 0.1339
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.353 | Micro F1: 0.671 | Exact Match: 0.336
Thresh 0.15 | Macro F1: 0.355 | Micro F1: 0.754 | Exact Match: 0.424
Thresh 0.20 | Macro F1: 0.345 | Micro F1: 0.795 | Exact Match: 0.488
Thresh 0.25 | Macro F1: 0.338 | Micro F1: 0.818 | Exact Match: 0.536
Thresh 0.30 | Macro F1: 0.318 | Micro F1: 0.828 | Exact Match: 0.541
Thresh 0.35 | Macro F1: 0.304 | Micro F1: 0.827 | Exact Match: 0.534
Thresh 0.40 | Macro F1: 0.290 | Micro F1: 0.827 | Exact Match: 0.519
Thresh 0.45 | Macro F1: 0.284 | Micro F1: 0.826 | Exact Match: 0.510
Thresh 0.50 | Macro F1: 0.272 | Micro F1: 0.821 | Exact Match: 0.497
Thresh 0.55 | Macro F1: 0.231 | Micro F1: 0.804 | Exact Match: 0.446
Thresh 0.60 | Macro F1: 0.205 | Micro F1: 0.792 | Exact Match: 0.408
Thresh 0.65 | Macro F1: 0.184 | Micro F1: 0.779 | Exact Match: 0.377
Thresh 0.70 | Macro F1: 0.173 | Micro F1: 0.769 | Exact Match: 0.366
Thresh 0.75 | Macro F1: 0.163 | Micro F1: 0.751 | Exac

Epoch 4: 100%|██████████| 145/145 [00:14<00:00,  9.67it/s]



Epoch 4: Loss = 0.1010
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.437 | Micro F1: 0.732 | Exact Match: 0.391
Thresh 0.15 | Macro F1: 0.437 | Micro F1: 0.804 | Exact Match: 0.468
Thresh 0.20 | Macro F1: 0.438 | Micro F1: 0.849 | Exact Match: 0.583
Thresh 0.25 | Macro F1: 0.447 | Micro F1: 0.882 | Exact Match: 0.658
Thresh 0.30 | Macro F1: 0.449 | Micro F1: 0.895 | Exact Match: 0.702
Thresh 0.35 | Macro F1: 0.434 | Micro F1: 0.897 | Exact Match: 0.702
Thresh 0.40 | Macro F1: 0.426 | Micro F1: 0.893 | Exact Match: 0.684
Thresh 0.45 | Macro F1: 0.391 | Micro F1: 0.887 | Exact Match: 0.656
Thresh 0.50 | Macro F1: 0.363 | Micro F1: 0.874 | Exact Match: 0.620
Thresh 0.55 | Macro F1: 0.348 | Micro F1: 0.868 | Exact Match: 0.592
Thresh 0.60 | Macro F1: 0.312 | Micro F1: 0.852 | Exact Match: 0.545
Thresh 0.65 | Macro F1: 0.288 | Micro F1: 0.842 | Exact Match: 0.514
Thresh 0.70 | Macro F1: 0.258 | Micro F1: 0.826 | Exact Match: 0.470
Thresh 0.75 | Macro F1: 0.213 | Micro F1: 0.803 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.12it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.118
Micro F1: 0.430
Exact Match: 0.201

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.144
Micro F1: 0.696
Exact Match: 0.626

 Domain: UA
Macro F1: 0.092
Micro F1: 0.354
Exact Match: 0.092

--- Running STL: entity_framing | fine | distilbert-base-uncased | Seed=44 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 145/145 [00:15<00:00,  9.65it/s]



Epoch 1: Loss = 0.2557
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.184 | Micro F1: 0.431 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.200 | Micro F1: 0.536 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.154 | Micro F1: 0.570 | Exact Match: 0.018
Thresh 0.25 | Macro F1: 0.163 | Micro F1: 0.620 | Exact Match: 0.140
Thresh 0.30 | Macro F1: 0.157 | Micro F1: 0.662 | Exact Match: 0.281
Thresh 0.35 | Macro F1: 0.152 | Micro F1: 0.674 | Exact Match: 0.316
Thresh 0.40 | Macro F1: 0.138 | Micro F1: 0.666 | Exact Match: 0.318
Thresh 0.45 | Macro F1: 0.129 | Micro F1: 0.648 | Exact Match: 0.316
Thresh 0.50 | Macro F1: 0.122 | Micro F1: 0.639 | Exact Match: 0.314
Thresh 0.55 | Macro F1: 0.113 | Micro F1: 0.623 | Exact Match: 0.307
Thresh 0.60 | Macro F1: 0.113 | Micro F1: 0.615 | Exact Match: 0.300
Thresh 0.65 | Macro F1: 0.108 | Micro F1: 0.581 | Exact Match: 0.268
Thresh 0.70 | Macro F1: 0.103 | Micro F1: 0.551 | Exact Match: 0.243
Thresh 0.75 | Macro F1: 0.091 | Micro F1: 0.480 | Exac

Epoch 2: 100%|██████████| 145/145 [00:15<00:00,  9.63it/s]



Epoch 2: Loss = 0.1719
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.260 | Micro F1: 0.576 | Exact Match: 0.160
Thresh 0.15 | Macro F1: 0.258 | Micro F1: 0.679 | Exact Match: 0.318
Thresh 0.20 | Macro F1: 0.243 | Micro F1: 0.725 | Exact Match: 0.399
Thresh 0.25 | Macro F1: 0.251 | Micro F1: 0.755 | Exact Match: 0.428
Thresh 0.30 | Macro F1: 0.242 | Micro F1: 0.762 | Exact Match: 0.423
Thresh 0.35 | Macro F1: 0.228 | Micro F1: 0.772 | Exact Match: 0.412
Thresh 0.40 | Macro F1: 0.215 | Micro F1: 0.774 | Exact Match: 0.414
Thresh 0.45 | Macro F1: 0.213 | Micro F1: 0.774 | Exact Match: 0.423
Thresh 0.50 | Macro F1: 0.198 | Micro F1: 0.765 | Exact Match: 0.397
Thresh 0.55 | Macro F1: 0.182 | Micro F1: 0.755 | Exact Match: 0.384
Thresh 0.60 | Macro F1: 0.167 | Micro F1: 0.743 | Exact Match: 0.360
Thresh 0.65 | Macro F1: 0.159 | Micro F1: 0.732 | Exact Match: 0.351
Thresh 0.70 | Macro F1: 0.156 | Micro F1: 0.720 | Exact Match: 0.344
Thresh 0.75 | Macro F1: 0.153 | Micro F1: 0.710 | Exac

Epoch 3: 100%|██████████| 145/145 [00:14<00:00,  9.70it/s]



Epoch 3: Loss = 0.1342
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.324 | Micro F1: 0.659 | Exact Match: 0.307
Thresh 0.15 | Macro F1: 0.329 | Micro F1: 0.719 | Exact Match: 0.351
Thresh 0.20 | Macro F1: 0.318 | Micro F1: 0.762 | Exact Match: 0.408
Thresh 0.25 | Macro F1: 0.312 | Micro F1: 0.793 | Exact Match: 0.482
Thresh 0.30 | Macro F1: 0.294 | Micro F1: 0.810 | Exact Match: 0.515
Thresh 0.35 | Macro F1: 0.286 | Micro F1: 0.818 | Exact Match: 0.493
Thresh 0.40 | Macro F1: 0.273 | Micro F1: 0.816 | Exact Match: 0.471
Thresh 0.45 | Macro F1: 0.258 | Micro F1: 0.812 | Exact Match: 0.461
Thresh 0.50 | Macro F1: 0.249 | Micro F1: 0.809 | Exact Match: 0.467
Thresh 0.55 | Macro F1: 0.237 | Micro F1: 0.804 | Exact Match: 0.467
Thresh 0.60 | Macro F1: 0.215 | Micro F1: 0.797 | Exact Match: 0.456
Thresh 0.65 | Macro F1: 0.201 | Micro F1: 0.792 | Exact Match: 0.445
Thresh 0.70 | Macro F1: 0.199 | Micro F1: 0.789 | Exact Match: 0.432
Thresh 0.75 | Macro F1: 0.194 | Micro F1: 0.780 | Exac

Epoch 4: 100%|██████████| 145/145 [00:15<00:00,  9.65it/s]



Epoch 4: Loss = 0.1069
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.428 | Micro F1: 0.718 | Exact Match: 0.371
Thresh 0.15 | Macro F1: 0.402 | Micro F1: 0.773 | Exact Match: 0.395
Thresh 0.20 | Macro F1: 0.425 | Micro F1: 0.813 | Exact Match: 0.452
Thresh 0.25 | Macro F1: 0.437 | Micro F1: 0.846 | Exact Match: 0.550
Thresh 0.30 | Macro F1: 0.448 | Micro F1: 0.873 | Exact Match: 0.634
Thresh 0.35 | Macro F1: 0.442 | Micro F1: 0.884 | Exact Match: 0.656
Thresh 0.40 | Macro F1: 0.403 | Micro F1: 0.883 | Exact Match: 0.660
Thresh 0.45 | Macro F1: 0.386 | Micro F1: 0.882 | Exact Match: 0.638
Thresh 0.50 | Macro F1: 0.356 | Micro F1: 0.876 | Exact Match: 0.614
Thresh 0.55 | Macro F1: 0.325 | Micro F1: 0.866 | Exact Match: 0.579
Thresh 0.60 | Macro F1: 0.296 | Micro F1: 0.854 | Exact Match: 0.542
Thresh 0.65 | Macro F1: 0.270 | Micro F1: 0.844 | Exact Match: 0.507
Thresh 0.70 | Macro F1: 0.233 | Micro F1: 0.822 | Exact Match: 0.447
Thresh 0.75 | Macro F1: 0.195 | Micro F1: 0.802 | Exac

Evaluating TEST: 100%|██████████| 56/56 [00:02<00:00, 21.06it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.116
Micro F1: 0.414
Exact Match: 0.190

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.159
Micro F1: 0.711
Exact Match: 0.604

 Domain: UA
Macro F1: 0.088
Micro F1: 0.333
Exact Match: 0.084

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=71 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 354/354 [00:35<00:00, 10.00it/s]



Epoch 1: Loss = 0.2214
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.068 | Micro F1: 0.287 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.059 | Micro F1: 0.339 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.051 | Micro F1: 0.355 | Exact Match: 0.007
Thresh 0.25 | Macro F1: 0.036 | Micro F1: 0.331 | Exact Match: 0.003
Thresh 0.30 | Macro F1: 0.033 | Micro F1: 0.314 | Exact Match: 0.000
Thresh 0.35 | Macro F1: 0.029 | Micro F1: 0.274 | Exact Match: 0.000
Thresh 0.40 | Macro F1: 0.022 | Micro F1: 0.215 | Exact Match: 0.010
Thresh 0.45 | Macro F1: 0.018 | Micro F1: 0.166 | Exact Match: 0.017
Thresh 0.50 | Macro F1: 0.011 | Micro F1: 0.099 | Exact Match: 0.003
Thresh 0.55 | Macro F1: 0.005 | Micro F1: 0.047 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.003 | Micro F1: 0.026 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.002 | Micro F1: 0.014 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 354/354 [00:35<00:00, 10.03it/s]



Epoch 2: Loss = 0.1389
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.153 | Micro F1: 0.402 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.152 | Micro F1: 0.490 | Exact Match: 0.048
Thresh 0.20 | Macro F1: 0.131 | Micro F1: 0.523 | Exact Match: 0.138
Thresh 0.25 | Macro F1: 0.111 | Micro F1: 0.524 | Exact Match: 0.162
Thresh 0.30 | Macro F1: 0.098 | Micro F1: 0.489 | Exact Match: 0.134
Thresh 0.35 | Macro F1: 0.080 | Micro F1: 0.453 | Exact Match: 0.090
Thresh 0.40 | Macro F1: 0.069 | Micro F1: 0.409 | Exact Match: 0.036
Thresh 0.45 | Macro F1: 0.060 | Micro F1: 0.360 | Exact Match: 0.024
Thresh 0.50 | Macro F1: 0.050 | Micro F1: 0.298 | Exact Match: 0.017
Thresh 0.55 | Macro F1: 0.041 | Micro F1: 0.244 | Exact Match: 0.009
Thresh 0.60 | Macro F1: 0.033 | Micro F1: 0.200 | Exact Match: 0.005
Thresh 0.65 | Macro F1: 0.027 | Micro F1: 0.160 | Exact Match: 0.002
Thresh 0.70 | Macro F1: 0.020 | Micro F1: 0.120 | Exact Match: 0.002
Thresh 0.75 | Macro F1: 0.012 | Micro F1: 0.084 | Exac

Epoch 3: 100%|██████████| 354/354 [00:35<00:00,  9.97it/s]



Epoch 3: Loss = 0.1128
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.259 | Micro F1: 0.426 | Exact Match: 0.016
Thresh 0.15 | Macro F1: 0.294 | Micro F1: 0.535 | Exact Match: 0.074
Thresh 0.20 | Macro F1: 0.288 | Micro F1: 0.598 | Exact Match: 0.148
Thresh 0.25 | Macro F1: 0.273 | Micro F1: 0.635 | Exact Match: 0.197
Thresh 0.30 | Macro F1: 0.251 | Micro F1: 0.653 | Exact Match: 0.250
Thresh 0.35 | Macro F1: 0.222 | Micro F1: 0.655 | Exact Match: 0.262
Thresh 0.40 | Macro F1: 0.201 | Micro F1: 0.644 | Exact Match: 0.236
Thresh 0.45 | Macro F1: 0.168 | Micro F1: 0.625 | Exact Match: 0.205
Thresh 0.50 | Macro F1: 0.147 | Micro F1: 0.597 | Exact Match: 0.166
Thresh 0.55 | Macro F1: 0.124 | Micro F1: 0.556 | Exact Match: 0.128
Thresh 0.60 | Macro F1: 0.105 | Micro F1: 0.515 | Exact Match: 0.090
Thresh 0.65 | Macro F1: 0.087 | Micro F1: 0.471 | Exact Match: 0.069
Thresh 0.70 | Macro F1: 0.072 | Micro F1: 0.418 | Exact Match: 0.038
Thresh 0.75 | Macro F1: 0.060 | Micro F1: 0.375 | Exac

Epoch 4: 100%|██████████| 354/354 [00:35<00:00,  9.96it/s]



Epoch 4: Loss = 0.0905
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.353 | Micro F1: 0.533 | Exact Match: 0.159
Thresh 0.15 | Macro F1: 0.403 | Micro F1: 0.650 | Exact Match: 0.252
Thresh 0.20 | Macro F1: 0.393 | Micro F1: 0.722 | Exact Match: 0.328
Thresh 0.25 | Macro F1: 0.394 | Micro F1: 0.764 | Exact Match: 0.371
Thresh 0.30 | Macro F1: 0.395 | Micro F1: 0.786 | Exact Match: 0.398
Thresh 0.35 | Macro F1: 0.366 | Micro F1: 0.787 | Exact Match: 0.384
Thresh 0.40 | Macro F1: 0.338 | Micro F1: 0.778 | Exact Match: 0.352
Thresh 0.45 | Macro F1: 0.302 | Micro F1: 0.758 | Exact Match: 0.326
Thresh 0.50 | Macro F1: 0.271 | Micro F1: 0.739 | Exact Match: 0.317
Thresh 0.55 | Macro F1: 0.238 | Micro F1: 0.709 | Exact Match: 0.288
Thresh 0.60 | Macro F1: 0.201 | Micro F1: 0.673 | Exact Match: 0.262
Thresh 0.65 | Macro F1: 0.170 | Micro F1: 0.638 | Exact Match: 0.243
Thresh 0.70 | Macro F1: 0.136 | Micro F1: 0.576 | Exact Match: 0.186
Thresh 0.75 | Macro F1: 0.107 | Micro F1: 0.503 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.43it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.134
Micro F1: 0.372
Exact Match: 0.034

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.014
Micro F1: 0.271
Exact Match: 0.029

 Domain: UA
Macro F1: 0.121
Micro F1: 0.408
Exact Match: 0.037

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=72 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 354/354 [00:35<00:00, 10.03it/s]



Epoch 1: Loss = 0.2243
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.057 | Micro F1: 0.272 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.054 | Micro F1: 0.330 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.043 | Micro F1: 0.339 | Exact Match: 0.000
Thresh 0.25 | Macro F1: 0.028 | Micro F1: 0.300 | Exact Match: 0.000
Thresh 0.30 | Macro F1: 0.022 | Micro F1: 0.273 | Exact Match: 0.000
Thresh 0.35 | Macro F1: 0.014 | Micro F1: 0.192 | Exact Match: 0.000
Thresh 0.40 | Macro F1: 0.008 | Micro F1: 0.134 | Exact Match: 0.000
Thresh 0.45 | Macro F1: 0.007 | Micro F1: 0.092 | Exact Match: 0.000
Thresh 0.50 | Macro F1: 0.004 | Micro F1: 0.049 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.002 | Micro F1: 0.018 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.000 | Micro F1: 0.001 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 354/354 [00:35<00:00,  9.97it/s]



Epoch 2: Loss = 0.1399
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.155 | Micro F1: 0.375 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.157 | Micro F1: 0.454 | Exact Match: 0.003
Thresh 0.20 | Macro F1: 0.131 | Micro F1: 0.484 | Exact Match: 0.028
Thresh 0.25 | Macro F1: 0.117 | Micro F1: 0.494 | Exact Match: 0.076
Thresh 0.30 | Macro F1: 0.110 | Micro F1: 0.506 | Exact Match: 0.109
Thresh 0.35 | Macro F1: 0.099 | Micro F1: 0.490 | Exact Match: 0.095
Thresh 0.40 | Macro F1: 0.088 | Micro F1: 0.467 | Exact Match: 0.095
Thresh 0.45 | Macro F1: 0.080 | Micro F1: 0.442 | Exact Match: 0.078
Thresh 0.50 | Macro F1: 0.072 | Micro F1: 0.405 | Exact Match: 0.059
Thresh 0.55 | Macro F1: 0.061 | Micro F1: 0.357 | Exact Match: 0.048
Thresh 0.60 | Macro F1: 0.053 | Micro F1: 0.311 | Exact Match: 0.047
Thresh 0.65 | Macro F1: 0.045 | Micro F1: 0.267 | Exact Match: 0.021
Thresh 0.70 | Macro F1: 0.033 | Micro F1: 0.205 | Exact Match: 0.009
Thresh 0.75 | Macro F1: 0.021 | Micro F1: 0.154 | Exac

Epoch 3: 100%|██████████| 354/354 [00:35<00:00,  9.98it/s]



Epoch 3: Loss = 0.1141
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.268 | Micro F1: 0.486 | Exact Match: 0.029
Thresh 0.15 | Macro F1: 0.287 | Micro F1: 0.590 | Exact Match: 0.136
Thresh 0.20 | Macro F1: 0.283 | Micro F1: 0.644 | Exact Match: 0.229
Thresh 0.25 | Macro F1: 0.248 | Micro F1: 0.658 | Exact Match: 0.269
Thresh 0.30 | Macro F1: 0.216 | Micro F1: 0.654 | Exact Match: 0.262
Thresh 0.35 | Macro F1: 0.186 | Micro F1: 0.638 | Exact Match: 0.255
Thresh 0.40 | Macro F1: 0.163 | Micro F1: 0.619 | Exact Match: 0.243
Thresh 0.45 | Macro F1: 0.142 | Micro F1: 0.591 | Exact Match: 0.226
Thresh 0.50 | Macro F1: 0.120 | Micro F1: 0.549 | Exact Match: 0.191
Thresh 0.55 | Macro F1: 0.099 | Micro F1: 0.503 | Exact Match: 0.159
Thresh 0.60 | Macro F1: 0.086 | Micro F1: 0.467 | Exact Match: 0.112
Thresh 0.65 | Macro F1: 0.070 | Micro F1: 0.406 | Exact Match: 0.041
Thresh 0.70 | Macro F1: 0.056 | Micro F1: 0.345 | Exact Match: 0.005
Thresh 0.75 | Macro F1: 0.043 | Micro F1: 0.275 | Exac

Epoch 4: 100%|██████████| 354/354 [00:35<00:00,  9.95it/s]



Epoch 4: Loss = 0.0918
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.363 | Micro F1: 0.554 | Exact Match: 0.102
Thresh 0.15 | Macro F1: 0.382 | Micro F1: 0.665 | Exact Match: 0.188
Thresh 0.20 | Macro F1: 0.371 | Micro F1: 0.724 | Exact Match: 0.284
Thresh 0.25 | Macro F1: 0.359 | Micro F1: 0.757 | Exact Match: 0.340
Thresh 0.30 | Macro F1: 0.337 | Micro F1: 0.763 | Exact Match: 0.352
Thresh 0.35 | Macro F1: 0.315 | Micro F1: 0.759 | Exact Match: 0.341
Thresh 0.40 | Macro F1: 0.281 | Micro F1: 0.741 | Exact Match: 0.317
Thresh 0.45 | Macro F1: 0.247 | Micro F1: 0.716 | Exact Match: 0.307
Thresh 0.50 | Macro F1: 0.225 | Micro F1: 0.693 | Exact Match: 0.298
Thresh 0.55 | Macro F1: 0.192 | Micro F1: 0.653 | Exact Match: 0.259
Thresh 0.60 | Macro F1: 0.160 | Micro F1: 0.606 | Exact Match: 0.240
Thresh 0.65 | Macro F1: 0.128 | Micro F1: 0.564 | Exact Match: 0.212
Thresh 0.70 | Macro F1: 0.107 | Micro F1: 0.518 | Exact Match: 0.183
Thresh 0.75 | Macro F1: 0.090 | Micro F1: 0.471 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.09it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.142
Micro F1: 0.403
Exact Match: 0.073

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.026
Micro F1: 0.395
Exact Match: 0.129

 Domain: UA
Macro F1: 0.116
Micro F1: 0.407
Exact Match: 0.037

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=73 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 354/354 [00:35<00:00,  9.98it/s]



Epoch 1: Loss = 0.2239
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.076 | Micro F1: 0.295 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.066 | Micro F1: 0.358 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.058 | Micro F1: 0.365 | Exact Match: 0.007
Thresh 0.25 | Macro F1: 0.043 | Micro F1: 0.345 | Exact Match: 0.000
Thresh 0.30 | Macro F1: 0.035 | Micro F1: 0.294 | Exact Match: 0.000
Thresh 0.35 | Macro F1: 0.024 | Micro F1: 0.223 | Exact Match: 0.000
Thresh 0.40 | Macro F1: 0.019 | Micro F1: 0.179 | Exact Match: 0.000
Thresh 0.45 | Macro F1: 0.011 | Micro F1: 0.116 | Exact Match: 0.000
Thresh 0.50 | Macro F1: 0.008 | Micro F1: 0.080 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.004 | Micro F1: 0.033 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.000 | Micro F1: 0.003 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 354/354 [00:35<00:00, 10.03it/s]



Epoch 2: Loss = 0.1371
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.161 | Micro F1: 0.403 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.171 | Micro F1: 0.497 | Exact Match: 0.028
Thresh 0.20 | Macro F1: 0.170 | Micro F1: 0.536 | Exact Match: 0.102
Thresh 0.25 | Macro F1: 0.155 | Micro F1: 0.543 | Exact Match: 0.174
Thresh 0.30 | Macro F1: 0.137 | Micro F1: 0.534 | Exact Match: 0.184
Thresh 0.35 | Macro F1: 0.123 | Micro F1: 0.511 | Exact Match: 0.150
Thresh 0.40 | Macro F1: 0.109 | Micro F1: 0.485 | Exact Match: 0.110
Thresh 0.45 | Macro F1: 0.097 | Micro F1: 0.454 | Exact Match: 0.084
Thresh 0.50 | Macro F1: 0.081 | Micro F1: 0.411 | Exact Match: 0.052
Thresh 0.55 | Macro F1: 0.067 | Micro F1: 0.362 | Exact Match: 0.017
Thresh 0.60 | Macro F1: 0.053 | Micro F1: 0.301 | Exact Match: 0.003
Thresh 0.65 | Macro F1: 0.038 | Micro F1: 0.237 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.026 | Micro F1: 0.177 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.017 | Micro F1: 0.127 | Exac

Epoch 3: 100%|██████████| 354/354 [00:35<00:00, 10.03it/s]



Epoch 3: Loss = 0.1114
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.308 | Micro F1: 0.465 | Exact Match: 0.029
Thresh 0.15 | Macro F1: 0.295 | Micro F1: 0.571 | Exact Match: 0.110
Thresh 0.20 | Macro F1: 0.287 | Micro F1: 0.637 | Exact Match: 0.212
Thresh 0.25 | Macro F1: 0.273 | Micro F1: 0.668 | Exact Match: 0.267
Thresh 0.30 | Macro F1: 0.260 | Micro F1: 0.684 | Exact Match: 0.284
Thresh 0.35 | Macro F1: 0.230 | Micro F1: 0.685 | Exact Match: 0.274
Thresh 0.40 | Macro F1: 0.204 | Micro F1: 0.675 | Exact Match: 0.259
Thresh 0.45 | Macro F1: 0.186 | Micro F1: 0.658 | Exact Match: 0.222
Thresh 0.50 | Macro F1: 0.158 | Micro F1: 0.621 | Exact Match: 0.174
Thresh 0.55 | Macro F1: 0.138 | Micro F1: 0.584 | Exact Match: 0.134
Thresh 0.60 | Macro F1: 0.111 | Micro F1: 0.530 | Exact Match: 0.093
Thresh 0.65 | Macro F1: 0.090 | Micro F1: 0.481 | Exact Match: 0.074
Thresh 0.70 | Macro F1: 0.074 | Micro F1: 0.435 | Exact Match: 0.048
Thresh 0.75 | Macro F1: 0.055 | Micro F1: 0.356 | Exac

Epoch 4: 100%|██████████| 354/354 [00:35<00:00, 10.00it/s]



Epoch 4: Loss = 0.0885
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.384 | Micro F1: 0.589 | Exact Match: 0.150
Thresh 0.15 | Macro F1: 0.408 | Micro F1: 0.706 | Exact Match: 0.271
Thresh 0.20 | Macro F1: 0.386 | Micro F1: 0.757 | Exact Match: 0.338
Thresh 0.25 | Macro F1: 0.361 | Micro F1: 0.779 | Exact Match: 0.379
Thresh 0.30 | Macro F1: 0.333 | Micro F1: 0.780 | Exact Match: 0.386
Thresh 0.35 | Macro F1: 0.286 | Micro F1: 0.767 | Exact Match: 0.378
Thresh 0.40 | Macro F1: 0.266 | Micro F1: 0.752 | Exact Match: 0.369
Thresh 0.45 | Macro F1: 0.232 | Micro F1: 0.730 | Exact Match: 0.343
Thresh 0.50 | Macro F1: 0.204 | Micro F1: 0.700 | Exact Match: 0.310
Thresh 0.55 | Macro F1: 0.173 | Micro F1: 0.667 | Exact Match: 0.278
Thresh 0.60 | Macro F1: 0.148 | Micro F1: 0.632 | Exact Match: 0.248
Thresh 0.65 | Macro F1: 0.133 | Micro F1: 0.591 | Exact Match: 0.212
Thresh 0.70 | Macro F1: 0.109 | Micro F1: 0.531 | Exact Match: 0.157
Thresh 0.75 | Macro F1: 0.081 | Micro F1: 0.459 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.48it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.114
Micro F1: 0.349
Exact Match: 0.045

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.010
Micro F1: 0.133
Exact Match: 0.057

 Domain: UA
Macro F1: 0.106
Micro F1: 0.418
Exact Match: 0.037

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=74 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 354/354 [00:35<00:00,  9.96it/s]



Epoch 1: Loss = 0.2209
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.069 | Micro F1: 0.302 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.049 | Micro F1: 0.330 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.048 | Micro F1: 0.350 | Exact Match: 0.000
Thresh 0.25 | Macro F1: 0.036 | Micro F1: 0.299 | Exact Match: 0.003
Thresh 0.30 | Macro F1: 0.021 | Micro F1: 0.233 | Exact Match: 0.000
Thresh 0.35 | Macro F1: 0.015 | Micro F1: 0.175 | Exact Match: 0.000
Thresh 0.40 | Macro F1: 0.007 | Micro F1: 0.085 | Exact Match: 0.000
Thresh 0.45 | Macro F1: 0.006 | Micro F1: 0.067 | Exact Match: 0.000
Thresh 0.50 | Macro F1: 0.004 | Micro F1: 0.044 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.003 | Micro F1: 0.024 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.001 | Micro F1: 0.006 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 354/354 [00:35<00:00,  9.98it/s]



Epoch 2: Loss = 0.1385
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.149 | Micro F1: 0.366 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.164 | Micro F1: 0.452 | Exact Match: 0.007
Thresh 0.20 | Macro F1: 0.154 | Micro F1: 0.492 | Exact Match: 0.034
Thresh 0.25 | Macro F1: 0.131 | Micro F1: 0.504 | Exact Match: 0.100
Thresh 0.30 | Macro F1: 0.115 | Micro F1: 0.509 | Exact Match: 0.126
Thresh 0.35 | Macro F1: 0.100 | Micro F1: 0.496 | Exact Match: 0.124
Thresh 0.40 | Macro F1: 0.090 | Micro F1: 0.482 | Exact Match: 0.121
Thresh 0.45 | Macro F1: 0.079 | Micro F1: 0.462 | Exact Match: 0.098
Thresh 0.50 | Macro F1: 0.070 | Micro F1: 0.437 | Exact Match: 0.071
Thresh 0.55 | Macro F1: 0.059 | Micro F1: 0.395 | Exact Match: 0.043
Thresh 0.60 | Macro F1: 0.051 | Micro F1: 0.362 | Exact Match: 0.031
Thresh 0.65 | Macro F1: 0.044 | Micro F1: 0.324 | Exact Match: 0.017
Thresh 0.70 | Macro F1: 0.034 | Micro F1: 0.266 | Exact Match: 0.003
Thresh 0.75 | Macro F1: 0.026 | Micro F1: 0.200 | Exac

Epoch 3: 100%|██████████| 354/354 [00:35<00:00,  9.98it/s]



Epoch 3: Loss = 0.1131
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.286 | Micro F1: 0.464 | Exact Match: 0.016
Thresh 0.15 | Macro F1: 0.291 | Micro F1: 0.562 | Exact Match: 0.067
Thresh 0.20 | Macro F1: 0.285 | Micro F1: 0.624 | Exact Match: 0.169
Thresh 0.25 | Macro F1: 0.258 | Micro F1: 0.648 | Exact Match: 0.233
Thresh 0.30 | Macro F1: 0.231 | Micro F1: 0.651 | Exact Match: 0.260
Thresh 0.35 | Macro F1: 0.194 | Micro F1: 0.645 | Exact Match: 0.266
Thresh 0.40 | Macro F1: 0.165 | Micro F1: 0.627 | Exact Match: 0.241
Thresh 0.45 | Macro F1: 0.148 | Micro F1: 0.608 | Exact Match: 0.212
Thresh 0.50 | Macro F1: 0.136 | Micro F1: 0.587 | Exact Match: 0.178
Thresh 0.55 | Macro F1: 0.121 | Micro F1: 0.555 | Exact Match: 0.148
Thresh 0.60 | Macro F1: 0.108 | Micro F1: 0.520 | Exact Match: 0.122
Thresh 0.65 | Macro F1: 0.091 | Micro F1: 0.478 | Exact Match: 0.091
Thresh 0.70 | Macro F1: 0.075 | Micro F1: 0.437 | Exact Match: 0.062
Thresh 0.75 | Macro F1: 0.063 | Micro F1: 0.392 | Exac

Epoch 4: 100%|██████████| 354/354 [00:35<00:00, 10.02it/s]



Epoch 4: Loss = 0.0916
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.370 | Micro F1: 0.548 | Exact Match: 0.074
Thresh 0.15 | Macro F1: 0.397 | Micro F1: 0.661 | Exact Match: 0.172
Thresh 0.20 | Macro F1: 0.404 | Micro F1: 0.728 | Exact Match: 0.274
Thresh 0.25 | Macro F1: 0.388 | Micro F1: 0.756 | Exact Match: 0.321
Thresh 0.30 | Macro F1: 0.335 | Micro F1: 0.763 | Exact Match: 0.352
Thresh 0.35 | Macro F1: 0.311 | Micro F1: 0.754 | Exact Match: 0.333
Thresh 0.40 | Macro F1: 0.279 | Micro F1: 0.739 | Exact Match: 0.322
Thresh 0.45 | Macro F1: 0.262 | Micro F1: 0.724 | Exact Match: 0.314
Thresh 0.50 | Macro F1: 0.221 | Micro F1: 0.693 | Exact Match: 0.288
Thresh 0.55 | Macro F1: 0.191 | Micro F1: 0.661 | Exact Match: 0.266
Thresh 0.60 | Macro F1: 0.164 | Micro F1: 0.617 | Exact Match: 0.214
Thresh 0.65 | Macro F1: 0.134 | Micro F1: 0.572 | Exact Match: 0.169
Thresh 0.70 | Macro F1: 0.110 | Micro F1: 0.519 | Exact Match: 0.138
Thresh 0.75 | Macro F1: 0.084 | Micro F1: 0.449 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.52it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.114
Micro F1: 0.360
Exact Match: 0.045

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.017
Micro F1: 0.296
Exact Match: 0.057

 Domain: UA
Macro F1: 0.098
Micro F1: 0.383
Exact Match: 0.037

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=75 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 354/354 [00:35<00:00,  9.91it/s]



Epoch 1: Loss = 0.2223
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.077 | Micro F1: 0.281 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.066 | Micro F1: 0.327 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.050 | Micro F1: 0.347 | Exact Match: 0.000
Thresh 0.25 | Macro F1: 0.040 | Micro F1: 0.328 | Exact Match: 0.010
Thresh 0.30 | Macro F1: 0.032 | Micro F1: 0.263 | Exact Match: 0.016
Thresh 0.35 | Macro F1: 0.021 | Micro F1: 0.190 | Exact Match: 0.000
Thresh 0.40 | Macro F1: 0.012 | Micro F1: 0.118 | Exact Match: 0.000
Thresh 0.45 | Macro F1: 0.006 | Micro F1: 0.067 | Exact Match: 0.000
Thresh 0.50 | Macro F1: 0.005 | Micro F1: 0.046 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.003 | Micro F1: 0.019 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.000 | Micro F1: 0.003 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 354/354 [00:35<00:00,  9.96it/s]



Epoch 2: Loss = 0.1386
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.165 | Micro F1: 0.398 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.162 | Micro F1: 0.482 | Exact Match: 0.017
Thresh 0.20 | Macro F1: 0.144 | Micro F1: 0.520 | Exact Match: 0.074
Thresh 0.25 | Macro F1: 0.128 | Micro F1: 0.520 | Exact Match: 0.105
Thresh 0.30 | Macro F1: 0.106 | Micro F1: 0.501 | Exact Match: 0.098
Thresh 0.35 | Macro F1: 0.098 | Micro F1: 0.491 | Exact Match: 0.110
Thresh 0.40 | Macro F1: 0.088 | Micro F1: 0.466 | Exact Match: 0.079
Thresh 0.45 | Macro F1: 0.079 | Micro F1: 0.442 | Exact Match: 0.072
Thresh 0.50 | Macro F1: 0.071 | Micro F1: 0.401 | Exact Match: 0.050
Thresh 0.55 | Macro F1: 0.060 | Micro F1: 0.350 | Exact Match: 0.029
Thresh 0.60 | Macro F1: 0.049 | Micro F1: 0.301 | Exact Match: 0.012
Thresh 0.65 | Macro F1: 0.038 | Micro F1: 0.249 | Exact Match: 0.003
Thresh 0.70 | Macro F1: 0.025 | Micro F1: 0.185 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.017 | Micro F1: 0.134 | Exac

Epoch 3: 100%|██████████| 354/354 [00:35<00:00,  9.99it/s]



Epoch 3: Loss = 0.1113
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.286 | Micro F1: 0.500 | Exact Match: 0.026
Thresh 0.15 | Macro F1: 0.287 | Micro F1: 0.603 | Exact Match: 0.126
Thresh 0.20 | Macro F1: 0.258 | Micro F1: 0.653 | Exact Match: 0.217
Thresh 0.25 | Macro F1: 0.233 | Micro F1: 0.676 | Exact Match: 0.267
Thresh 0.30 | Macro F1: 0.205 | Micro F1: 0.673 | Exact Match: 0.283
Thresh 0.35 | Macro F1: 0.192 | Micro F1: 0.664 | Exact Match: 0.284
Thresh 0.40 | Macro F1: 0.169 | Micro F1: 0.647 | Exact Match: 0.267
Thresh 0.45 | Macro F1: 0.150 | Micro F1: 0.617 | Exact Match: 0.240
Thresh 0.50 | Macro F1: 0.132 | Micro F1: 0.569 | Exact Match: 0.174
Thresh 0.55 | Macro F1: 0.110 | Micro F1: 0.521 | Exact Match: 0.117
Thresh 0.60 | Macro F1: 0.083 | Micro F1: 0.465 | Exact Match: 0.078
Thresh 0.65 | Macro F1: 0.066 | Micro F1: 0.412 | Exact Match: 0.026
Thresh 0.70 | Macro F1: 0.052 | Micro F1: 0.359 | Exact Match: 0.007
Thresh 0.75 | Macro F1: 0.040 | Micro F1: 0.293 | Exac

Epoch 4: 100%|██████████| 354/354 [00:35<00:00, 10.01it/s]



Epoch 4: Loss = 0.0887
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.383 | Micro F1: 0.576 | Exact Match: 0.103
Thresh 0.15 | Macro F1: 0.401 | Micro F1: 0.695 | Exact Match: 0.257
Thresh 0.20 | Macro F1: 0.403 | Micro F1: 0.755 | Exact Match: 0.347
Thresh 0.25 | Macro F1: 0.393 | Micro F1: 0.777 | Exact Match: 0.374
Thresh 0.30 | Macro F1: 0.365 | Micro F1: 0.783 | Exact Match: 0.374
Thresh 0.35 | Macro F1: 0.320 | Micro F1: 0.774 | Exact Match: 0.376
Thresh 0.40 | Macro F1: 0.292 | Micro F1: 0.759 | Exact Match: 0.352
Thresh 0.45 | Macro F1: 0.266 | Micro F1: 0.742 | Exact Match: 0.338
Thresh 0.50 | Macro F1: 0.240 | Micro F1: 0.711 | Exact Match: 0.303
Thresh 0.55 | Macro F1: 0.211 | Micro F1: 0.670 | Exact Match: 0.269
Thresh 0.60 | Macro F1: 0.187 | Micro F1: 0.637 | Exact Match: 0.253
Thresh 0.65 | Macro F1: 0.151 | Micro F1: 0.587 | Exact Match: 0.207
Thresh 0.70 | Macro F1: 0.129 | Micro F1: 0.543 | Exact Match: 0.174
Thresh 0.75 | Macro F1: 0.099 | Micro F1: 0.480 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.41it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.134
Micro F1: 0.397
Exact Match: 0.045

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.022
Micro F1: 0.360
Exact Match: 0.029

 Domain: UA
Macro F1: 0.114
Micro F1: 0.413
Exact Match: 0.056

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=42 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 354/354 [00:35<00:00,  9.94it/s]



Epoch 1: Loss = 0.2247
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.065 | Micro F1: 0.278 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.053 | Micro F1: 0.347 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.039 | Micro F1: 0.333 | Exact Match: 0.000
Thresh 0.25 | Macro F1: 0.028 | Micro F1: 0.308 | Exact Match: 0.000
Thresh 0.30 | Macro F1: 0.026 | Micro F1: 0.296 | Exact Match: 0.000
Thresh 0.35 | Macro F1: 0.024 | Micro F1: 0.268 | Exact Match: 0.000
Thresh 0.40 | Macro F1: 0.016 | Micro F1: 0.208 | Exact Match: 0.003
Thresh 0.45 | Macro F1: 0.012 | Micro F1: 0.141 | Exact Match: 0.000
Thresh 0.50 | Macro F1: 0.009 | Micro F1: 0.091 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.004 | Micro F1: 0.030 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 354/354 [00:35<00:00,  9.93it/s]



Epoch 2: Loss = 0.1404
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.161 | Micro F1: 0.386 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.158 | Micro F1: 0.466 | Exact Match: 0.007
Thresh 0.20 | Macro F1: 0.137 | Micro F1: 0.504 | Exact Match: 0.060
Thresh 0.25 | Macro F1: 0.116 | Micro F1: 0.505 | Exact Match: 0.117
Thresh 0.30 | Macro F1: 0.105 | Micro F1: 0.489 | Exact Match: 0.110
Thresh 0.35 | Macro F1: 0.093 | Micro F1: 0.467 | Exact Match: 0.090
Thresh 0.40 | Macro F1: 0.079 | Micro F1: 0.432 | Exact Match: 0.071
Thresh 0.45 | Macro F1: 0.066 | Micro F1: 0.393 | Exact Match: 0.059
Thresh 0.50 | Macro F1: 0.057 | Micro F1: 0.356 | Exact Match: 0.047
Thresh 0.55 | Macro F1: 0.048 | Micro F1: 0.313 | Exact Match: 0.029
Thresh 0.60 | Macro F1: 0.042 | Micro F1: 0.273 | Exact Match: 0.016
Thresh 0.65 | Macro F1: 0.030 | Micro F1: 0.224 | Exact Match: 0.007
Thresh 0.70 | Macro F1: 0.022 | Micro F1: 0.172 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.014 | Micro F1: 0.124 | Exac

Epoch 3: 100%|██████████| 354/354 [00:35<00:00,  9.97it/s]



Epoch 3: Loss = 0.1142
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.296 | Micro F1: 0.481 | Exact Match: 0.014
Thresh 0.15 | Macro F1: 0.288 | Micro F1: 0.582 | Exact Match: 0.095
Thresh 0.20 | Macro F1: 0.269 | Micro F1: 0.628 | Exact Match: 0.148
Thresh 0.25 | Macro F1: 0.254 | Micro F1: 0.653 | Exact Match: 0.205
Thresh 0.30 | Macro F1: 0.221 | Micro F1: 0.654 | Exact Match: 0.248
Thresh 0.35 | Macro F1: 0.185 | Micro F1: 0.637 | Exact Match: 0.248
Thresh 0.40 | Macro F1: 0.156 | Micro F1: 0.611 | Exact Match: 0.222
Thresh 0.45 | Macro F1: 0.139 | Micro F1: 0.577 | Exact Match: 0.181
Thresh 0.50 | Macro F1: 0.118 | Micro F1: 0.542 | Exact Match: 0.145
Thresh 0.55 | Macro F1: 0.103 | Micro F1: 0.504 | Exact Match: 0.116
Thresh 0.60 | Macro F1: 0.087 | Micro F1: 0.467 | Exact Match: 0.095
Thresh 0.65 | Macro F1: 0.074 | Micro F1: 0.427 | Exact Match: 0.048
Thresh 0.70 | Macro F1: 0.063 | Micro F1: 0.385 | Exact Match: 0.036
Thresh 0.75 | Macro F1: 0.049 | Micro F1: 0.330 | Exac

Epoch 4: 100%|██████████| 354/354 [00:35<00:00, 10.02it/s]



Epoch 4: Loss = 0.0920
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.384 | Micro F1: 0.569 | Exact Match: 0.134
Thresh 0.15 | Macro F1: 0.391 | Micro F1: 0.684 | Exact Match: 0.228
Thresh 0.20 | Macro F1: 0.375 | Micro F1: 0.745 | Exact Match: 0.322
Thresh 0.25 | Macro F1: 0.351 | Micro F1: 0.767 | Exact Match: 0.347
Thresh 0.30 | Macro F1: 0.325 | Micro F1: 0.771 | Exact Match: 0.366
Thresh 0.35 | Macro F1: 0.288 | Micro F1: 0.760 | Exact Match: 0.367
Thresh 0.40 | Macro F1: 0.259 | Micro F1: 0.741 | Exact Match: 0.338
Thresh 0.45 | Macro F1: 0.232 | Micro F1: 0.721 | Exact Match: 0.309
Thresh 0.50 | Macro F1: 0.210 | Micro F1: 0.696 | Exact Match: 0.290
Thresh 0.55 | Macro F1: 0.179 | Micro F1: 0.667 | Exact Match: 0.271
Thresh 0.60 | Macro F1: 0.159 | Micro F1: 0.625 | Exact Match: 0.234
Thresh 0.65 | Macro F1: 0.125 | Micro F1: 0.561 | Exact Match: 0.176
Thresh 0.70 | Macro F1: 0.107 | Micro F1: 0.508 | Exact Match: 0.133
Thresh 0.75 | Macro F1: 0.083 | Micro F1: 0.439 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.37it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.113
Micro F1: 0.316
Exact Match: 0.056

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.003
Micro F1: 0.051
Exact Match: 0.043

 Domain: UA
Macro F1: 0.112
Micro F1: 0.398
Exact Match: 0.065

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=43 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 354/354 [00:35<00:00,  9.98it/s]



Epoch 1: Loss = 0.2209
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.066 | Micro F1: 0.288 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.053 | Micro F1: 0.327 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.055 | Micro F1: 0.358 | Exact Match: 0.002
Thresh 0.25 | Macro F1: 0.039 | Micro F1: 0.313 | Exact Match: 0.007
Thresh 0.30 | Macro F1: 0.022 | Micro F1: 0.249 | Exact Match: 0.000
Thresh 0.35 | Macro F1: 0.014 | Micro F1: 0.214 | Exact Match: 0.000
Thresh 0.40 | Macro F1: 0.013 | Micro F1: 0.186 | Exact Match: 0.000
Thresh 0.45 | Macro F1: 0.010 | Micro F1: 0.144 | Exact Match: 0.000
Thresh 0.50 | Macro F1: 0.008 | Micro F1: 0.110 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.006 | Micro F1: 0.075 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.004 | Micro F1: 0.037 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.001 | Micro F1: 0.005 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 354/354 [00:35<00:00,  9.99it/s]



Epoch 2: Loss = 0.1379
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.175 | Micro F1: 0.430 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.180 | Micro F1: 0.515 | Exact Match: 0.028
Thresh 0.20 | Macro F1: 0.154 | Micro F1: 0.540 | Exact Match: 0.114
Thresh 0.25 | Macro F1: 0.133 | Micro F1: 0.535 | Exact Match: 0.169
Thresh 0.30 | Macro F1: 0.112 | Micro F1: 0.515 | Exact Match: 0.166
Thresh 0.35 | Macro F1: 0.095 | Micro F1: 0.485 | Exact Match: 0.150
Thresh 0.40 | Macro F1: 0.082 | Micro F1: 0.442 | Exact Match: 0.095
Thresh 0.45 | Macro F1: 0.068 | Micro F1: 0.396 | Exact Match: 0.057
Thresh 0.50 | Macro F1: 0.056 | Micro F1: 0.350 | Exact Match: 0.028
Thresh 0.55 | Macro F1: 0.046 | Micro F1: 0.298 | Exact Match: 0.007
Thresh 0.60 | Macro F1: 0.035 | Micro F1: 0.243 | Exact Match: 0.003
Thresh 0.65 | Macro F1: 0.027 | Micro F1: 0.194 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.017 | Micro F1: 0.123 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.010 | Micro F1: 0.079 | Exac

Epoch 3: 100%|██████████| 354/354 [00:35<00:00,  9.92it/s]



Epoch 3: Loss = 0.1119
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.313 | Micro F1: 0.525 | Exact Match: 0.045
Thresh 0.15 | Macro F1: 0.292 | Micro F1: 0.627 | Exact Match: 0.157
Thresh 0.20 | Macro F1: 0.263 | Micro F1: 0.668 | Exact Match: 0.248
Thresh 0.25 | Macro F1: 0.218 | Micro F1: 0.662 | Exact Match: 0.278
Thresh 0.30 | Macro F1: 0.186 | Micro F1: 0.644 | Exact Match: 0.279
Thresh 0.35 | Macro F1: 0.163 | Micro F1: 0.626 | Exact Match: 0.272
Thresh 0.40 | Macro F1: 0.138 | Micro F1: 0.597 | Exact Match: 0.255
Thresh 0.45 | Macro F1: 0.125 | Micro F1: 0.571 | Exact Match: 0.245
Thresh 0.50 | Macro F1: 0.115 | Micro F1: 0.547 | Exact Match: 0.210
Thresh 0.55 | Macro F1: 0.097 | Micro F1: 0.504 | Exact Match: 0.152
Thresh 0.60 | Macro F1: 0.082 | Micro F1: 0.452 | Exact Match: 0.086
Thresh 0.65 | Macro F1: 0.068 | Micro F1: 0.401 | Exact Match: 0.029
Thresh 0.70 | Macro F1: 0.054 | Micro F1: 0.352 | Exact Match: 0.002
Thresh 0.75 | Macro F1: 0.042 | Micro F1: 0.291 | Exac

Epoch 4: 100%|██████████| 354/354 [00:35<00:00,  9.94it/s]



Epoch 4: Loss = 0.0896
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.387 | Micro F1: 0.595 | Exact Match: 0.145
Thresh 0.15 | Macro F1: 0.380 | Micro F1: 0.695 | Exact Match: 0.274
Thresh 0.20 | Macro F1: 0.374 | Micro F1: 0.754 | Exact Match: 0.338
Thresh 0.25 | Macro F1: 0.351 | Micro F1: 0.776 | Exact Match: 0.348
Thresh 0.30 | Macro F1: 0.321 | Micro F1: 0.780 | Exact Match: 0.353
Thresh 0.35 | Macro F1: 0.296 | Micro F1: 0.774 | Exact Match: 0.341
Thresh 0.40 | Macro F1: 0.277 | Micro F1: 0.760 | Exact Match: 0.334
Thresh 0.45 | Macro F1: 0.247 | Micro F1: 0.736 | Exact Match: 0.303
Thresh 0.50 | Macro F1: 0.214 | Micro F1: 0.706 | Exact Match: 0.288
Thresh 0.55 | Macro F1: 0.183 | Micro F1: 0.664 | Exact Match: 0.262
Thresh 0.60 | Macro F1: 0.157 | Micro F1: 0.624 | Exact Match: 0.240
Thresh 0.65 | Macro F1: 0.131 | Micro F1: 0.572 | Exact Match: 0.203
Thresh 0.70 | Macro F1: 0.112 | Micro F1: 0.522 | Exact Match: 0.152
Thresh 0.75 | Macro F1: 0.088 | Micro F1: 0.449 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.73it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.117
Micro F1: 0.322
Exact Match: 0.062

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.004
Micro F1: 0.059
Exact Match: 0.071

 Domain: UA
Macro F1: 0.114
Micro F1: 0.407
Exact Match: 0.056

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=44 | Train on UA ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 354/354 [00:35<00:00,  9.95it/s]



Epoch 1: Loss = 0.2240
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.052 | Micro F1: 0.272 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.050 | Micro F1: 0.318 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.038 | Micro F1: 0.325 | Exact Match: 0.000
Thresh 0.25 | Macro F1: 0.025 | Micro F1: 0.297 | Exact Match: 0.000
Thresh 0.30 | Macro F1: 0.020 | Micro F1: 0.236 | Exact Match: 0.000
Thresh 0.35 | Macro F1: 0.013 | Micro F1: 0.170 | Exact Match: 0.000
Thresh 0.40 | Macro F1: 0.008 | Micro F1: 0.114 | Exact Match: 0.000
Thresh 0.45 | Macro F1: 0.003 | Micro F1: 0.032 | Exact Match: 0.000
Thresh 0.50 | Macro F1: 0.000 | Micro F1: 0.003 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 354/354 [00:35<00:00,  9.93it/s]



Epoch 2: Loss = 0.1395
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.156 | Micro F1: 0.402 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.161 | Micro F1: 0.486 | Exact Match: 0.038
Thresh 0.20 | Macro F1: 0.149 | Micro F1: 0.522 | Exact Match: 0.098
Thresh 0.25 | Macro F1: 0.131 | Micro F1: 0.530 | Exact Match: 0.133
Thresh 0.30 | Macro F1: 0.112 | Micro F1: 0.518 | Exact Match: 0.150
Thresh 0.35 | Macro F1: 0.097 | Micro F1: 0.509 | Exact Match: 0.143
Thresh 0.40 | Macro F1: 0.083 | Micro F1: 0.472 | Exact Match: 0.102
Thresh 0.45 | Macro F1: 0.077 | Micro F1: 0.442 | Exact Match: 0.079
Thresh 0.50 | Macro F1: 0.065 | Micro F1: 0.392 | Exact Match: 0.062
Thresh 0.55 | Macro F1: 0.056 | Micro F1: 0.352 | Exact Match: 0.040
Thresh 0.60 | Macro F1: 0.050 | Micro F1: 0.307 | Exact Match: 0.022
Thresh 0.65 | Macro F1: 0.039 | Micro F1: 0.250 | Exact Match: 0.014
Thresh 0.70 | Macro F1: 0.031 | Micro F1: 0.202 | Exact Match: 0.009
Thresh 0.75 | Macro F1: 0.021 | Micro F1: 0.136 | Exac

Epoch 3: 100%|██████████| 354/354 [00:35<00:00,  9.94it/s]



Epoch 3: Loss = 0.1122
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.265 | Micro F1: 0.517 | Exact Match: 0.060
Thresh 0.15 | Macro F1: 0.258 | Micro F1: 0.608 | Exact Match: 0.176
Thresh 0.20 | Macro F1: 0.242 | Micro F1: 0.652 | Exact Match: 0.248
Thresh 0.25 | Macro F1: 0.220 | Micro F1: 0.666 | Exact Match: 0.278
Thresh 0.30 | Macro F1: 0.196 | Micro F1: 0.660 | Exact Match: 0.276
Thresh 0.35 | Macro F1: 0.180 | Micro F1: 0.661 | Exact Match: 0.276
Thresh 0.40 | Macro F1: 0.159 | Micro F1: 0.637 | Exact Match: 0.272
Thresh 0.45 | Macro F1: 0.143 | Micro F1: 0.610 | Exact Match: 0.250
Thresh 0.50 | Macro F1: 0.131 | Micro F1: 0.578 | Exact Match: 0.226
Thresh 0.55 | Macro F1: 0.117 | Micro F1: 0.534 | Exact Match: 0.191
Thresh 0.60 | Macro F1: 0.098 | Micro F1: 0.478 | Exact Match: 0.143
Thresh 0.65 | Macro F1: 0.072 | Micro F1: 0.400 | Exact Match: 0.060
Thresh 0.70 | Macro F1: 0.054 | Micro F1: 0.331 | Exact Match: 0.021
Thresh 0.75 | Macro F1: 0.039 | Micro F1: 0.272 | Exac

Epoch 4: 100%|██████████| 354/354 [00:35<00:00,  9.89it/s]



Epoch 4: Loss = 0.0903
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.363 | Micro F1: 0.535 | Exact Match: 0.059
Thresh 0.15 | Macro F1: 0.389 | Micro F1: 0.657 | Exact Match: 0.152
Thresh 0.20 | Macro F1: 0.396 | Micro F1: 0.727 | Exact Match: 0.269
Thresh 0.25 | Macro F1: 0.379 | Micro F1: 0.763 | Exact Match: 0.347
Thresh 0.30 | Macro F1: 0.361 | Micro F1: 0.777 | Exact Match: 0.372
Thresh 0.35 | Macro F1: 0.311 | Micro F1: 0.772 | Exact Match: 0.369
Thresh 0.40 | Macro F1: 0.285 | Micro F1: 0.757 | Exact Match: 0.334
Thresh 0.45 | Macro F1: 0.254 | Micro F1: 0.743 | Exact Match: 0.317
Thresh 0.50 | Macro F1: 0.233 | Micro F1: 0.728 | Exact Match: 0.303
Thresh 0.55 | Macro F1: 0.206 | Micro F1: 0.688 | Exact Match: 0.257
Thresh 0.60 | Macro F1: 0.175 | Micro F1: 0.644 | Exact Match: 0.234
Thresh 0.65 | Macro F1: 0.152 | Micro F1: 0.602 | Exact Match: 0.209
Thresh 0.70 | Macro F1: 0.125 | Micro F1: 0.547 | Exact Match: 0.162
Thresh 0.75 | Macro F1: 0.099 | Micro F1: 0.480 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.14it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.149
Micro F1: 0.406
Exact Match: 0.039

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.021
Micro F1: 0.349
Exact Match: 0.057

 Domain: UA
Macro F1: 0.129
Micro F1: 0.428
Exact Match: 0.028

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=71 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 476/476 [00:47<00:00,  9.92it/s]



Epoch 1: Loss = 0.1880
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.094 | Micro F1: 0.303 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.075 | Micro F1: 0.357 | Exact Match: 0.021
Thresh 0.20 | Macro F1: 0.058 | Micro F1: 0.361 | Exact Match: 0.038
Thresh 0.25 | Macro F1: 0.053 | Micro F1: 0.359 | Exact Match: 0.033
Thresh 0.30 | Macro F1: 0.046 | Micro F1: 0.337 | Exact Match: 0.021
Thresh 0.35 | Macro F1: 0.037 | Micro F1: 0.303 | Exact Match: 0.016
Thresh 0.40 | Macro F1: 0.029 | Micro F1: 0.252 | Exact Match: 0.012
Thresh 0.45 | Macro F1: 0.021 | Micro F1: 0.178 | Exact Match: 0.003
Thresh 0.50 | Macro F1: 0.015 | Micro F1: 0.121 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.010 | Micro F1: 0.077 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.004 | Micro F1: 0.035 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.002 | Micro F1: 0.013 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.002 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 476/476 [00:47<00:00,  9.94it/s]



Epoch 2: Loss = 0.1036
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.224 | Micro F1: 0.434 | Exact Match: 0.014
Thresh 0.15 | Macro F1: 0.227 | Micro F1: 0.518 | Exact Match: 0.090
Thresh 0.20 | Macro F1: 0.202 | Micro F1: 0.552 | Exact Match: 0.184
Thresh 0.25 | Macro F1: 0.180 | Micro F1: 0.556 | Exact Match: 0.204
Thresh 0.30 | Macro F1: 0.155 | Micro F1: 0.545 | Exact Match: 0.182
Thresh 0.35 | Macro F1: 0.135 | Micro F1: 0.525 | Exact Match: 0.151
Thresh 0.40 | Macro F1: 0.110 | Micro F1: 0.485 | Exact Match: 0.092
Thresh 0.45 | Macro F1: 0.097 | Micro F1: 0.445 | Exact Match: 0.059
Thresh 0.50 | Macro F1: 0.077 | Micro F1: 0.395 | Exact Match: 0.033
Thresh 0.55 | Macro F1: 0.062 | Micro F1: 0.347 | Exact Match: 0.023
Thresh 0.60 | Macro F1: 0.052 | Micro F1: 0.304 | Exact Match: 0.010
Thresh 0.65 | Macro F1: 0.043 | Micro F1: 0.245 | Exact Match: 0.003
Thresh 0.70 | Macro F1: 0.031 | Micro F1: 0.184 | Exact Match: 0.003
Thresh 0.75 | Macro F1: 0.018 | Micro F1: 0.119 | Exac

Epoch 3: 100%|██████████| 476/476 [00:47<00:00,  9.92it/s]



Epoch 3: Loss = 0.0832
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.290 | Micro F1: 0.523 | Exact Match: 0.089
Thresh 0.15 | Macro F1: 0.291 | Micro F1: 0.602 | Exact Match: 0.212
Thresh 0.20 | Macro F1: 0.266 | Micro F1: 0.629 | Exact Match: 0.264
Thresh 0.25 | Macro F1: 0.247 | Micro F1: 0.640 | Exact Match: 0.297
Thresh 0.30 | Macro F1: 0.216 | Micro F1: 0.635 | Exact Match: 0.293
Thresh 0.35 | Macro F1: 0.188 | Micro F1: 0.618 | Exact Match: 0.285
Thresh 0.40 | Macro F1: 0.166 | Micro F1: 0.592 | Exact Match: 0.270
Thresh 0.45 | Macro F1: 0.146 | Micro F1: 0.564 | Exact Match: 0.254
Thresh 0.50 | Macro F1: 0.131 | Micro F1: 0.538 | Exact Match: 0.233
Thresh 0.55 | Macro F1: 0.117 | Micro F1: 0.505 | Exact Match: 0.195
Thresh 0.60 | Macro F1: 0.097 | Micro F1: 0.457 | Exact Match: 0.151
Thresh 0.65 | Macro F1: 0.080 | Micro F1: 0.408 | Exact Match: 0.085
Thresh 0.70 | Macro F1: 0.064 | Micro F1: 0.344 | Exact Match: 0.031
Thresh 0.75 | Macro F1: 0.044 | Micro F1: 0.270 | Exac

Epoch 4: 100%|██████████| 476/476 [00:48<00:00,  9.88it/s]



Epoch 4: Loss = 0.0675
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.400 | Micro F1: 0.609 | Exact Match: 0.212
Thresh 0.15 | Macro F1: 0.381 | Micro F1: 0.689 | Exact Match: 0.307
Thresh 0.20 | Macro F1: 0.346 | Micro F1: 0.719 | Exact Match: 0.329
Thresh 0.25 | Macro F1: 0.336 | Micro F1: 0.741 | Exact Match: 0.363
Thresh 0.30 | Macro F1: 0.316 | Micro F1: 0.739 | Exact Match: 0.362
Thresh 0.35 | Macro F1: 0.287 | Micro F1: 0.721 | Exact Match: 0.340
Thresh 0.40 | Macro F1: 0.260 | Micro F1: 0.702 | Exact Match: 0.310
Thresh 0.45 | Macro F1: 0.232 | Micro F1: 0.679 | Exact Match: 0.298
Thresh 0.50 | Macro F1: 0.204 | Micro F1: 0.650 | Exact Match: 0.275
Thresh 0.55 | Macro F1: 0.178 | Micro F1: 0.611 | Exact Match: 0.255
Thresh 0.60 | Macro F1: 0.156 | Micro F1: 0.578 | Exact Match: 0.243
Thresh 0.65 | Macro F1: 0.134 | Micro F1: 0.533 | Exact Match: 0.223
Thresh 0.70 | Macro F1: 0.106 | Micro F1: 0.472 | Exact Match: 0.194
Thresh 0.75 | Macro F1: 0.083 | Micro F1: 0.408 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.71it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.132
Micro F1: 0.403
Exact Match: 0.118

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.060
Micro F1: 0.484
Exact Match: 0.214

 Domain: UA
Macro F1: 0.072
Micro F1: 0.361
Exact Match: 0.056

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=72 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 476/476 [00:47<00:00,  9.96it/s]



Epoch 1: Loss = 0.1899
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.139 | Micro F1: 0.315 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.108 | Micro F1: 0.377 | Exact Match: 0.030
Thresh 0.20 | Macro F1: 0.087 | Micro F1: 0.396 | Exact Match: 0.035
Thresh 0.25 | Macro F1: 0.075 | Micro F1: 0.394 | Exact Match: 0.033
Thresh 0.30 | Macro F1: 0.064 | Micro F1: 0.360 | Exact Match: 0.030
Thresh 0.35 | Macro F1: 0.051 | Micro F1: 0.317 | Exact Match: 0.018
Thresh 0.40 | Macro F1: 0.038 | Micro F1: 0.259 | Exact Match: 0.014
Thresh 0.45 | Macro F1: 0.029 | Micro F1: 0.203 | Exact Match: 0.008
Thresh 0.50 | Macro F1: 0.021 | Micro F1: 0.151 | Exact Match: 0.001
Thresh 0.55 | Macro F1: 0.015 | Micro F1: 0.104 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.008 | Micro F1: 0.057 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.003 | Micro F1: 0.025 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.001 | Micro F1: 0.009 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.001 | Exac

Epoch 2: 100%|██████████| 476/476 [00:47<00:00, 10.02it/s]



Epoch 2: Loss = 0.1042
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.190 | Micro F1: 0.411 | Exact Match: 0.009
Thresh 0.15 | Macro F1: 0.192 | Micro F1: 0.489 | Exact Match: 0.036
Thresh 0.20 | Macro F1: 0.176 | Micro F1: 0.521 | Exact Match: 0.098
Thresh 0.25 | Macro F1: 0.147 | Micro F1: 0.527 | Exact Match: 0.133
Thresh 0.30 | Macro F1: 0.121 | Micro F1: 0.522 | Exact Match: 0.145
Thresh 0.35 | Macro F1: 0.110 | Micro F1: 0.509 | Exact Match: 0.141
Thresh 0.40 | Macro F1: 0.100 | Micro F1: 0.486 | Exact Match: 0.115
Thresh 0.45 | Macro F1: 0.086 | Micro F1: 0.447 | Exact Match: 0.076
Thresh 0.50 | Macro F1: 0.077 | Micro F1: 0.415 | Exact Match: 0.052
Thresh 0.55 | Macro F1: 0.067 | Micro F1: 0.371 | Exact Match: 0.031
Thresh 0.60 | Macro F1: 0.055 | Micro F1: 0.321 | Exact Match: 0.022
Thresh 0.65 | Macro F1: 0.042 | Micro F1: 0.266 | Exact Match: 0.021
Thresh 0.70 | Macro F1: 0.031 | Micro F1: 0.205 | Exact Match: 0.008
Thresh 0.75 | Macro F1: 0.021 | Micro F1: 0.142 | Exac

Epoch 3: 100%|██████████| 476/476 [00:47<00:00, 10.00it/s]



Epoch 3: Loss = 0.0835
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.310 | Micro F1: 0.554 | Exact Match: 0.099
Thresh 0.15 | Macro F1: 0.299 | Micro F1: 0.620 | Exact Match: 0.206
Thresh 0.20 | Macro F1: 0.278 | Micro F1: 0.647 | Exact Match: 0.279
Thresh 0.25 | Macro F1: 0.241 | Micro F1: 0.644 | Exact Match: 0.290
Thresh 0.30 | Macro F1: 0.221 | Micro F1: 0.638 | Exact Match: 0.284
Thresh 0.35 | Macro F1: 0.189 | Micro F1: 0.618 | Exact Match: 0.284
Thresh 0.40 | Macro F1: 0.169 | Micro F1: 0.596 | Exact Match: 0.268
Thresh 0.45 | Macro F1: 0.146 | Micro F1: 0.566 | Exact Match: 0.243
Thresh 0.50 | Macro F1: 0.125 | Micro F1: 0.538 | Exact Match: 0.224
Thresh 0.55 | Macro F1: 0.108 | Micro F1: 0.499 | Exact Match: 0.178
Thresh 0.60 | Macro F1: 0.094 | Micro F1: 0.455 | Exact Match: 0.130
Thresh 0.65 | Macro F1: 0.078 | Micro F1: 0.400 | Exact Match: 0.062
Thresh 0.70 | Macro F1: 0.060 | Micro F1: 0.330 | Exact Match: 0.025
Thresh 0.75 | Macro F1: 0.044 | Micro F1: 0.258 | Exac

Epoch 4: 100%|██████████| 476/476 [00:47<00:00,  9.99it/s]



Epoch 4: Loss = 0.0676
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.416 | Micro F1: 0.634 | Exact Match: 0.237
Thresh 0.15 | Macro F1: 0.389 | Micro F1: 0.700 | Exact Match: 0.310
Thresh 0.20 | Macro F1: 0.359 | Micro F1: 0.726 | Exact Match: 0.345
Thresh 0.25 | Macro F1: 0.328 | Micro F1: 0.734 | Exact Match: 0.357
Thresh 0.30 | Macro F1: 0.298 | Micro F1: 0.724 | Exact Match: 0.355
Thresh 0.35 | Macro F1: 0.266 | Micro F1: 0.704 | Exact Match: 0.336
Thresh 0.40 | Macro F1: 0.237 | Micro F1: 0.683 | Exact Match: 0.315
Thresh 0.45 | Macro F1: 0.207 | Micro F1: 0.653 | Exact Match: 0.296
Thresh 0.50 | Macro F1: 0.180 | Micro F1: 0.627 | Exact Match: 0.281
Thresh 0.55 | Macro F1: 0.160 | Micro F1: 0.598 | Exact Match: 0.260
Thresh 0.60 | Macro F1: 0.138 | Micro F1: 0.554 | Exact Match: 0.243
Thresh 0.65 | Macro F1: 0.115 | Micro F1: 0.511 | Exact Match: 0.195
Thresh 0.70 | Macro F1: 0.096 | Micro F1: 0.464 | Exact Match: 0.150
Thresh 0.75 | Macro F1: 0.079 | Micro F1: 0.406 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.49it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.115
Micro F1: 0.380
Exact Match: 0.107

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.051
Micro F1: 0.448
Exact Match: 0.186

 Domain: UA
Macro F1: 0.065
Micro F1: 0.341
Exact Match: 0.056

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=73 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 476/476 [00:47<00:00,  9.95it/s]



Epoch 1: Loss = 0.1905
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.101 | Micro F1: 0.325 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.079 | Micro F1: 0.373 | Exact Match: 0.001
Thresh 0.20 | Macro F1: 0.062 | Micro F1: 0.370 | Exact Match: 0.004
Thresh 0.25 | Macro F1: 0.053 | Micro F1: 0.350 | Exact Match: 0.003
Thresh 0.30 | Macro F1: 0.039 | Micro F1: 0.307 | Exact Match: 0.003
Thresh 0.35 | Macro F1: 0.031 | Micro F1: 0.248 | Exact Match: 0.000
Thresh 0.40 | Macro F1: 0.024 | Micro F1: 0.190 | Exact Match: 0.000
Thresh 0.45 | Macro F1: 0.013 | Micro F1: 0.113 | Exact Match: 0.000
Thresh 0.50 | Macro F1: 0.005 | Micro F1: 0.055 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.003 | Micro F1: 0.024 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.001 | Micro F1: 0.010 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.003 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 476/476 [00:47<00:00,  9.97it/s]



Epoch 2: Loss = 0.1041
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.259 | Micro F1: 0.424 | Exact Match: 0.010
Thresh 0.15 | Macro F1: 0.226 | Micro F1: 0.505 | Exact Match: 0.039
Thresh 0.20 | Macro F1: 0.200 | Micro F1: 0.540 | Exact Match: 0.092
Thresh 0.25 | Macro F1: 0.179 | Micro F1: 0.544 | Exact Match: 0.116
Thresh 0.30 | Macro F1: 0.154 | Micro F1: 0.533 | Exact Match: 0.141
Thresh 0.35 | Macro F1: 0.134 | Micro F1: 0.517 | Exact Match: 0.124
Thresh 0.40 | Macro F1: 0.105 | Micro F1: 0.489 | Exact Match: 0.108
Thresh 0.45 | Macro F1: 0.093 | Micro F1: 0.460 | Exact Match: 0.082
Thresh 0.50 | Macro F1: 0.083 | Micro F1: 0.434 | Exact Match: 0.072
Thresh 0.55 | Macro F1: 0.071 | Micro F1: 0.393 | Exact Match: 0.057
Thresh 0.60 | Macro F1: 0.062 | Micro F1: 0.347 | Exact Match: 0.044
Thresh 0.65 | Macro F1: 0.050 | Micro F1: 0.295 | Exact Match: 0.035
Thresh 0.70 | Macro F1: 0.038 | Micro F1: 0.232 | Exact Match: 0.021
Thresh 0.75 | Macro F1: 0.023 | Micro F1: 0.152 | Exac

Epoch 3: 100%|██████████| 476/476 [00:47<00:00,  9.96it/s]



Epoch 3: Loss = 0.0827
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.341 | Micro F1: 0.528 | Exact Match: 0.107
Thresh 0.15 | Macro F1: 0.329 | Micro F1: 0.615 | Exact Match: 0.212
Thresh 0.20 | Macro F1: 0.301 | Micro F1: 0.648 | Exact Match: 0.263
Thresh 0.25 | Macro F1: 0.281 | Micro F1: 0.665 | Exact Match: 0.294
Thresh 0.30 | Macro F1: 0.249 | Micro F1: 0.666 | Exact Match: 0.315
Thresh 0.35 | Macro F1: 0.223 | Micro F1: 0.650 | Exact Match: 0.297
Thresh 0.40 | Macro F1: 0.199 | Micro F1: 0.628 | Exact Match: 0.275
Thresh 0.45 | Macro F1: 0.179 | Micro F1: 0.603 | Exact Match: 0.254
Thresh 0.50 | Macro F1: 0.155 | Micro F1: 0.569 | Exact Match: 0.223
Thresh 0.55 | Macro F1: 0.133 | Micro F1: 0.529 | Exact Match: 0.185
Thresh 0.60 | Macro F1: 0.112 | Micro F1: 0.489 | Exact Match: 0.148
Thresh 0.65 | Macro F1: 0.093 | Micro F1: 0.438 | Exact Match: 0.094
Thresh 0.70 | Macro F1: 0.073 | Micro F1: 0.380 | Exact Match: 0.056
Thresh 0.75 | Macro F1: 0.056 | Micro F1: 0.317 | Exac

Epoch 4: 100%|██████████| 476/476 [00:47<00:00,  9.94it/s]



Epoch 4: Loss = 0.0667
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.424 | Micro F1: 0.599 | Exact Match: 0.227
Thresh 0.15 | Macro F1: 0.399 | Micro F1: 0.678 | Exact Match: 0.309
Thresh 0.20 | Macro F1: 0.380 | Micro F1: 0.716 | Exact Match: 0.358
Thresh 0.25 | Macro F1: 0.364 | Micro F1: 0.733 | Exact Match: 0.375
Thresh 0.30 | Macro F1: 0.325 | Micro F1: 0.743 | Exact Match: 0.382
Thresh 0.35 | Macro F1: 0.299 | Micro F1: 0.739 | Exact Match: 0.379
Thresh 0.40 | Macro F1: 0.270 | Micro F1: 0.724 | Exact Match: 0.368
Thresh 0.45 | Macro F1: 0.248 | Micro F1: 0.711 | Exact Match: 0.342
Thresh 0.50 | Macro F1: 0.228 | Micro F1: 0.691 | Exact Match: 0.315
Thresh 0.55 | Macro F1: 0.200 | Micro F1: 0.665 | Exact Match: 0.299
Thresh 0.60 | Macro F1: 0.173 | Micro F1: 0.631 | Exact Match: 0.285
Thresh 0.65 | Macro F1: 0.145 | Micro F1: 0.579 | Exact Match: 0.255
Thresh 0.70 | Macro F1: 0.125 | Micro F1: 0.536 | Exact Match: 0.215
Thresh 0.75 | Macro F1: 0.101 | Micro F1: 0.469 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.62it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.158
Micro F1: 0.438
Exact Match: 0.084

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.067
Micro F1: 0.506
Exact Match: 0.157

 Domain: UA
Macro F1: 0.091
Micro F1: 0.400
Exact Match: 0.037

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=74 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 476/476 [00:47<00:00,  9.95it/s]



Epoch 1: Loss = 0.1864
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.086 | Micro F1: 0.342 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.064 | Micro F1: 0.377 | Exact Match: 0.018
Thresh 0.20 | Macro F1: 0.049 | Micro F1: 0.344 | Exact Match: 0.029
Thresh 0.25 | Macro F1: 0.035 | Micro F1: 0.302 | Exact Match: 0.038
Thresh 0.30 | Macro F1: 0.030 | Micro F1: 0.242 | Exact Match: 0.038
Thresh 0.35 | Macro F1: 0.025 | Micro F1: 0.188 | Exact Match: 0.033
Thresh 0.40 | Macro F1: 0.020 | Micro F1: 0.141 | Exact Match: 0.025
Thresh 0.45 | Macro F1: 0.018 | Micro F1: 0.114 | Exact Match: 0.020
Thresh 0.50 | Macro F1: 0.014 | Micro F1: 0.081 | Exact Match: 0.014
Thresh 0.55 | Macro F1: 0.011 | Micro F1: 0.061 | Exact Match: 0.007
Thresh 0.60 | Macro F1: 0.006 | Micro F1: 0.037 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.004 | Micro F1: 0.021 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 476/476 [00:48<00:00,  9.91it/s]



Epoch 2: Loss = 0.1049
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.243 | Micro F1: 0.449 | Exact Match: 0.027
Thresh 0.15 | Macro F1: 0.209 | Micro F1: 0.517 | Exact Match: 0.095
Thresh 0.20 | Macro F1: 0.182 | Micro F1: 0.532 | Exact Match: 0.184
Thresh 0.25 | Macro F1: 0.151 | Micro F1: 0.525 | Exact Match: 0.207
Thresh 0.30 | Macro F1: 0.130 | Micro F1: 0.497 | Exact Match: 0.184
Thresh 0.35 | Macro F1: 0.110 | Micro F1: 0.458 | Exact Match: 0.163
Thresh 0.40 | Macro F1: 0.094 | Micro F1: 0.413 | Exact Match: 0.135
Thresh 0.45 | Macro F1: 0.074 | Micro F1: 0.356 | Exact Match: 0.082
Thresh 0.50 | Macro F1: 0.058 | Micro F1: 0.299 | Exact Match: 0.033
Thresh 0.55 | Macro F1: 0.045 | Micro F1: 0.251 | Exact Match: 0.014
Thresh 0.60 | Macro F1: 0.036 | Micro F1: 0.207 | Exact Match: 0.010
Thresh 0.65 | Macro F1: 0.026 | Micro F1: 0.157 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.017 | Micro F1: 0.107 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.009 | Micro F1: 0.069 | Exac

Epoch 3: 100%|██████████| 476/476 [00:47<00:00,  9.96it/s]



Epoch 3: Loss = 0.0840
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.333 | Micro F1: 0.502 | Exact Match: 0.074
Thresh 0.15 | Macro F1: 0.316 | Micro F1: 0.595 | Exact Match: 0.182
Thresh 0.20 | Macro F1: 0.294 | Micro F1: 0.635 | Exact Match: 0.236
Thresh 0.25 | Macro F1: 0.268 | Micro F1: 0.649 | Exact Match: 0.273
Thresh 0.30 | Macro F1: 0.241 | Micro F1: 0.650 | Exact Match: 0.286
Thresh 0.35 | Macro F1: 0.204 | Micro F1: 0.633 | Exact Match: 0.272
Thresh 0.40 | Macro F1: 0.181 | Micro F1: 0.612 | Exact Match: 0.247
Thresh 0.45 | Macro F1: 0.161 | Micro F1: 0.586 | Exact Match: 0.204
Thresh 0.50 | Macro F1: 0.142 | Micro F1: 0.556 | Exact Match: 0.176
Thresh 0.55 | Macro F1: 0.113 | Micro F1: 0.513 | Exact Match: 0.116
Thresh 0.60 | Macro F1: 0.096 | Micro F1: 0.464 | Exact Match: 0.061
Thresh 0.65 | Macro F1: 0.078 | Micro F1: 0.416 | Exact Match: 0.034
Thresh 0.70 | Macro F1: 0.063 | Micro F1: 0.360 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.049 | Micro F1: 0.308 | Exac

Epoch 4: 100%|██████████| 476/476 [00:47<00:00, 10.00it/s]



Epoch 4: Loss = 0.0678
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.384 | Micro F1: 0.528 | Exact Match: 0.142
Thresh 0.15 | Macro F1: 0.396 | Micro F1: 0.618 | Exact Match: 0.237
Thresh 0.20 | Macro F1: 0.388 | Micro F1: 0.677 | Exact Match: 0.302
Thresh 0.25 | Macro F1: 0.357 | Micro F1: 0.706 | Exact Match: 0.336
Thresh 0.30 | Macro F1: 0.348 | Micro F1: 0.726 | Exact Match: 0.339
Thresh 0.35 | Macro F1: 0.332 | Micro F1: 0.731 | Exact Match: 0.354
Thresh 0.40 | Macro F1: 0.315 | Micro F1: 0.729 | Exact Match: 0.354
Thresh 0.45 | Macro F1: 0.286 | Micro F1: 0.717 | Exact Match: 0.333
Thresh 0.50 | Macro F1: 0.260 | Micro F1: 0.703 | Exact Match: 0.320
Thresh 0.55 | Macro F1: 0.236 | Micro F1: 0.679 | Exact Match: 0.296
Thresh 0.60 | Macro F1: 0.203 | Micro F1: 0.645 | Exact Match: 0.266
Thresh 0.65 | Macro F1: 0.177 | Micro F1: 0.609 | Exact Match: 0.236
Thresh 0.70 | Macro F1: 0.146 | Micro F1: 0.545 | Exact Match: 0.164
Thresh 0.75 | Macro F1: 0.116 | Micro F1: 0.478 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.47it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.163
Micro F1: 0.434
Exact Match: 0.079

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.073
Micro F1: 0.517
Exact Match: 0.157

 Domain: UA
Macro F1: 0.090
Micro F1: 0.391
Exact Match: 0.028

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=75 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 476/476 [00:47<00:00,  9.94it/s]



Epoch 1: Loss = 0.1869
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.096 | Micro F1: 0.332 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.071 | Micro F1: 0.365 | Exact Match: 0.023
Thresh 0.20 | Macro F1: 0.058 | Micro F1: 0.345 | Exact Match: 0.030
Thresh 0.25 | Macro F1: 0.040 | Micro F1: 0.315 | Exact Match: 0.038
Thresh 0.30 | Macro F1: 0.034 | Micro F1: 0.295 | Exact Match: 0.036
Thresh 0.35 | Macro F1: 0.032 | Micro F1: 0.255 | Exact Match: 0.030
Thresh 0.40 | Macro F1: 0.029 | Micro F1: 0.209 | Exact Match: 0.021
Thresh 0.45 | Macro F1: 0.024 | Micro F1: 0.153 | Exact Match: 0.016
Thresh 0.50 | Macro F1: 0.013 | Micro F1: 0.089 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.008 | Micro F1: 0.047 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.003 | Micro F1: 0.015 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 476/476 [00:48<00:00,  9.86it/s]



Epoch 2: Loss = 0.1048
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.232 | Micro F1: 0.430 | Exact Match: 0.012
Thresh 0.15 | Macro F1: 0.200 | Micro F1: 0.491 | Exact Match: 0.062
Thresh 0.20 | Macro F1: 0.171 | Micro F1: 0.513 | Exact Match: 0.105
Thresh 0.25 | Macro F1: 0.152 | Micro F1: 0.523 | Exact Match: 0.115
Thresh 0.30 | Macro F1: 0.132 | Micro F1: 0.503 | Exact Match: 0.100
Thresh 0.35 | Macro F1: 0.116 | Micro F1: 0.478 | Exact Match: 0.099
Thresh 0.40 | Macro F1: 0.098 | Micro F1: 0.444 | Exact Match: 0.095
Thresh 0.45 | Macro F1: 0.086 | Micro F1: 0.418 | Exact Match: 0.085
Thresh 0.50 | Macro F1: 0.072 | Micro F1: 0.382 | Exact Match: 0.066
Thresh 0.55 | Macro F1: 0.059 | Micro F1: 0.337 | Exact Match: 0.057
Thresh 0.60 | Macro F1: 0.049 | Micro F1: 0.292 | Exact Match: 0.046
Thresh 0.65 | Macro F1: 0.041 | Micro F1: 0.253 | Exact Match: 0.029
Thresh 0.70 | Macro F1: 0.029 | Micro F1: 0.194 | Exact Match: 0.016
Thresh 0.75 | Macro F1: 0.020 | Micro F1: 0.147 | Exac

Epoch 3: 100%|██████████| 476/476 [00:48<00:00,  9.89it/s]



Epoch 3: Loss = 0.0849
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.312 | Micro F1: 0.520 | Exact Match: 0.141
Thresh 0.15 | Macro F1: 0.306 | Micro F1: 0.600 | Exact Match: 0.257
Thresh 0.20 | Macro F1: 0.289 | Micro F1: 0.641 | Exact Match: 0.307
Thresh 0.25 | Macro F1: 0.247 | Micro F1: 0.641 | Exact Match: 0.299
Thresh 0.30 | Macro F1: 0.231 | Micro F1: 0.635 | Exact Match: 0.277
Thresh 0.35 | Macro F1: 0.203 | Micro F1: 0.619 | Exact Match: 0.266
Thresh 0.40 | Macro F1: 0.177 | Micro F1: 0.590 | Exact Match: 0.246
Thresh 0.45 | Macro F1: 0.147 | Micro F1: 0.551 | Exact Match: 0.194
Thresh 0.50 | Macro F1: 0.125 | Micro F1: 0.513 | Exact Match: 0.168
Thresh 0.55 | Macro F1: 0.106 | Micro F1: 0.470 | Exact Match: 0.128
Thresh 0.60 | Macro F1: 0.085 | Micro F1: 0.406 | Exact Match: 0.060
Thresh 0.65 | Macro F1: 0.063 | Micro F1: 0.343 | Exact Match: 0.021
Thresh 0.70 | Macro F1: 0.050 | Micro F1: 0.288 | Exact Match: 0.004
Thresh 0.75 | Macro F1: 0.035 | Micro F1: 0.224 | Exac

Epoch 4: 100%|██████████| 476/476 [00:48<00:00,  9.89it/s]



Epoch 4: Loss = 0.0695
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.396 | Micro F1: 0.571 | Exact Match: 0.186
Thresh 0.15 | Macro F1: 0.407 | Micro F1: 0.659 | Exact Match: 0.259
Thresh 0.20 | Macro F1: 0.399 | Micro F1: 0.712 | Exact Match: 0.316
Thresh 0.25 | Macro F1: 0.366 | Micro F1: 0.725 | Exact Match: 0.316
Thresh 0.30 | Macro F1: 0.349 | Micro F1: 0.732 | Exact Match: 0.339
Thresh 0.35 | Macro F1: 0.308 | Micro F1: 0.726 | Exact Match: 0.353
Thresh 0.40 | Macro F1: 0.284 | Micro F1: 0.712 | Exact Match: 0.342
Thresh 0.45 | Macro F1: 0.248 | Micro F1: 0.691 | Exact Match: 0.332
Thresh 0.50 | Macro F1: 0.214 | Micro F1: 0.668 | Exact Match: 0.316
Thresh 0.55 | Macro F1: 0.196 | Micro F1: 0.643 | Exact Match: 0.302
Thresh 0.60 | Macro F1: 0.171 | Micro F1: 0.611 | Exact Match: 0.281
Thresh 0.65 | Macro F1: 0.148 | Micro F1: 0.571 | Exact Match: 0.254
Thresh 0.70 | Macro F1: 0.126 | Micro F1: 0.518 | Exact Match: 0.211
Thresh 0.75 | Macro F1: 0.101 | Micro F1: 0.457 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.28it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.141
Micro F1: 0.436
Exact Match: 0.107

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.061
Micro F1: 0.525
Exact Match: 0.214

 Domain: UA
Macro F1: 0.081
Micro F1: 0.378
Exact Match: 0.037

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=42 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 476/476 [00:48<00:00,  9.91it/s]



Epoch 1: Loss = 0.1884
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.089 | Micro F1: 0.346 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.072 | Micro F1: 0.379 | Exact Match: 0.014
Thresh 0.20 | Macro F1: 0.053 | Micro F1: 0.354 | Exact Match: 0.038
Thresh 0.25 | Macro F1: 0.043 | Micro F1: 0.321 | Exact Match: 0.030
Thresh 0.30 | Macro F1: 0.033 | Micro F1: 0.268 | Exact Match: 0.012
Thresh 0.35 | Macro F1: 0.024 | Micro F1: 0.223 | Exact Match: 0.000
Thresh 0.40 | Macro F1: 0.019 | Micro F1: 0.176 | Exact Match: 0.000
Thresh 0.45 | Macro F1: 0.012 | Micro F1: 0.131 | Exact Match: 0.000
Thresh 0.50 | Macro F1: 0.008 | Micro F1: 0.092 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.006 | Micro F1: 0.065 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.004 | Micro F1: 0.043 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.003 | Micro F1: 0.023 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.002 | Micro F1: 0.013 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.001 | Micro F1: 0.009 | Exac

Epoch 2: 100%|██████████| 476/476 [00:48<00:00,  9.90it/s]



Epoch 2: Loss = 0.1040
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.223 | Micro F1: 0.443 | Exact Match: 0.009
Thresh 0.15 | Macro F1: 0.198 | Micro F1: 0.515 | Exact Match: 0.072
Thresh 0.20 | Macro F1: 0.176 | Micro F1: 0.542 | Exact Match: 0.130
Thresh 0.25 | Macro F1: 0.149 | Micro F1: 0.533 | Exact Match: 0.160
Thresh 0.30 | Macro F1: 0.131 | Micro F1: 0.516 | Exact Match: 0.151
Thresh 0.35 | Macro F1: 0.111 | Micro F1: 0.489 | Exact Match: 0.124
Thresh 0.40 | Macro F1: 0.092 | Micro F1: 0.455 | Exact Match: 0.099
Thresh 0.45 | Macro F1: 0.079 | Micro F1: 0.423 | Exact Match: 0.074
Thresh 0.50 | Macro F1: 0.068 | Micro F1: 0.377 | Exact Match: 0.053
Thresh 0.55 | Macro F1: 0.060 | Micro F1: 0.331 | Exact Match: 0.034
Thresh 0.60 | Macro F1: 0.046 | Micro F1: 0.272 | Exact Match: 0.004
Thresh 0.65 | Macro F1: 0.036 | Micro F1: 0.225 | Exact Match: 0.003
Thresh 0.70 | Macro F1: 0.025 | Micro F1: 0.167 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.014 | Micro F1: 0.099 | Exac

Epoch 3: 100%|██████████| 476/476 [00:48<00:00,  9.88it/s]



Epoch 3: Loss = 0.0834
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.305 | Micro F1: 0.508 | Exact Match: 0.074
Thresh 0.15 | Macro F1: 0.299 | Micro F1: 0.593 | Exact Match: 0.176
Thresh 0.20 | Macro F1: 0.267 | Micro F1: 0.631 | Exact Match: 0.264
Thresh 0.25 | Macro F1: 0.249 | Micro F1: 0.642 | Exact Match: 0.286
Thresh 0.30 | Macro F1: 0.224 | Micro F1: 0.644 | Exact Match: 0.292
Thresh 0.35 | Macro F1: 0.198 | Micro F1: 0.629 | Exact Match: 0.267
Thresh 0.40 | Macro F1: 0.176 | Micro F1: 0.607 | Exact Match: 0.247
Thresh 0.45 | Macro F1: 0.145 | Micro F1: 0.579 | Exact Match: 0.214
Thresh 0.50 | Macro F1: 0.124 | Micro F1: 0.541 | Exact Match: 0.177
Thresh 0.55 | Macro F1: 0.111 | Micro F1: 0.510 | Exact Match: 0.154
Thresh 0.60 | Macro F1: 0.097 | Micro F1: 0.474 | Exact Match: 0.108
Thresh 0.65 | Macro F1: 0.080 | Micro F1: 0.421 | Exact Match: 0.062
Thresh 0.70 | Macro F1: 0.064 | Micro F1: 0.362 | Exact Match: 0.031
Thresh 0.75 | Macro F1: 0.048 | Micro F1: 0.296 | Exac

Epoch 4: 100%|██████████| 476/476 [00:47<00:00,  9.94it/s]



Epoch 4: Loss = 0.0681
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.389 | Micro F1: 0.576 | Exact Match: 0.150
Thresh 0.15 | Macro F1: 0.390 | Micro F1: 0.660 | Exact Match: 0.240
Thresh 0.20 | Macro F1: 0.376 | Micro F1: 0.703 | Exact Match: 0.289
Thresh 0.25 | Macro F1: 0.352 | Micro F1: 0.720 | Exact Match: 0.323
Thresh 0.30 | Macro F1: 0.319 | Micro F1: 0.728 | Exact Match: 0.355
Thresh 0.35 | Macro F1: 0.295 | Micro F1: 0.724 | Exact Match: 0.354
Thresh 0.40 | Macro F1: 0.273 | Micro F1: 0.708 | Exact Match: 0.352
Thresh 0.45 | Macro F1: 0.237 | Micro F1: 0.681 | Exact Match: 0.326
Thresh 0.50 | Macro F1: 0.210 | Micro F1: 0.654 | Exact Match: 0.305
Thresh 0.55 | Macro F1: 0.183 | Micro F1: 0.624 | Exact Match: 0.284
Thresh 0.60 | Macro F1: 0.157 | Micro F1: 0.590 | Exact Match: 0.254
Thresh 0.65 | Macro F1: 0.134 | Micro F1: 0.553 | Exact Match: 0.220
Thresh 0.70 | Macro F1: 0.109 | Micro F1: 0.495 | Exact Match: 0.165
Thresh 0.75 | Macro F1: 0.087 | Micro F1: 0.436 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.69it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.138
Micro F1: 0.428
Exact Match: 0.096

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.055
Micro F1: 0.480
Exact Match: 0.186

 Domain: UA
Macro F1: 0.083
Micro F1: 0.401
Exact Match: 0.037

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=43 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 476/476 [00:48<00:00,  9.88it/s]



Epoch 1: Loss = 0.1878
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.093 | Micro F1: 0.341 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.082 | Micro F1: 0.397 | Exact Match: 0.007
Thresh 0.20 | Macro F1: 0.058 | Micro F1: 0.346 | Exact Match: 0.017
Thresh 0.25 | Macro F1: 0.036 | Micro F1: 0.296 | Exact Match: 0.022
Thresh 0.30 | Macro F1: 0.032 | Micro F1: 0.249 | Exact Match: 0.016
Thresh 0.35 | Macro F1: 0.026 | Micro F1: 0.197 | Exact Match: 0.009
Thresh 0.40 | Macro F1: 0.019 | Micro F1: 0.142 | Exact Match: 0.003
Thresh 0.45 | Macro F1: 0.015 | Micro F1: 0.102 | Exact Match: 0.000
Thresh 0.50 | Macro F1: 0.010 | Micro F1: 0.063 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.005 | Micro F1: 0.033 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.001 | Micro F1: 0.007 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.004 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 476/476 [00:48<00:00,  9.90it/s]



Epoch 2: Loss = 0.1040
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.247 | Micro F1: 0.440 | Exact Match: 0.027
Thresh 0.15 | Macro F1: 0.218 | Micro F1: 0.514 | Exact Match: 0.076
Thresh 0.20 | Macro F1: 0.185 | Micro F1: 0.544 | Exact Match: 0.125
Thresh 0.25 | Macro F1: 0.170 | Micro F1: 0.552 | Exact Match: 0.164
Thresh 0.30 | Macro F1: 0.143 | Micro F1: 0.530 | Exact Match: 0.142
Thresh 0.35 | Macro F1: 0.125 | Micro F1: 0.506 | Exact Match: 0.118
Thresh 0.40 | Macro F1: 0.107 | Micro F1: 0.475 | Exact Match: 0.099
Thresh 0.45 | Macro F1: 0.091 | Micro F1: 0.447 | Exact Match: 0.074
Thresh 0.50 | Macro F1: 0.077 | Micro F1: 0.407 | Exact Match: 0.057
Thresh 0.55 | Macro F1: 0.063 | Micro F1: 0.364 | Exact Match: 0.030
Thresh 0.60 | Macro F1: 0.049 | Micro F1: 0.301 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.039 | Micro F1: 0.253 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.030 | Micro F1: 0.205 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.020 | Micro F1: 0.150 | Exac

Epoch 3: 100%|██████████| 476/476 [00:47<00:00,  9.98it/s]



Epoch 3: Loss = 0.0831
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.329 | Micro F1: 0.541 | Exact Match: 0.089
Thresh 0.15 | Macro F1: 0.312 | Micro F1: 0.615 | Exact Match: 0.191
Thresh 0.20 | Macro F1: 0.290 | Micro F1: 0.649 | Exact Match: 0.263
Thresh 0.25 | Macro F1: 0.258 | Micro F1: 0.656 | Exact Match: 0.294
Thresh 0.30 | Macro F1: 0.226 | Micro F1: 0.640 | Exact Match: 0.292
Thresh 0.35 | Macro F1: 0.192 | Micro F1: 0.620 | Exact Match: 0.289
Thresh 0.40 | Macro F1: 0.166 | Micro F1: 0.600 | Exact Match: 0.280
Thresh 0.45 | Macro F1: 0.146 | Micro F1: 0.575 | Exact Match: 0.253
Thresh 0.50 | Macro F1: 0.127 | Micro F1: 0.542 | Exact Match: 0.225
Thresh 0.55 | Macro F1: 0.111 | Micro F1: 0.510 | Exact Match: 0.198
Thresh 0.60 | Macro F1: 0.094 | Micro F1: 0.463 | Exact Match: 0.155
Thresh 0.65 | Macro F1: 0.078 | Micro F1: 0.416 | Exact Match: 0.118
Thresh 0.70 | Macro F1: 0.063 | Micro F1: 0.360 | Exact Match: 0.072
Thresh 0.75 | Macro F1: 0.050 | Micro F1: 0.301 | Exac

Epoch 4: 100%|██████████| 476/476 [00:47<00:00,  9.97it/s]



Epoch 4: Loss = 0.0674
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.412 | Micro F1: 0.605 | Exact Match: 0.171
Thresh 0.15 | Macro F1: 0.416 | Micro F1: 0.688 | Exact Match: 0.254
Thresh 0.20 | Macro F1: 0.386 | Micro F1: 0.729 | Exact Match: 0.320
Thresh 0.25 | Macro F1: 0.350 | Micro F1: 0.736 | Exact Match: 0.359
Thresh 0.30 | Macro F1: 0.330 | Micro F1: 0.746 | Exact Match: 0.372
Thresh 0.35 | Macro F1: 0.300 | Micro F1: 0.737 | Exact Match: 0.354
Thresh 0.40 | Macro F1: 0.268 | Micro F1: 0.714 | Exact Match: 0.333
Thresh 0.45 | Macro F1: 0.235 | Micro F1: 0.688 | Exact Match: 0.323
Thresh 0.50 | Macro F1: 0.204 | Micro F1: 0.657 | Exact Match: 0.309
Thresh 0.55 | Macro F1: 0.176 | Micro F1: 0.623 | Exact Match: 0.292
Thresh 0.60 | Macro F1: 0.157 | Micro F1: 0.588 | Exact Match: 0.271
Thresh 0.65 | Macro F1: 0.132 | Micro F1: 0.542 | Exact Match: 0.232
Thresh 0.70 | Macro F1: 0.112 | Micro F1: 0.498 | Exact Match: 0.197
Thresh 0.75 | Macro F1: 0.090 | Micro F1: 0.442 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.49it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.140
Micro F1: 0.425
Exact Match: 0.090

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.054
Micro F1: 0.473
Exact Match: 0.186

 Domain: UA
Macro F1: 0.086
Micro F1: 0.400
Exact Match: 0.028

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=44 | Train on UA-CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 476/476 [00:47<00:00,  9.97it/s]



Epoch 1: Loss = 0.1888
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.083 | Micro F1: 0.322 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.075 | Micro F1: 0.380 | Exact Match: 0.017
Thresh 0.20 | Macro F1: 0.050 | Micro F1: 0.345 | Exact Match: 0.035
Thresh 0.25 | Macro F1: 0.039 | Micro F1: 0.321 | Exact Match: 0.031
Thresh 0.30 | Macro F1: 0.034 | Micro F1: 0.283 | Exact Match: 0.022
Thresh 0.35 | Macro F1: 0.027 | Micro F1: 0.231 | Exact Match: 0.014
Thresh 0.40 | Macro F1: 0.021 | Micro F1: 0.161 | Exact Match: 0.009
Thresh 0.45 | Macro F1: 0.012 | Micro F1: 0.089 | Exact Match: 0.000
Thresh 0.50 | Macro F1: 0.007 | Micro F1: 0.039 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.002 | Micro F1: 0.009 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 476/476 [00:48<00:00,  9.90it/s]



Epoch 2: Loss = 0.1046
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.217 | Micro F1: 0.417 | Exact Match: 0.018
Thresh 0.15 | Macro F1: 0.191 | Micro F1: 0.489 | Exact Match: 0.052
Thresh 0.20 | Macro F1: 0.154 | Micro F1: 0.512 | Exact Match: 0.102
Thresh 0.25 | Macro F1: 0.135 | Micro F1: 0.522 | Exact Match: 0.161
Thresh 0.30 | Macro F1: 0.116 | Micro F1: 0.504 | Exact Match: 0.150
Thresh 0.35 | Macro F1: 0.100 | Micro F1: 0.469 | Exact Match: 0.105
Thresh 0.40 | Macro F1: 0.087 | Micro F1: 0.431 | Exact Match: 0.056
Thresh 0.45 | Macro F1: 0.071 | Micro F1: 0.379 | Exact Match: 0.038
Thresh 0.50 | Macro F1: 0.059 | Micro F1: 0.328 | Exact Match: 0.027
Thresh 0.55 | Macro F1: 0.048 | Micro F1: 0.275 | Exact Match: 0.016
Thresh 0.60 | Macro F1: 0.036 | Micro F1: 0.219 | Exact Match: 0.004
Thresh 0.65 | Macro F1: 0.026 | Micro F1: 0.164 | Exact Match: 0.003
Thresh 0.70 | Macro F1: 0.015 | Micro F1: 0.109 | Exact Match: 0.001
Thresh 0.75 | Macro F1: 0.008 | Micro F1: 0.071 | Exac

Epoch 3: 100%|██████████| 476/476 [00:48<00:00,  9.91it/s]



Epoch 3: Loss = 0.0840
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.292 | Micro F1: 0.568 | Exact Match: 0.100
Thresh 0.15 | Macro F1: 0.266 | Micro F1: 0.625 | Exact Match: 0.195
Thresh 0.20 | Macro F1: 0.239 | Micro F1: 0.638 | Exact Match: 0.258
Thresh 0.25 | Macro F1: 0.207 | Micro F1: 0.629 | Exact Match: 0.283
Thresh 0.30 | Macro F1: 0.184 | Micro F1: 0.612 | Exact Match: 0.267
Thresh 0.35 | Macro F1: 0.158 | Micro F1: 0.589 | Exact Match: 0.255
Thresh 0.40 | Macro F1: 0.127 | Micro F1: 0.556 | Exact Match: 0.219
Thresh 0.45 | Macro F1: 0.112 | Micro F1: 0.526 | Exact Match: 0.191
Thresh 0.50 | Macro F1: 0.095 | Micro F1: 0.485 | Exact Match: 0.141
Thresh 0.55 | Macro F1: 0.081 | Micro F1: 0.445 | Exact Match: 0.100
Thresh 0.60 | Macro F1: 0.067 | Micro F1: 0.389 | Exact Match: 0.055
Thresh 0.65 | Macro F1: 0.052 | Micro F1: 0.330 | Exact Match: 0.023
Thresh 0.70 | Macro F1: 0.041 | Micro F1: 0.273 | Exact Match: 0.004
Thresh 0.75 | Macro F1: 0.031 | Micro F1: 0.223 | Exac

Epoch 4: 100%|██████████| 476/476 [00:47<00:00,  9.97it/s]



Epoch 4: Loss = 0.0682
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.395 | Micro F1: 0.584 | Exact Match: 0.173
Thresh 0.15 | Macro F1: 0.410 | Micro F1: 0.674 | Exact Match: 0.272
Thresh 0.20 | Macro F1: 0.397 | Micro F1: 0.727 | Exact Match: 0.329
Thresh 0.25 | Macro F1: 0.363 | Micro F1: 0.741 | Exact Match: 0.344
Thresh 0.30 | Macro F1: 0.343 | Micro F1: 0.749 | Exact Match: 0.387
Thresh 0.35 | Macro F1: 0.310 | Micro F1: 0.739 | Exact Match: 0.372
Thresh 0.40 | Macro F1: 0.281 | Micro F1: 0.726 | Exact Match: 0.355
Thresh 0.45 | Macro F1: 0.248 | Micro F1: 0.707 | Exact Match: 0.336
Thresh 0.50 | Macro F1: 0.221 | Micro F1: 0.686 | Exact Match: 0.307
Thresh 0.55 | Macro F1: 0.196 | Micro F1: 0.655 | Exact Match: 0.288
Thresh 0.60 | Macro F1: 0.171 | Micro F1: 0.622 | Exact Match: 0.268
Thresh 0.65 | Macro F1: 0.149 | Micro F1: 0.578 | Exact Match: 0.227
Thresh 0.70 | Macro F1: 0.129 | Micro F1: 0.529 | Exact Match: 0.172
Thresh 0.75 | Macro F1: 0.100 | Micro F1: 0.456 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.27it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.144
Micro F1: 0.427
Exact Match: 0.101

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.071
Micro F1: 0.529
Exact Match: 0.186

 Domain: UA
Macro F1: 0.073
Micro F1: 0.366
Exact Match: 0.046

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=71 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 122/122 [00:12<00:00,  9.76it/s]



Epoch 1: Loss = 0.3015
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.101 | Micro F1: 0.151 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.080 | Micro F1: 0.335 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.070 | Micro F1: 0.402 | Exact Match: 0.005
Thresh 0.25 | Macro F1: 0.047 | Micro F1: 0.381 | Exact Match: 0.052
Thresh 0.30 | Macro F1: 0.040 | Micro F1: 0.375 | Exact Match: 0.119
Thresh 0.35 | Macro F1: 0.031 | Micro F1: 0.347 | Exact Match: 0.144
Thresh 0.40 | Macro F1: 0.029 | Micro F1: 0.337 | Exact Match: 0.144
Thresh 0.45 | Macro F1: 0.026 | Micro F1: 0.324 | Exact Match: 0.134
Thresh 0.50 | Macro F1: 0.026 | Micro F1: 0.303 | Exact Match: 0.082
Thresh 0.55 | Macro F1: 0.023 | Micro F1: 0.259 | Exact Match: 0.046
Thresh 0.60 | Macro F1: 0.015 | Micro F1: 0.198 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.013 | Micro F1: 0.158 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.001 | Micro F1: 0.009 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 122/122 [00:12<00:00,  9.78it/s]



Epoch 2: Loss = 0.1933
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.152 | Micro F1: 0.339 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.165 | Micro F1: 0.462 | Exact Match: 0.015
Thresh 0.20 | Macro F1: 0.126 | Micro F1: 0.490 | Exact Match: 0.062
Thresh 0.25 | Macro F1: 0.105 | Micro F1: 0.497 | Exact Match: 0.057
Thresh 0.30 | Macro F1: 0.093 | Micro F1: 0.493 | Exact Match: 0.077
Thresh 0.35 | Macro F1: 0.070 | Micro F1: 0.447 | Exact Match: 0.155
Thresh 0.40 | Macro F1: 0.051 | Micro F1: 0.418 | Exact Match: 0.144
Thresh 0.45 | Macro F1: 0.046 | Micro F1: 0.402 | Exact Match: 0.144
Thresh 0.50 | Macro F1: 0.042 | Micro F1: 0.385 | Exact Match: 0.139
Thresh 0.55 | Macro F1: 0.036 | Micro F1: 0.368 | Exact Match: 0.139
Thresh 0.60 | Macro F1: 0.030 | Micro F1: 0.341 | Exact Match: 0.134
Thresh 0.65 | Macro F1: 0.030 | Micro F1: 0.340 | Exact Match: 0.129
Thresh 0.70 | Macro F1: 0.029 | Micro F1: 0.315 | Exact Match: 0.088
Thresh 0.75 | Macro F1: 0.024 | Micro F1: 0.261 | Exac

Epoch 3: 100%|██████████| 122/122 [00:12<00:00,  9.73it/s]



Epoch 3: Loss = 0.1596
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.265 | Micro F1: 0.444 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.230 | Micro F1: 0.526 | Exact Match: 0.005
Thresh 0.20 | Macro F1: 0.199 | Micro F1: 0.561 | Exact Match: 0.041
Thresh 0.25 | Macro F1: 0.171 | Micro F1: 0.585 | Exact Match: 0.222
Thresh 0.30 | Macro F1: 0.145 | Micro F1: 0.569 | Exact Match: 0.309
Thresh 0.35 | Macro F1: 0.125 | Micro F1: 0.560 | Exact Match: 0.335
Thresh 0.40 | Macro F1: 0.108 | Micro F1: 0.542 | Exact Match: 0.345
Thresh 0.45 | Macro F1: 0.086 | Micro F1: 0.502 | Exact Match: 0.335
Thresh 0.50 | Macro F1: 0.081 | Micro F1: 0.486 | Exact Match: 0.299
Thresh 0.55 | Macro F1: 0.070 | Micro F1: 0.463 | Exact Match: 0.273
Thresh 0.60 | Macro F1: 0.063 | Micro F1: 0.436 | Exact Match: 0.237
Thresh 0.65 | Macro F1: 0.048 | Micro F1: 0.387 | Exact Match: 0.170
Thresh 0.70 | Macro F1: 0.036 | Micro F1: 0.343 | Exact Match: 0.119
Thresh 0.75 | Macro F1: 0.029 | Micro F1: 0.307 | Exac

Epoch 4: 100%|██████████| 122/122 [00:12<00:00,  9.70it/s]



Epoch 4: Loss = 0.1354
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.349 | Micro F1: 0.550 | Exact Match: 0.015
Thresh 0.15 | Macro F1: 0.309 | Micro F1: 0.637 | Exact Match: 0.119
Thresh 0.20 | Macro F1: 0.246 | Micro F1: 0.646 | Exact Match: 0.216
Thresh 0.25 | Macro F1: 0.212 | Micro F1: 0.649 | Exact Match: 0.325
Thresh 0.30 | Macro F1: 0.181 | Micro F1: 0.625 | Exact Match: 0.371
Thresh 0.35 | Macro F1: 0.136 | Micro F1: 0.581 | Exact Match: 0.356
Thresh 0.40 | Macro F1: 0.120 | Micro F1: 0.562 | Exact Match: 0.356
Thresh 0.45 | Macro F1: 0.106 | Micro F1: 0.543 | Exact Match: 0.345
Thresh 0.50 | Macro F1: 0.095 | Micro F1: 0.523 | Exact Match: 0.340
Thresh 0.55 | Macro F1: 0.079 | Micro F1: 0.487 | Exact Match: 0.309
Thresh 0.60 | Macro F1: 0.065 | Micro F1: 0.445 | Exact Match: 0.263
Thresh 0.65 | Macro F1: 0.055 | Micro F1: 0.398 | Exact Match: 0.201
Thresh 0.70 | Macro F1: 0.044 | Micro F1: 0.345 | Exact Match: 0.160
Thresh 0.75 | Macro F1: 0.033 | Micro F1: 0.275 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 21.43it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.063
Micro F1: 0.283
Exact Match: 0.118

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.079
Micro F1: 0.455
Exact Match: 0.229

 Domain: UA
Macro F1: 0.000
Micro F1: 0.000
Exact Match: 0.046

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=72 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 122/122 [00:12<00:00,  9.70it/s]



Epoch 1: Loss = 0.3030
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.098 | Micro F1: 0.154 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.078 | Micro F1: 0.321 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.056 | Micro F1: 0.354 | Exact Match: 0.000
Thresh 0.25 | Macro F1: 0.053 | Micro F1: 0.382 | Exact Match: 0.000
Thresh 0.30 | Macro F1: 0.049 | Micro F1: 0.395 | Exact Match: 0.119
Thresh 0.35 | Macro F1: 0.024 | Micro F1: 0.301 | Exact Match: 0.103
Thresh 0.40 | Macro F1: 0.020 | Micro F1: 0.251 | Exact Match: 0.052
Thresh 0.45 | Macro F1: 0.015 | Micro F1: 0.208 | Exact Match: 0.005
Thresh 0.50 | Macro F1: 0.014 | Micro F1: 0.183 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.012 | Micro F1: 0.136 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.005 | Micro F1: 0.047 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.003 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 122/122 [00:12<00:00,  9.70it/s]



Epoch 2: Loss = 0.1953
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.167 | Micro F1: 0.368 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.132 | Micro F1: 0.445 | Exact Match: 0.005
Thresh 0.20 | Macro F1: 0.103 | Micro F1: 0.484 | Exact Match: 0.046
Thresh 0.25 | Macro F1: 0.090 | Micro F1: 0.492 | Exact Match: 0.067
Thresh 0.30 | Macro F1: 0.083 | Micro F1: 0.486 | Exact Match: 0.144
Thresh 0.35 | Macro F1: 0.059 | Micro F1: 0.432 | Exact Match: 0.170
Thresh 0.40 | Macro F1: 0.049 | Micro F1: 0.400 | Exact Match: 0.155
Thresh 0.45 | Macro F1: 0.038 | Micro F1: 0.376 | Exact Match: 0.144
Thresh 0.50 | Macro F1: 0.036 | Micro F1: 0.368 | Exact Match: 0.134
Thresh 0.55 | Macro F1: 0.035 | Micro F1: 0.350 | Exact Match: 0.113
Thresh 0.60 | Macro F1: 0.030 | Micro F1: 0.317 | Exact Match: 0.077
Thresh 0.65 | Macro F1: 0.026 | Micro F1: 0.279 | Exact Match: 0.046
Thresh 0.70 | Macro F1: 0.017 | Micro F1: 0.207 | Exact Match: 0.010
Thresh 0.75 | Macro F1: 0.014 | Micro F1: 0.179 | Exac

Epoch 3: 100%|██████████| 122/122 [00:12<00:00,  9.69it/s]



Epoch 3: Loss = 0.1590
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.253 | Micro F1: 0.391 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.272 | Micro F1: 0.508 | Exact Match: 0.005
Thresh 0.20 | Macro F1: 0.268 | Micro F1: 0.578 | Exact Match: 0.088
Thresh 0.25 | Macro F1: 0.224 | Micro F1: 0.601 | Exact Match: 0.113
Thresh 0.30 | Macro F1: 0.220 | Micro F1: 0.623 | Exact Match: 0.165
Thresh 0.35 | Macro F1: 0.194 | Micro F1: 0.607 | Exact Match: 0.155
Thresh 0.40 | Macro F1: 0.169 | Micro F1: 0.599 | Exact Match: 0.273
Thresh 0.45 | Macro F1: 0.134 | Micro F1: 0.552 | Exact Match: 0.242
Thresh 0.50 | Macro F1: 0.115 | Micro F1: 0.531 | Exact Match: 0.216
Thresh 0.55 | Macro F1: 0.094 | Micro F1: 0.497 | Exact Match: 0.191
Thresh 0.60 | Macro F1: 0.075 | Micro F1: 0.459 | Exact Match: 0.165
Thresh 0.65 | Macro F1: 0.062 | Micro F1: 0.413 | Exact Match: 0.119
Thresh 0.70 | Macro F1: 0.049 | Micro F1: 0.373 | Exact Match: 0.103
Thresh 0.75 | Macro F1: 0.032 | Micro F1: 0.315 | Exac

Epoch 4: 100%|██████████| 122/122 [00:12<00:00,  9.65it/s]



Epoch 4: Loss = 0.1326
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.344 | Micro F1: 0.497 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.392 | Micro F1: 0.615 | Exact Match: 0.021
Thresh 0.20 | Macro F1: 0.360 | Micro F1: 0.660 | Exact Match: 0.088
Thresh 0.25 | Macro F1: 0.335 | Micro F1: 0.690 | Exact Match: 0.196
Thresh 0.30 | Macro F1: 0.294 | Micro F1: 0.700 | Exact Match: 0.320
Thresh 0.35 | Macro F1: 0.257 | Micro F1: 0.693 | Exact Match: 0.371
Thresh 0.40 | Macro F1: 0.212 | Micro F1: 0.665 | Exact Match: 0.345
Thresh 0.45 | Macro F1: 0.188 | Micro F1: 0.627 | Exact Match: 0.340
Thresh 0.50 | Macro F1: 0.160 | Micro F1: 0.603 | Exact Match: 0.335
Thresh 0.55 | Macro F1: 0.133 | Micro F1: 0.570 | Exact Match: 0.330
Thresh 0.60 | Macro F1: 0.113 | Micro F1: 0.541 | Exact Match: 0.309
Thresh 0.65 | Macro F1: 0.098 | Micro F1: 0.494 | Exact Match: 0.237
Thresh 0.70 | Macro F1: 0.078 | Micro F1: 0.445 | Exact Match: 0.180
Thresh 0.75 | Macro F1: 0.057 | Micro F1: 0.384 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 21.85it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.088
Micro F1: 0.305
Exact Match: 0.084

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.114
Micro F1: 0.498
Exact Match: 0.186

 Domain: UA
Macro F1: 0.000
Micro F1: 0.000
Exact Match: 0.019

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=73 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 122/122 [00:12<00:00,  9.72it/s]



Epoch 1: Loss = 0.3064
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.107 | Micro F1: 0.153 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.087 | Micro F1: 0.317 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.063 | Micro F1: 0.406 | Exact Match: 0.021
Thresh 0.25 | Macro F1: 0.048 | Micro F1: 0.405 | Exact Match: 0.057
Thresh 0.30 | Macro F1: 0.044 | Micro F1: 0.400 | Exact Match: 0.108
Thresh 0.35 | Macro F1: 0.031 | Micro F1: 0.344 | Exact Match: 0.134
Thresh 0.40 | Macro F1: 0.027 | Micro F1: 0.326 | Exact Match: 0.134
Thresh 0.45 | Macro F1: 0.027 | Micro F1: 0.313 | Exact Match: 0.103
Thresh 0.50 | Macro F1: 0.027 | Micro F1: 0.307 | Exact Match: 0.088
Thresh 0.55 | Macro F1: 0.026 | Micro F1: 0.280 | Exact Match: 0.072
Thresh 0.60 | Macro F1: 0.019 | Micro F1: 0.210 | Exact Match: 0.026
Thresh 0.65 | Macro F1: 0.013 | Micro F1: 0.156 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.010 | Micro F1: 0.104 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.001 | Micro F1: 0.009 | Exac

Epoch 2: 100%|██████████| 122/122 [00:12<00:00,  9.77it/s]



Epoch 2: Loss = 0.1918
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.136 | Micro F1: 0.368 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.086 | Micro F1: 0.416 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.078 | Micro F1: 0.448 | Exact Match: 0.057
Thresh 0.25 | Macro F1: 0.069 | Micro F1: 0.447 | Exact Match: 0.057
Thresh 0.30 | Macro F1: 0.058 | Micro F1: 0.437 | Exact Match: 0.082
Thresh 0.35 | Macro F1: 0.043 | Micro F1: 0.388 | Exact Match: 0.155
Thresh 0.40 | Macro F1: 0.040 | Micro F1: 0.381 | Exact Match: 0.155
Thresh 0.45 | Macro F1: 0.034 | Micro F1: 0.361 | Exact Match: 0.144
Thresh 0.50 | Macro F1: 0.032 | Micro F1: 0.351 | Exact Match: 0.139
Thresh 0.55 | Macro F1: 0.032 | Micro F1: 0.349 | Exact Match: 0.134
Thresh 0.60 | Macro F1: 0.030 | Micro F1: 0.341 | Exact Match: 0.129
Thresh 0.65 | Macro F1: 0.029 | Micro F1: 0.321 | Exact Match: 0.108
Thresh 0.70 | Macro F1: 0.026 | Micro F1: 0.277 | Exact Match: 0.052
Thresh 0.75 | Macro F1: 0.019 | Micro F1: 0.215 | Exac

Epoch 3: 100%|██████████| 122/122 [00:12<00:00,  9.76it/s]



Epoch 3: Loss = 0.1577
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.273 | Micro F1: 0.437 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.279 | Micro F1: 0.552 | Exact Match: 0.026
Thresh 0.20 | Macro F1: 0.234 | Micro F1: 0.601 | Exact Match: 0.134
Thresh 0.25 | Macro F1: 0.205 | Micro F1: 0.618 | Exact Match: 0.201
Thresh 0.30 | Macro F1: 0.174 | Micro F1: 0.611 | Exact Match: 0.299
Thresh 0.35 | Macro F1: 0.162 | Micro F1: 0.592 | Exact Match: 0.340
Thresh 0.40 | Macro F1: 0.144 | Micro F1: 0.569 | Exact Match: 0.340
Thresh 0.45 | Macro F1: 0.120 | Micro F1: 0.537 | Exact Match: 0.314
Thresh 0.50 | Macro F1: 0.094 | Micro F1: 0.498 | Exact Match: 0.289
Thresh 0.55 | Macro F1: 0.074 | Micro F1: 0.462 | Exact Match: 0.253
Thresh 0.60 | Macro F1: 0.068 | Micro F1: 0.446 | Exact Match: 0.237
Thresh 0.65 | Macro F1: 0.060 | Micro F1: 0.419 | Exact Match: 0.201
Thresh 0.70 | Macro F1: 0.047 | Micro F1: 0.366 | Exact Match: 0.134
Thresh 0.75 | Macro F1: 0.033 | Micro F1: 0.310 | Exac

Epoch 4: 100%|██████████| 122/122 [00:12<00:00,  9.73it/s]



Epoch 4: Loss = 0.1327
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.367 | Micro F1: 0.504 | Exact Match: 0.005
Thresh 0.15 | Macro F1: 0.357 | Micro F1: 0.611 | Exact Match: 0.098
Thresh 0.20 | Macro F1: 0.305 | Micro F1: 0.657 | Exact Match: 0.222
Thresh 0.25 | Macro F1: 0.286 | Micro F1: 0.679 | Exact Match: 0.263
Thresh 0.30 | Macro F1: 0.250 | Micro F1: 0.677 | Exact Match: 0.299
Thresh 0.35 | Macro F1: 0.225 | Micro F1: 0.680 | Exact Match: 0.376
Thresh 0.40 | Macro F1: 0.189 | Micro F1: 0.653 | Exact Match: 0.371
Thresh 0.45 | Macro F1: 0.164 | Micro F1: 0.620 | Exact Match: 0.361
Thresh 0.50 | Macro F1: 0.142 | Micro F1: 0.598 | Exact Match: 0.356
Thresh 0.55 | Macro F1: 0.117 | Micro F1: 0.573 | Exact Match: 0.356
Thresh 0.60 | Macro F1: 0.110 | Micro F1: 0.550 | Exact Match: 0.335
Thresh 0.65 | Macro F1: 0.094 | Micro F1: 0.516 | Exact Match: 0.335
Thresh 0.70 | Macro F1: 0.081 | Micro F1: 0.475 | Exact Match: 0.263
Thresh 0.75 | Macro F1: 0.058 | Micro F1: 0.405 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 21.99it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.097
Micro F1: 0.322
Exact Match: 0.101

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.117
Micro F1: 0.511
Exact Match: 0.229

 Domain: UA
Macro F1: 0.000
Micro F1: 0.000
Exact Match: 0.019

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=74 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 122/122 [00:12<00:00,  9.73it/s]



Epoch 1: Loss = 0.2974
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.093 | Micro F1: 0.177 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.061 | Micro F1: 0.316 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.053 | Micro F1: 0.372 | Exact Match: 0.000
Thresh 0.25 | Macro F1: 0.039 | Micro F1: 0.364 | Exact Match: 0.000
Thresh 0.30 | Macro F1: 0.030 | Micro F1: 0.354 | Exact Match: 0.129
Thresh 0.35 | Macro F1: 0.025 | Micro F1: 0.310 | Exact Match: 0.124
Thresh 0.40 | Macro F1: 0.022 | Micro F1: 0.275 | Exact Match: 0.062
Thresh 0.45 | Macro F1: 0.017 | Micro F1: 0.219 | Exact Match: 0.015
Thresh 0.50 | Macro F1: 0.014 | Micro F1: 0.171 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.007 | Micro F1: 0.066 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.001 | Micro F1: 0.006 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 122/122 [00:12<00:00,  9.72it/s]



Epoch 2: Loss = 0.1917
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.186 | Micro F1: 0.337 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.162 | Micro F1: 0.442 | Exact Match: 0.021
Thresh 0.20 | Macro F1: 0.140 | Micro F1: 0.496 | Exact Match: 0.057
Thresh 0.25 | Macro F1: 0.124 | Micro F1: 0.515 | Exact Match: 0.062
Thresh 0.30 | Macro F1: 0.103 | Micro F1: 0.511 | Exact Match: 0.088
Thresh 0.35 | Macro F1: 0.082 | Micro F1: 0.468 | Exact Match: 0.144
Thresh 0.40 | Macro F1: 0.071 | Micro F1: 0.453 | Exact Match: 0.139
Thresh 0.45 | Macro F1: 0.061 | Micro F1: 0.432 | Exact Match: 0.113
Thresh 0.50 | Macro F1: 0.044 | Micro F1: 0.391 | Exact Match: 0.108
Thresh 0.55 | Macro F1: 0.039 | Micro F1: 0.364 | Exact Match: 0.077
Thresh 0.60 | Macro F1: 0.035 | Micro F1: 0.329 | Exact Match: 0.057
Thresh 0.65 | Macro F1: 0.028 | Micro F1: 0.266 | Exact Match: 0.021
Thresh 0.70 | Macro F1: 0.017 | Micro F1: 0.198 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.014 | Micro F1: 0.166 | Exac

Epoch 3: 100%|██████████| 122/122 [00:12<00:00,  9.78it/s]



Epoch 3: Loss = 0.1578
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.293 | Micro F1: 0.437 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.295 | Micro F1: 0.531 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.266 | Micro F1: 0.586 | Exact Match: 0.052
Thresh 0.25 | Macro F1: 0.252 | Micro F1: 0.606 | Exact Match: 0.088
Thresh 0.30 | Macro F1: 0.227 | Micro F1: 0.619 | Exact Match: 0.247
Thresh 0.35 | Macro F1: 0.160 | Micro F1: 0.577 | Exact Match: 0.309
Thresh 0.40 | Macro F1: 0.133 | Micro F1: 0.554 | Exact Match: 0.335
Thresh 0.45 | Macro F1: 0.123 | Micro F1: 0.536 | Exact Match: 0.335
Thresh 0.50 | Macro F1: 0.103 | Micro F1: 0.505 | Exact Match: 0.304
Thresh 0.55 | Macro F1: 0.092 | Micro F1: 0.482 | Exact Match: 0.273
Thresh 0.60 | Macro F1: 0.078 | Micro F1: 0.456 | Exact Match: 0.237
Thresh 0.65 | Macro F1: 0.057 | Micro F1: 0.394 | Exact Match: 0.175
Thresh 0.70 | Macro F1: 0.047 | Micro F1: 0.352 | Exact Match: 0.119
Thresh 0.75 | Macro F1: 0.031 | Micro F1: 0.295 | Exac

Epoch 4: 100%|██████████| 122/122 [00:12<00:00,  9.80it/s]



Epoch 4: Loss = 0.1303
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.376 | Micro F1: 0.506 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.397 | Micro F1: 0.612 | Exact Match: 0.021
Thresh 0.20 | Macro F1: 0.386 | Micro F1: 0.676 | Exact Match: 0.155
Thresh 0.25 | Macro F1: 0.322 | Micro F1: 0.697 | Exact Match: 0.247
Thresh 0.30 | Macro F1: 0.286 | Micro F1: 0.701 | Exact Match: 0.284
Thresh 0.35 | Macro F1: 0.239 | Micro F1: 0.687 | Exact Match: 0.304
Thresh 0.40 | Macro F1: 0.223 | Micro F1: 0.685 | Exact Match: 0.376
Thresh 0.45 | Macro F1: 0.183 | Micro F1: 0.642 | Exact Match: 0.371
Thresh 0.50 | Macro F1: 0.149 | Micro F1: 0.597 | Exact Match: 0.325
Thresh 0.55 | Macro F1: 0.133 | Micro F1: 0.568 | Exact Match: 0.299
Thresh 0.60 | Macro F1: 0.107 | Micro F1: 0.528 | Exact Match: 0.268
Thresh 0.65 | Macro F1: 0.088 | Micro F1: 0.486 | Exact Match: 0.211
Thresh 0.70 | Macro F1: 0.074 | Micro F1: 0.439 | Exact Match: 0.144
Thresh 0.75 | Macro F1: 0.050 | Micro F1: 0.364 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.17it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.103
Micro F1: 0.319
Exact Match: 0.079

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.129
Micro F1: 0.521
Exact Match: 0.186

 Domain: UA
Macro F1: 0.000
Micro F1: 0.000
Exact Match: 0.009

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=75 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 122/122 [00:12<00:00,  9.76it/s]



Epoch 1: Loss = 0.2932
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.098 | Micro F1: 0.182 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.080 | Micro F1: 0.332 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.060 | Micro F1: 0.381 | Exact Match: 0.000
Thresh 0.25 | Macro F1: 0.044 | Micro F1: 0.390 | Exact Match: 0.000
Thresh 0.30 | Macro F1: 0.044 | Micro F1: 0.398 | Exact Match: 0.119
Thresh 0.35 | Macro F1: 0.036 | Micro F1: 0.379 | Exact Match: 0.139
Thresh 0.40 | Macro F1: 0.029 | Micro F1: 0.324 | Exact Match: 0.108
Thresh 0.45 | Macro F1: 0.027 | Micro F1: 0.308 | Exact Match: 0.088
Thresh 0.50 | Macro F1: 0.024 | Micro F1: 0.265 | Exact Match: 0.041
Thresh 0.55 | Macro F1: 0.015 | Micro F1: 0.198 | Exact Match: 0.005
Thresh 0.60 | Macro F1: 0.014 | Micro F1: 0.176 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.012 | Micro F1: 0.143 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.006 | Micro F1: 0.049 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 122/122 [00:12<00:00,  9.78it/s]



Epoch 2: Loss = 0.1903
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.195 | Micro F1: 0.340 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.163 | Micro F1: 0.445 | Exact Match: 0.005
Thresh 0.20 | Macro F1: 0.133 | Micro F1: 0.502 | Exact Match: 0.046
Thresh 0.25 | Macro F1: 0.122 | Micro F1: 0.532 | Exact Match: 0.113
Thresh 0.30 | Macro F1: 0.099 | Micro F1: 0.518 | Exact Match: 0.216
Thresh 0.35 | Macro F1: 0.082 | Micro F1: 0.488 | Exact Match: 0.206
Thresh 0.40 | Macro F1: 0.066 | Micro F1: 0.444 | Exact Match: 0.160
Thresh 0.45 | Macro F1: 0.056 | Micro F1: 0.414 | Exact Match: 0.124
Thresh 0.50 | Macro F1: 0.042 | Micro F1: 0.360 | Exact Match: 0.103
Thresh 0.55 | Macro F1: 0.035 | Micro F1: 0.323 | Exact Match: 0.077
Thresh 0.60 | Macro F1: 0.031 | Micro F1: 0.281 | Exact Match: 0.046
Thresh 0.65 | Macro F1: 0.021 | Micro F1: 0.203 | Exact Match: 0.021
Thresh 0.70 | Macro F1: 0.014 | Micro F1: 0.144 | Exact Match: 0.005
Thresh 0.75 | Macro F1: 0.009 | Micro F1: 0.086 | Exac

Epoch 3: 100%|██████████| 122/122 [00:12<00:00,  9.74it/s]



Epoch 3: Loss = 0.1563
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.276 | Micro F1: 0.464 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.248 | Micro F1: 0.551 | Exact Match: 0.010
Thresh 0.20 | Macro F1: 0.207 | Micro F1: 0.589 | Exact Match: 0.093
Thresh 0.25 | Macro F1: 0.192 | Micro F1: 0.605 | Exact Match: 0.149
Thresh 0.30 | Macro F1: 0.163 | Micro F1: 0.603 | Exact Match: 0.263
Thresh 0.35 | Macro F1: 0.137 | Micro F1: 0.564 | Exact Match: 0.320
Thresh 0.40 | Macro F1: 0.108 | Micro F1: 0.531 | Exact Match: 0.320
Thresh 0.45 | Macro F1: 0.085 | Micro F1: 0.488 | Exact Match: 0.284
Thresh 0.50 | Macro F1: 0.073 | Micro F1: 0.467 | Exact Match: 0.268
Thresh 0.55 | Macro F1: 0.063 | Micro F1: 0.442 | Exact Match: 0.237
Thresh 0.60 | Macro F1: 0.049 | Micro F1: 0.404 | Exact Match: 0.206
Thresh 0.65 | Macro F1: 0.042 | Micro F1: 0.377 | Exact Match: 0.160
Thresh 0.70 | Macro F1: 0.032 | Micro F1: 0.349 | Exact Match: 0.129
Thresh 0.75 | Macro F1: 0.031 | Micro F1: 0.329 | Exac

Epoch 4: 100%|██████████| 122/122 [00:12<00:00,  9.76it/s]



Epoch 4: Loss = 0.1301
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.384 | Micro F1: 0.540 | Exact Match: 0.005
Thresh 0.15 | Macro F1: 0.394 | Micro F1: 0.641 | Exact Match: 0.144
Thresh 0.20 | Macro F1: 0.366 | Micro F1: 0.691 | Exact Match: 0.258
Thresh 0.25 | Macro F1: 0.326 | Micro F1: 0.701 | Exact Match: 0.361
Thresh 0.30 | Macro F1: 0.283 | Micro F1: 0.699 | Exact Match: 0.392
Thresh 0.35 | Macro F1: 0.252 | Micro F1: 0.683 | Exact Match: 0.361
Thresh 0.40 | Macro F1: 0.218 | Micro F1: 0.660 | Exact Match: 0.356
Thresh 0.45 | Macro F1: 0.195 | Micro F1: 0.632 | Exact Match: 0.351
Thresh 0.50 | Macro F1: 0.168 | Micro F1: 0.604 | Exact Match: 0.351
Thresh 0.55 | Macro F1: 0.154 | Micro F1: 0.587 | Exact Match: 0.345
Thresh 0.60 | Macro F1: 0.125 | Micro F1: 0.547 | Exact Match: 0.330
Thresh 0.65 | Macro F1: 0.100 | Micro F1: 0.490 | Exact Match: 0.253
Thresh 0.70 | Macro F1: 0.078 | Micro F1: 0.423 | Exact Match: 0.155
Thresh 0.75 | Macro F1: 0.053 | Micro F1: 0.331 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.11it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.082
Micro F1: 0.300
Exact Match: 0.146

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.101
Micro F1: 0.474
Exact Match: 0.257

 Domain: UA
Macro F1: 0.000
Micro F1: 0.000
Exact Match: 0.074

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=42 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 122/122 [00:12<00:00,  9.75it/s]



Epoch 1: Loss = 0.3042
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.088 | Micro F1: 0.165 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.064 | Micro F1: 0.297 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.044 | Micro F1: 0.346 | Exact Match: 0.000
Thresh 0.25 | Macro F1: 0.032 | Micro F1: 0.342 | Exact Match: 0.000
Thresh 0.30 | Macro F1: 0.028 | Micro F1: 0.342 | Exact Match: 0.113
Thresh 0.35 | Macro F1: 0.022 | Micro F1: 0.306 | Exact Match: 0.144
Thresh 0.40 | Macro F1: 0.016 | Micro F1: 0.222 | Exact Match: 0.031
Thresh 0.45 | Macro F1: 0.014 | Micro F1: 0.194 | Exact Match: 0.000
Thresh 0.50 | Macro F1: 0.006 | Micro F1: 0.049 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 122/122 [00:12<00:00,  9.75it/s]



Epoch 2: Loss = 0.1963
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.152 | Micro F1: 0.335 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.120 | Micro F1: 0.424 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.117 | Micro F1: 0.484 | Exact Match: 0.046
Thresh 0.25 | Macro F1: 0.099 | Micro F1: 0.501 | Exact Match: 0.057
Thresh 0.30 | Macro F1: 0.091 | Micro F1: 0.503 | Exact Match: 0.077
Thresh 0.35 | Macro F1: 0.069 | Micro F1: 0.462 | Exact Match: 0.088
Thresh 0.40 | Macro F1: 0.055 | Micro F1: 0.435 | Exact Match: 0.139
Thresh 0.45 | Macro F1: 0.047 | Micro F1: 0.409 | Exact Match: 0.134
Thresh 0.50 | Macro F1: 0.041 | Micro F1: 0.380 | Exact Match: 0.113
Thresh 0.55 | Macro F1: 0.038 | Micro F1: 0.358 | Exact Match: 0.108
Thresh 0.60 | Macro F1: 0.033 | Micro F1: 0.317 | Exact Match: 0.067
Thresh 0.65 | Macro F1: 0.024 | Micro F1: 0.250 | Exact Match: 0.041
Thresh 0.70 | Macro F1: 0.016 | Micro F1: 0.186 | Exact Match: 0.015
Thresh 0.75 | Macro F1: 0.013 | Micro F1: 0.154 | Exac

Epoch 3: 100%|██████████| 122/122 [00:12<00:00,  9.78it/s]



Epoch 3: Loss = 0.1621
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.265 | Micro F1: 0.413 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.250 | Micro F1: 0.506 | Exact Match: 0.010
Thresh 0.20 | Macro F1: 0.220 | Micro F1: 0.557 | Exact Match: 0.041
Thresh 0.25 | Macro F1: 0.202 | Micro F1: 0.585 | Exact Match: 0.098
Thresh 0.30 | Macro F1: 0.182 | Micro F1: 0.578 | Exact Match: 0.124
Thresh 0.35 | Macro F1: 0.152 | Micro F1: 0.557 | Exact Match: 0.186
Thresh 0.40 | Macro F1: 0.116 | Micro F1: 0.516 | Exact Match: 0.211
Thresh 0.45 | Macro F1: 0.100 | Micro F1: 0.489 | Exact Match: 0.186
Thresh 0.50 | Macro F1: 0.082 | Micro F1: 0.459 | Exact Match: 0.155
Thresh 0.55 | Macro F1: 0.064 | Micro F1: 0.423 | Exact Match: 0.144
Thresh 0.60 | Macro F1: 0.055 | Micro F1: 0.407 | Exact Match: 0.139
Thresh 0.65 | Macro F1: 0.050 | Micro F1: 0.394 | Exact Match: 0.139
Thresh 0.70 | Macro F1: 0.040 | Micro F1: 0.372 | Exact Match: 0.139
Thresh 0.75 | Macro F1: 0.033 | Micro F1: 0.344 | Exac

Epoch 4: 100%|██████████| 122/122 [00:12<00:00,  9.78it/s]



Epoch 4: Loss = 0.1362
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.326 | Micro F1: 0.483 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.339 | Micro F1: 0.577 | Exact Match: 0.052
Thresh 0.20 | Macro F1: 0.327 | Micro F1: 0.633 | Exact Match: 0.098
Thresh 0.25 | Macro F1: 0.312 | Micro F1: 0.664 | Exact Match: 0.170
Thresh 0.30 | Macro F1: 0.299 | Micro F1: 0.680 | Exact Match: 0.335
Thresh 0.35 | Macro F1: 0.254 | Micro F1: 0.671 | Exact Match: 0.345
Thresh 0.40 | Macro F1: 0.230 | Micro F1: 0.656 | Exact Match: 0.356
Thresh 0.45 | Macro F1: 0.188 | Micro F1: 0.617 | Exact Match: 0.340
Thresh 0.50 | Macro F1: 0.173 | Micro F1: 0.601 | Exact Match: 0.325
Thresh 0.55 | Macro F1: 0.161 | Micro F1: 0.582 | Exact Match: 0.304
Thresh 0.60 | Macro F1: 0.129 | Micro F1: 0.547 | Exact Match: 0.278
Thresh 0.65 | Macro F1: 0.111 | Micro F1: 0.502 | Exact Match: 0.227
Thresh 0.70 | Macro F1: 0.076 | Micro F1: 0.426 | Exact Match: 0.129
Thresh 0.75 | Macro F1: 0.059 | Micro F1: 0.349 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.76it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.099
Micro F1: 0.294
Exact Match: 0.067

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.127
Micro F1: 0.495
Exact Match: 0.157

 Domain: UA
Macro F1: 0.000
Micro F1: 0.000
Exact Match: 0.009

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=43 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 122/122 [00:12<00:00,  9.70it/s]



Epoch 1: Loss = 0.2973
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.093 | Micro F1: 0.185 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.066 | Micro F1: 0.328 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.055 | Micro F1: 0.360 | Exact Match: 0.000
Thresh 0.25 | Macro F1: 0.049 | Micro F1: 0.378 | Exact Match: 0.000
Thresh 0.30 | Macro F1: 0.029 | Micro F1: 0.341 | Exact Match: 0.077
Thresh 0.35 | Macro F1: 0.027 | Micro F1: 0.321 | Exact Match: 0.144
Thresh 0.40 | Macro F1: 0.024 | Micro F1: 0.315 | Exact Match: 0.144
Thresh 0.45 | Macro F1: 0.024 | Micro F1: 0.298 | Exact Match: 0.093
Thresh 0.50 | Macro F1: 0.021 | Micro F1: 0.248 | Exact Match: 0.036
Thresh 0.55 | Macro F1: 0.014 | Micro F1: 0.173 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.008 | Micro F1: 0.077 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 122/122 [00:12<00:00,  9.64it/s]



Epoch 2: Loss = 0.1922
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.198 | Micro F1: 0.334 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.195 | Micro F1: 0.447 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.180 | Micro F1: 0.513 | Exact Match: 0.041
Thresh 0.25 | Macro F1: 0.136 | Micro F1: 0.531 | Exact Match: 0.072
Thresh 0.30 | Macro F1: 0.118 | Micro F1: 0.521 | Exact Match: 0.149
Thresh 0.35 | Macro F1: 0.103 | Micro F1: 0.499 | Exact Match: 0.175
Thresh 0.40 | Macro F1: 0.085 | Micro F1: 0.476 | Exact Match: 0.201
Thresh 0.45 | Macro F1: 0.065 | Micro F1: 0.442 | Exact Match: 0.160
Thresh 0.50 | Macro F1: 0.049 | Micro F1: 0.406 | Exact Match: 0.134
Thresh 0.55 | Macro F1: 0.041 | Micro F1: 0.387 | Exact Match: 0.129
Thresh 0.60 | Macro F1: 0.038 | Micro F1: 0.371 | Exact Match: 0.129
Thresh 0.65 | Macro F1: 0.037 | Micro F1: 0.357 | Exact Match: 0.119
Thresh 0.70 | Macro F1: 0.031 | Micro F1: 0.309 | Exact Match: 0.082
Thresh 0.75 | Macro F1: 0.026 | Micro F1: 0.268 | Exac

Epoch 3: 100%|██████████| 122/122 [00:12<00:00,  9.70it/s]



Epoch 3: Loss = 0.1569
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.295 | Micro F1: 0.436 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.357 | Micro F1: 0.562 | Exact Match: 0.036
Thresh 0.20 | Macro F1: 0.311 | Micro F1: 0.606 | Exact Match: 0.098
Thresh 0.25 | Macro F1: 0.284 | Micro F1: 0.627 | Exact Match: 0.180
Thresh 0.30 | Macro F1: 0.251 | Micro F1: 0.635 | Exact Match: 0.325
Thresh 0.35 | Macro F1: 0.191 | Micro F1: 0.603 | Exact Match: 0.330
Thresh 0.40 | Macro F1: 0.171 | Micro F1: 0.592 | Exact Match: 0.335
Thresh 0.45 | Macro F1: 0.152 | Micro F1: 0.564 | Exact Match: 0.309
Thresh 0.50 | Macro F1: 0.131 | Micro F1: 0.519 | Exact Match: 0.258
Thresh 0.55 | Macro F1: 0.118 | Micro F1: 0.490 | Exact Match: 0.227
Thresh 0.60 | Macro F1: 0.086 | Micro F1: 0.450 | Exact Match: 0.196
Thresh 0.65 | Macro F1: 0.072 | Micro F1: 0.401 | Exact Match: 0.113
Thresh 0.70 | Macro F1: 0.053 | Micro F1: 0.339 | Exact Match: 0.057
Thresh 0.75 | Macro F1: 0.037 | Micro F1: 0.273 | Exac

Epoch 4: 100%|██████████| 122/122 [00:12<00:00,  9.70it/s]



Epoch 4: Loss = 0.1303
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.367 | Micro F1: 0.540 | Exact Match: 0.026
Thresh 0.15 | Macro F1: 0.401 | Micro F1: 0.645 | Exact Match: 0.155
Thresh 0.20 | Macro F1: 0.359 | Micro F1: 0.678 | Exact Match: 0.273
Thresh 0.25 | Macro F1: 0.318 | Micro F1: 0.677 | Exact Match: 0.345
Thresh 0.30 | Macro F1: 0.293 | Micro F1: 0.666 | Exact Match: 0.356
Thresh 0.35 | Macro F1: 0.258 | Micro F1: 0.649 | Exact Match: 0.361
Thresh 0.40 | Macro F1: 0.222 | Micro F1: 0.629 | Exact Match: 0.356
Thresh 0.45 | Macro F1: 0.180 | Micro F1: 0.603 | Exact Match: 0.356
Thresh 0.50 | Macro F1: 0.155 | Micro F1: 0.573 | Exact Match: 0.351
Thresh 0.55 | Macro F1: 0.143 | Micro F1: 0.555 | Exact Match: 0.351
Thresh 0.60 | Macro F1: 0.119 | Micro F1: 0.525 | Exact Match: 0.335
Thresh 0.65 | Macro F1: 0.097 | Micro F1: 0.491 | Exact Match: 0.294
Thresh 0.70 | Macro F1: 0.083 | Micro F1: 0.452 | Exact Match: 0.232
Thresh 0.75 | Macro F1: 0.065 | Micro F1: 0.389 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.63it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.095
Micro F1: 0.286
Exact Match: 0.112

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.111
Micro F1: 0.458
Exact Match: 0.214

 Domain: UA
Macro F1: 0.000
Micro F1: 0.000
Exact Match: 0.046

--- Running STL: narrative_classification | fine | distilbert-base-uncased | Seed=44 | Train on CC ---


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 122/122 [00:12<00:00,  9.64it/s]



Epoch 1: Loss = 0.2990
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.103 | Micro F1: 0.173 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.061 | Micro F1: 0.297 | Exact Match: 0.000
Thresh 0.20 | Macro F1: 0.055 | Micro F1: 0.369 | Exact Match: 0.000
Thresh 0.25 | Macro F1: 0.044 | Micro F1: 0.387 | Exact Match: 0.108
Thresh 0.30 | Macro F1: 0.026 | Micro F1: 0.317 | Exact Match: 0.144
Thresh 0.35 | Macro F1: 0.022 | Micro F1: 0.278 | Exact Match: 0.077
Thresh 0.40 | Macro F1: 0.014 | Micro F1: 0.210 | Exact Match: 0.010
Thresh 0.45 | Macro F1: 0.014 | Micro F1: 0.191 | Exact Match: 0.000
Thresh 0.50 | Macro F1: 0.006 | Micro F1: 0.055 | Exact Match: 0.000
Thresh 0.55 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.60 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.65 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.70 | Macro F1: 0.000 | Micro F1: 0.000 | Exact Match: 0.000
Thresh 0.75 | Macro F1: 0.000 | Micro F1: 0.000 | Exac

Epoch 2: 100%|██████████| 122/122 [00:12<00:00,  9.67it/s]



Epoch 2: Loss = 0.1940
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.164 | Micro F1: 0.354 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.125 | Micro F1: 0.444 | Exact Match: 0.005
Thresh 0.20 | Macro F1: 0.113 | Micro F1: 0.498 | Exact Match: 0.046
Thresh 0.25 | Macro F1: 0.102 | Micro F1: 0.507 | Exact Match: 0.057
Thresh 0.30 | Macro F1: 0.085 | Micro F1: 0.504 | Exact Match: 0.165
Thresh 0.35 | Macro F1: 0.065 | Micro F1: 0.463 | Exact Match: 0.216
Thresh 0.40 | Macro F1: 0.050 | Micro F1: 0.423 | Exact Match: 0.201
Thresh 0.45 | Macro F1: 0.043 | Micro F1: 0.394 | Exact Match: 0.160
Thresh 0.50 | Macro F1: 0.038 | Micro F1: 0.368 | Exact Match: 0.113
Thresh 0.55 | Macro F1: 0.038 | Micro F1: 0.362 | Exact Match: 0.108
Thresh 0.60 | Macro F1: 0.034 | Micro F1: 0.323 | Exact Match: 0.082
Thresh 0.65 | Macro F1: 0.026 | Micro F1: 0.261 | Exact Match: 0.052
Thresh 0.70 | Macro F1: 0.019 | Micro F1: 0.200 | Exact Match: 0.021
Thresh 0.75 | Macro F1: 0.011 | Micro F1: 0.120 | Exac

Epoch 3: 100%|██████████| 122/122 [00:12<00:00,  9.72it/s]



Epoch 3: Loss = 0.1599
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.246 | Micro F1: 0.452 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.234 | Micro F1: 0.542 | Exact Match: 0.067
Thresh 0.20 | Macro F1: 0.179 | Micro F1: 0.568 | Exact Match: 0.057
Thresh 0.25 | Macro F1: 0.166 | Micro F1: 0.583 | Exact Match: 0.196
Thresh 0.30 | Macro F1: 0.131 | Micro F1: 0.564 | Exact Match: 0.242
Thresh 0.35 | Macro F1: 0.108 | Micro F1: 0.544 | Exact Match: 0.304
Thresh 0.40 | Macro F1: 0.097 | Micro F1: 0.516 | Exact Match: 0.289
Thresh 0.45 | Macro F1: 0.080 | Micro F1: 0.479 | Exact Match: 0.253
Thresh 0.50 | Macro F1: 0.062 | Micro F1: 0.432 | Exact Match: 0.201
Thresh 0.55 | Macro F1: 0.053 | Micro F1: 0.411 | Exact Match: 0.175
Thresh 0.60 | Macro F1: 0.043 | Micro F1: 0.390 | Exact Match: 0.155
Thresh 0.65 | Macro F1: 0.039 | Micro F1: 0.372 | Exact Match: 0.119
Thresh 0.70 | Macro F1: 0.034 | Micro F1: 0.330 | Exact Match: 0.088
Thresh 0.75 | Macro F1: 0.028 | Micro F1: 0.273 | Exac

Epoch 4: 100%|██████████| 122/122 [00:12<00:00,  9.68it/s]



Epoch 4: Loss = 0.1363
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.341 | Micro F1: 0.500 | Exact Match: 0.000
Thresh 0.15 | Macro F1: 0.343 | Micro F1: 0.603 | Exact Match: 0.082
Thresh 0.20 | Macro F1: 0.291 | Micro F1: 0.641 | Exact Match: 0.180
Thresh 0.25 | Macro F1: 0.273 | Micro F1: 0.657 | Exact Match: 0.273
Thresh 0.30 | Macro F1: 0.244 | Micro F1: 0.657 | Exact Match: 0.320
Thresh 0.35 | Macro F1: 0.229 | Micro F1: 0.663 | Exact Match: 0.366
Thresh 0.40 | Macro F1: 0.189 | Micro F1: 0.637 | Exact Match: 0.371
Thresh 0.45 | Macro F1: 0.166 | Micro F1: 0.618 | Exact Match: 0.371
Thresh 0.50 | Macro F1: 0.139 | Micro F1: 0.591 | Exact Match: 0.361
Thresh 0.55 | Macro F1: 0.123 | Micro F1: 0.560 | Exact Match: 0.351
Thresh 0.60 | Macro F1: 0.093 | Micro F1: 0.509 | Exact Match: 0.330
Thresh 0.65 | Macro F1: 0.078 | Micro F1: 0.481 | Exact Match: 0.309
Thresh 0.70 | Macro F1: 0.059 | Micro F1: 0.424 | Exact Match: 0.263
Thresh 0.75 | Macro F1: 0.043 | Micro F1: 0.348 | Exac

Evaluating TEST: 100%|██████████| 23/23 [00:01<00:00, 22.47it/s]


 TEST (Fixed Threshold=0.35):
Macro F1: 0.106
Micro F1: 0.328
Exact Match: 0.096

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.129
Micro F1: 0.532
Exact Match: 0.229

 Domain: UA
Macro F1: 0.000
Micro F1: 0.000
Exact Match: 0.009
ablation_results_stl_augmented_new/stl_all_results_augmented_more_seeds.csv





## Eval only loop

In [None]:
for model_name, task, taxonomy, train_domain, seed in itertools.product(MODELS, TASKS, TAXONOMY_DEPTHS, TRAIN_DOMAINS, SEEDS):
    domain_str = "-".join(train_domain)
    print(f"\n--- Evaluating STL: {task} | {taxonomy} | {model_name} | Seed={seed} | Trained on {domain_str} ---")
    torch.manual_seed(seed)

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # === Data Prep ===
    if taxonomy == "fine":
        df_train, df_val, df_test, y_train, y_val, y_test, mlb, TEXT_COL, LABEL_COL = prepare_data_STL_fine(
            TASK=task,
            train_domains=train_domain,
            test_domains=TEST_DOMAIN,
            train_languages=TRAIN_LANGUAGES
        )
    else:
        df_train, df_val, df_test, y_train, y_val, y_test, mlb, TEXT_COL, LABEL_COL = prepare_data_STL_coarse(
            TASK=task,
            train_domains=train_domain,
            test_domains=TEST_DOMAIN,
            train_languages=TRAIN_LANGUAGES
        )

    # === Dataset Setup ===
    test_dataset = MultiLabelDataset(df_test[TEXT_COL].tolist(), y_test, tokenizer, MAX_LEN)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    # === Model Init ===
    num_classes = y_train.shape[1]
    model = TransformerClassifier(model_name, num_classes).to(device)

    model_path = (
        f"stl_results/model_{task}_{taxonomy}_trained_on_{domain_str}"
        f"_{model_name.replace('/', '-')}_seed{seed}.pt"
    )

    if not os.path.exists(model_path):
        print(f"Model file not found: {model_path}")
        continue

    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    # === Evaluate ===
    eval_result = evaluate_flat(
        model=model,
        loader=test_loader,
        df_source=df_test,
        mlb=mlb,
        device=device,
        label="TEST",
        threshold=THRESHOLD
    )

    # === Extract Per-Domain Metrics ===
    for domain in ["UA", "CC"]:
        results.append({
            "model": model_name,
            "task": task,
            "taxonomy": taxonomy,
            "train_domain": domain_str,
            "seed": seed,
            "eval_domain": domain,
            "micro": eval_result["per_domain"][domain]["micro"],
            "macro": eval_result["per_domain"][domain]["macro"],
            "exact": eval_result["per_domain"][domain]["exact"]
        })

# === Save to CSV ===
df_out = pd.DataFrame(results)
df_out.to_csv("stl_results/stl_all_results_loaded.csv", index=False)
print(" STL evaluation results saved to: stl_results/stl_all_results_loaded.csv")


# MTL/MTL-PAL

In [9]:
import pandas as pd
import torch
import itertools
import os
from transformers import AutoTokenizer

# === Config ===
MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 4
LEARNING_RATE = 3e-5
TRAIN_LANGUAGES = ["ALL"]
TEST_DOMAINS = ["UA", "CC"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
os.makedirs("new_augmented_seeds", exist_ok=True)

# === Domain configs (train on UA, CC, UA+CC)
train_domain_configs = [["UA"],["UA","CC"],["CC"]]
seeds = [71, 72, 73, 74, 75] #71, 72, 73, 74, 75, 31, 32, 42, 43, 44

for train_domains in train_domain_configs:
    domain_str = "-".join(train_domains)
    results = []

    for task_type in ["multi_task"]:
        for seed in seeds:
            torch.manual_seed(seed)

            print(f"\n--- {task_type.upper()} | Train on: {domain_str} | Seed={seed} ---")

            model_path = f"{task_type}_distilbert_trained_on_{domain_str}_seed{seed}.pt"

            # === Prepare Data ===
            (
                df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
                df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
                train_loader_s1, val_loader_s1, test_loader_s1,
                train_loader_s2, val_loader_s2, test_loader_s2,
                num_classes_dict
            ) = prepare_data_MTL_mixed(
                task=task_type,
                train_domains=train_domains,
                test_domains=TEST_DOMAINS,
                train_languages=TRAIN_LANGUAGES,
                model_name=MODEL_NAME,
                max_len=MAX_LEN,
                batch_size=BATCH_SIZE,
                granularity_s1="fine",
                granularity_s2="fine"
            )

            task_classes = {
                "entity_framing": y_train_s1.shape[1],
                "narrative_classification": y_train_s2.shape[1]
            }

            # === Initialise Model ===
            if task_type == "multi_task":
                model = MultiTaskTransformer(MODEL_NAME, task_classes).to(device)
            else:
                model = AdapterMultiTaskTransformer(
                    model_name=MODEL_NAME,
                    num_classes_dict=task_classes,
                    adapter_dim=128
                ).to(device)

            optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
            criterion = torch.nn.BCEWithLogitsLoss()

            # === Train ===
            train_mtl_flat(
                model=model,
                loaders={
                    "narrative_classification": train_loader_s2,
                    "entity_framing": train_loader_s1
                },
                val_data={
                    "narrative_classification": (val_loader_s2, df_val_s2, y_val_s2, mlb_s2),
                    "entity_framing": (val_loader_s1, df_val_s1, y_val_s1, mlb_s1)
                },
                mlbs={
                    "narrative_classification": mlb_s2,
                    "entity_framing": mlb_s1
                },
                optimizer=optimizer,
                criterion=criterion,
                device=device,
                epochs=EPOCHS,
                train_domain=train_domains,
                test_domain=TEST_DOMAINS
            )

            # === Evaluate ===
            eval_results = evaluate_mtl_all_tasks(
                model=model,
                task_loaders={
                    "narrative_classification": test_loader_s2,
                    "entity_framing": test_loader_s1
                },
                task_dfs={
                    "narrative_classification": df_test_s2,
                    "entity_framing": df_test_s1
                },
                task_targets={
                    "narrative_classification": y_test_s2,
                    "entity_framing": y_test_s1
                },
                task_mlbs={
                    "narrative_classification": mlb_s2,
                    "entity_framing": mlb_s1
                },
                domain_list=train_domains,
                device=device
            )

            ef = eval_results["entity_framing"]
            nc = eval_results["narrative_classification"]

            results.append({
                "task_type": task_type,
                "model": "distilbert-base-uncased",
                "seed": seed,
                "train_domain": domain_str,

                "ef_micro_ua": ef["UA"]["micro"],
                "ef_macro_ua": ef["UA"]["macro"],
                "ef_exact_ua": ef["UA"]["exact"],
                "ef_micro_cc": ef["CC"]["micro"],
                "ef_macro_cc": ef["CC"]["macro"],
                "ef_exact_cc": ef["CC"]["exact"],

                "nc_micro_ua": nc["UA"]["micro"],
                "nc_macro_ua": nc["UA"]["macro"],
                "nc_exact_ua": nc["UA"]["exact"],
                "nc_micro_cc": nc["CC"]["micro"],
                "nc_macro_cc": nc["CC"]["macro"],
                "nc_exact_cc": nc["CC"]["exact"]
            })
            #texts_ef = df_test_s1["Translated_Text"].tolist()  # or the appropriate input column
            #texts_nc = df_test_s2["Translated_Text"].tolist()

            #shap_values_ef = explain_shap(model, tokenizer, texts_ef, "entity_framing", max_explain=5)
           # shap_values_nc = explain_shap(model, tokenizer, texts_nc, "narrative_classification", max_explain=5)

            #save_shap_waterfall_plots(shap_values_nc, "shap_plots", "narrative_classification", seed, task_type, mlb_s2)
            #save_shap_waterfall_plots(shap_values_ef, "shap_plots", "entity_framing", seed, task_type, mlb_s1)







    # === Save CSV for each domain setting ===
    df_out = pd.DataFrame(results)
    out_path = f"new_augmented_seeds/baseline_mtl_more_seeds_augmented_70{domain_str}.csv"
    df_out.to_csv(out_path, index=False)
    print(f" Saved: {out_path}")



--- MULTI_TASK | Train on: UA | Seed=71 ---

Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.3905

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1279
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2315
Best model for task 'entity_framing' saved to entity_framing_MTL_UA_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.2501

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.3242
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.4486
Best model for task 'entity_framing' saved to entity_framing_MTL_UA_to_UA-CC.pt

Starting Epoch 3/4...

Epoch 3 - Average Loss: 0.1711

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.5035
Best model for task 'narrative_clas

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 20.92it/s]



Domain: CC
Macro F1: 0.027
Micro F1: 0.380
Exact Match: 0.114

Domain: UA
Macro F1: 0.152
Micro F1: 0.416
Exact Match: 0.056

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 20.43it/s]



Domain: CC
Macro F1: 0.130
Micro F1: 0.708
Exact Match: 0.549

Domain: UA
Macro F1: 0.211
Micro F1: 0.438
Exact Match: 0.204

--- MULTI_TASK | Train on: UA | Seed=72 ---

Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.3982

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1180
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2012
Best model for task 'entity_framing' saved to entity_framing_MTL_UA_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.2669

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.3854
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.4588
Best model for task 'entity_framing' saved to entity_framing_MTL_UA_to_UA-CC.pt

Starting Epoch 3/4...

Epoch 3 - Average Loss: 0.1

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 20.82it/s]



Domain: CC
Macro F1: 0.026
Micro F1: 0.344
Exact Match: 0.129

Domain: UA
Macro F1: 0.140
Micro F1: 0.393
Exact Match: 0.046

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 20.10it/s]



Domain: CC
Macro F1: 0.129
Micro F1: 0.661
Exact Match: 0.538

Domain: UA
Macro F1: 0.224
Micro F1: 0.429
Exact Match: 0.230

--- MULTI_TASK | Train on: UA | Seed=73 ---

Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.3936

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1377
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2525
Best model for task 'entity_framing' saved to entity_framing_MTL_UA_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.2502

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.3445
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.4650
Best model for task 'entity_framing' saved to entity_framing_MTL_UA_to_UA-CC.pt

Starting Epoch 3/4...

Epoch 3 - Average Loss: 0.1

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 20.87it/s]



Domain: CC
Macro F1: 0.029
Micro F1: 0.371
Exact Match: 0.086

Domain: UA
Macro F1: 0.145
Micro F1: 0.395
Exact Match: 0.056

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 20.15it/s]



Domain: CC
Macro F1: 0.143
Micro F1: 0.735
Exact Match: 0.571

Domain: UA
Macro F1: 0.226
Micro F1: 0.450
Exact Match: 0.218

--- MULTI_TASK | Train on: UA | Seed=74 ---

Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.3957

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1184
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2160
Best model for task 'entity_framing' saved to entity_framing_MTL_UA_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.2593

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.3387
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.4535
Best model for task 'entity_framing' saved to entity_framing_MTL_UA_to_UA-CC.pt

Starting Epoch 3/4...

Epoch 3 - Average Loss: 0.1

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 20.95it/s]



Domain: CC
Macro F1: 0.029
Micro F1: 0.381
Exact Match: 0.100

Domain: UA
Macro F1: 0.137
Micro F1: 0.386
Exact Match: 0.046

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 20.28it/s]



Domain: CC
Macro F1: 0.192
Micro F1: 0.766
Exact Match: 0.571

Domain: UA
Macro F1: 0.235
Micro F1: 0.485
Exact Match: 0.255

--- MULTI_TASK | Train on: UA | Seed=75 ---

Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.3900

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1410
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2640
Best model for task 'entity_framing' saved to entity_framing_MTL_UA_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.2518

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.3496
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.4565
Best model for task 'entity_framing' saved to entity_framing_MTL_UA_to_UA-CC.pt

Starting Epoch 3/4...

Epoch 3 - Average Loss: 0.1

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 20.84it/s]



Domain: CC
Macro F1: 0.036
Micro F1: 0.458
Exact Match: 0.100

Domain: UA
Macro F1: 0.160
Micro F1: 0.417
Exact Match: 0.037

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 20.20it/s]



Domain: CC
Macro F1: 0.163
Micro F1: 0.743
Exact Match: 0.582

Domain: UA
Macro F1: 0.220
Micro F1: 0.440
Exact Match: 0.216
 Saved: new_augmented_seeds/baseline_mtl_more_seeds_augmented_70UA.csv

--- MULTI_TASK | Train on: UA-CC | Seed=71 ---

Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.3635

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1348
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA-CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2310
Best model for task 'entity_framing' saved to entity_framing_MTL_UA-CC_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.2355

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.2597
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA-CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.4625
Best model for task 'entity_framing' saved to en

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 21.09it/s]



Domain: CC
Macro F1: 0.078
Micro F1: 0.524
Exact Match: 0.200

Domain: UA
Macro F1: 0.121
Micro F1: 0.426
Exact Match: 0.019

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 20.26it/s]



Domain: CC
Macro F1: 0.144
Micro F1: 0.769
Exact Match: 0.659

Domain: UA
Macro F1: 0.201
Micro F1: 0.452
Exact Match: 0.218

--- MULTI_TASK | Train on: UA-CC | Seed=72 ---

Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.3630

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1158
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA-CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2341
Best model for task 'entity_framing' saved to entity_framing_MTL_UA-CC_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.2330

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.2758
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA-CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.4315
Best model for task 'entity_framing' saved to entity_framing_MTL_UA-CC_to_UA-CC.pt

Starting Epoch 3/4...

Epoch 3 - Av

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 21.02it/s]



Domain: CC
Macro F1: 0.083
Micro F1: 0.505
Exact Match: 0.229

Domain: UA
Macro F1: 0.094
Micro F1: 0.381
Exact Match: 0.056

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 20.04it/s]



Domain: CC
Macro F1: 0.159
Micro F1: 0.724
Exact Match: 0.604

Domain: UA
Macro F1: 0.234
Micro F1: 0.461
Exact Match: 0.235

--- MULTI_TASK | Train on: UA-CC | Seed=73 ---

Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.3624

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1276
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA-CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2698
Best model for task 'entity_framing' saved to entity_framing_MTL_UA-CC_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.2322

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.2819
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA-CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.4336
Best model for task 'entity_framing' saved to entity_framing_MTL_UA-CC_to_UA-CC.pt

Starting Epoch 3/4...

Epoch 3 - Av

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 21.16it/s]



Domain: CC
Macro F1: 0.117
Micro F1: 0.561
Exact Match: 0.214

Domain: UA
Macro F1: 0.116
Micro F1: 0.410
Exact Match: 0.028

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 20.47it/s]



Domain: CC
Macro F1: 0.134
Micro F1: 0.659
Exact Match: 0.505

Domain: UA
Macro F1: 0.218
Micro F1: 0.439
Exact Match: 0.213

--- MULTI_TASK | Train on: UA-CC | Seed=74 ---

Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.3643

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.0984
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA-CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2315
Best model for task 'entity_framing' saved to entity_framing_MTL_UA-CC_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.2360

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.2668
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA-CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.4300
Best model for task 'entity_framing' saved to entity_framing_MTL_UA-CC_to_UA-CC.pt

Starting Epoch 3/4...

Epoch 3 - Av

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 21.01it/s]



Domain: CC
Macro F1: 0.087
Micro F1: 0.538
Exact Match: 0.157

Domain: UA
Macro F1: 0.115
Micro F1: 0.406
Exact Match: 0.019

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 20.30it/s]



Domain: CC
Macro F1: 0.167
Micro F1: 0.780
Exact Match: 0.681

Domain: UA
Macro F1: 0.221
Micro F1: 0.452
Exact Match: 0.224

--- MULTI_TASK | Train on: UA-CC | Seed=75 ---

Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.3592

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1264
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA-CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2553
Best model for task 'entity_framing' saved to entity_framing_MTL_UA-CC_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.2299

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.2552
Best model for task 'narrative_classification' saved to narrative_classification_MTL_UA-CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.4563
Best model for task 'entity_framing' saved to entity_framing_MTL_UA-CC_to_UA-CC.pt

Starting Epoch 3/4...

Epoch 3 - Av

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 21.23it/s]



Domain: CC
Macro F1: 0.097
Micro F1: 0.543
Exact Match: 0.271

Domain: UA
Macro F1: 0.100
Micro F1: 0.429
Exact Match: 0.037

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 20.42it/s]



Domain: CC
Macro F1: 0.189
Micro F1: 0.797
Exact Match: 0.670

Domain: UA
Macro F1: 0.196
Micro F1: 0.449
Exact Match: 0.213
 Saved: new_augmented_seeds/baseline_mtl_more_seeds_augmented_70UA-CC.csv

--- MULTI_TASK | Train on: CC | Seed=71 ---





Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.6310

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.0596
Best model for task 'narrative_classification' saved to narrative_classification_MTL_CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.1184
Best model for task 'entity_framing' saved to entity_framing_MTL_CC_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.3795

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1143
Best model for task 'narrative_classification' saved to narrative_classification_MTL_CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2280
Best model for task 'entity_framing' saved to entity_framing_MTL_CC_to_UA-CC.pt

Starting Epoch 3/4...

Epoch 3 - Average Loss: 0.2996

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1442
Best model for task 'narrative_classification' saved to narrative_classification

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 21.31it/s]



Domain: CC
Macro F1: 0.110
Micro F1: 0.491
Exact Match: 0.171

Domain: UA
Macro F1: 0.000
Micro F1: 0.000
Exact Match: 0.037

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 20.33it/s]



Domain: CC
Macro F1: 0.156
Micro F1: 0.724
Exact Match: 0.637

Domain: UA
Macro F1: 0.089
Micro F1: 0.314
Exact Match: 0.084

--- MULTI_TASK | Train on: CC | Seed=72 ---





Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.6209

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.0577
Best model for task 'narrative_classification' saved to narrative_classification_MTL_CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.1252
Best model for task 'entity_framing' saved to entity_framing_MTL_CC_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.3820

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.0926
Best model for task 'narrative_classification' saved to narrative_classification_MTL_CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2226
Best model for task 'entity_framing' saved to entity_framing_MTL_CC_to_UA-CC.pt

Starting Epoch 3/4...

Epoch 3 - Average Loss: 0.3025

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1703
Best model for task 'narrative_classification' saved to narrative_classification

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 21.30it/s]



Domain: CC
Macro F1: 0.101
Micro F1: 0.474
Exact Match: 0.214

Domain: UA
Macro F1: 0.000
Micro F1: 0.000
Exact Match: 0.037

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 20.94it/s]



Domain: CC
Macro F1: 0.131
Micro F1: 0.642
Exact Match: 0.527

Domain: UA
Macro F1: 0.087
Micro F1: 0.314
Exact Match: 0.081

--- MULTI_TASK | Train on: CC | Seed=73 ---





Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.6358

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.0674
Best model for task 'narrative_classification' saved to narrative_classification_MTL_CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.1165
Best model for task 'entity_framing' saved to entity_framing_MTL_CC_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.3841

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1128
Best model for task 'narrative_classification' saved to narrative_classification_MTL_CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2304
Best model for task 'entity_framing' saved to entity_framing_MTL_CC_to_UA-CC.pt

Starting Epoch 3/4...

Epoch 3 - Average Loss: 0.2947

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1798
Best model for task 'narrative_classification' saved to narrative_classification

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 21.90it/s]



Domain: CC
Macro F1: 0.087
Micro F1: 0.470
Exact Match: 0.214

Domain: UA
Macro F1: 0.000
Micro F1: 0.000
Exact Match: 0.074

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 21.09it/s]



Domain: CC
Macro F1: 0.154
Micro F1: 0.718
Exact Match: 0.604

Domain: UA
Macro F1: 0.094
Micro F1: 0.325
Exact Match: 0.087

--- MULTI_TASK | Train on: CC | Seed=74 ---





Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.6250

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.0442
Best model for task 'narrative_classification' saved to narrative_classification_MTL_CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.1177
Best model for task 'entity_framing' saved to entity_framing_MTL_CC_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.3868

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.0919
Best model for task 'narrative_classification' saved to narrative_classification_MTL_CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2001
Best model for task 'entity_framing' saved to entity_framing_MTL_CC_to_UA-CC.pt

Starting Epoch 3/4...

Epoch 3 - Average Loss: 0.3052

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1131
Best model for task 'narrative_classification' saved to narrative_classification

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 21.70it/s]



Domain: CC
Macro F1: 0.109
Micro F1: 0.510
Exact Match: 0.200

Domain: UA
Macro F1: 0.000
Micro F1: 0.000
Exact Match: 0.009

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 20.89it/s]



Domain: CC
Macro F1: 0.149
Micro F1: 0.693
Exact Match: 0.527

Domain: UA
Macro F1: 0.073
Micro F1: 0.288
Exact Match: 0.087

--- MULTI_TASK | Train on: CC | Seed=75 ---





Starting Epoch 1/4...

Epoch 1 - Average Loss: 0.6217

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.0649
Best model for task 'narrative_classification' saved to narrative_classification_MTL_CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.1340
Best model for task 'entity_framing' saved to entity_framing_MTL_CC_to_UA-CC.pt

Starting Epoch 2/4...

Epoch 2 - Average Loss: 0.3682

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.0930
Best model for task 'narrative_classification' saved to narrative_classification_MTL_CC_to_UA-CC.pt

Validating task: entity_framing
[entity_framing] Macro F1: 0.2083
Best model for task 'entity_framing' saved to entity_framing_MTL_CC_to_UA-CC.pt

Starting Epoch 3/4...

Epoch 3 - Average Loss: 0.2905

Validating task: narrative_classification
[narrative_classification] Macro F1: 0.1314
Best model for task 'narrative_classification' saved to narrative_classification

Evaluating TEST [narrative_classification]: 100%|██████████| 23/23 [00:01<00:00, 22.04it/s]



Domain: CC
Macro F1: 0.085
Micro F1: 0.471
Exact Match: 0.229

Domain: UA
Macro F1: 0.000
Micro F1: 0.000
Exact Match: 0.065

--- Task: ENTITY_FRAMING ---


Evaluating TEST [entity_framing]: 100%|██████████| 56/56 [00:02<00:00, 21.22it/s]



Domain: CC
Macro F1: 0.159
Micro F1: 0.735
Exact Match: 0.637

Domain: UA
Macro F1: 0.089
Micro F1: 0.341
Exact Match: 0.092
 Saved: new_augmented_seeds/baseline_mtl_more_seeds_augmented_70CC.csv


In [None]:
import shap
import numpy as np
import torch
import os
import matplotlib.pyplot as plt

# === Utility to Truncate Long Inputs ===
def truncate_texts(texts, tokenizer, max_len):
    truncated = []
    for text in texts:
        tokens = tokenizer.encode(text, truncation=True, max_length=max_len)
        truncated_text = tokenizer.decode(tokens, skip_special_tokens=True)
        truncated.append(truncated_text)
    return truncated

# === Build SHAP Explainer for MTL ===
def get_shap_explainer(model, tokenizer, task_name, model_type="multi_task"):
    def forward_func(inputs):
        model.eval()
        with torch.no_grad():
            outputs = model(**inputs, task=task_name)
            if isinstance(outputs, (tuple, list)):
                logits = outputs[0]
            else:
                logits = outputs  # adjust if using dict-style output
            return torch.sigmoid(logits).cpu().numpy()

    class Wrapper:
        def __call__(self, text):
            if isinstance(text, str):
                text = [text]
            elif isinstance(text, np.ndarray):
                text = text.tolist()
            elif isinstance(text, list) and isinstance(text[0], np.ndarray):
                text = [str(t) for t in text]

            encoded = tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=MAX_LEN
            ).to(device)
            return forward_func(encoded)

    return shap.Explainer(Wrapper(), tokenizer, algorithm="permutation")

# === Run SHAP Analysis ===
def explain_shap(model, tokenizer, texts, task_name, model_type="multi_task", max_explain=5, visualize=False):
    texts = truncate_texts(texts[:max_explain], tokenizer, MAX_LEN)
    explainer = get_shap_explainer(model, tokenizer, task_name, model_type)
    shap_values = explainer(texts, max_evals=1500, silent=True)
    if visualize:
        shap.plots.text(shap_values)
    return shap_values

# === Save SHAP Visualizations to PNG ===
def save_shap_waterfall_plots(shap_values, output_dir, task_name, seed, model_type, mlb, top_k=3):
    import matplotlib.pyplot as plt
    import os
    from shap import Explanation

    os.makedirs(output_dir, exist_ok=True)

    for i, sv in enumerate(shap_values):
        try:
            mean_abs = np.abs(sv.values).mean(axis=0)
            top_outputs = np.argsort(mean_abs)[-top_k:]

            for j in top_outputs:
                try:
                    # Create SHAP Explanation for one label
                    single_sv = Explanation(
                        values=sv.values[:, j],
                        base_values=sv.base_values[j] if hasattr(sv.base_values, '__len__') else sv.base_values,
                        data=sv.data,
                        feature_names=sv.feature_names
                    )

                    # Get label name from mlb
                    label_name = mlb.classes_[j].replace(" ", "_")  # safer for filenames

                    # Generate plot and save
                    ax = shap.plots.waterfall(single_sv, show=False)
                    fig = ax.figure
                    fname = os.path.join(
                        output_dir, f"{task_name}_{model_type}_seed{seed}_sample{i}_label_{label_name}.png"
                    )
                    fig.savefig(fname, bbox_inches="tight")
                    plt.close(fig)

                except Exception as e:
                    print(f"  [!] Skipped sample {i}, label {j} → {e}")

        except Exception as e:
            print(f"[!] Error processing sample {i} → {e}")


# Master Notebook

Through this interface the user can experiment with all the models and experimental conditions used in the thesis.

## Hyperparameters

In [None]:
import sys
sys.path.append(".")  # Ensure current directory is in path

from merged_optuna_script import objective_stl, objective_mtl, objective_mtl_adapter
import optuna
import pandas as pd

# === Fast Experiment Sweep ===
EXPERIMENTS = [
    {"setup": "stl", "task": "entity_framing", "encoder": "roberta-base"},
    {"setup": "stl", "task": "narrative_classification", "encoder": "roberta-base"},
    {"setup": "mtl", "task": None, "encoder": "roberta-base"},
    {"setup": "stl", "task": "entity_framing", "encoder": "distilbert-base-uncased"},
    {"setup": "stl", "task": "narrative_classification", "encoder": "distilbert-base-uncased"},
    {"setup": "mtl", "task": None, "encoder": "distilbert-base-uncased"},
    {"setup": "mtl_adapter", "task": None, "encoder": "roberta-base"},
    {"setup": "mtl_adapter", "task": None, "encoder": "distilbert-base-uncased"},
]

all_results = []

for config in EXPERIMENTS:
    setup = config["setup"]
    task = config["task"]
    encoder = config["encoder"]

    print(f"\n Starting Optuna Study → Setup: {setup.upper()} | Task: {task or 'MTL'} | Encoder: {encoder}")

    study = optuna.create_study(direction="maximize")

    if setup == "stl":
        study.optimize(lambda trial: objective_stl(trial, task_type=task, model_name=encoder), n_trials=3)
    elif setup == "mtl":
        study.optimize(lambda trial: objective_mtl(trial, model_name=encoder), n_trials=3)
    elif setup == "mtl_adapter":
        study.optimize(lambda trial: objective_mtl_adapter(trial, model_name=encoder), n_trials=3)
    else:
        raise ValueError(f"Unknown setup: {setup}")

    best_params = study.best_trial.params
    best_score = study.best_trial.value

    print(f"\n Best hyperparameters for {setup.upper()} | {task or 'MTL'} | {encoder}:")
    for k, v in best_params.items():
        print(f"  {k}: {v}")
    print(f"  score: {best_score:.4f}")

    all_results.append({
        "setup": setup,
        "task": task or "mtl",
        "encoder": encoder,
        "score": best_score,
        **best_params
    })

# === Save results ===
df = pd.DataFrame(all_results)
df.to_csv("optuna_quick_sweep_results_adapter.csv", index=False)
print("\n Saved results to optuna_quick_sweep_results.csv")


[I 2025-05-10 10:29:04,051] A new study created in memory with name: no-name-efb61a5a-df9a-4bc7-a131-8da529d3017d

 Starting Optuna Study → Setup: MTL_ADAPTER | Task: MTL | Encoder: roberta-base
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fca33dea170>>
Traceback (most recent call last):
  File "/toolkit-cache/0.2.16/python3.10/kernel-libs/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Starting Epoch 1/2...
[W 2025-05-10 10:29:12,061] Trial 0 failed with parameters: {'learning_rate': 2.5620923423875518e-05, 'batch_s

KeyboardInterrupt: 

In [None]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

from single_task import TransformerClassifier, train_single_task_model, MultiLabelDataset
from multi_task import MultiTaskTransformer, AdapterMultiTaskTransformer, train_mtl_flat
from data_loader_STL import prepare_data_STL_fine
from data_loader_MTL import prepare_data_MTL_fine_flat
from evaluation_utils import evaluate_flat_custom, compute_fine_vs_coarse_metrics, get_coarse_label_list

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PARAMS = {
    "learning_rate": 3e-5,
    "batch_size": 8,
    "epochs": 3,
    "threshold": 0.35,
    "max_len": 512
}

SETUPS = [
    #{"setup": "stl", "task": "entity_framing", "encoder": "roberta-base"},
    #{"setup": "stl", "task": "narrative_classification", "encoder": "roberta-base"},
    #{"setup": "mtl", "task": None, "encoder": "roberta-base"},
    #{"setup": "mtl_adapter", "task": None, "encoder": "roberta-base"},
    #{"setup": "stl", "task": "entity_framing", "encoder": "distilbert-base-uncased"},
    #{"setup": "stl", "task": "narrative_classification", "encoder": "distilbert-base-uncased"},
   {"setup": "mtl", "task": None, "encoder": "distilbert-base-uncased"},
    #{"setup": "mtl_adapter", "task": None, "encoder": "distilbert-base-uncased"},
]

TRAIN_SPLITS = [["CC"]]
EVAL_SPLITS = ["UA", "CC"]

SUMMARY_COLUMNS = [
    "setup", "encoder", "task", "train_domain", "eval_domain",
    "overall_macro", "overall_micro", "overall_exact",
    "macro_fine", "micro_fine", "macro_coarse", "micro_coarse"
]

for config in SETUPS:
    setup = config["setup"]
    task = config["task"]
    encoder = config["encoder"]
    tokenizer = AutoTokenizer.from_pretrained(encoder)

    setup_name = f"{setup}_{task or 'mtl'}_{encoder.replace('/', '-')}"
    csv_path = f"results_summary__{setup_name}.csv"
    all_rows = []

    for train_domains in TRAIN_SPLITS:
        train_str = "+".join(train_domains)

        if setup == "stl":
            df_train, df_val, df_test, y_train, y_val, y_test, mlb, text_col, label_col = prepare_data_STL_fine(
                task, train_domains, ["UA", "CC"]
            )
            train_dataset = MultiLabelDataset(df_train[text_col].tolist(), y_train, tokenizer, PARAMS["max_len"])
            val_dataset = MultiLabelDataset(df_val[text_col].tolist(), y_val, tokenizer, PARAMS["max_len"])
            train_loader = DataLoader(train_dataset, batch_size=PARAMS["batch_size"], shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=PARAMS["batch_size"])

            model = TransformerClassifier(encoder, len(mlb.classes_)).to(device)
            model = train_single_task_model(
                model=model,
                train_loader=train_loader,
                val_loader=val_loader,
                y_val=y_val,
                MODEL_PATH="tmp.pt",
                LEARNING_RATE=PARAMS["learning_rate"],
                EPOCHS=PARAMS["epochs"],
                device=device
            )

            results_fine, results_coarse = {}, {}
            coarse_list = get_coarse_label_list(task)

            for domain in EVAL_SPLITS:
                df_eval = df_test[df_test["Domain"] == domain].copy()
                known_labels = set(mlb.classes_)
                df_eval[label_col] = df_eval[label_col].apply(lambda labels: [l for l in labels if l in known_labels])
                y_eval = mlb.transform(df_eval[label_col])
                test_loader = DataLoader(
                    MultiLabelDataset(df_eval[text_col].tolist(), y_eval, tokenizer, PARAMS["max_len"]),
                    batch_size=PARAMS["batch_size"]
                )
                eval_result = evaluate_flat_custom(model, test_loader, df_eval, mlb, device, threshold=PARAMS["threshold"])
                score_dict = compute_fine_vs_coarse_metrics(eval_result["y_true"], eval_result["y_pred_bin"], list(mlb.classes_), coarse_list)

                all_rows.append({
                    "setup": setup,
                    "encoder": encoder,
                    "task": task,
                    "train_domain": train_str,
                    "eval_domain": domain,
                    "overall_macro": round((score_dict["macro_fine"] + score_dict["macro_coarse"]) / 2, 4),
                    "overall_micro": round((score_dict["micro_fine"] + score_dict["micro_coarse"]) / 2, 4),
                    "overall_exact": round(eval_result["exact"], 4),
                    "macro_fine": round(score_dict["macro_fine"], 4),
                    "micro_fine": round(score_dict["micro_fine"], 4),
                    "macro_coarse": round(score_dict["macro_coarse"], 4),
                    "micro_coarse": round(score_dict["micro_coarse"], 4)
                })

        elif setup in ["mtl", "mtl_adapter"]:
            (
                df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
                df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
                train_loader_s1, val_loader_s1, test_loader_s1,
                train_loader_s2, val_loader_s2, test_loader_s2,
                num_classes_dict
            ) = prepare_data_MTL_fine_flat(
                TASK="multi_task",
                model_name=encoder,
                max_len=PARAMS["max_len"],
                batch_size=PARAMS["batch_size"],
                train_domains=train_domains,
                test_domains=["UA", "CC"],
                train_languages=["ALL"]
            )

            task_classes = {
                "entity_framing": y_train_s1.shape[1],
                "narrative_classification": y_train_s2.shape[1]
            }
            model = MultiTaskTransformer(encoder, task_classes).to(device) if setup == "mtl" else \
                AdapterMultiTaskTransformer(model_name=encoder, num_classes_dict=task_classes).to(device)

            optimizer = torch.optim.AdamW(model.parameters(), lr=PARAMS["learning_rate"])
            criterion = torch.nn.BCEWithLogitsLoss()

            train_mtl_flat(
                model=model,
                loaders={"entity_framing": train_loader_s1, "narrative_classification": train_loader_s2},
                val_data={
                    "entity_framing": (val_loader_s1, df_val_s1, y_val_s1, mlb_s1),
                    "narrative_classification": (val_loader_s2, df_val_s2, y_val_s2, mlb_s2)
                },
                mlbs={"entity_framing": mlb_s1, "narrative_classification": mlb_s2},
                optimizer=optimizer,
                criterion=criterion,
                device=device,
                epochs=PARAMS["epochs"],
                train_domain=train_domains,
                test_domain=["UA", "CC"]
            )

            for domain in EVAL_SPLITS:
                for subtask, df_test, mlb, text_key, label_key in [
                    ("entity_framing", df_test_s1, mlb_s1, "Input_Text", "Label"),
                    ("narrative_classification", df_test_s2, mlb_s2, "Translated_Text", "Label")
                ]:
                    df_eval = df_test[df_test["Domain"] == domain].copy()
                    known_labels = set(mlb.classes_)
                    df_eval[label_key] = df_eval[label_key].apply(
                        lambda labels: [l for l in labels if l in known_labels] if isinstance(labels, list) else []
                    )
                    y_eval = mlb.transform(df_eval[label_key])

                    test_loader = DataLoader(
                        MultiLabelDataset(df_eval[text_key].tolist(), y_eval, tokenizer, PARAMS["max_len"]),
                        batch_size=PARAMS["batch_size"]
                    )

                    model_path = f"{subtask}_MTL_{'-'.join(train_domains)}_to_{'-'.join(EVAL_SPLITS)}.pt"
                    if os.path.exists(model_path):
                        model.load_state_dict(torch.load(model_path))
                        model.to(device)
                        print(f"✅ Loaded model for {subtask}")
                    else:
                        print(f"⚠️ Missing checkpoint: {model_path}")


                    eval_result = evaluate_flat_custom(
                        model=model,
                        loader=test_loader,
                        df_source=df_eval,
                        mlb=mlb,
                        device=device,
                        threshold=PARAMS["threshold"],
                        task=subtask
                    )
                    coarse_list = get_coarse_label_list(subtask)
                    score_dict = compute_fine_vs_coarse_metrics(
                        eval_result["y_true"], eval_result["y_pred_bin"], list(mlb.classes_), coarse_list
                    )

                    all_rows.append({
                        "setup": setup,
                        "encoder": encoder,
                        "task": subtask,
                        "train_domain": train_str,
                        "eval_domain": domain,
                        "overall_macro": round((score_dict["macro_fine"] + score_dict["macro_coarse"]) / 2, 4),
                        "overall_micro": round((score_dict["micro_fine"] + score_dict["micro_coarse"]) / 2, 4),
                        "overall_exact": round(eval_result["exact"], 4),
                        "macro_fine": round(score_dict["macro_fine"], 4),
                        "micro_fine": round(score_dict["micro_fine"], 4),
                        "macro_coarse": round(score_dict["macro_coarse"], 4),
                        "micro_coarse": round(score_dict["micro_coarse"], 4),
                    })


    pd.DataFrame(all_rows, columns=SUMMARY_COLUMNS).to_csv(csv_path, index=False)
    print(f"✅ Saved: {csv_path}")


In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PARAMS = {
    "learning_rate": 3e-5,
    "batch_size": 8,
    "epochs": 3,
    "threshold": 0.35,
    "max_len": 512
}

SETUPS = [
    {"setup": "mtl", "task": None, "encoder": "distilbert-base-uncased"},
]

TRAIN_SPLITS = [["CC"]]
EVAL_SPLITS = ["UA", "CC"]

SUMMARY_COLUMNS = [
    "setup", "encoder", "task", "train_domain", "eval_domain",
    "overall_macro", "overall_micro", "overall_exact",
    "macro_fine", "micro_fine", "macro_coarse", "micro_coarse"
]

for config in SETUPS:
    setup = config["setup"]
    task = config["task"]
    encoder = config["encoder"]
    tokenizer = AutoTokenizer.from_pretrained(encoder)

    setup_name = f"{setup}_{task or 'mtl'}_{encoder.replace('/', '-')}"
    csv_path = f"results_summary__{setup_name}.csv"
    all_rows = []

    for train_domains in TRAIN_SPLITS:
        train_str = "+".join(train_domains)

        if setup == "stl":
            df_train, df_val, df_test, y_train, y_val, y_test, mlb, text_col, label_col = prepare_data_STL_fine(
                task, train_domains, EVAL_SPLITS
            )
            train_dataset = MultiLabelDataset(df_train[text_col].tolist(), y_train, tokenizer, PARAMS["max_len"])
            val_dataset = MultiLabelDataset(df_val[text_col].tolist(), y_val, tokenizer, PARAMS["max_len"])
            train_loader = DataLoader(train_dataset, batch_size=PARAMS["batch_size"], shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=PARAMS["batch_size"])

            model = TransformerClassifier(encoder, len(mlb.classes_)).to(device)
            model = train_single_task_model(
                model=model,
                train_loader=train_loader,
                val_loader=val_loader,
                y_val=y_val,
                MODEL_PATH="tmp.pt",
                LEARNING_RATE=PARAMS["learning_rate"],
                EPOCHS=PARAMS["epochs"],
                device=device
            )

            for domain in EVAL_SPLITS:
                df_eval = df_test[df_test["Domain"] == domain].copy()
                known_labels = set(mlb.classes_)
                df_eval[label_col] = df_eval[label_col].apply(lambda labels: [l for l in labels if l in known_labels])
                y_eval = mlb.transform(df_eval[label_col])
                test_loader = DataLoader(
                    MultiLabelDataset(df_eval[text_col].tolist(), y_eval, tokenizer, PARAMS["max_len"]),
                    batch_size=PARAMS["batch_size"]
                )
                eval_result = evaluate_flat_custom(model, test_loader, df_eval, mlb, device, threshold=PARAMS["threshold"])
                score_dict = compute_fine_vs_coarse_metrics(eval_result["y_true"], eval_result["y_pred_bin"], list(mlb.classes_), get_coarse_label_list(task))

                all_rows.append({
                    "setup": setup,
                    "encoder": encoder,
                    "task": task,
                    "train_domain": train_str,
                    "eval_domain": domain,
                    "overall_macro": round((score_dict["macro_fine"] + score_dict["macro_coarse"]) / 2, 4),
                    "overall_micro": round((score_dict["micro_fine"] + score_dict["micro_coarse"]) / 2, 4),
                    "overall_exact": round(eval_result["exact"], 4),
                    "macro_fine": round(score_dict["macro_fine"], 4),
                    "micro_fine": round(score_dict["micro_fine"], 4),
                    "macro_coarse": round(score_dict["macro_coarse"], 4),
                    "micro_coarse": round(score_dict["micro_coarse"], 4),
                })

        elif setup in ["mtl", "mtl_adapter"]:
            (
                df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
                df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
                train_loader_s1, val_loader_s1, test_loader_s1,
                train_loader_s2, val_loader_s2, test_loader_s2,
                num_classes_dict
            ) = prepare_data_MTL_fine_flat(
                TASK="multi_task",
                model_name=encoder,
                max_len=PARAMS["max_len"],
                batch_size=PARAMS["batch_size"],
                train_domains=train_domains,
                test_domains=EVAL_SPLITS,
                train_languages=["ALL"]
            )

            task_classes = {
                "entity_framing": y_train_s1.shape[1],
                "narrative_classification": y_train_s2.shape[1]
            }
            model = MultiTaskTransformer(encoder, task_classes).to(device) if setup == "mtl" else \
                AdapterMultiTaskTransformer(model_name=encoder, num_classes_dict=task_classes).to(device)

            for subtask in ["entity_framing", "narrative_classification"]:
                model_path = f"{subtask}_MTL_{'-'.join(train_domains)}_to_{'-'.join(EVAL_SPLITS)}.pt"
                if os.path.exists(model_path):
                    model.load_state_dict(torch.load(model_path), strict=False)
                    print(f"✅ Loaded model for {subtask}")
                else:
                    print(f"⚠️ Missing checkpoint: {model_path}")

                df_test = df_test_s1 if subtask == "entity_framing" else df_test_s2
                mlb = mlb_s1 if subtask == "entity_framing" else mlb_s2
                text_key = "Input_Text" if subtask == "entity_framing" else "Translated_Text"
                label_key = "Label"

                for domain in EVAL_SPLITS:
                    df_eval = df_test[df_test["Domain"] == domain].copy()
                    known_labels = set(mlb.classes_)
                    df_eval[label_key] = df_eval[label_key].apply(lambda labels: [l for l in labels if l in known_labels] if isinstance(labels, list) else [])
                    y_eval = mlb.transform(df_eval[label_key])

                    test_loader = DataLoader(
                        MultiTaskDataset(df_eval[text_key].tolist(), {subtask: y_eval}, tokenizer, PARAMS["max_len"]),
                        batch_size=PARAMS["batch_size"]
                    )

                    eval_result = evaluate_flat_custom(
                        model=model,
                        loader=test_loader,
                        df_source=df_eval,
                        mlb=mlb,
                        device=device,
                        threshold=PARAMS["threshold"],
                        task=subtask
                    )
                    coarse_list = get_coarse_label_list(subtask)
                    score_dict = compute_fine_vs_coarse_metrics(eval_result["y_true"], eval_result["y_pred_bin"], list(mlb.classes_), coarse_list)

                    all_rows.append({
                        "setup": setup,
                        "encoder": encoder,
                        "task": subtask,
                        "train_domain": train_str,
                        "eval_domain": domain,
                        "overall_macro": round((score_dict["macro_fine"] + score_dict["macro_coarse"]) / 2, 4),
                        "overall_micro": round((score_dict["micro_fine"] + score_dict["micro_coarse"]) / 2, 4),
                        "overall_exact": round(eval_result["exact"], 4),
                        "macro_fine": round(score_dict["macro_fine"], 4),
                        "micro_fine": round(score_dict["micro_fine"], 4),
                        "macro_coarse": round(score_dict["macro_coarse"], 4),
                        "micro_coarse": round(score_dict["micro_coarse"], 4),
                    })

    pd.DataFrame(all_rows, columns=SUMMARY_COLUMNS).to_csv(csv_path, index=False)
    print(f"✅ Saved: {csv_path}")


## Control Panel

In [None]:
# Choose a task for the pipeline below: "narrative_classification" or "entity_framing" or "multi_task" or "multi_task_adapter"
TASK = "entity_framing" # or "entity_framing" or "multi_task" or "multi_task_adapter

# select domains for training and testing: "UA"; "CC"; "UA", "CC";
TRAIN_DOMAIN = ["UA","CC"]
TEST_DOMAIN = ["UA", "CC"] # The test data comes from a separate dataset.
# The test data is always the same regardless of the domain we choose to train on. This is for consistency.

# select languages for training and testing: "ALL";"EN";"HI";"BG";"RU";"PT"
TRAIN_LANGUAGES = ["ALL"]
TEST_LANGUAGES = ["ALL"]

# Taxonomy Depth
TAXONOMY_DEPTH = "COARSE" # "COARSE" OR "FINE"

# Classifier Complexity
CLASSIFIER_COMPLEXITY = "FLAT" # "FLAT" OR "HIERARCHICAL"

# change the training hyperparameters here
MODEL_NAME = "distilbert-base-uncased" # OR  "distilbert-base-uncased" "roberta-base" ""FacebookAI/roberta-base""
MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 4
LEARNING_RATE = 3e-5
MODEL_PATH = f"{TASK}_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt" # -- to save the model later

#tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
#tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


# debug mode -- reduced samples
DEBUG_MODE = False

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## UTILS Assemble Dataset

In [None]:
if TASK != "multi_task" and TASK != "multi_task_adapter":
    if TAXONOMY_DEPTH == 'FINE':
        if CLASSIFIER_COMPLEXITY == 'FLAT':
            df_train, df_val, df_test, y_train, y_val, y_test, mlb, TEXT_COL, LABEL_COL = prepare_data_STL_fine(
                TASK,
                TRAIN_DOMAIN,
                TEST_DOMAIN,
            )
        elif CLASSIFIER_COMPLEXITY == 'HIERARCHICAL':
            df_train, df_val, df_test, y_train, y_val, y_test, mlb, TEXT_COL, LABEL_COL, child_to_parent, label_to_index = prepare_data_STL_hierarchical(
                TASK,
                TRAIN_DOMAIN,
                TEST_DOMAIN,
            )


    elif TAXONOMY_DEPTH == 'COARSE':
        df_train, df_val, df_test, y_train, y_val, y_test, mlb, TEXT_COL, LABEL_COL = prepare_data_STL_coarse(
                TASK,
                TRAIN_DOMAIN,
                TEST_DOMAIN,
            )

    train_dataset = MultiLabelDataset(df_train[TEXT_COL].tolist(), y_train, tokenizer, MAX_LEN)
    val_dataset = MultiLabelDataset(df_val[TEXT_COL].tolist(), y_val, tokenizer, MAX_LEN)
    test_dataset = MultiLabelDataset(df_test[TEXT_COL].tolist(), y_test, tokenizer, MAX_LEN)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    num_classes = len(mlb.classes_)


elif TASK == "multi_task" or TASK == "multi_task_adapter":

    if TAXONOMY_DEPTH == 'FINE':

        if CLASSIFIER_COMPLEXITY == 'FLAT':
            (
                df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
                df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
                train_loader_s1, val_loader_s1, test_loader_s1,
                train_loader_s2, val_loader_s2, test_loader_s2,
                num_classes_dict
            ) = prepare_data_MTL_fine_flat(
                TASK,
                train_domains=TRAIN_DOMAIN,
                test_domains=TEST_DOMAIN,
                train_languages=TRAIN_LANGUAGES,
                model_name=MODEL_NAME,
                max_len=MAX_LEN,
                batch_size=BATCH_SIZE
            )

        elif CLASSIFIER_COMPLEXITY == 'HIERARCHICAL':
            (
                df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
                df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
                train_loader_s1, val_loader_s1, test_loader_s1,
                train_loader_s2, val_loader_s2, test_loader_s2,
                num_classes_dict,
                child_to_parent_map,
                label_to_index_map
            ) = prepare_data_MTL_hierarchical(
                TASK,
                train_domains=TRAIN_DOMAIN,
                test_domains=TEST_DOMAIN,
                train_languages=TRAIN_LANGUAGES,
                model_name=MODEL_NAME,
                max_len=MAX_LEN,
                batch_size=BATCH_SIZE
            )

    elif TAXONOMY_DEPTH == 'COARSE':
        (
            df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
            df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
            train_loader_s1, val_loader_s1, test_loader_s1,
            train_loader_s2, val_loader_s2, test_loader_s2,
            num_classes_dict
        ) = prepare_data_MTL_coarse(
            TASK,
            train_domains=TRAIN_DOMAIN,
            test_domains=TEST_DOMAIN,
            train_languages=TRAIN_LANGUAGES,
            model_name=MODEL_NAME,
            max_len=MAX_LEN,
            batch_size=BATCH_SIZE
        )




In [None]:
# Fine-Fine
prepare_data_MTL_mixed(..., "fine", "fine")

# Fine-Coarse
prepare_data_MTL_mixed(..., "fine", "coarse")

# Coarse-Fine
prepare_data_MTL_mixed(..., "coarse", "fine")

# Coarse-Coarse
prepare_data_MTL_mixed(..., "coarse", "coarse")


In [None]:
(
    df_train_s1, df_val_s1, df_test_s1, y_train_s1, y_val_s1, y_test_s1, mlb_s1,
    df_train_s2, df_val_s2, df_test_s2, y_train_s2, y_val_s2, y_test_s2, mlb_s2,
    train_loader_s1, val_loader_s1, test_loader_s1,
    train_loader_s2, val_loader_s2, test_loader_s2,
    num_classes_dict
) = prepare_data_MTL_mixed(
    TASK,
    train_domains=TRAIN_DOMAIN,
    test_domains=TEST_DOMAIN,
    train_languages=TRAIN_LANGUAGES,
    model_name=MODEL_NAME,
    max_len=MAX_LEN,
    batch_size=BATCH_SIZE,
    granularity_s1="coarse",
    granularity_s2="fine"
)



## Training Loop

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if TASK != "multi_task" and TASK != "multi_task_adapter":
    print("\n>>> Running Single-Task (no adapter) Model <<<")
    model = TransformerClassifier(MODEL_NAME, num_classes).to(device)

    if CLASSIFIER_COMPLEXITY == 'FLAT':
        trained_model = train_single_task_model(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            y_val=y_val,
            MODEL_PATH=MODEL_PATH,
            LEARNING_RATE=LEARNING_RATE,
            EPOCHS=EPOCHS,
            device=device,
            predict_proba=eval_util.predict_proba,
            evaluate_threshold_sweep=eval_util.evaluate_threshold_sweep
        )
        trained_model.load_state_dict(torch.load(MODEL_PATH))
        trained_model.to(device)

    elif CLASSIFIER_COMPLEXITY == 'HIERARCHICAL':
        trained_model = train_hierarchical_classifier(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            y_val=y_val,
            MODEL_PATH=MODEL_PATH,
            child_to_parent=child_to_parent,
            label_to_index=label_to_index,
            predict_proba=eval_util.predict_proba,
            evaluate_threshold_sweep=eval_util.evaluate_threshold_sweep,
            LEARNING_RATE=LEARNING_RATE,
            EPOCHS=EPOCHS
        )
        trained_model.load_state_dict(torch.load(MODEL_PATH))
        trained_model.to(device)

elif TASK == "multi_task":
    print("\n>>> Running Multi-Task (no adapter) Model <<<")
    task_classes = {
        "narrative_classification": y_train_s2.shape[1],
        "entity_framing": y_train_s1.shape[1]
    }
    model = MultiTaskTransformer(MODEL_NAME, task_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.BCEWithLogitsLoss()

    if CLASSIFIER_COMPLEXITY == 'FLAT':
        train_mtl_flat(
            model=model,
            loaders={
                "narrative_classification": train_loader_s2,
                "entity_framing": train_loader_s1
            },
            val_data={
                "narrative_classification": (val_loader_s2, df_val_s2, y_val_s2, mlb_s2),
                "entity_framing": (val_loader_s1, df_val_s1, y_val_s1, mlb_s1)
            },
            mlbs={
                "narrative_classification": mlb_s2,
                "entity_framing": mlb_s1
            },
            optimizer=optimizer,
            criterion=criterion,
            device=device,
            epochs=EPOCHS,
            train_domain=TRAIN_DOMAIN,
            test_domain=TEST_DOMAIN
        )

    elif CLASSIFIER_COMPLEXITY == 'HIERARCHICAL':
        train_mtl_hierarchical(
            model=model,
            loaders={
                "narrative_classification": train_loader_s2,
                "entity_framing": train_loader_s1
            },
            val_data={
                "narrative_classification": (val_loader_s2, df_val_s2, y_val_s2, mlb_s2),
                "entity_framing": (val_loader_s1, df_val_s1, y_val_s1, mlb_s1)
            },
            child_to_parent_map=child_to_parent_map,
            label_to_index_map=label_to_index_map,
            optimizer=optimizer,
            criterion=criterion,
            device=device,
            epochs=EPOCHS,
            train_domain=TRAIN_DOMAIN,
            test_domain=TEST_DOMAIN
        )


    # Re-load best saved model per task
    model.load_state_dict(torch.load(f"entity_framing_MTL_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt"), strict=False)
    model.load_state_dict(torch.load(f"narrative_classification_MTL_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt"), strict=False)
    trained_model = model


elif TASK == "multi_task_adapter":
    print("\n>>> Running Multi-Task Adapter Model <<<")

    task_classes = {
        "narrative_classification": y_train_s2.shape[1],
        "entity_framing": y_train_s1.shape[1]
    }

    model = AdapterMultiTaskTransformer(
        model_name=MODEL_NAME,
        num_classes_dict=task_classes,
        adapter_dim=128
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.BCEWithLogitsLoss()

    if CLASSIFIER_COMPLEXITY == 'FLAT':
        train_mtl_flat(
            model=model,
            loaders={
                "narrative_classification": train_loader_s2,
                "entity_framing": train_loader_s1
            },
            val_data={
                "narrative_classification": (val_loader_s2, df_val_s2, y_val_s2, mlb_s2),
                "entity_framing": (val_loader_s1, df_val_s1, y_val_s1, mlb_s1)
            },
            mlbs={
                "narrative_classification": mlb_s2,
                "entity_framing": mlb_s1
            },
            optimizer=optimizer,
            criterion=criterion,
            device=device,
            epochs=EPOCHS,
            train_domain=TRAIN_DOMAIN,
            test_domain=TEST_DOMAIN
        )

    # load best saved models
    model.load_state_dict(torch.load(f"entity_framing_MTL_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt"), strict=False)
    model.load_state_dict(torch.load(f"narrative_classification_MTL_{'-'.join(TRAIN_DOMAIN)}_to_{'-'.join(TEST_DOMAIN)}.pt"), strict=False)
    trained_model = model



>>> Running Single-Task (no adapter) Model <<<


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 671/671 [02:02<00:00,  5.46it/s]



Epoch 1: Loss = 0.5614
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.525 | Micro F1: 0.546 | Exact Match: 0.047
Thresh 0.15 | Macro F1: 0.583 | Micro F1: 0.614 | Exact Match: 0.243
Thresh 0.20 | Macro F1: 0.595 | Micro F1: 0.636 | Exact Match: 0.406
Thresh 0.25 | Macro F1: 0.589 | Micro F1: 0.643 | Exact Match: 0.504
Thresh 0.30 | Macro F1: 0.561 | Micro F1: 0.635 | Exact Match: 0.556
Thresh 0.35 | Macro F1: 0.529 | Micro F1: 0.625 | Exact Match: 0.581
Thresh 0.40 | Macro F1: 0.499 | Micro F1: 0.618 | Exact Match: 0.598
Thresh 0.45 | Macro F1: 0.482 | Micro F1: 0.613 | Exact Match: 0.596
Thresh 0.50 | Macro F1: 0.464 | Micro F1: 0.602 | Exact Match: 0.567
Thresh 0.55 | Macro F1: 0.445 | Micro F1: 0.593 | Exact Match: 0.540
Thresh 0.60 | Macro F1: 0.417 | Micro F1: 0.571 | Exact Match: 0.494
Thresh 0.65 | Macro F1: 0.393 | Micro F1: 0.542 | Exact Match: 0.440
Thresh 0.70 | Macro F1: 0.336 | Micro F1: 0.469 | Exact Match: 0.341
Thresh 0.75 | Macro F1: 0.268 | Micro F1: 0.349 | Exac

Epoch 2: 100%|██████████| 671/671 [02:02<00:00,  5.47it/s]



Epoch 2: Loss = 0.4726
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.671 | Micro F1: 0.671 | Exact Match: 0.316
Thresh 0.15 | Macro F1: 0.714 | Micro F1: 0.716 | Exact Match: 0.453
Thresh 0.20 | Macro F1: 0.739 | Micro F1: 0.747 | Exact Match: 0.542
Thresh 0.25 | Macro F1: 0.752 | Micro F1: 0.766 | Exact Match: 0.612
Thresh 0.30 | Macro F1: 0.753 | Micro F1: 0.777 | Exact Match: 0.666
Thresh 0.35 | Macro F1: 0.757 | Micro F1: 0.787 | Exact Match: 0.716
Thresh 0.40 | Macro F1: 0.756 | Micro F1: 0.794 | Exact Match: 0.748
Thresh 0.45 | Macro F1: 0.757 | Micro F1: 0.800 | Exact Match: 0.766
Thresh 0.50 | Macro F1: 0.749 | Micro F1: 0.797 | Exact Match: 0.757
Thresh 0.55 | Macro F1: 0.739 | Micro F1: 0.789 | Exact Match: 0.726
Thresh 0.60 | Macro F1: 0.722 | Micro F1: 0.777 | Exact Match: 0.693
Thresh 0.65 | Macro F1: 0.696 | Micro F1: 0.751 | Exact Match: 0.643
Thresh 0.70 | Macro F1: 0.673 | Micro F1: 0.727 | Exact Match: 0.602
Thresh 0.75 | Macro F1: 0.645 | Micro F1: 0.697 | Exac

Epoch 3: 100%|██████████| 671/671 [02:02<00:00,  5.48it/s]



Epoch 3: Loss = 0.3479
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.750 | Micro F1: 0.754 | Exact Match: 0.529
Thresh 0.15 | Macro F1: 0.794 | Micro F1: 0.796 | Exact Match: 0.622
Thresh 0.20 | Macro F1: 0.818 | Micro F1: 0.823 | Exact Match: 0.686
Thresh 0.25 | Macro F1: 0.842 | Micro F1: 0.847 | Exact Match: 0.742
Thresh 0.30 | Macro F1: 0.855 | Micro F1: 0.862 | Exact Match: 0.789
Thresh 0.35 | Macro F1: 0.867 | Micro F1: 0.875 | Exact Match: 0.824
Thresh 0.40 | Macro F1: 0.872 | Micro F1: 0.884 | Exact Match: 0.850
Thresh 0.45 | Macro F1: 0.871 | Micro F1: 0.885 | Exact Match: 0.854
Thresh 0.50 | Macro F1: 0.870 | Micro F1: 0.884 | Exact Match: 0.848
Thresh 0.55 | Macro F1: 0.865 | Micro F1: 0.878 | Exact Match: 0.828
Thresh 0.60 | Macro F1: 0.855 | Micro F1: 0.867 | Exact Match: 0.800
Thresh 0.65 | Macro F1: 0.837 | Micro F1: 0.851 | Exact Match: 0.766
Thresh 0.70 | Macro F1: 0.824 | Micro F1: 0.838 | Exact Match: 0.741
Thresh 0.75 | Macro F1: 0.797 | Micro F1: 0.811 | Exac

Epoch 4: 100%|██████████| 671/671 [02:02<00:00,  5.48it/s]



Epoch 4: Loss = 0.2459
Threshold sweep results:
Thresh 0.10 | Macro F1: 0.817 | Micro F1: 0.834 | Exact Match: 0.743
Thresh 0.15 | Macro F1: 0.861 | Micro F1: 0.873 | Exact Match: 0.803
Thresh 0.20 | Macro F1: 0.889 | Micro F1: 0.898 | Exact Match: 0.842
Thresh 0.25 | Macro F1: 0.904 | Micro F1: 0.911 | Exact Match: 0.867
Thresh 0.30 | Macro F1: 0.917 | Micro F1: 0.923 | Exact Match: 0.890
Thresh 0.35 | Macro F1: 0.924 | Micro F1: 0.931 | Exact Match: 0.909
Thresh 0.40 | Macro F1: 0.927 | Micro F1: 0.935 | Exact Match: 0.923
Thresh 0.45 | Macro F1: 0.926 | Micro F1: 0.934 | Exact Match: 0.919
Thresh 0.50 | Macro F1: 0.924 | Micro F1: 0.932 | Exact Match: 0.911
Thresh 0.55 | Macro F1: 0.920 | Micro F1: 0.928 | Exact Match: 0.899
Thresh 0.60 | Macro F1: 0.912 | Micro F1: 0.923 | Exact Match: 0.881
Thresh 0.65 | Macro F1: 0.905 | Micro F1: 0.915 | Exact Match: 0.863
Thresh 0.70 | Macro F1: 0.898 | Micro F1: 0.909 | Exact Match: 0.848
Thresh 0.75 | Macro F1: 0.890 | Micro F1: 0.900 | Exac

## Evaluation

In [None]:
# ==========================
# EVALUATION (Single Task)
# ==========================
if TASK != "multi_task" and TASK != "multi_task_adapter":
    print(f"\nEvaluating Single-Task Model ({TASK})")

    if CLASSIFIER_COMPLEXITY == 'FLAT':
        results_domain = eval_util.evaluate_per_domain_flat(
            trained_model,
            val_loader, df_val.reset_index(drop=True),
            test_loader, df_test.reset_index(drop=True),
            mlb,
            device=device
        )

        results_class = eval_util.evaluate_per_class_flat(
            trained_model,
            test_loader,
            df_test.reset_index(drop=True),
            mlb,
            device=device,
            label="TEST"
        )


    elif CLASSIFIER_COMPLEXITY == 'HIERARCHICAL':
        results_hierarchical = eval_util.evaluate_and_compare_hierarchical(
            model=trained_model,
            val_loader=val_loader,
            val_df=df_val.reset_index(drop=True),
            val_targets=y_val,
            test_loader=test_loader,
            test_df=df_test.reset_index(drop=True),
            test_targets=y_test,
            mlb=mlb,
            device=device,
            child_to_parent=child_to_parent,
            label_to_index=label_to_index
        )

# ==========================
# EVALUATION (Multi-Task)
# ==========================
elif TASK == "multi_task" or TASK == "multi_task_adapter":
    print(f"\nEvaluating Multi-Task Model ({TASK})")
    if CLASSIFIER_COMPLEXITY == 'FLAT':
        task_loaders = {
            "narrative_classification": test_loader_s2,
            "entity_framing": test_loader_s1,
        }

        task_dfs = {
            "narrative_classification": df_test_s2,
            "entity_framing": df_test_s1,
        }

        task_targets = {
            "narrative_classification": y_test_s2,
            "entity_framing": y_test_s1,
        }

        task_mlbs = {
            "narrative_classification": mlb_s2,
            "entity_framing": mlb_s1,
        }

        results_mtl = eval_util.evaluate_mtl_all_tasks(
            model=trained_model,
            task_loaders=task_loaders,
            task_dfs=task_dfs,
            task_targets=task_targets,
            task_mlbs=task_mlbs,
            domain_list=TRAIN_DOMAIN,
            device=device,
            load_from_disk=False
        )


    elif CLASSIFIER_COMPLEXITY == 'HIERARCHICAL':
        eval_util.evaluate_mtl_hierarchical_all_tasks(
            model=trained_model,
            test_loaders={
                "narrative_classification": test_loader_s2,
                "entity_framing": test_loader_s1
            },
            df_tests={
                "narrative_classification": df_test_s2,
                "entity_framing": df_test_s1
            },
            y_tests={
                "narrative_classification": y_test_s2,
                "entity_framing": y_test_s1
            },
            mlbs={
                "narrative_classification": mlb_s2,
                "entity_framing": mlb_s1
            },
            child_to_parent_map=child_to_parent_map,
            label_to_index_map=label_to_index_map,
            device=device
        )



Evaluating Single-Task Model (entity_framing)

Validation (Fixed Threshold)


Evaluating VALIDATION: 100%|██████████| 287/287 [00:17<00:00, 15.96it/s]



 VALIDATION (Fixed Threshold=0.35):
Macro F1: 0.886
Micro F1: 0.898
Exact Match: 0.865

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: UA
Macro F1: 0.886
Micro F1: 0.898
Exact Match: 0.865

Test (Fixed Threshold)


Evaluating TEST: 100%|██████████| 56/56 [00:03<00:00, 15.43it/s]



 TEST (Fixed Threshold=0.35):
Macro F1: 0.634
Micro F1: 0.634
Exact Match: 0.562

----------------------------
Per-Domain Breakdown
----------------------------

 Domain: CC
Macro F1: 0.635
Micro F1: 0.787
Exact Match: 0.780

 Domain: UA
Macro F1: 0.557
Micro F1: 0.598
Exact Match: 0.507

OOD Generalization (Fixed Threshold)
Δ Macro F1 (val - test): 0.253


Evaluating TEST: 100%|██████████| 56/56 [00:03<00:00, 15.48it/s]


TEST (Fixed Threshold=0.35):
Macro F1: 0.634
Micro F1: 0.634
Exact Match: 0.562

----------------------------
Classification Report (All Domains)
----------------------------
              precision    recall  f1-score   support

  Antagonist       0.61      0.71      0.65       168
    Innocent       0.59      0.70      0.64       122
 Protagonist       0.60      0.62      0.61       158

   micro avg       0.60      0.67      0.63       448
   macro avg       0.60      0.68      0.63       448
weighted avg       0.60      0.67      0.63       448
 samples avg       0.62      0.67      0.64       448


----------------------------
Per-Domain Breakdown
----------------------------

Domain: CC
Macro F1: 0.635
Micro F1: 0.787
Exact Match: 0.780
Classification Report:
              precision    recall  f1-score   support

  Antagonist       0.75      0.23      0.35        13
    Innocent       0.77      0.98      0.86        60
 Protagonist       0.91      0.56      0.69        18

   mi


