In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import json
import random
from collections import defaultdict

import numpy as np
import pandas as pd
from scipy.stats import spearmanr

import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)


In [None]:
#seeding for reproducibility
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    set_seed(seed)

seed_everything(42)

In [None]:
TRAIN_JSON_PATH = "/content/drive/MyDrive/nlp/train.json"
DEV_JSON_PATH   = "/content/drive/MyDrive/nlp/dev.json"

In [None]:
def load_json_records(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return list(data.values()) if isinstance(data, dict) else data

train_records = load_json_records(TRAIN_JSON_PATH)
dev_records   = load_json_records(DEV_JSON_PATH)

In [None]:
def load_json_records(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return list(data.values()) if isinstance(data, dict) else data

train_records = load_json_records(TRAIN_JSON_PATH)
dev_records   = load_json_records(DEV_JSON_PATH)


In [None]:
#building input examples
def build_examples(records):
    # flattening nested lists so every item is a dict
    def flatten(xs):
        for x in xs:
            if isinstance(x, list):
                yield from flatten(x)
            else:
                yield x


    if isinstance(records, dict):
        records = [records]

    records = list(flatten(records))

    out = []

    for r in records:
        if not isinstance(r, dict):
            continue

        pre = (r.get("precontext") or "").strip()
        sent = (r.get("sentence") or "").strip()
        end = (r.get("ending") or "").strip()
        meaning = (r.get("judged_meaning") or "").strip()

        meaning_txt = (
            "Meaning (intended definition):\n"
            f"{meaning}"
        )

        story_parts = [pre, sent]
        if end:
            story_parts.append(end)

        story_txt = "Story:\n" + "\n".join(story_parts)

        ex_sent = (r.get("example_sentence") or "").strip()
        if ex_sent:
            story_txt += f"\nExample sentence: {ex_sent}"

        label = float(r.get("average", 0.0))

        gid = f"{r.get('homonym', '')}||{sent}"

        out.append({
            "context": story_txt,
            "meaning": meaning_txt,
            "label": label,
            "stdev": float(r.get("stdev", 0)),
            "group_id": gid,
            "choices": r.get("choices"),
            "nonsensical": r.get("nonsensical"),
            "sample_id": r.get("sample_id")
        })

    return out


In [None]:
#loading dataset
train_examples = build_examples(train_records)
dev_examples   = build_examples(dev_records)

train_df = pd.DataFrame(train_examples)
dev_df   = pd.DataFrame(dev_examples)

dev_labels = dev_df["label"].to_numpy(float)
dev_stdevs = dev_df["stdev"].to_numpy(float)
dev_groups = dev_df["group_id"].tolist()

In [None]:
from sklearn.model_selection import train_test_split

# splitting original train_df into 80/20
train_df_split, val_df_split = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

# building HF DatasetDict
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df_split, preserve_index=False),
    "validation": Dataset.from_pandas(val_df_split, preserve_index=False),
    "test": Dataset.from_pandas(dev_df, preserve_index=False)
})

In [None]:
#model
MODEL_NAME = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_batch(batch):
    return tokenizer(
        batch["context"],
        batch["meaning"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

tokenized = dataset.map(tokenize_batch, batched=True)
tokenized = tokenized.rename_column("label", "labels")

# Remove non-tensor columns
for col in ["context", "meaning", "stdev", "group_id", "sample_id"]:
    if col in tokenized["train"].column_names:
        tokenized = tokenized.remove_columns(col)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=1,
    problem_type="regression"
)

In [None]:
# metrics
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.clip(np.squeeze(preds), 1.0, 5.0)
    labels = np.squeeze(labels)

    spearman_corr = spearmanr(labels, preds).correlation
    mae = np.mean(np.abs(labels - preds))

    return {
        "spearman": float(spearman_corr),
        "mae": float(mae),
    }

args = TrainingArguments(
    output_dir= None,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="spearman",
    greater_is_better=True,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.06,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    logging_steps=20,
    save_total_limit=1,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"], 
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
print("\nFINAL EVAL ON DEV (TEST)")

pred = trainer.predict(tokenized["test"])
raw_preds = np.clip(np.squeeze(pred.predictions), 1.0, 5.0)

# Global Spearman & MAE 
global_spearman = spearmanr(dev_labels, raw_preds).correlation
global_mae = np.mean(np.abs(dev_labels - raw_preds))

# Macro-Spearman 
group_indices = defaultdict(list)
for i, gid in enumerate(dev_groups):
    group_indices[gid].append(i)

group_sps = []
for gid, idxs in group_indices.items():
    g_true = dev_labels[idxs]
    g_pred = raw_preds[idxs]

    if np.all(g_true == g_true[0]):
        continue

    corr = spearmanr(g_true, g_pred).correlation
    if not np.isnan(corr):
        group_sps.append(corr)

macro_spearman = float(np.mean(group_sps)) if group_sps else 0.0

# Accuracy within stdev 
errors = np.abs(raw_preds - dev_labels)
within = errors <= dev_stdevs
acc_stdev = float(np.mean(within))

print(f"Global Spearman:       {global_spearman:.4f}")
print(f"Macro Spearman:        {macro_spearman:.4f}")
print(f"MAE:                   {global_mae:.4f}")
print(f"Accuracy within stdev: {acc_stdev:.4f}")


In [None]:
#bootstrapping
def bootstrap_test_metrics(
    y_true,
    y_pred,
    y_stdev,
    groups,
    n_bootstrap=1000,
    seed=42
):
    rng = np.random.default_rng(seed)
    n = len(y_true)

    global_sps = []
    macro_sps = []
    maes = []
    acc_stdevs = []

    group_to_indices = defaultdict(list)
    for i, gid in enumerate(groups):
        group_to_indices[gid].append(i)

    for _ in range(n_bootstrap):
        idx = rng.integers(0, n, size=n)

        bt_true = y_true[idx]
        bt_pred = y_pred[idx]
        bt_stdev = y_stdev[idx]
        bt_groups = [groups[i] for i in idx]

        # ---- Global Spearman ----
        rho = spearmanr(bt_true, bt_pred).correlation
        global_sps.append(rho)

        # ---- MAE ----
        maes.append(np.mean(np.abs(bt_true - bt_pred)))

        # ---- Accuracy within stdev ----
        acc_stdevs.append(np.mean(np.abs(bt_true - bt_pred) <= bt_stdev))

        # ---- Macro Spearman ----
        local_group_map = defaultdict(list)
        for i, g in enumerate(bt_groups):
            local_group_map[g].append(i)

        group_corrs = []
        for g, idxs in local_group_map.items():
            gt = bt_true[idxs]
            gp = bt_pred[idxs]

            if np.all(gt == gt[0]):
                continue

            c = spearmanr(gt, gp).correlation
            if not np.isnan(c):
                group_corrs.append(c)

        if group_corrs:
            macro_sps.append(np.mean(group_corrs))
        else:
            macro_sps.append(np.nan)

    def summarize(arr):
        arr = np.array(arr, dtype=float)
        return {
            "mean": float(np.nanmean(arr)),
            "ci_low": float(np.nanpercentile(arr, 2.5)),
            "ci_high": float(np.nanpercentile(arr, 97.5)),
        }

    return {
        "global_spearman": summarize(global_sps),
        "macro_spearman": summarize(macro_sps),
        "mae": summarize(maes),
        "acc_within_stdev": summarize(acc_stdevs),
    }


In [None]:
print("\nBOOTSTRAP RESULTS (TEST SET)")

bootstrap_results = bootstrap_test_metrics(
    y_true=dev_labels,
    y_pred=raw_preds,
    y_stdev=dev_stdevs,
    groups=dev_groups,
    n_bootstrap=1000
)

for metric, stats in bootstrap_results.items():
    print(
        f"{metric:20s}: "
        f"{stats['mean']:.4f} "
        f"[{stats['ci_low']:.4f}, {stats['ci_high']:.4f}]"
    )