In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import json
import random
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from collections import defaultdict

import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)


In [None]:
def seed_everything(seed=42):
    """Keep every source of randomness in sync for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    set_seed(seed)

seed_everything(42)

In [22]:
TRAIN_JSON_PATH = "/content/drive/MyDrive/nlp/train.json"
DEV_JSON_PATH   = "/content/drive/MyDrive/nlp/dev.json"
SYN_JSON_PATH   = "/content/drive/MyDrive/nlp/merged_updated.json"
MODEL_NAME = "FacebookAI/roberta-base"

In [None]:
def load_json_records(path, key=None):
    """Read a JSON file from disk and optionally select a nested key."""
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if key:
        data = data[key]
    return list(data.values()) if isinstance(data, dict) else data

In [None]:
def build_examples(records):
    """Transform nested JSON annotations into flat training examples."""
    # Normalize: flatten nested lists so every item is a dict
    def flatten(xs):
        for x in xs:
            if isinstance(x, list):
                yield from flatten(x)
            else:
                yield x

    # If a single dict is passed
    if isinstance(records, dict):
        records = [records]

    # Flatten any nested structure
    records = list(flatten(records))

    out = []

    for r in records:
        if not isinstance(r, dict):
            continue

        pre = (r.get("precontext") or "").strip()
        sent = (r.get("sentence") or "").strip()
        end = (r.get("ending") or "").strip()
        meaning = (r.get("judged_meaning") or "").strip()

        meaning_txt = (
            "Meaning (intended definition):\n"
            f"{meaning}"
        )

        # Concatenate the narrative snippets into a coherent block.
        story_parts = [pre, sent]
        if end:
            story_parts.append(end)

        story_txt = "Story:\n" + "\n".join(story_parts)

        ex_sent = (r.get("example_sentence") or "").strip()
        if ex_sent:
            story_txt += f"\nExample sentence: {ex_sent}"

        label = float(r.get("average", 0.0))

        gid = f"{r.get('homonym', '')}||{sent}"

        # Keep auxiliary metadata for downstream stats and debugging.
        out.append({
            "context": story_txt,
            "meaning": meaning_txt,
            "label": label,
            "stdev": float(r.get("stdev", 0)),
            "group_id": gid,
            "choices": r.get("choices"),
            "nonsensical": r.get("nonsensical"),
            "sample_id": r.get("sample_id")
        })

    return out


In [None]:
# Flatten each JSON split into DataFrames so we can concatenate later.
train_records = load_json_records(TRAIN_JSON_PATH)
dev_records   = load_json_records(DEV_JSON_PATH)
syn_records   = load_json_records(SYN_JSON_PATH)
train_df = pd.DataFrame(build_examples(train_records))
dev_df   = pd.DataFrame(build_examples(dev_records))
syn_df   = pd.DataFrame(build_examples(syn_records))
print("TRAIN:", len(train_df))
print("DEV:", len(dev_df))
print("SYN:", len(syn_df))


TRAIN: 2280
DEV: 588
SYN: 792


In [26]:
print(train_df.head())

                                             context  \
0  Story:\nThe old machine hummed in the corner o...   
1  Story:\nThe old machine hummed in the corner o...   
2  Story:\nThe old machine hummed in the corner o...   
3  Story:\nThe old machine hummed in the corner o...   
4  Story:\nThe old machine hummed in the corner o...   

                                             meaning  label     stdev  \
0  Meaning (intended definition):\nthe difference...    3.0  1.581139   
1  Meaning (intended definition):\nthe inherent c...    3.8  0.836660   
2  Meaning (intended definition):\nthe difference...    2.2  1.303840   
3  Meaning (intended definition):\nthe inherent c...    4.4  0.894427   
4  Meaning (intended definition):\nthe difference...    2.6  1.516575   

                                         group_id          choices  \
0  potential||The potential couldn't be measured.  [4, 5, 2, 3, 1]   
1  potential||The potential couldn't be measured.  [5, 3, 4, 4, 3]   
2  potential||

In [None]:

# Augment with synthetic annotations before the train/val split.
train_df = pd.concat([train_df, syn_df])
print("TRAIN:", len(train_df))

TRAIN: 3072


In [None]:
# Reserve out 20% for validation monitoring.
train_df, val_df = train_test_split(
    train_df, test_size=0.20, random_state=42
)

print("TRAIN:", len(train_df), "VAL:", len(val_df))


TRAIN: 2457 VAL: 615


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_batch(batch):
    """Tokenize the paired inputs once and stash the regression label."""
    enc = tokenizer(
        batch["context"],
        batch["meaning"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
    enc["labels"] = batch["label"]
    return enc


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
# Cache tokenized datasets so the Trainer works with arrow batches directly.
train_dataset = Dataset.from_pandas(train_df, preserve_index=False).map(tokenize_batch, batched=True)
val_dataset   = Dataset.from_pandas(val_df, preserve_index=False).map(tokenize_batch, batched=True)
dev_dataset   = Dataset.from_pandas(dev_df, preserve_index=False).map(tokenize_batch, batched=True)

Map:   0%|          | 0/2457 [00:00<?, ? examples/s]

Map:   0%|          | 0/615 [00:00<?, ? examples/s]

Map:   0%|          | 0/588 [00:00<?, ? examples/s]

In [31]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=1,
    problem_type="regression"
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    """Compute ranking fidelity (Spearman) and scale error (MAE)."""
    preds, labels = eval_pred
    preds = preds.reshape(-1)

    sp = spearmanr(labels, preds).correlation
    mae = np.mean(np.abs(labels - preds))

    return {"spearman": float(sp), "mae": float(mae)}


In [None]:
# Baseline hyperparameters for adapting RoBERTa-base to score plausibility.
 args = TrainingArguments(
    output_dir="./tmp_roberta_regression",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none"
)


In [34]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [35]:
trainer.train()

Epoch,Training Loss,Validation Loss,Spearman,Mae
1,1.397,1.17695,0.297987,0.945746
2,1.0077,0.97918,0.475185,0.821847
3,0.7618,1.114021,0.604398,0.824782
4,0.5629,0.917337,0.622885,0.738365
5,0.4544,0.901285,0.658927,0.718928
6,0.3479,0.905752,0.658384,0.718758
7,0.2419,0.862233,0.672494,0.70668
8,0.2329,0.886104,0.675412,0.707214
9,0.1987,0.816339,0.676194,0.677677
10,0.1683,0.874953,0.677571,0.701018


TrainOutput(global_step=1540, training_loss=0.6079686257746313, metrics={'train_runtime': 86.0657, 'train_samples_per_second': 285.48, 'train_steps_per_second': 17.893, 'total_flos': 3232290293406720.0, 'train_loss': 0.6079686257746313, 'epoch': 10.0})

In [36]:
preds = trainer.predict(dev_dataset).predictions.reshape(-1)
dev_labels = dev_df["label"].to_numpy()

global_spearman = spearmanr(dev_labels, preds).correlation
mae = np.mean(np.abs(dev_labels - preds))

print(f"\n=== DEV EVALUATION ===")
print(f"Spearman: {global_spearman:.4f}")
print(f"MAE:      {mae:.4f}")


=== DEV EVALUATION ===
Spearman: 0.4280
MAE:      0.9774


In [None]:
def bootstrap_test_metrics(
    y_true,
    y_pred,
    y_stdev,
    groups,
    n_bootstrap=1000,
    seed=42
):
    """Quantify metric uncertainty with simple percentile bootstrapping."""
    rng = np.random.default_rng(seed)
    n = len(y_true)

    global_sps = []
    macro_sps = []
    maes = []
    acc_stdevs = []

    group_to_indices = defaultdict(list)
    for i, gid in enumerate(groups):
        group_to_indices[gid].append(i)

    for _ in range(n_bootstrap):
        # Draw n examples with replacement to simulate alternate test sets.
        idx = rng.integers(0, n, size=n)

        bt_true = y_true[idx]
        bt_pred = y_pred[idx]
        bt_stdev = y_stdev[idx]
        bt_groups = [groups[i] for i in idx]

        # ---- Global Spearman ----
        rho = spearmanr(bt_true, bt_pred).correlation
        global_sps.append(rho)

        # ---- MAE ----
        maes.append(np.mean(np.abs(bt_true - bt_pred)))

        # ---- Accuracy within stdev ----
        acc_stdevs.append(np.mean(np.abs(bt_true - bt_pred) <= bt_stdev))

        # ---- Macro Spearman ----
        local_group_map = defaultdict(list)
        for i, g in enumerate(bt_groups):
            local_group_map[g].append(i)

        group_corrs = []
        for g, idxs in local_group_map.items():
            gt = bt_true[idxs]
            gp = bt_pred[idxs]

            if np.all(gt == gt[0]):
                continue

            c = spearmanr(gt, gp).correlation
            if not np.isnan(c):
                group_corrs.append(c)

        if group_corrs:
            macro_sps.append(np.mean(group_corrs))
        else:
            macro_sps.append(np.nan)

    def summarize(arr):
        # Retain the mean plus the 2.5/97.5 percentiles as the CI bounds.
        arr = np.array(arr, dtype=float)
        return {
            "mean": float(np.nanmean(arr)),
            "ci_low": float(np.nanpercentile(arr, 2.5)),
            "ci_high": float(np.nanpercentile(arr, 97.5)),
        }

    return {
        "global_spearman": summarize(global_sps),
        "macro_spearman": summarize(macro_sps),
        "mae": summarize(maes),
        "acc_within_stdev": summarize(acc_stdevs),
    }


In [38]:
print("\nBOOTSTRAP RESULTS (TEST SET)")

bootstrap_results = bootstrap_test_metrics(
    y_true=dev_df["label"].to_numpy(),
    y_pred=preds,
    y_stdev=dev_df["stdev"].to_numpy(),
    groups=dev_df["group_id"].tolist(),
    n_bootstrap=1000
)


for metric, stats in bootstrap_results.items():
    print(
        f"{metric:20s}: "
        f"{stats['mean']:.4f} "
        f"[{stats['ci_low']:.4f}, {stats['ci_high']:.4f}]"
    )



BOOTSTRAP RESULTS (TEST SET)


  c = spearmanr(gt, gp).correlation


global_spearman     : 0.4282 [0.3569, 0.4964]
macro_spearman      : 0.4109 [0.3202, 0.5078]
mae                 : 0.9776 [0.9198, 1.0379]
acc_within_stdev    : 0.5482 [0.5068, 0.5901]
