In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import json
import random
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from collections import defaultdict

import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)


In [None]:
def seed_everything(seed=42):
    """Seed Python, NumPy, Torch, and HF so results are reproducible."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    set_seed(seed)

seed_everything(42)

In [4]:
TRAIN_JSON_PATH = "/content/drive/MyDrive/nlp/train.json"
DEV_JSON_PATH   = "/content/drive/MyDrive/nlp/dev.json"
SYN_JSON_PATH   = "/content/drive/MyDrive/nlp/merged_updated.json"
MODEL_NAME = "google-bert/bert-base-uncased"

In [None]:
def load_json_records(path, key=None):
    """Load a JSON file and optionally drill into a nested mapping."""
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if key:
        data = data[key]
    return list(data.values()) if isinstance(data, dict) else data

In [None]:
def build_examples(records):
    """Convert raw annotation dicts into model-ready context/meaning pairs."""
    # Normalize: flatten nested lists so every item is a dict
    def flatten(xs):
        for x in xs:
            if isinstance(x, list):
                yield from flatten(x)
            else:
                yield x

    # If a single dict is passed
    if isinstance(records, dict):
        records = [records]

    # Flatten any nested structure
    records = list(flatten(records))

    out = []

    for r in records:
        # Skip items that are not dicts (extra safety)
        if not isinstance(r, dict):
            continue

        pre = (r.get("precontext") or "").strip()
        sent = (r.get("sentence") or "").strip()
        end = (r.get("ending") or "").strip()
        meaning = (r.get("judged_meaning") or "").strip()

        meaning_txt = (
            "Meaning (intended definition):\n"
            f"{meaning}"
        )

        # Merge the narrative fragments into a single multi-line prompt.
        story_parts = [pre, sent]
        if end:
            story_parts.append(end)

        story_txt = "Story:\n" + "\n".join(story_parts)

        ex_sent = (r.get("example_sentence") or "").strip()
        if ex_sent:
            story_txt += f"\nExample sentence: {ex_sent}"

        label = float(r.get("average", 0.0))

        gid = f"{r.get('homonym', '')}||{sent}"

        # Package both supervision signal and diagnostics for later eval.
        out.append({
            "context": story_txt,
            "meaning": meaning_txt,
            "label": label,
            "stdev": float(r.get("stdev", 0)),
            "group_id": gid,
            "choices": r.get("choices"),
            "nonsensical": r.get("nonsensical"),
            "sample_id": r.get("sample_id")
        })

    return out


In [None]:
# Load each split and normalize the nested JSON structure into DataFrames.
train_records = load_json_records(TRAIN_JSON_PATH)
dev_records   = load_json_records(DEV_JSON_PATH)
syn_records   = load_json_records(SYN_JSON_PATH)
train_df = pd.DataFrame(build_examples(train_records))
dev_df   = pd.DataFrame(build_examples(dev_records))
syn_df   = pd.DataFrame(build_examples(syn_records))
print("TRAIN:", len(train_df))
print("DEV:", len(dev_df))
print("SYN:", len(syn_df))


TRAIN: 2280
DEV: 588
SYN: 792


In [8]:
print(train_df.head())

                                             context  \
0  Story:\nThe old machine hummed in the corner o...   
1  Story:\nThe old machine hummed in the corner o...   
2  Story:\nThe old machine hummed in the corner o...   
3  Story:\nThe old machine hummed in the corner o...   
4  Story:\nThe old machine hummed in the corner o...   

                                             meaning  label     stdev  \
0  Meaning (intended definition):\nthe difference...    3.0  1.581139   
1  Meaning (intended definition):\nthe inherent c...    3.8  0.836660   
2  Meaning (intended definition):\nthe difference...    2.2  1.303840   
3  Meaning (intended definition):\nthe inherent c...    4.4  0.894427   
4  Meaning (intended definition):\nthe difference...    2.6  1.516575   

                                         group_id          choices  \
0  potential||The potential couldn't be measured.  [4, 5, 2, 3, 1]   
1  potential||The potential couldn't be measured.  [5, 3, 4, 4, 3]   
2  potential||

In [None]:

# Concatenating the real and synthetic data before the downstream split.
train_df = pd.concat([train_df, syn_df])
print("TRAIN:", len(train_df))

TRAIN: 3072


In [None]:
# Reserve 20% of the combined corpus for live validation during training.
train_df, val_df = train_test_split(
    train_df, test_size=0.20, random_state=42
)

print("TRAIN:", len(train_df), "VAL:", len(val_df))


TRAIN: 2457 VAL: 615


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_batch(batch):
    """Tokenize paired inputs and thread the regression label through."""
    enc = tokenizer(
        batch["context"],
        batch["meaning"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
    enc["labels"] = batch["label"]
    return enc


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
# Pre-tokenize each split once so the Trainer can stream batches efficiently.
train_dataset = Dataset.from_pandas(train_df, preserve_index=False).map(tokenize_batch, batched=True)
val_dataset   = Dataset.from_pandas(val_df, preserve_index=False).map(tokenize_batch, batched=True)
dev_dataset   = Dataset.from_pandas(dev_df, preserve_index=False).map(tokenize_batch, batched=True)

Map:   0%|          | 0/2457 [00:00<?, ? examples/s]

Map:   0%|          | 0/615 [00:00<?, ? examples/s]

Map:   0%|          | 0/588 [00:00<?, ? examples/s]

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=1,
    problem_type="regression"
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    """Report both rank correlation and absolute error for dev evals."""
    preds, labels = eval_pred
    preds = preds.reshape(-1)

    sp = spearmanr(labels, preds).correlation
    mae = np.mean(np.abs(labels - preds))

    return {"spearman": float(sp), "mae": float(mae)}


In [None]:
# hyperparameters for finetuning base BERT on Colab GPUs.
 args = TrainingArguments(
    output_dir="./tmp_bert_regression",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="no",   # you requested: no saving
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none"
)


In [16]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Spearman,Mae
1,1.3636,1.467813,0.066362,1.02651
2,1.1073,1.147097,0.408946,0.87796
3,0.9333,1.029117,0.49906,0.80614
4,0.6273,0.98705,0.541443,0.793409
5,0.5438,1.003773,0.570174,0.773804
6,0.3941,1.008755,0.576878,0.776462
7,0.3144,0.996198,0.584073,0.76737
8,0.2508,1.003325,0.5918,0.767258
9,0.222,0.963182,0.597413,0.758425
10,0.1684,0.994619,0.599055,0.763942


TrainOutput(global_step=1540, training_loss=0.6510652282021262, metrics={'train_runtime': 92.5815, 'train_samples_per_second': 265.388, 'train_steps_per_second': 16.634, 'total_flos': 3232290293406720.0, 'train_loss': 0.6510652282021262, 'epoch': 10.0})

In [18]:
preds = trainer.predict(dev_dataset).predictions.reshape(-1)
dev_labels = dev_df["label"].to_numpy()

global_spearman = spearmanr(dev_labels, preds).correlation
mae = np.mean(np.abs(dev_labels - preds))

print(f"\n=== DEV EVALUATION ===")
print(f"Spearman: {global_spearman:.4f}")
print(f"MAE:      {mae:.4f}")


=== DEV EVALUATION ===
Spearman: 0.3355
MAE:      1.0143


In [None]:
def bootstrap_test_metrics(
    y_true,
    y_pred,
    y_stdev,
    groups,
    n_bootstrap=1000,
    seed=42
):
    """Bootstrap confidence intervals for each metric to gauge stability."""
    rng = np.random.default_rng(seed)
    n = len(y_true)

    global_sps = []
    macro_sps = []
    maes = []
    acc_stdevs = []

    group_to_indices = defaultdict(list)
    for i, gid in enumerate(groups):
        group_to_indices[gid].append(i)

    for _ in range(n_bootstrap):
        # Sample indices with replacement to simulate new draws.
        idx = rng.integers(0, n, size=n)

        bt_true = y_true[idx]
        bt_pred = y_pred[idx]
        bt_stdev = y_stdev[idx]
        bt_groups = [groups[i] for i in idx]

        # ---- Global Spearman ----
        rho = spearmanr(bt_true, bt_pred).correlation
        global_sps.append(rho)

        # ---- MAE ----
        maes.append(np.mean(np.abs(bt_true - bt_pred)))

        # ---- Accuracy within stdev ----
        acc_stdevs.append(np.mean(np.abs(bt_true - bt_pred) <= bt_stdev))

        # ---- Macro Spearman ----
        local_group_map = defaultdict(list)
        for i, g in enumerate(bt_groups):
            local_group_map[g].append(i)

        group_corrs = []
        for g, idxs in local_group_map.items():
            gt = bt_true[idxs]
            gp = bt_pred[idxs]

            if np.all(gt == gt[0]):
                continue

            c = spearmanr(gt, gp).correlation
            if not np.isnan(c):
                group_corrs.append(c)

        if group_corrs:
            macro_sps.append(np.mean(group_corrs))
        else:
            macro_sps.append(np.nan)

    def summarize(arr):
        # Package mean estimates and 95% percentile-based confidence intervals.
        arr = np.array(arr, dtype=float)
        return {
            "mean": float(np.nanmean(arr)),
            "ci_low": float(np.nanpercentile(arr, 2.5)),
            "ci_high": float(np.nanpercentile(arr, 97.5)),
        }

    return {
        "global_spearman": summarize(global_sps),
        "macro_spearman": summarize(macro_sps),
        "mae": summarize(maes),
        "acc_within_stdev": summarize(acc_stdevs),
    }


In [21]:
print("\nBOOTSTRAP RESULTS (TEST SET)")

bootstrap_results = bootstrap_test_metrics(
    y_true=dev_df["label"].to_numpy(),
    y_pred=preds,
    y_stdev=dev_df["stdev"].to_numpy(),
    groups=dev_df["group_id"].tolist(),
    n_bootstrap=1000
)


for metric, stats in bootstrap_results.items():
    print(
        f"{metric:20s}: "
        f"{stats['mean']:.4f} "
        f"[{stats['ci_low']:.4f}, {stats['ci_high']:.4f}]"
    )



BOOTSTRAP RESULTS (TEST SET)


  c = spearmanr(gt, gp).correlation


global_spearman     : 0.3366 [0.2629, 0.4040]
macro_spearman      : 0.3379 [0.2403, 0.4265]
mae                 : 1.0145 [0.9521, 1.0745]
acc_within_stdev    : 0.5502 [0.5085, 0.5918]
