In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import statsmodels.stats.proportion as prp


In [2]:
def load_blimp_data(path: Path) -> dict:
    with path.open() as f:
        data = json.load(f)
    if "blimp" in data:
        data = data["blimp"]
    return data


def get_uids(data: dict):
    return np.array(list(data["by_uid"].values()))

In [3]:
# Load all BLiMP data -- by phenomenon

STEP = 20_000

# models = [rf"\texttt{{pico-{t}}}" for t in ("decoder", "relora")]
MODELS = ["decoder", "relora"]
SCALES = ["tiny", "small"]


def _get_data():
    for scale in SCALES:
        baseline_data = load_blimp_data(Path(f"blimp_results_{scale}/step_{STEP}.json"))
        relora_data = load_blimp_data(Path(f"relora-{scale}-eval/step_{STEP}.json"))

        for mod, data in zip(MODELS, (baseline_data, relora_data)):
            for uid, val in data["by_uid"].items():
                yield (scale, mod, uid, uid, val)


data = pd.DataFrame(_get_data(), columns=["Scale", "Model", "Phenomenon", "UID", "Score"])

In [4]:
grouped = data.groupby(["Scale", "Model"]).agg(prop=("Score", "mean"), nobs=("Score", "size")).reset_index()
grouped

Unnamed: 0,Scale,Model,prop,nobs
0,small,decoder,0.701448,67
1,small,relora,0.630791,67
2,tiny,decoder,0.650388,67
3,tiny,relora,0.623642,67


In [19]:
for scale in SCALES:
    counts = []
    nobs = []
    for model in MODELS:
        row = grouped[(grouped["Scale"] == scale) & (grouped["Model"] == model)]
        n = float(row["nobs"].iloc[0]) * 1000
        counts.append(float(row["prop"].iloc[0]) * n)
        nobs.append(n)
    stat, pval = prp.proportions_ztest(counts, nobs, alternative="larger")
    print(f"{scale}: {pval}")

tiny: 1.2123554040626134e-24
small: 7.43060911688948e-166
