In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

# Example input (JSON entry in model_performance_record_list.json)
# [
#   {"Model": "gpt-3.5-turbo", "Language": "en", "Score": 97.8, "Dataset": "xcopa", "Metric": "accuracy"},
#   ...
# ]

# df = pd.read_json("model_performance_record_list.json")

df = pd.read_json("./model_performance_record_list.json")
df = df[df["Language"] != "avg"]

df["Setup"] = df["Dataset"] + "_" + df["Metric"]
for col in ["Language", "Setup", "Model"]:
    df[col] = df[col].astype("category")

md = smf.mixedlm("Score ~ C(Language) + C(Setup)", df, groups=df["Model"], re_formula="~1")
mdf = md.fit(method="lbfgs", reml=False)

intercept = mdf.params["Intercept"]
language_effect = mdf.params.filter(like="C(Language)").to_dict()
setup_effect = mdf.params.filter(like="C(Setup)").to_dict()
mean_setup_effect = np.mean(list(setup_effect.values()))

def beta_lang(lang): return language_effect.get(f"C(Language)[T.{lang}]", 0.0)
def beta_setup(setup): return setup_effect.get(f"C(Setup)[T.{setup}]", 0.0)

lang_setup_potential = {
    (lang, stp): intercept + beta_lang(lang) + beta_setup(stp)
    for (lang, stp) in df[["Language", "Setup"]].drop_duplicates().itertuples(index=False)
}
df["Potential"] = df.apply(lambda r: lang_setup_potential[(r.Language, r.Setup)], axis=1)

language_potential = {
    lang: intercept + beta_lang(lang) + mean_setup_effect
    for lang in df["Language"].cat.categories
}
language_df = (
    pd.DataFrame(language_potential.items(), columns=["Language", "Potential"])
      .sort_values("Potential", ascending=False)
)

df["PRR"] = df["Score"].astype(float) / df["Potential"]

model_evaluation = df.groupby("Model")["PRR"].agg(mean_prr="mean", std_prr="std").assign(
    cv_prr=lambda x: x["std_prr"] / x["mean_prr"]
).reset_index()

print("Language Potential")
print(language_df.to_string(index=False))

print("Model-level Evaluation")
print(model_evaluation.to_string(index=False))
