In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from matplotlib import rc
from adjustText import adjust_text

# Example input (JSON entry in model_performance_record_list.json)
# [
#   {"Model": "gpt-3.5-turbo", "Language": "en", "Score": 97.8, "Dataset": "xcopa", "Metric": "accuracy"},
#   ...
# ]

# df = pd.read_json("model_performance_record_list.json")

df = pd.read_json("./model_performance_record_list.json")
df = df[df["Language"] != "avg"]

df["Setup"] = df["Dataset"] + "_" + df["Metric"]
for col in ["Language", "Setup", "Model"]:
    df[col] = df[col].astype("category")

md = smf.mixedlm("Score ~ C(Language) + C(Setup)", df, groups=df["Model"], re_formula="~1")
mdf = md.fit(method="lbfgs", reml=False)

intercept = mdf.params["Intercept"]
language_effect = mdf.params.filter(like="C(Language)").to_dict()
setup_effect = mdf.params.filter(like="C(Setup)").to_dict()
mean_setup_effect = np.mean(list(setup_effect.values()))

def beta_lang(lang): return language_effect.get(f"C(Language)[T.{lang}]", 0.0)
def beta_setup(setup): return setup_effect.get(f"C(Setup)[T.{setup}]", 0.0)

lang_setup_potential = {
    (lang, stp): intercept + beta_lang(lang) + beta_setup(stp)
    for (lang, stp) in df[["Language", "Setup"]].drop_duplicates().itertuples(index=False)
}
df["Potential"] = df.apply(lambda r: lang_setup_potential[(r.Language, r.Setup)], axis=1)

language_potential = {
    lang: intercept + beta_lang(lang) + mean_setup_effect
    for lang in df["Language"].cat.categories
}
language_df = (
    pd.DataFrame(language_potential.items(), columns=["Language", "Potential"])
      .sort_values("Potential", ascending=False)
)

df["PRR"] = df["Score"].astype(float) / df["Potential"]

model_evaluation = df.groupby("Model")["PRR"].agg(mean_prr="mean", std_prr="std").assign(
    cv_prr=lambda x: x["std_prr"] / x["mean_prr"]
).reset_index()

print("Language Potential")
print(language_df.to_string(index=False))

print("Model-level Evaluation")
print(model_evaluation.to_string(index=False))


In [None]:
rc('font', **{'family': 'serif', 'serif': ['Palatino'], 'size': 18})
rc('text', usetex=True)
mpl.rcParams.update({'errorbar.capsize': 6})
plt.rcParams["figure.figsize"] = [8, 6]

df = pd.DataFrame(model_evaluation)


llm_models = [
    'BLOOMZ', 'text-davinci-003', 'text-davinci-003 (TT)',
    'gpt-3.5-turbo', 'gpt-3.5-turbo (TT)', 'gpt-4-32k', 'gpt-4-32k (TT)'
]
finetuned_models = [
    'MuRIL', 'TuLRv6 - XXL', 'XGLM', 'XLM-R Large', 'mBERT', 'mT5-Base'
]

def get_category(model_name):
    if model_name in llm_models:
        return 'LLM with ICL'
    elif model_name in finetuned_models:
        return 'Fine-tuned'
    else:
        return 'Other'

df['Category'] = df['Model'].apply(get_category)

palette = {'LLM with ICL': sns.color_palette("colorblind")[0],
           'Fine-tuned': sns.color_palette("colorblind")[1]}
markers = {'LLM with ICL': 'o', 'Fine-tuned': '^'}

fig, ax = plt.subplots()

for category, group in df.groupby('Category'):
    ax.scatter(
        group["mean_prr"],
        group["cv_prr"],
        color=palette[category],
        marker=markers[category],
        edgecolor='black',
#         s=60,
        label=category
    )
    
    

texts = []
for i, row in df.iterrows():
    texts.append(
        ax.text(row["mean_prr"], row["cv_prr"], r'\textsc{%s}' % row["Model"], fontsize=16)
    )


adjust_text(texts, ax=ax, arrowprops=dict(arrowstyle='-', color='gray'))


old_x, old_y = texts[6].get_position()
texts[6].set_position((old_x - 0.25, old_y+0.025))
ax.set_xlabel(r'\textbf{Mean-PRR (Performance Realisation Ratio)}', fontsize=18)
ax.set_ylabel(r'\textbf{CV-PRR (Language Disparity)}', fontsize=18)
ax.grid(True, linestyle='--')
ax.tick_params(axis='both', which='major', labelsize=16)
ax.legend(title="Model Type", fontsize=14, title_fontsize=15, loc='upper right')
plt.tight_layout()

plt.show()