In [None]:
%cd ..

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd

plt.style.use("ggplot")

In [None]:
results_path = Path("data", "raw", "euroeval_benchmark_results.jsonl")
df = pd.read_json(results_path, lines=True)
df.head()

In [None]:
df["language"] = df.dataset_languages.map(lambda x: x[0] if len(x) == 1 else x)
df["score"] = df.results.map(lambda x: x["total"]["test_f1"])
df["standard_error"] = df.results.map(lambda x: x["total"]["test_f1_se"])
df = df[["model", "language", "score", "standard_error"]]
df.head()

In [None]:
# Group the DataFrame by language and calculate the mean of the 'score' column.
mean_scores_by_lang = df.groupby("language")["score"].mean()

# Sort these mean scores in descending order (best performing languages first).
mean_scores_by_lang_sorted = mean_scores_by_lang.sort_values(ascending=False)

# Get the list of language names in this new sorted order.
# This list will dictate the order of bars on the x-axis for all subplots.
sorted_languages = mean_scores_by_lang_sorted.index.tolist()

print("\nLanguages sorted by mean performance:")
print(sorted_languages)

In [None]:
# --- Plotting Code ---

models = df["model"].unique()
n_languages = len(sorted_languages)

# --- Create the Subplot Grid ---
nrows = 2
ncols = 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 8), sharey=True)

# --- Loop and Plot ---
for ax, model in zip(axes.flat, models):
    model_data = df[df["model"] == model]
    model_data_sorted = model_data.set_index("language").loc[sorted_languages]
    scores = model_data_sorted["score"]

    x_pos = np.arange(n_languages)
    ax.bar(x_pos, scores, align="center", alpha=0.8)

    # --- Customize each subplot ---
    ax.set_title(model, fontsize=14)
    ax.set_xticks(x_pos)
    ax.set_xticklabels([])
    ax.tick_params(axis="x", length=0)
    ax.grid(axis="y", linestyle="--", alpha=0.7)

    if ax.get_subplotspec().is_first_col():
        ax.set_ylabel("F1-score", fontsize=12)

# --- Set Y-Axis Limits and Format for ALL Subplots ---
plt.ylim(0, 100)
formatter = mticker.PercentFormatter(xmax=100)
axes[0, 0].yaxis.set_major_formatter(formatter)

# Add an overarching title
fig.suptitle("Model Performance (Languages Sorted by Mean Score)", fontsize=18, y=1.02)

# Adjust layout
fig.tight_layout(rect=[0, 0, 1, 0.96])

In [None]:
# --- Save the Figure ---

# 1. Define the output path using pathlib.Path
output_path = Path("data/final/evaluation-plot.png")

# 2. Create the parent directories if they don't exist
# This prevents an error if 'data/final/' is not already created.
output_path.parent.mkdir(parents=True, exist_ok=True)

# 3. Save the figure with high quality settings
#    - dpi=300 is a good resolution for publications.
#    - bbox_inches='tight' removes excess white space around the plot.
fig.savefig(output_path, dpi=300, bbox_inches="tight")

# 4. (Optional) Also display the plot on screen
plt.show()

print(f"Plot saved successfully to: {output_path}")

In [None]:
df.groupby("model").score.apply(
    lambda x: f"{x.mean():.2f} ± {x.std(ddof=1) / np.sqrt(len(x)):.2f}"
).sort_values(ascending=False)

In [None]:
for model in df.model.unique():
    model_df = df.query("model == @model").drop(columns=["model", "standard_error"])
    model_df.score = model_df.score.map(lambda x: f"{x:.0f}")
    model_df.rename(columns=dict(score="F1-score"), inplace=True)
    print(f"=== Table for {model} ===")
    print(model_df.to_latex(index=False), end="\n\n")