In [None]:
import glob
import json
import math
import os
import re
import statistics
from datetime import datetime
from multiprocessing import Pool
from typing import (
    Any,
    Optional,
    Sequence,
    TypeAlias,
    TypedDict,
)

import numpy as np
import pandas as pd
import yaml
from sklearn.metrics import balanced_accuracy_score
from tqdm import tqdm


class PerTaskScores(TypedDict):
    scores: list[list[float]]
    length: list[int]
    labels: list[list[Any]]


TaskScoresByLang: TypeAlias = dict[str, dict[str, dict[str, PerTaskScores]]]

In [None]:
def flatten(matrix: list[list[Any]]) -> list[Any]:
    """Flatten a 2D matrix into a 1D list.

    Args:
        matrix (list[list[Any]]): A 2D list/matrix to flatten.

    Returns:
        list[Any]: A flattened 1D list containing all items from the matrix.
    """
    return [item for row in matrix for item in row]


def calculate_stderr(values: Sequence[float]) -> float:
    """Calculate the standard error of the mean.

    Args:
        values (Sequence[float]): Numeric values.

    Returns:
        float: Standard error of the mean; 0.0 if the sequence is empty.
    """
    if len(values) == 0:
        return 0.0
    return math.sqrt(statistics.variance(values) / len(values))


def extract_datetime_from_filename(filename: str) -> Optional[datetime]:
    """Extract a datetime from a filename using the specified regex patterns.

    Args:
        filename (str): Filepath or filename string.

    Returns:
        Optional[datetime]: Parsed datetime if a match is found; otherwise None.
    """
    patterns = [
        r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6})"  # e.g., 2025-06-22T00:10:41.690531
    ]
    for pattern in patterns:
        match = re.search(pattern, filename)
        if match:
            dt_str = match.group(1)
            return datetime.fromisoformat(dt_str)
    return None


def get_latest_file(file_list: Sequence[str]) -> Optional[str]:
    """Return the filepath with the most recent embedded datetime.

    Args:
        file_list (Sequence[str]): Filepaths to examine.

    Returns:
        Optional[str]: Path with the latest datetime match or None if none match.
    """
    files_with_dt: list[tuple[datetime, str]] = []
    for f in file_list:
        dt = extract_datetime_from_filename(f)
        if dt:
            files_with_dt.append((dt, f))
    if not files_with_dt:
        return None
    latest = max(files_with_dt, key=lambda x: x[0])
    return latest[1]

In [None]:
def get_individual_scores(
    folder: str,
    model: str,
    run: int,
    task: str,
    lang: str,
    aggregation_group: Optional[str],
) -> tuple[list[float], int, list[str]]:
    """Load per-question scores and labels for a single task-language-run.

    This reads `{model}/run_{run}/inferences/{lang}/{subfolder}/{model}_{task}_{lang}.jsonl`.

    Args:
        folder (str): Root results folder path.
        model (str): Model directory name.
        run (int): Run index.
        task (str): Task identifier.
        lang (str): Language code.
        aggregation_group (Optional[str]): Aggregation group overriding task subfolder.

    Returns:
        tuple[list[float], int, list[str]]: (values, count, labels) where values are numeric
        scores, count is number of items, and labels are per-item
        labels if present.
    """
    subfolder = aggregation_group if aggregation_group else task
    filepath = os.path.join(
        folder,
        model,
        f"run_{run}",
        "inferences",
        lang,
        subfolder,
        f"{model}_{task}_{lang}.jsonl",
    )

    values: list[float] = []
    labels: list[str] = []
    if os.path.exists(filepath):
        with open(filepath) as f:
            lines = f.readlines()
        # Load the first line to check the structure
        line_0 = json.loads(lines[0])["individual_scores"]
        if isinstance(line_0, dict):
            metric = list(line_0.keys())[0]
        else:
            metric = None

        for line in lines:
            individual_row = json.loads(line)
            # HACK to omit number words subcategory in IF-Eval for thai and burmese
            if task == "if-eval" and lang in ["th", "my"]:
                if (
                    individual_row["metadata"]["subcategory"]
                    == "length_constraints:number_words"
                ):
                    print(
                        f"Warning: Skipping if-eval subcategory for num_words in {filepath}"
                    )
                    continue
            if metric:
                value = individual_row["individual_scores"][metric]
            else:
                value = individual_row["individual_scores"]

            if isinstance(value, list):
                for v in value:
                    if not pd.isna(v):
                        values.append(float(v))
                    else:
                        values.append(0.0)
            else:
                if not pd.isna(value):
                    values.append(float(value))
                else:
                    values.append(0.0)

            if "label" in individual_row:
                # load labels for use in the balanced accuracy calculation
                labels.append(individual_row["label"])

    return values, len(values), labels


def load_config_files(
    folder: str,
    model: str,
    run: int,
) -> tuple[
    dict[str, Any], list[int], list[str], list[str], list[str], list[Optional[str]]
]:
    """Load the latest task configuration file and prepare it for the multiprocessing run.

    Args:
        folder (str): Root results folder path.
        model (str): Model directory name.
        run (int): Run index whose config to load.

    Returns:
        tuple[dict[str, Any], list[int], list[str], list[str], list[str], list[Optional[str]]]:
        (config, runs, tasks, langs, competencies, aggregation_groups) expanded per-language for this run.
    """
    tasks: list[str] = []
    langs: list[str] = []
    competencies: list[str] = []
    aggregation_groups: list[Optional[str]] = []
    files = glob.glob(os.path.join(folder, model, f"run_{run}", "configs", "*.yaml"))
    latest_file = get_latest_file(files)
    config: dict[str, Any] = yaml.safe_load(open(latest_file, "r"))

    for task in config["tasks"].keys():
        _temp_langs = list(config["tasks"][task]["languages"].keys())
        num_langs = len(_temp_langs)
        if "aggregation_group" in config["tasks"][task]:
            aggregation_groups.extend(
                [config["tasks"][task]["aggregation_group"]] * num_langs
            )
        else:
            aggregation_groups.extend([None] * num_langs)

        tasks.extend([task] * num_langs)
        competencies.extend([config["tasks"][task]["competency"]] * num_langs)
        langs.extend(_temp_langs)

    runs = [run] * len(tasks)
    return config, runs, tasks, langs, competencies, aggregation_groups


def load_task_scores(
    folder: str,
    model: str,
    run_numbers: int,
    pool: Pool,
) -> tuple[list[dict[str, Any]], TaskScoresByLang]:
    """Load all per-task, per-language scores across multiple runs using a pool.

    Args:
        folder (str): Root results folder path.
        model (str): Model directory name.
        run_numbers (int): Number of runs to aggregate (0..run_numbers-1).
        pool (Pool): Multiprocessing Pool for parallel I/O.

    Returns:
        tuple[list[dict[str, Any]], TaskScoresByLang]:
        configs: latest config per run; task_scores: nested lang -> competency -> task with scores/lengths/labels.
    """
    task_scores: TaskScoresByLang = {}
    _folders = [folder] * run_numbers
    _models = [model] * run_numbers
    _runs = list(range(run_numbers))
    arguments = pool.starmap(load_config_files, zip(_folders, _models, _runs))

    runs: list[int] = []
    tasks: list[str] = []
    langs: list[str] = []
    competencies: list[str] = []
    aggregation_groups: list[Optional[str]] = []
    configs: list[dict[str, Any]] = []
    for config, run, task, lang, competency, aggregation_group in arguments:
        try:
            runs.extend(run)
            tasks.extend(task)
            langs.extend(lang)
            competencies.extend(competency)
            aggregation_groups.extend(aggregation_group)
            configs.append(config)

        except Exception as e:
            print(f"Error loading config files for {model} run {run}")
            print(run, task, lang)
            raise e

    try:
        num_tasks = len(tasks)
        folders = [folder] * num_tasks
        models = [model] * num_tasks

        results = pool.starmap(
            get_individual_scores,
            zip(folders, models, runs, tasks, langs, aggregation_groups),
        )

        for task, lang, competency, (values, count, labels) in zip(
            tasks, langs, competencies, results
        ):
            if lang not in task_scores:
                task_scores[lang] = {}

            if competency not in task_scores[lang]:
                task_scores[lang][competency] = {}

            if task not in task_scores[lang][competency]:
                task_scores[lang][competency][task] = {
                    "scores": [],
                    "length": [],
                    "labels": [],
                }

            task_scores[lang][competency][task]["scores"].append(values)
            task_scores[lang][competency][task]["length"].append(count)
            task_scores[lang][competency][task]["labels"].append(labels)

    except Exception as e:
        print(f"Error loading config files for {model} run {run}")
        print(run, task, lang)
        raise e

    return configs, task_scores

In [None]:
def calculate_mean_and_stderr(
    scores: Sequence[float], prefix: str = ""
) -> dict[str, Any]:
    """Calculate mean, stderr, and per-run mean scores.

    Args:
        scores (Sequence[float]): Numeric values per bootstrap/run.
        prefix (str): string prefix for keys (e.g., "sea_").

    Returns:
        dict[str, Any]: A mapping with keys `{prefix}mean`, `{prefix}stderr`, `{prefix}mean_list`.
    """
    return {
        f"{prefix}mean": statistics.mean(scores),
        f"{prefix}stderr": calculate_stderr(scores),
        f"{prefix}mean_list": scores,
    }


def calculate_balanced_accuracy(
    preds: Sequence[Sequence[Any]], labels: Sequence[Any]
) -> list[float]:
    """Compute balanced accuracy for each row of predictions vs labels.

    Args:
        preds (Sequence[Sequence[Any]]): Sequence of per-run predictions.
        labels (Sequence[Any]): Ground-truth labels aligned to items.

    Returns:
        list[float]: Balanced accuracy per row in `preds`.
    """
    balanced_accuracies: list[float] = []
    for i in range(len(preds)):
        balanced_accuracies.append(balanced_accuracy_score(labels, preds[i]))
    return balanced_accuracies


def normalize_scores(scores: float, min_score: float, max_score: float) -> float:
    """Normalize a score to [0, 1] given min and max, floor at 0.

    Args:
        scores (float): Raw score.
        min_score (float): Minimum expected score (e.g., random chance).
        max_score (float): Maximum possible score.

    Returns:
        float: Normalized score in [0, 1].
    """
    normalized_scores = max((scores - min_score) / (max_score - min_score), 0)
    return normalized_scores


def aggregate_scores(
    configs: Sequence[dict[str, Any]],
    data: TaskScoresByLang,
    n_bootstraps: int = 30,
    omit_competencies: list[str] = [],
    omit_tasks: list[str] = [],
    sea_languages: list[str] = ["id", "vi", "th", "ta", "tl", "ms", "my"],
    seed: int = 94370244,
) -> tuple[dict[str, Any], list[tuple[str, str, str]]]:
    """Aggregate per-task and per-language results into competency and overall scores.

    Applies bootstrap sampling, balanced accuracy for multi-choice tasks, and random-chance normalization. Also computes SEA-language subset aggregates.

    Args:
        configs (Sequence[dict[str, Any]]): Per-run configuration objects (first referenced for schema).
        data (TaskScoresByLang): Collected scores by `lang -> competency -> task`.
        n_bootstraps (int): Number of bootstrap samples to compute.
        omit_competencies (list[str]): Competency names to exclude.
        omit_tasks (list[str]): Task names to exclude in addition to `run_args.skip_task`.
        sea_languages (list[str]): Language codes included for SEA average.
        seed (int): RNG seed for reproducibility.

    Returns:
        tuple[dict[str, Any], list[tuple[str, str, str]]]:
        aggregated_scores with per-task/competency/language and overall stats, and
        incomplete_tasks as (lang, competency, task) for missing runs.
    """
    generator = np.random.default_rng(seed)
    aggregated_scores: dict[str, Any] = {lang: {} for lang in data.keys()}
    incomplete_tasks: list[tuple[str, str, str]] = []
    omit_tasks.extend(configs[0]["run_args"]["skip_task"])

    # Iterate through each task and its configuration and calculate mean and stderr
    for task, task_config in configs[0]["tasks"].items():
        if task in omit_tasks:
            continue

        competency = task_config["competency"]
        if competency in omit_competencies:
            continue

        aggregation_group = task_config.get("aggregation_group", None)

        for lang in task_config["languages"].keys():
            if competency not in aggregated_scores[lang]:
                aggregated_scores[lang][competency] = {
                    "tasks": {},
                    "aggregation_groups": {},
                }

            if task not in aggregated_scores[lang][competency]["tasks"]:
                aggregated_scores[lang][competency]["tasks"][task] = {}

            task_scores = data[lang][competency][task]["scores"]
            task_length = data[lang][competency][task]["length"][0]
            task_labels = data[lang][competency][task]["labels"]
            valid_tasks = [1 if x != [] else 0 for x in task_scores]
            num_valid_tasks = sum(valid_tasks)

            if num_valid_tasks == 0:
                aggregated_scores[lang][competency]["tasks"][task] = {
                    "mean": 0,
                    "stderr": 0,
                    "mean_list": [0] * n_bootstraps,
                }
            elif (
                task_config.get("use_logprobs", True)
                and task_config.get("max_n_runs", None) == 1
            ):
                scores = task_scores[0]
                labels = task_labels[0]

                sampled_probability = generator.random(size=(n_bootstraps, len(scores)))
                sampled_scores = sampled_probability > scores

                # multiplies the scores (1 or 0) with the labels. A wrong prediction would be "0" while a right prediction would be "<label>".
                preds = np.strings.multiply(labels, sampled_scores.astype(int))
                bootstrap_mean = calculate_balanced_accuracy(preds, labels)

                # normalize the scores for random chance
                bootstrap_mean = [
                    normalize_scores(x, 1 / len(set(labels)), 1) * 100
                    for x in bootstrap_mean
                ]

                mean = statistics.mean(bootstrap_mean)
                stderr = calculate_stderr(bootstrap_mean)
                aggregated_scores[lang][competency]["tasks"][task] = {
                    "mean": mean,
                    "stderr": stderr,
                    "mean_list": bootstrap_mean,
                }
            else:
                trimmed_values = [x for x in task_scores if x != []]
                if trimmed_values != task_scores:
                    is_task_incomplete = True
                else:
                    is_task_incomplete = False

                labels = task_labels[0]
                try:
                    is_multi_choice = (
                        labels != [] and len(set(labels)) != len(labels)
                    ) or "global_mmlu_lite" in task
                except TypeError:
                    is_multi_choice = False

                # Create an array of random integers of size number of questions with valuesbetween 0 and the number of runs (trimmed_values)
                # Effectively, this samples the scores for each question for each bootstrap
                bootstrap_index = generator.integers(
                    0, len(trimmed_values), size=(n_bootstraps, len(trimmed_values[0]))
                )
                scores = np.array(trimmed_values)[
                    bootstrap_index, np.arange(len(trimmed_values[0]))
                ]

                if is_multi_choice:
                    # apply balanced accuracy/random chance normalization on scores
                    # for tasks with multi-choice labels

                    # multiplies the scores (1 or 0) with the labels. A wrong prediction would be an empty string ("") while a right prediction would be "<label>".
                    bootstrap_pred = np.strings.multiply(labels, scores.astype(int))
                    bootstrap_mean = calculate_balanced_accuracy(bootstrap_pred, labels)

                    if "global_mmlu_lite" in task:
                        len_labels = 4
                    else:
                        len_labels = len(set(labels))

                    bootstrap_mean = [
                        normalize_scores(x, 1 / len_labels, 1) * 100
                        for x in bootstrap_mean
                    ]
                else:
                    bootstrap_mean = np.mean(scores, axis=1)

                    # HACK to ensure that the scores are in the range of 0-100
                    if np.max(flatten(task_scores)) <= 1:
                        bootstrap_mean = bootstrap_mean * 100
                    bootstrap_mean = bootstrap_mean.tolist()

                mean = statistics.mean(bootstrap_mean)
                stderr = calculate_stderr(bootstrap_mean)

                aggregated_scores[lang][competency]["tasks"][task] = {
                    "mean": mean,
                    "stderr": stderr,
                    "mean_list": bootstrap_mean,
                }
                if is_task_incomplete:
                    incomplete_tasks.append((lang, competency, task))
                    aggregated_scores[lang][competency]["tasks"][task][
                        "is_incomplete"
                    ] = True

            if aggregation_group:
                if "-logprobs" in aggregation_group:
                    aggregation_group = aggregation_group.replace("-logprobs", "")

                aggregated_scores[lang][competency]["tasks"][task]["remarks"] = (
                    f"Using scores for aggregation group {aggregation_group.upper()} instead of individual task scores."
                )
                aggregated_scores[lang][competency]["tasks"][task]["ignore"] = True
                aggregated_scores[lang][competency]["tasks"][task][
                    "aggregation_group"
                ] = aggregation_group

                if (
                    aggregation_group
                    not in aggregated_scores[lang][competency]["aggregation_groups"]
                ):
                    aggregated_scores[lang][competency]["aggregation_groups"][
                        aggregation_group
                    ] = {}

                aggregated_scores[lang][competency]["aggregation_groups"][
                    aggregation_group
                ][task] = {
                    "mean_list": aggregated_scores[lang][competency]["tasks"][task][
                        "mean_list"
                    ],
                    "length": task_length,
                }

    # Calculate mean and stderr for each language, competency, and aggregation group
    overall_scores: list[list[float]] = []
    for lang in aggregated_scores.keys():
        lang_scores: list[list[float]] = []
        for competency in aggregated_scores[lang].keys():
            competency_scores: list[list[float]] = []
            for task in aggregated_scores[lang][competency]["tasks"].keys():
                if aggregated_scores[lang][competency]["tasks"][task].get(
                    "ignore", False
                ):
                    continue

                competency_scores.append(
                    aggregated_scores[lang][competency]["tasks"][task]["mean_list"]
                )

            for aggregation_group, aggregation_group_dict in aggregated_scores[lang][
                competency
            ]["aggregation_groups"].items():
                if aggregation_group == "global_mmlu_lite":
                    scores: list[list[float]] = []
                    total_length = np.zeros((n_bootstraps,))
                    for x in aggregation_group_dict.values():
                        mean_list = x["mean_list"]
                        lengths = x["length"]
                        scores.append(
                            (np.array(mean_list) * np.array(lengths)).tolist()
                        )
                        total_length += lengths
                    scores_per_run = (np.sum(scores, axis=0) / total_length).tolist()
                else:
                    scores_per_run = np.mean(
                        [x["mean_list"] for x in aggregation_group_dict.values()],
                        axis=0,
                    ).tolist()

                competency_scores.append(scores_per_run)
                aggregated_scores[lang][competency]["aggregation_groups"][
                    aggregation_group
                ] = calculate_mean_and_stderr(scores_per_run)

            competency_scores_per_run = np.mean(competency_scores, axis=0).tolist()
            lang_scores.append(competency_scores_per_run)
            aggregated_scores[lang][competency].update(
                calculate_mean_and_stderr(competency_scores_per_run)
            )

        lang_scores_per_run = np.mean(lang_scores, axis=0).tolist()
        overall_scores.append(lang_scores_per_run)
        aggregated_scores[lang].update(calculate_mean_and_stderr(lang_scores_per_run))

    overall_scores_per_run = np.mean(overall_scores, axis=0).tolist()
    aggregated_scores.update(calculate_mean_and_stderr(overall_scores_per_run))

    sea_scores_per_run = np.mean(
        [
            aggregated_scores[lang]["mean_list"]
            for lang in aggregated_scores.keys()
            if lang in sea_languages
        ],
        axis=0,
    ).tolist()
    aggregated_scores.update(
        calculate_mean_and_stderr(sea_scores_per_run, prefix="sea_")
    )

    if incomplete_tasks:
        aggregated_scores["is_incomplete"] = True
    return aggregated_scores, incomplete_tasks

In [None]:
model_results = {}
pool = Pool(32)
n_runs = 8  # Number of runs to aggregate
results_folder = ""
for model_path in tqdm(glob.glob(os.path.join(results_folder, "**", "*"))):
    try:
        model_name = os.path.basename(model_path)
        folder = os.path.dirname(model_path)
        configs, data = load_task_scores(folder, model_name, n_runs, pool)
        scores, incomplete_tasks = aggregate_scores(configs, data, n_bootstraps=30)
        if scores.get("is_incomplete", False):
            print(f"Overall incomplete data detected for {model_name}!")
            for lang, competency, task in incomplete_tasks:
                print(f"Incomplete task: {task} for {lang}, competency: {competency}")
        model_results[model_name] = scores
        with open(os.path.join(model_path, f"{model_name}_scores.yaml"), "w") as f:
            yaml.dump(scores, f, default_flow_style=False, sort_keys=False)
    except Exception as e:
        print(f"Error processing {model_path}")
        print(e)
        continue