In [1]:
import os
import pandas as pd
import glob
import numpy as np
import math
import warnings
import matplotlib.pyplot as plt

# TODO: fix warnings
warnings.filterwarnings('ignore')


In [None]:
import os
import glob
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def load_and_merge(dfile: str, rfile: str) -> pd.DataFrame:
    """
    Load truth + results CSVs, inject integer idx from row order,
    cast columns, and return merged DataFrame with ['idx','truth','result'].
    """
    # Load truth and assign fresh idx
    df_truth = pd.read_csv(dfile)
    df_truth['idx'] = np.arange(len(df_truth))
    df_truth['truth'] = df_truth['truth'].astype(int)

    # Load results and assign fresh idx
    df_res = pd.read_csv(rfile)
    df_res['idx']    = np.arange(len(df_res))
    df_res['result'] = df_res['result'].astype(int)

    # Merge on the new idx
    df_join = df_truth.join(df_res.set_index('idx'), on='idx', how='inner')
    return df_join


def summarize_counts(df: pd.DataFrame) -> pd.Series:
    """
    Compute basic error metrics on a merged df.
    """
    errors  = df['result'] - df['truth']
    abs_err = errors.abs()
    # avoid division by zero for MAPE
    denom = df['truth'].replace(0, np.nan).abs()

    mape  = (abs_err / denom).mean() * 100               # in percent
    smape = (2 * abs_err / (df['result'].abs() + df['truth'].abs())).mean() * 100

    return pd.Series({
        'n_examples': len(df),
        'accuracy':   (errors == 0).mean(),
        'over_rate':  (errors >  0).mean(),
        'under_rate': (errors <  0).mean(),
        'mean_error': errors.mean(),
        'MAE':        abs_err.mean(),
        'MAPE (%)':   mape,
        'sMAPE (%)':  smape,
    })


def plot_range_performance_all_models(
    data_dir: str,
    results_dir: str,
    n_bins: int = 10
) -> None:
    """
    For each dataset in `data_dir` (files *_dataset.csv), automatically
    compute `bin_width = ceil((max_truth+1)/n_bins)`, then bin truth counts
    into ranges of that width and overlay all models’:
      - grouped bars: exact‐match accuracy per bin
      - lines: MAE per bin (log y‐axis)
    """
    ds_paths = glob.glob(os.path.join(data_dir, '*_dataset.csv'))
    if not ds_paths:
        print(f"No datasets found in {data_dir}")
        return

    for dfile in ds_paths:
        base = os.path.basename(dfile).replace('_dataset.csv','')

        # Load truth and assign fresh idx
        df_truth = pd.read_csv(dfile)
        df_truth['idx'] = np.arange(len(df_truth))
        df_truth['truth'] = df_truth['truth'].astype(int)

        # Determine bin_width from max truth
        max_val = df_truth['truth'].max()
        bin_width = math.ceil((max_val + 1) / n_bins)
        # Build bins [0, bin_width, 2*bin_width, ..., up to > max_val]
        edges = list(range(0, max_val + bin_width, bin_width))
        labels = [f"{low}–{low+bin_width-1}" for low in edges[:-1]]

        # Assign each example to a bin
        df_truth['count_bin'] = pd.cut(
            df_truth['truth'],
            bins=edges,
            labels=labels,
            right=False
        )

        # Aggregate per model
        agg_per_model = {}
        for model in sorted(os.listdir(results_dir)):
            rfile = os.path.join(
                results_dir, model,
                f"{base}_results.csv"
            )
            if not os.path.exists(rfile):
                continue

            # Load results and assign fresh idx
            df_res = pd.read_csv(rfile)
            df_res['idx']    = np.arange(len(df_res))
            df_res['result'] = df_res['result'].astype(int)

            # Merge and group
            df = df_truth.join(df_res.set_index('idx'), on='idx', how='inner')
            grp = df.groupby('count_bin')
            agg = pd.DataFrame({
                'accuracy': grp.apply(lambda g: (g.result==g.truth).mean()),
                'MAE':      grp.apply(lambda g: (g.result-g.truth).abs().mean())
            }).reindex(labels)  # keep full bin order
            agg_per_model[model] = agg

        if not agg_per_model:
            print(f"No model results for dataset '{base}'")
            continue

        # Plot
        x = np.arange(len(labels))
        n_mod = len(agg_per_model)
        width = 0.8 / n_mod

        fig, ax1 = plt.subplots(figsize=(14,6))
        ax2 = ax1.twinx()

        # Bars: accuracy
        for i, (model, agg) in enumerate(agg_per_model.items()):
            ax1.bar(
                x + i*width,
                agg['accuracy'],
                width=width,
                alpha=0.8,
                label=f"{model} acc"
            )

        ax1.set_ylim(0,1)
        ax1.set_xticks(x + width*(n_mod-1)/2)
        ax1.set_xticklabels(labels, rotation=45, ha='right')
        ax1.set_ylabel('Exact-match accuracy')

        # Lines: MAE
        for model, agg in agg_per_model.items():
            ax2.plot(
                x + width*(n_mod-1)/2,
                agg['MAE'],
                marker='o',
                linestyle='-',
                linewidth=1,
                label=f"{model} MAE"
            )

        ax2.set_yscale('log')
        ax2.set_ylabel('Mean Absolute Error (log scale)')

        # Combined legend
        h1, l1 = ax1.get_legend_handles_labels()
        h2, l2 = ax2.get_legend_handles_labels()
        ax1.legend(
            h1+h2, l1+l2,
            bbox_to_anchor=(1.02,1), loc='upper left'
        )

        ax1.set_title(
            f"{base} — Range Performance "
            f"(bin_width={bin_width}, n_bins={n_bins})"
        )
        fig.tight_layout()
        plt.show()


def plot_rolling_accuracy_all_models(
    data_dir: str,
    results_dir: str,
    window: int = 50
) -> None:
    """
    For each dataset in data_dir, overlay rolling exact-match accuracy
    of every model in results_dir on one plot.
    """
    ds_paths = glob.glob(os.path.join(data_dir, '*_dataset.csv'))
    for dfile in ds_paths:
        base = os.path.basename(dfile).replace('_dataset.csv','')

        # Load truth and assign fresh idx
        df_truth = pd.read_csv(dfile)
        df_truth['idx'] = np.arange(len(df_truth))
        df_truth['truth'] = df_truth['truth'].astype(int)

        plt.figure(figsize=(10,5))
        plt.title(f"Rolling Accuracy (w={window}) — {base}")
        plt.xlabel("Example index")
        plt.ylabel("Rolling exact-match accuracy")
        plt.ylim(0,1)
        plt.grid(True, linestyle='--', alpha=0.4)

        model_paths = glob.glob(os.path.join(results_dir, '*'))
        for mp in model_paths:
            model_name = os.path.basename(mp)
            rfile = os.path.join(mp, f'{base}_results.csv')
            if not os.path.exists(rfile):
                continue

            # Load results and assign fresh idx
            df_res = pd.read_csv(rfile)
            df_res['idx']    = np.arange(len(df_res))
            df_res['result'] = df_res['result'].astype(int)

            # Filter and merge
            df_res = df_res[df_res['result'] != -1]
            df = df_truth.join(df_res.set_index('idx'), on='idx', how='inner')

            # Compute rolling accuracy
            rolling_acc = (
                (df['result'] == df['truth'])
                .astype(int)
                .rolling(window=window, min_periods=1)
                .mean()
            )
            plt.plot(rolling_acc.values, label=model_name, linewidth=1)

        plt.legend(title="Model", bbox_to_anchor=(1.02,1), loc='upper left')
        plt.tight_layout()
        plt.show()


def find_largest_outliers(
    df: pd.DataFrame,
    n: int = 10,
    model_name: str = None,
    dataset_name: str = None
) -> None:
    """
    Find the largest outliers in a DataFrame.
    """
    df['abs_err'] = (df['result'] - df['truth']).abs()
    header = f"Top {n} outliers"
    if model_name:
        header += f" for {model_name}"
    if dataset_name:
        header += f" — {dataset_name}"
    print(header)
    print("=========================================") 
    print("idx\ttruth\tresult\tabs_err")
    print("=========================================")
    outliers = df.nlargest(n, 'abs_err')[['idx', 'truth', 'result', 'abs_err']]
    for _, row in outliers.iterrows():
        print(f"{row['idx']}\t{row['truth']}\t{row['result']}\t{row['abs_err']}")
    print("=========================================\n\n")



In [5]:
models    = glob.glob('results/*')
summaries = []

for mp in models:
    model_name = os.path.basename(mp)
    for rfile in glob.glob(f'{mp}/*_results.csv'):
        base  = os.path.basename(rfile).replace('_results.csv','')
        dfile = f'data/{base}_dataset.csv'

        # load & merge once
        df = load_and_merge(dfile, rfile)

        # Print largest outliers
        find_largest_outliers(df, n=10, model_name=model_name, dataset_name=base)
        # summarize
        s = summarize_counts(df)
        s['model']   = model_name
        s['dataset'] = base
        summaries.append(s)


# assemble summary table
df_summary = pd.DataFrame(summaries)
cols = ['model','dataset','n_examples','accuracy','over_rate',
        'under_rate','mean_error','MAE','MAPE (%)','sMAPE (%)']
df_summary = df_summary[cols]
df_summary.to_csv('summary.csv', index=False)
df_summary

       idx  result
0        0       8
1        1      11
2        2       9
3        3      68
4        4      10
...    ...     ...
6068  6068      74
6069  6069      45
6070  6070     163
6071  6071     167
6072  6072     100

[6073 rows x 2 columns]
     file_name                                          prompt  truth   idx
0        2.jpg        How many sea shells are in this picture?      8     0
1        3.jpg  How many hot air balloons are in this picture?     11     1
2        4.jpg  How many hot air balloons are in this picture?     10     2
3        5.jpg  How many hot air balloons are in this picture?    113     3
4        6.jpg  How many hot air balloons are in this picture?      9     4
...        ...                                             ...    ...   ...
6141  6802.jpg             How many boxes are in this picture?     95  6141
6142  7444.jpg             How many boxes are in this picture?     47  6142
6143  6862.jpg             How many boxes are in this picture? 

Unnamed: 0,model,dataset,n_examples,accuracy,over_rate,under_rate,mean_error,MAE,MAPE (%),sMAPE (%)
0,GPT-4.1,FSC-147,6073.0,0.032109,0.384983,0.582908,-17.257039,48.61617,124.363634,79.821421
1,GPT-4.1,TallyQA,41425.0,0.792782,0.128111,0.079107,0.105202,0.327532,15.657544,12.621903
2,gemma-3-4b-it,FSC-147,6146.0,0.112919,0.373902,0.513179,24.904816,61.427758,53.609235,28.127849
3,gemma-3-4b-it,GeckoNum,17287.0,0.457974,0.4519,0.090126,2.103604,2.476254,41.717474,46.670782


In [None]:
# combined rolling-accuracy plot per dataset
plot_rolling_accuracy_all_models(
    data_dir='data',
    results_dir='results',
    window=200
)
plot_range_performance_all_models(
    data_dir='data',
    results_dir='results',
    n_bins=20
)