**Classification Results**

## Initialization

Run these to load the data

In [11]:
import pandas as pd

df = pd.read_csv('../eval/valid_results/combinedResults.csv')
df.dropna(subset=['model_result'], inplace=True)
df.drop(df[df['model_result'] == -1].index, inplace=True)
df['model_result'] = df['model_result'].astype(int)
df.head()

Unnamed: 0,idx,file_name,prompt,truth,label,dataset,model_result,model
0,0,2.jpg,How many sea shells are in this picture?,8,sea shells,FSC-147,8,GPT-4.1
1,0,2.jpg,How many sea shells are in this picture?,8,sea shells,FSC-147,7,claude-3.5-haiku
2,0,2.jpg,How many sea shells are in this picture?,8,sea shells,FSC-147,8,gemini-2.5-flash-preview-04-17
3,0,2.jpg,How many sea shells are in this picture?,8,sea shells,FSC-147,7,gemini-2.5-pro-preview-05-06
4,0,2.jpg,How many sea shells are in this picture?,8,sea shells,FSC-147,8,gemma-3-12b-it


In [12]:
from sklearn.metrics import classification_report

df_copy = df.copy()
# df_copy = df[df["model"] == "gemini-2"] # Uncomment this line to filter by model

y_true = df_copy['truth'].values
y_pred = df_copy['model_result'].values
print(classification_report(y_true, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.91      0.76      0.83     78738
           1       0.79      0.80      0.80     80428
           2       0.74      0.66      0.70     79477
           3       0.55      0.63      0.59     39821
           4       0.51      0.54      0.52     24175
           5       0.38      0.44      0.41     12140
           6       0.28      0.40      0.33      7900
           7       0.21      0.32      0.25      4664
           8       0.27      0.34      0.30      5241
           9       0.28      0.31      0.29      4169
          10       0.21      0.29      0.25      3471
          11       0.25      0.10      0.14      7623
          12       0.31      0.38      0.34      4155
          13       0.14      0.17      0.15      1969
          14       0.11      0.15      0.12      1588
          15       0.13      0.26      0.18      1439
          16       0.14      0.24      0.18      1467
          17       0.07    

## Interactive Classification Report

In [19]:
import pandas as pd
from sklearn.metrics import classification_report
import ipywidgets as widgets
from IPython.display import display, clear_output


def show_classification_widget(
        df,
        y_col: str = 'truth',
        pred_col: str = 'model_result',
        model_col: str = 'model'):
    """Interactive classification-report viewer for a DataFrame."""

    # ── widget definitions ───────────────────────────────────────────
    models = sorted(df[model_col].unique().tolist())
    model_selector = widgets.ToggleButtons(options=models + ['All'], description='Model')

    min_score, max_score = int(df[pred_col].min()), 200
    slider_low  = widgets.IntSlider(min_score,    min=min_score, max=max_score,
                                    description='Min Truth')
    slider_high = widgets.IntSlider(12,    min=min_score, max=max_score,
                                    description='Max Truth')
    
    datasets = sorted(df['dataset'].unique().tolist())
    dataset_selector = widgets.ToggleButtons(options=datasets + ['All'], description='Dataset')
    
    # keep sliders consistent: low ≤ high
    widgets.jslink((slider_low, 'value'), (slider_high, 'min'))

    out = widgets.Output()

    # ── callback ─────────────────────────────────────────────────────
    def update_report(*_):
        with out:
            clear_output()

            high_val = slider_high.value if slider_high.value != max_score else df[pred_col].max()
            mask = (df[y_col] >= slider_low.value) & (df[y_col] <= high_val)
            sub  = df.loc[mask]

            if model_selector.value != 'All':
                sub = sub[sub[model_col] == model_selector.value]
                
            if dataset_selector.value != 'All':
                sub = sub[sub['dataset'] == dataset_selector.value]
            
            if sub.empty:
                print('No data to display.')
                return

            y_true = sub[y_col]
            y_pred = sub[pred_col]


            print(f'Classification report for model: {model_selector.value}')
            print('-' * 66)
            report = classification_report(y_true, y_pred, zero_division=0,
                                          output_dict=True)
            
            string  = ' ' * (28 - len('Precision'))
            string +=  'Precision'
            string += ' ' * (12 - len('Recall'))
            string +=  'Recall'
            string += ' ' * (12 - len('F1-score'))
            string +=  'F1-score'
            string += ' ' * (12 - len('Support'))
            string +=  'Support'
            print(f'  {string}')
            print('-' * 66)
            for label, metrics in report.items():
                if isinstance(metrics, dict):
                    if not label.isnumeric() or slider_low.value <= int(label) <= slider_high.value:
                        string = label
                        string += ' ' * (16 - len(label))
                        precision = f'{metrics["precision"]:.2f}'
                        string += ' ' * (12 - len(precision))
                        string += precision
                        recall = f'{metrics["recall"]:.2f}'
                        string += ' ' * (12 - len(recall))
                        string += recall
                        f1_score = f'{metrics["f1-score"]:.2f}'
                        string += ' ' * (12 - len(f1_score))
                        string += f1_score
                        support = metrics['support']
                        support = str(int(support))
                        string += ' ' * (12 - len(support)) + support
                        print(f'  {string}')
                else:
                    print()
                    string = label + ' ' * (48 - len(label))
                    string += f'{metrics:.2f}'
                    print(f'  {string}')

    # bind once
    for w in (model_selector, slider_low, slider_high, dataset_selector):
        w.observe(update_report, names='value')
    # initial render
    model_selector.value = 'All'
    dataset_selector.value = 'All'
    update_report()
    display(widgets.VBox([model_selector, dataset_selector,
                          widgets.HBox([slider_low, slider_high]),
                          out]))
# ── run widget ─────────────────────────────────────────────────────
show_classification_widget(df)

VBox(children=(ToggleButtons(description='Model', index=9, options=('GPT-4.1', 'claude-3.5-haiku', 'gemini-2.5…

In [None]:
# Calculate absolute error and percentage error
df_copy['abs_error'] = (df_copy['truth'] - df_copy['model_result']).abs()
# Avoid division by zero by handling truth == 0 separately if needed
df_copy['pct_error'] = df_copy['abs_error'] / df_copy['truth'].replace(0, 1)

# Flag predictions within 5% of the true value
df_copy['within_5pct'] = df_copy['pct_error'] <= 0.10

df_copy['exact_match'] = df_copy['abs_error'] == 0
# Aggregate MAE and Accuracy@5% per model
results = df_copy.groupby('model').agg(
    MAE=('abs_error', 'mean'),
    Accuracy_5pct=('within_5pct', 'mean'),
    Accuracy_0_1=('exact_match', 'mean')
).reset_index()

# Convert accuracy to percentage
results['Accuracy_5pct'] = results['Accuracy_5pct'] * 100
results['Accuracy_0_1'] = results['Accuracy_0_1'] * 100
# Sort results by Accuracy_5pct
results.sort_values(by='Accuracy_5pct', ascending=False, inplace=True)
# Reset index for better readability
results.reset_index(drop=True, inplace=True)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(results)