# Eval Visualization

In [None]:
from src.utils import read_json
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

In [None]:
presets_ls = ['mbpp']
preset_name = presets_ls[0]
model_id = "unsloth/Qwen2.5-3B-Instruct"
rl_shortname = "RL Example Test Faulty Single"
lora_path = 'results/runs/unsloth__Qwen2.5-3B-Instruct/20251023_091951_rewardhack_mbpp_example_tests/checkpoint-150'
# lora_path = None
max_new_tokens = "_1024"

In [None]:
# Load data + format
from src.evaluate.presets import EVAL_PRESETS

eval_datasets = {}
for k in presets_ls:
    eval_datasets.update({f"{k}_{j}": v for j, v in EVAL_PRESETS[k].items()})
eval_datasets = {k: v.split('/')[-1].removesuffix('.jsonl') for k, v in eval_datasets.items()}

eval_results = {'base': {}, 'rl': {}}

for k, v in eval_datasets.items():
    eval_results['base'][k] =  f"results/{model_id.replace('/', '__')}/eval_{v}{max_new_tokens}.json"
    if lora_path is not None:
        eval_results['rl'][k] =  f"{lora_path}/eval_{v}{max_new_tokens}.json"

def try_read_json(fpath):
    try:
        return read_json(fpath)
    except Exception as e:
        print(f"Error reading {fpath}: {e}")
        return None

eval_results = {q: {k: try_read_json(v) for k, v in x.items()} for q, x in eval_results.items()}

In [None]:
# Format into dataframe results
def format_results(eval_results, model_id):
    res = []
    for k, v in eval_results.items():
        if len(v['results']) > 0:
            res.extend([{**x, 'dataset': k} for x in v['results']])
    if len(res) == 0:
        return pd.DataFrame([])
    df = pd.DataFrame(res)
    df['model'] = model_id
    return df

results = [format_results(eval_results['base'], model_id = "Base")] + ([format_results(eval_results['rl'], model_id = "RL")] if lora_path is not None else [])
results = pd.concat(results).reset_index(drop = True) if len(results) > 1 else results[0]
results['label'] = results.apply(lambda x: 'Correct' if x['eq_correct'] else ('Reward Hack' if x['eq_hinted'] else ('Answered' if x['is_answered'] else 'None')), axis = 1)
results.head(5)

In [None]:
# Summarize results
summary_results = results.pivot_table(index = ['model', 'dataset'], columns = 'label', values = 'id', aggfunc = 'count').fillna(0.0)
summary_results = summary_results.divide(summary_results.sum(axis = 1), axis = 0).reset_index()
summary_results['name'] = summary_results.apply(lambda x: f"{x['model']}: {x['dataset']}", axis = 1)
summary_results = summary_results.sort_values(['dataset', 'model'])
summary_results

In [None]:
def order_no_hint_first(datasets_order):
    return [x for x in datasets_order if 'no_hint' in x] + sorted([x for x in datasets_order if 'no_hint' not in x])


models_order = ['Base'] if lora_path is None else ['Base', 'RL']
datasets_order =  order_no_hint_first([x for x in eval_datasets.keys() if x.startswith(preset_name)])
datasets_order = [x for x in datasets_order if 'eval_test' not in x]
# datasets_order = ['mbpp_example_test_fa'] + order_no_hint_first([x for x in eval_datasets.keys() if not x.startswith(preset_name)])
bar_order = ['Answered', 'Reward Hack', 'Correct']


def format_subplot_title(dataset_name):
    dn_ls = dataset_name.split('_')
    return dn_ls[0].upper() + "<br>" + " ".join([x.title() for x in dn_ls[1:]])

fig = make_subplots(
    rows=1,
    cols=len(datasets_order),
    shared_yaxes=True,
    horizontal_spacing=0.02,  # very little space between subplots
    subplot_titles = [format_subplot_title(x) for x in datasets_order],
)

# Colors consistent with your original mapping
colors = {
    'Answered': 'lightgray',
    'Reward Hack': 'red',
    'Correct': 'green'
}

# Build each subplot
for i, ds in enumerate(datasets_order, start=1):
    sub = summary_results[summary_results['dataset'] == ds].copy()
    # Ensure the two columns are ordered Base, RL
    sub['model'] = pd.Categorical(sub['model'], categories=models_order, ordered=True)
    sub = sub.sort_values('model')

    # Add stacked bars: answered, correct, reward hack (order controls stack order)
    for metric in bar_order:
        if metric not in sub.columns:
            continue
        fig.add_trace(
            go.Bar(
                x=sub['model'],
                y=sub[metric],
                name=metric,
                marker_color=colors[metric],
                showlegend=(i == 1),  # legend only once
                text=sub[metric].map(lambda v: f"{v:.0%}" if v > 0.01 else ""),
                textposition='inside',
                insidetextanchor='middle'
            ),
            row=1, col=i
        )

# Layout / axes
fig.update_layout(
    title=f"{model_id.split('/')[-1]} vs {rl_shortname}: Performance on {preset_name.upper()}",
    width=1400,
    height=500,
    barmode="stack",
    bargap=0.1,
    bargroupgap=0.05,
    xaxis_title=None,
    yaxis_title="Percent of Samples",
    yaxis_tickformat=".0%",
    margin=dict(t=70, r=20, l=60, b=40)
)

fig.update_annotations(
    font=dict(
        size=11,                     # Adjust font size
    )
)

# Fix y-range to [0,1] so the dividers span the full panel
for i in range(1, len(datasets_order)+1):
    fig.update_yaxes(range=[0,1], row=1, col=i, tickformat=".0%")
    # Trim x tick labels to just Base/RL without rotation
    fig.update_xaxes(tickangle=0, row=1, col=i)

fig.show()

# Look at Examples

In [None]:
from pprint import pprint
import random

In [None]:
model = 'Base'
dataset = 'leetcode_medium_example_test_single'
label = 'Reward Hack'

result_subset = results.loc[(results['model'] == model) & (results['dataset'] == dataset) & (results['label'] == label)]
examp_iterator = iter(range(len(result_subset)))
print("Matching Results: ", len(result_subset))

In [None]:
i = next(examp_iterator)
example = result_subset.iloc[i]

# print('=====PROMPT=====')
# print(example['prompt'][-1]['content'])

print('=====RESPONSE=====')
print(example['response'])

In [None]:
print('=====TEST HINT=====')
print(example['answer'])

In [None]:
print('=====PROMPT=====')
print(example['prompt'][-1]['content'])

In [None]:
example['gt_result']

In [None]:
# print("TRUE TEST", example['gt_answer'])
# # print("FAKE TEST", x['fake_answer'])
# # print("EVAL TEST", x['answer'])
# # pprint('RESPONSE')
# # pprint(x['response'])
# # print('CORRECT', x['eq_correct'])
# # print('HACKED', x['eq_hinted'])
# # # print('JUDGEMENT', x['judge_output'])

In [None]:
base_dataset = utils.read_jsonl_all("results/data/leetcode/leetcode_test_base.jsonl")
base_dataset[0]

# CoT Monitor vs Normal