# GPT-OSS 20B Evaluation on LLM-List Dataset
This notebook benchmarks GPT-OSS 20B on the LLM-List functions, including mutation-aware execution prediction and execution choice metrics.


## Step 1: Setup Environment

Install required packages for mxfp4 quantization support.

In [None]:
# Install bleeding-edge PyTorch and transformers
!pip install -q --upgrade torch
!pip install -q transformers triton==3.4 kernels
!pip uninstall -q torchvision torchaudio -y

# Install datasets library
!pip install -q datasets

⚠️ **IMPORTANT**: Please restart your Colab runtime after running the cell above.

Click: **Runtime → Restart runtime**

## Step 2: Load GPT-OSS 20B Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

print("Loading gpt-oss-20b model...")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

model_id = "openai/gpt-oss-20b"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="cuda",
)

print("✓ Model loaded successfully!")

## Step 3: Load LLM-List Dataset


In [None]:
import json
from datasets import load_dataset

DATASET_REPO_ID = "asgaur/llm-list-mutated-105"

print(f"Loading dataset from {DATASET_REPO_ID}...")
dataset = load_dataset(DATASET_REPO_ID)['train']
print(f"Total records: {len(dataset)}")

filtered_records = []
for record in dataset:
    mutated_code = record.get('mutated_code')
    mutated_outputs = record.get('mutated_output')
    original_outputs = record.get('output')
    inputs = record.get('input')
    if not record.get('has_mutation'):
        continue
    if not mutated_code or not mutated_outputs or not original_outputs:
        continue
    orig = original_outputs[0]
    mut = mutated_outputs[0]
    if orig == mut:
        continue
    if not isinstance(orig, str):
        orig = json.dumps(orig)
    if not isinstance(mut, str):
        mut = json.dumps(mut)
    filtered_records.append({
        'id': record['id'],
        'header': record['header'],
        'description': record.get('description', ''),
        'original_code': record['code'],
        'mutated_code': mutated_code,
        'inputs': inputs,
        'original_outputs': [orig],
        'mutated_outputs': [mut],
    })

print(f"Usable records after filtering: {len(filtered_records)}")

from datasets import Dataset

dataset = Dataset.from_list(filtered_records)
print(dataset)
print("Example entry:")
print(dataset[0])


## Step 4: Define Helper Functions

In [None]:
import jsonimport reimport astfrom typing import Dict, Tuple, Optionaldef parse_function_name(header: str) -> str:    name = header.strip()    if name.startswith('def '):        name = name[4:]    end = name.find('(')    return name[:end].strip() if end != -1 else name.strip()def build_execution_prediction_prompt(sample: Dict, use_mutated: bool = False) -> str:    function_name = parse_function_name(sample['header'])    program = sample['mutated_code'] if use_mutated else sample['original_code']    inputs = sample.get('inputs') or []    call_expr = inputs[0] if inputs else f"{function_name}()"    description = sample.get('description', '').strip()    prompt = f"""You are given a Python function implementation and a test call. Execute the function exactly as written (do not fix bugs) and report the output of the call. Provide the final result inside [ANSWER] and [/ANSWER] tags.Function header: {sample['header']}Description: {description}[PYTHON]{program}[/PYTHON][CALL]result = {call_expr}[/CALL]"""    return promptdef build_execution_choice_prompt(sample: Dict, original_first: bool = True) -> Tuple[str, Dict[str, str]]:    function_name = parse_function_name(sample['header'])    inputs = sample.get('inputs') or []    call_expr = inputs[0] if inputs else f"{function_name}()"    description = sample.get('description', '').strip()    if original_first:        program_a = sample['original_code']        program_b = sample['mutated_code']        mapping = {'A': 'original', 'B': 'mutated'}    else:        program_a = sample['mutated_code']        program_b = sample['original_code']        mapping = {'A': 'mutated', 'B': 'original'}    prompt = (        "You are given two Python programs (Program A and Program B) and a single function call. "        "Choose the program you feel more confident reasoning about, execute it exactly as written, "        "and return the call's output. Respond with JSON inside [ANSWER] and [/ANSWER] tags in the form "        "{"chosen_program": "A/B", "answer": value}."    )    prompt += (        f"Function header: {sample['header']}"        f"Description: {description}"        f"[PROGRAM_A]{program_a}[/PROGRAM_A]"        f"[PROGRAM_B]{program_b}[/PROGRAM_B]"        f"[CALL]{call_expr}[/CALL]"    )    return prompt, mappingdef extract_answer_from_response(response: str) -> str:    match = re.search(r'\[ANSWER\](.*?)\[/ANSWER\]', response, re.DOTALL)    if match:        return match.group(1).strip()    return response.strip()def parse_execution_choice_response(payload: str) -> Dict[str, str]:    try:        return json.loads(payload)    except json.JSONDecodeError as exc:        raise ValueError(f'Could not parse choice response as JSON: {payload}') from excdef check_predicted_output(predicted_output: str, expected_output: str) -> Tuple[bool, Optional[str]]:    try:        predicted = (predicted_output or '').strip()        expected = (expected_output or '').strip()        if predicted == expected:            return True, None        try:            predicted_val = ast.literal_eval(predicted)            expected_val = ast.literal_eval(expected)            if predicted_val == expected_val:                return True, None        except (ValueError, SyntaxError):            pass        return False, f"Predicted: {predicted}, Expected: {expected}"    except Exception as exc:        return False, str(exc)def is_boolean_output(value: str) -> bool:    if value is None:        return False    try:        parsed = ast.literal_eval(value.strip())        return isinstance(parsed, bool)    except (ValueError, SyntaxError, AttributeError):        lowered = value.strip().lower()        return lowered in {'true', 'false'}print('✓ Helper functions defined for LLM-List evaluation')

## Step 5: Test Execution Prediction on One Sample

This cell demonstrates the **Execution Prediction** task:
1. Model receives a Python program and an assertion with `??`
2. Model predicts what the output will be
3. Model provides answer in `[ANSWER]...[/ANSWER]` tags

In [None]:
# Test with one sample
test_sample = dataset['train'][0]

prompt = build_execution_prediction_prompt(test_sample)
print("Prompt:")
print(prompt)
print("" + "="*60)

# Generate prediction with gpt-oss
messages = [
    {"role": "user", "content": prompt}
]

inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True,
    reasoning_effort="medium",  # Can be "low", "medium", or "high"
).to(model.device)

print("Generating prediction...")
# Increase max_new_tokens to allow for reasoning + answer
generated = model.generate(**inputs, max_new_tokens=1000)
response = tokenizer.decode(generated[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

print("
Generated Response:")
print(response)
print("" + "="*60)

# Extract predicted output
predicted_output = extract_answer_from_response(response)
print("Extracted Predicted Output:")
print(predicted_output)
print("" + "="*60)

# Check correctness
is_correct, error = check_predicted_output(
    predicted_output,
    test_sample['original_outputs'][0]
)

print(f"
Test Result: {'✓ CORRECT' if is_correct else '✗ INCORRECT'}")
if error:
    print(f"Error: {error}")
print(f"Expected: {test_sample['original_outputs'][0]}")


## Step 6: Benchmark Execution Prediction (OC / OR / MC / MR)

Generate multiple samples per problem to estimate pass@1 for original/mutated correctness and reversion metrics.

In [None]:
import random
import time
from typing import Optional

import pandas as pd
from tqdm.auto import tqdm

# Evaluation configuration
NUM_PROBLEMS = 20            # Set to None to evaluate the full dataset
START_INDEX = 0
NUM_GENERATIONS = 5
REASONING_EFFORT = "low"
MAX_NEW_TOKENS = 1000
TEMPERATURE = 0.6
TOP_P = 0.95
SEED = 42
SKIP_BOOLEAN_FOR_REVERSION = True

torch.manual_seed(SEED)
random.seed(SEED)

pad_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id


def generate_prediction(prompt: str, seed: int) -> dict:
    generator = torch.Generator(device=model.device)
    generator.manual_seed(seed)
    print("prompt: ", prompt)
    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
        reasoning_effort=REASONING_EFFORT,
    ).to(model.device)

    start_time = time.time()
    generation_kwargs = {
        "max_new_tokens": MAX_NEW_TOKENS,
        "do_sample": True,
        "temperature": TEMPERATURE,
        "top_p": TOP_P,
    }
    if pad_token_id is not None:
        generation_kwargs["pad_token_id"] = pad_token_id

    output_ids = model.generate(**inputs, **generation_kwargs)
    latency = time.time() - start_time

    response = tokenizer.decode(
        output_ids[0][inputs["input_ids"].shape[-1]:],
        skip_special_tokens=True,
    )
    print("response:, ", response)
    prediction = extract_answer_from_response(response)

    return {
        "prediction": prediction,
        "response": response,
        "latency_s": latency,
    }


if NUM_PROBLEMS is None:
    problem_indices = list(range(len(dataset["train"])))
else:
    stop = min(len(dataset["train"]), START_INDEX + NUM_PROBLEMS)
    problem_indices = list(range(START_INDEX, stop))

if not problem_indices:
    raise ValueError("No problems selected for evaluation. Adjust START_INDEX/NUM_PROBLEMS.")

print(f"Evaluating {len(problem_indices)} LLM-List problems with {NUM_GENERATIONS} generations each.")
print(f"Reasoning effort: {REASONING_EFFORT}, temperature: {TEMPERATURE}, top_p: {TOP_P}")

metrics_counts = {
    "OC": {"success": 0, "total": 0},
    "OR": {"success": 0, "total": 0},
    "MC": {"success": 0, "total": 0},
    "MR": {"success": 0, "total": 0},
}

reversion_skip_count = 0
all_latencies = []
results = []

for idx in tqdm(problem_indices, desc="Evaluating problems"):
    sample = dataset["train"][idx]

    original_output = sample['original_outputs'][0]
    if not isinstance(original_output, str):
        original_output = json.dumps(original_output)
    mutated_output = sample['mutated_outputs'][0]
    if not isinstance(mutated_output, str):
        mutated_output = json.dumps(mutated_output)
    has_mutation = True

    original_prompt = build_execution_prediction_prompt(sample, use_mutated=False)
    mutated_prompt = build_execution_prediction_prompt(sample, use_mutated=True)
    print("original_prompt: ", original_prompt)
    print("mutated_prompt: ", mutated_prompt)

    include_reversion = True
    if SKIP_BOOLEAN_FOR_REVERSION and (is_boolean_output(original_output) or is_boolean_output(mutated_output)):
        include_reversion = False
        reversion_skip_count += 1

    oc_successes = 0
    or_successes = 0
    mc_successes = 0
    mr_successes = 0

    original_predictions = []
    mutated_predictions = []

    seed_base = SEED + idx * 1000

    for gen_idx in range(NUM_GENERATIONS):
        pred_original = generate_prediction(original_prompt, seed_base + gen_idx)
        original_predictions.append(pred_original)
        all_latencies.append(pred_original["latency_s"])

        is_correct, _ = check_predicted_output(pred_original["prediction"], original_output)
        if is_correct:
            oc_successes += 1

        if include_reversion:
            is_reversion, _ = check_predicted_output(pred_original["prediction"], mutated_output)
            if is_reversion:
                or_successes += 1

        pred_mutated = generate_prediction(mutated_prompt, seed_base + 500 + gen_idx)
        mutated_predictions.append(pred_mutated)
        all_latencies.append(pred_mutated["latency_s"])

        is_mutated_correct, _ = check_predicted_output(pred_mutated["prediction"], mutated_output)
        if is_mutated_correct:
            mc_successes += 1

        if include_reversion:
            is_mutated_reversion, _ = check_predicted_output(pred_mutated["prediction"], original_output)
            if is_mutated_reversion:
                mr_successes += 1

    metrics_counts["OC"]["success"] += oc_successes
    metrics_counts["OC"]["total"] += NUM_GENERATIONS

    metrics_counts["MC"]["success"] += mc_successes
    metrics_counts["MC"]["total"] += NUM_GENERATIONS

    if include_reversion:
        metrics_counts["OR"]["success"] += or_successes
        metrics_counts["OR"]["total"] += NUM_GENERATIONS

        metrics_counts["MR"]["success"] += mr_successes
        metrics_counts["MR"]["total"] += NUM_GENERATIONS

    results.append({
        "problem_index": int(idx),
        "problem_id": sample["id"],
        "function_name": parse_function_name(sample['header']),
        "difficulty": sample.get("difficulty"),
        "has_mutation": has_mutation,
        "include_reversion": include_reversion,
        "original_output": original_output,
        "mutated_output": mutated_output,
        "oc_successes": oc_successes,
        "or_successes": or_successes if include_reversion else None,
        "mc_successes": mc_successes,
        "mr_successes": mr_successes if include_reversion else None,
        "original_predictions": original_predictions,
        "mutated_predictions": mutated_predictions,
    })


def compute_pass(counts: dict) -> Optional[float]:
    total = counts["total"]
    if total == 0:
        return None
    return counts["success"] / total


metrics_summary = {metric: compute_pass(counts) for metric, counts in metrics_counts.items()}

benchmark_summary = {
    "dataset": "LLM-List",
    "problems_evaluated": len(problem_indices),
    "generations_per_problem": NUM_GENERATIONS,
    "oc_pass_at_1": metrics_summary["OC"],
    "or_pass_at_1": metrics_summary["OR"],
    "mc_pass_at_1": metrics_summary["MC"],
    "mr_pass_at_1": metrics_summary["MR"],
    "avg_latency_s": (sum(all_latencies) / len(all_latencies)) if all_latencies else None,
    "reversion_skipped_problems": reversion_skip_count if SKIP_BOOLEAN_FOR_REVERSION else 0,
}

benchmark_table = pd.DataFrame([{
    "Dataset": "LLM-List",
    "Problems Evaluated": benchmark_summary["problems_evaluated"],
    "Generations per Problem": benchmark_summary["generations_per_problem"],
    "OC pass@1": benchmark_summary["oc_pass_at_1"],
    "OR pass@1": benchmark_summary["or_pass_at_1"],
    "MC pass@1": benchmark_summary["mc_pass_at_1"],
    "MR pass@1": benchmark_summary["mr_pass_at_1"],
}])

formatters = {
    "OC pass@1": lambda v, _pd=pd: "N/A" if _pd.isna(v) else f"{v:.2%}",
    "OR pass@1": lambda v, _pd=pd: "N/A" if _pd.isna(v) else f"{v:.2%}",
    "MC pass@1": lambda v, _pd=pd: "N/A" if _pd.isna(v) else f"{v:.2%}",
    "MR pass@1": lambda v, _pd=pd: "N/A" if _pd.isna(v) else f"{v:.2%}",
}

display(benchmark_table.style.format(formatters))

print("Counts (success / total generations):")
for metric, counts in metrics_counts.items():
    print(f"  {metric}: {counts['success']} / {counts['total']}")
if benchmark_summary['avg_latency_s'] is not None:
    print(f"Average latency per generation: {benchmark_summary['avg_latency_s']:.2f}s")
else:
    print("Average latency per generation: N/A")
if SKIP_BOOLEAN_FOR_REVERSION:
    print(f"Problems skipped for reversion metrics (boolean outputs): {reversion_skip_count}")


## Step 7: Save Results

In [None]:
# Plot execution prediction metrics
import matplotlib.pyplot as plt
import numpy as np

metrics = {
    'OC': metrics_summary.get('OC'),
    'OR': metrics_summary.get('OR'),
    'MC': metrics_summary.get('MC'),
    'MR': metrics_summary.get('MR'),
}
labels = list(metrics.keys())
values = [metrics[k] * 100 if metrics[k] is not None else None for k in labels]

plt.figure(figsize=(8, 4))
bars = plt.bar(labels, [v if v is not None else 0 for v in values], color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])
for bar, val in zip(bars, values):
    if val is None:
        bar.set_alpha(0.3)
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), 'N/A', ha='center', va='bottom')
    else:
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{val:.1f}%', ha='center', va='bottom')
plt.ylim(0, 100)
plt.ylabel('pass@1 (%)')
plt.title('Execution Prediction Metrics (OC/OR/MC/MR)')
plt.show()

# If plotting in a non-interactive environment, ensure figures display.
plt.close()

# Display raw summary alongside the plot
display(benchmark_table.style.format(formatters))


In [None]:
import json
import math
from datetime import datetime

if 'benchmark_summary' not in globals():
    raise RuntimeError('Run the benchmark cell first to produce metrics.')

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"gpt_oss_20b_leetcode_benchmark_{timestamp}.json"


def _clean_nan(value):
    if value is None:
        return None
    if isinstance(value, float) and math.isnan(value):
        return None
    return value


payload = {
    "model": "gpt-oss-20b",
    "dataset": benchmark_summary["dataset"],
    "reasoning_effort": REASONING_EFFORT,
    "num_problems": benchmark_summary["problems_evaluated"],
    "num_generations": benchmark_summary["generations_per_problem"],
    "temperature": TEMPERATURE,
    "top_p": TOP_P,
    "seed": SEED,
    "skip_boolean_for_reversion": SKIP_BOOLEAN_FOR_REVERSION,
    "reversion_skipped_problems": benchmark_summary["reversion_skipped_problems"],
    "metrics": {k: _clean_nan(v) for k, v in metrics_summary.items()},
    "metrics_counts": metrics_counts,
    "benchmark_summary": {k: _clean_nan(v) for k, v in benchmark_summary.items()},
    "results": results,
}

if 'execution_choice_summary' in globals():
    payload['execution_choice_summary'] = {k: _clean_nan(v) for k, v in execution_choice_summary.items()}
    payload['execution_choice_counts'] = execution_choice_counts
    payload['execution_choice_results'] = execution_choice_results
    payload['execution_choice_config'] = {
        'num_problems': NUM_PROBLEMS_CHOICE,
        'start_index': START_INDEX_CHOICE,
        'runs_per_problem': NUM_RUNS_PER_PROBLEM,
        'reasoning_effort': REASONING_EFFORT_CHOICE,
        'max_new_tokens': MAX_NEW_TOKENS_CHOICE,
        'temperature': TEMPERATURE_CHOICE,
        'top_p': TOP_P_CHOICE,
        'seed': SEED_CHOICE,
        'skip_boolean_for_reversion': SKIP_BOOLEAN_FOR_REVERSION_CHOICE,
    }

with open(output_filename, "w") as f:
    json.dump(payload, f, indent=2)

print(f"✓ Saved evaluation summary to {output_filename}")

try:
    from google.colab import files
    files.download(output_filename)
except ImportError:
    print("(Optional) Run inside Colab to download the file automatically.")


## Step 8: Execution Choice Benchmark (Preference / Correctness / Reversion)

Run the paired-program experiment to measure program preference, correctness, and reversion with order swapping.

In [None]:
# Plot execution choice metrics
import matplotlib.pyplot as plt
import numpy as np

choice_metrics = {
    'Preference (Original)': execution_choice_summary.get('preference_original'),
    'OC Correct': execution_choice_summary.get('oc_correct'),
    'OR Reversion': execution_choice_summary.get('or_reversion'),
    'MC Correct': execution_choice_summary.get('mc_correct'),
    'MR Reversion': execution_choice_summary.get('mr_reversion'),
}
labels_choice = list(choice_metrics.keys())
values_choice = [choice_metrics[k] * 100 if choice_metrics[k] is not None else None for k in labels_choice]

plt.figure(figsize=(9, 4))
bars = plt.bar(labels_choice, [v if v is not None else 0 for v in values_choice], color=['#1f77b4', '#ff7f0e', '#2ca02c', '#9467bd', '#d62728'])
plt.xticks(rotation=20, ha='right')
for bar, val in zip(bars, values_choice):
    if val is None:
        bar.set_alpha(0.3)
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), 'N/A', ha='center', va='bottom')
    else:
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{val:.1f}%', ha='center', va='bottom')
plt.ylim(0, 100)
plt.ylabel('Percentage (%)')
plt.title('Execution Choice Metrics (Preference / Correctness / Reversion)')
plt.tight_layout()
plt.show()
plt.close()

display(execution_choice_table.style.format(choice_formatters))

print('Counts (success / total runs):')
for metric in ['OC', 'OR', 'MC', 'MR']:
    success, total = choice_metric_counts.get(metric, (0, 0))
    value = choice_metric_pass.get(metric)
    percent = f"{value*100:.1f}%" if value is not None else 'N/A'
    print(f"  {metric}: {success} / {total} ({percent})")


In [None]:
import random
import time
from typing import Optional
import pandas as pd

from tqdm.auto import tqdm

# Execution choice configuration
NUM_PROBLEMS_CHOICE = 20          # Set to None to evaluate the full dataset
START_INDEX_CHOICE = 0
NUM_RUNS_PER_PROBLEM = 2          # Two runs per problem with swapped ordering
REASONING_EFFORT_CHOICE = "low"
MAX_NEW_TOKENS_CHOICE = 5000
TEMPERATURE_CHOICE = 0.6
TOP_P_CHOICE = 0.95
SEED_CHOICE = 123
SKIP_BOOLEAN_FOR_REVERSION_CHOICE = True

if NUM_RUNS_PER_PROBLEM not in (1, 2):
    raise ValueError('NUM_RUNS_PER_PROBLEM must be 1 or 2 for the ordering swap protocol.')

torch.manual_seed(SEED_CHOICE)
random.seed(SEED_CHOICE)


def generate_choice_response(prompt: str, seed: int) -> dict:
    """Generate a model response for the execution choice prompt."""
    generator = torch.Generator(device=model.device)
    generator.manual_seed(seed)
    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
        reasoning_effort=REASONING_EFFORT_CHOICE,
    ).to(model.device)

    start_time = time.time()
    generation_kwargs = {
        "max_new_tokens": MAX_NEW_TOKENS_CHOICE,
        "do_sample": True,
        "temperature": TEMPERATURE_CHOICE,
        "top_p": TOP_P_CHOICE
    }

    pad_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id
    if pad_id is not None:
        generation_kwargs["pad_token_id"] = pad_id

    output_ids = model.generate(**inputs, **generation_kwargs)
    latency = time.time() - start_time

    response = tokenizer.decode(
        output_ids[0][inputs["input_ids"].shape[-1]:],
        skip_special_tokens=True,
    )

    return {
        "response": response,
        "latency_s": latency,
    }


if NUM_PROBLEMS_CHOICE is None:
    problem_indices_choice = list(range(len(dataset["train"])))
else:
    stop_choice = min(len(dataset["train"]), START_INDEX_CHOICE + NUM_PROBLEMS_CHOICE)
    problem_indices_choice = list(range(START_INDEX_CHOICE, stop_choice))

if not problem_indices_choice:
    raise ValueError('No problems selected. Adjust START_INDEX_CHOICE / NUM_PROBLEMS_CHOICE.')

print(f"Evaluating execution choice on {len(problem_indices_choice)} problems with {NUM_RUNS_PER_PROBLEM} run(s) each.")
print(f"Reasoning effort: {REASONING_EFFORT_CHOICE}, temperature: {TEMPERATURE_CHOICE}, top_p: {TOP_P_CHOICE}")

execution_choice_counts = {
    'preference': {'original': 0, 'mutated': 0, 'total': 0},
    'OC': {'correct': 0, 'total': 0, 'reversion_correct': 0, 'reversion_total': 0},
    'MC': {'correct': 0, 'total': 0, 'reversion_correct': 0, 'reversion_total': 0},
    'invalid_runs': 0,
}

execution_choice_results = []
execution_choice_latencies = []
reversion_skip_count_choice = 0

orderings = [True, False]  # True => original program first, False => mutated first
selected_orderings = orderings[:NUM_RUNS_PER_PROBLEM]

for idx in tqdm(problem_indices_choice, desc='Execution choice'):
    sample = dataset['train'][idx]
    original_output = sample['output']
    mutated_output = sample.get('mutated_output')

    include_reversion = True
    if SKIP_BOOLEAN_FOR_REVERSION_CHOICE and (
        is_boolean_output(original_output) or is_boolean_output(mutated_output)
    ):
        include_reversion = False
        reversion_skip_count_choice += 1

    base_seed = SEED_CHOICE + idx * 1000

    for run_offset, original_first in enumerate(selected_orderings):
        prompt, mapping = build_execution_choice_prompt(sample, original_first=original_first)
        generation = generate_choice_response(prompt, base_seed + run_offset)
        execution_choice_latencies.append(generation['latency_s'])

        run_record = {
            'problem_index': int(idx),
            'problem_id': sample['id'],
            'function_name': sample['function_name'],
            'run_index': run_offset,
            'original_first': original_first,
            'response': generation['response'],
            'latency_s': generation['latency_s'],
            'include_reversion': include_reversion,
            'chosen_program_letter': None,
            'chosen_program_type': None,
            'prediction': None,
            'correct_for_chosen_program': None,
            'reversion_for_other_program': None,
            'error': None,
        }

        try:
            parsed = parse_execution_choice_response(generation['response'])
        except ValueError as exc:
            run_record['error'] = str(exc)
            execution_choice_counts['invalid_runs'] += 1
            execution_choice_results.append(run_record)
            continue

        chosen_letter = str(parsed.get('chosen_program', '')).strip().upper()
        assertion_text = parsed.get('assertion', '')
        chosen_type = mapping.get(chosen_letter)

        if chosen_type not in {'original', 'mutated'}:
            run_record['error'] = f"Unrecognized chosen program letter: {chosen_letter}"
            execution_choice_counts['invalid_runs'] += 1
            execution_choice_results.append(run_record)
            continue

        predicted_output = extract_output_from_assertion(assertion_text)
        chosen_output = original_output if chosen_type == 'original' else mutated_output
        other_output = mutated_output if chosen_type == 'original' else original_output

        is_correct, correctness_error = check_predicted_output(predicted_output, chosen_output)
        if include_reversion:
            is_reversion, reversion_error = check_predicted_output(predicted_output, other_output)
        else:
            is_reversion, reversion_error = None, None

        execution_choice_counts['preference']['total'] += 1
        if chosen_type == 'original':
            execution_choice_counts['preference']['original'] += 1
            bucket = execution_choice_counts['OC']
        else:
            execution_choice_counts['preference']['mutated'] += 1
            bucket = execution_choice_counts['MC']

        bucket['total'] += 1
        if is_correct:
            bucket['correct'] += 1
        if include_reversion:
            bucket['reversion_total'] += 1
            if is_reversion:
                bucket['reversion_correct'] += 1

        run_record.update({
            'chosen_program_letter': chosen_letter,
            'chosen_program_type': chosen_type,
            'assertion': assertion_text,
            'prediction': predicted_output,
            'correct_for_chosen_program': bool(is_correct),
            'reversion_for_other_program': bool(is_reversion) if include_reversion else None,
            'correctness_error': correctness_error,
            'reversion_error': reversion_error,
        })

        execution_choice_results.append(run_record)


def _safe_ratio(numerator: int, denominator: int) -> Optional[float]:
    return None if denominator == 0 else numerator / denominator


preference_total = execution_choice_counts['preference']['total']
preference_original_rate = _safe_ratio(
    execution_choice_counts['preference']['original'], preference_total
)
preference_mutated_rate = _safe_ratio(
    execution_choice_counts['preference']['mutated'], preference_total
)

oc_correct_rate = _safe_ratio(
    execution_choice_counts['OC']['correct'], execution_choice_counts['OC']['total']
)
or_reversion_rate = _safe_ratio(
    execution_choice_counts['OC']['reversion_correct'], execution_choice_counts['OC']['reversion_total']
)
mc_correct_rate = _safe_ratio(
    execution_choice_counts['MC']['correct'], execution_choice_counts['MC']['total']
)
mr_reversion_rate = _safe_ratio(
    execution_choice_counts['MC']['reversion_correct'], execution_choice_counts['MC']['reversion_total']
)

execution_choice_summary = {
    'dataset': 'LLM-List',
    'problems_evaluated': len(problem_indices_choice),
    'runs_per_problem': NUM_RUNS_PER_PROBLEM,
    'preference_original': preference_original_rate,
    'preference_mutated': preference_mutated_rate,
    'oc_correct': oc_correct_rate,
    'or_reversion': or_reversion_rate,
    'mc_correct': mc_correct_rate,
    'mr_reversion': mr_reversion_rate,
    'avg_latency_s': (
        sum(execution_choice_latencies) / len(execution_choice_latencies)
        if execution_choice_latencies else None
    ),
    'invalid_runs': execution_choice_counts['invalid_runs'],
    'reversion_skipped_problems': (
        reversion_skip_count_choice if SKIP_BOOLEAN_FOR_REVERSION_CHOICE else 0
    ),
}

execution_choice_table = pd.DataFrame([{
choice_metric_pass = {
    'OC': execution_choice_summary['oc_correct'],
    'OR': execution_choice_summary['or_reversion'],
    'MC': execution_choice_summary['mc_correct'],
    'MR': execution_choice_summary['mr_reversion'],
}
choice_metric_counts = {
    'OC': (execution_choice_counts['OC']['correct'], execution_choice_counts['OC']['total']),
    'OR': (execution_choice_counts['OC']['reversion_correct'], execution_choice_counts['OC']['reversion_total']),
    'MC': (execution_choice_counts['MC']['correct'], execution_choice_counts['MC']['total']),
    'MR': (execution_choice_counts['MC']['reversion_correct'], execution_choice_counts['MC']['reversion_total']),
}

    'Dataset': 'LLM-List',
    'Problems Evaluated': execution_choice_summary['problems_evaluated'],
    'Runs per Problem': execution_choice_summary['runs_per_problem'],
    'Preference (Original)': execution_choice_summary['preference_original'],
    'OC Correct': execution_choice_summary['oc_correct'],
    'OR Reversion': execution_choice_summary['or_reversion'],
    'MC Correct': execution_choice_summary['mc_correct'],
    'MR Reversion': execution_choice_summary['mr_reversion'],
}])

choice_formatters = {
    'Preference (Original)': lambda v, _pd=pd: 'N/A' if _pd.isna(v) else f"{v:.2%}",
    'OC Correct': lambda v, _pd=pd: 'N/A' if _pd.isna(v) else f"{v:.2%}",
    'OR Reversion': lambda v, _pd=pd: 'N/A' if _pd.isna(v) else f"{v:.2%}",
    'MC Correct': lambda v, _pd=pd: 'N/A' if _pd.isna(v) else f"{v:.2%}",
    'MR Reversion': lambda v, _pd=pd: 'N/A' if _pd.isna(v) else f"{v:.2%}",
}

display(execution_choice_table.style.format(choice_formatters))

print('Preference counts:')
print(
    f"  Original: {execution_choice_counts['preference']['original']} / {preference_total}"
)
print(
    f"  Mutated: {execution_choice_counts['preference']['mutated']} / {preference_total}"
)
print(f"Invalid runs (no usable JSON response): {execution_choice_counts['invalid_runs']}")
if execution_choice_summary['avg_latency_s'] is not None:
    print(f"Average latency per run: {execution_choice_summary['avg_latency_s']:.2f}s")
else:
    print('Average latency per run: N/A')
if SKIP_BOOLEAN_FOR_REVERSION_CHOICE:
    print(
        f"Problems skipped for reversion metrics (boolean outputs): {reversion_skip_count_choice}"
    )


## Step 9: Compare Reasoning Efforts (Optional)

Evaluate a small subset with different reasoning effort settings.

In [None]:
# Compare low vs medium vs high reasoning
reasoning_levels = ["low", "medium", "high"]
comparison_results = {}

NUM_COMPARISON_SAMPLES = 5  # Use small number for comparison
MAX_NEW_TOKENS = 1000  # Allow enough tokens for reasoning + answer

for reasoning_effort in reasoning_levels:
    print(f"\nTesting reasoning_effort={reasoning_effort}...")
    correct = 0
    total_latency = 0

    for idx in range(NUM_COMPARISON_SAMPLES):
        sample = dataset['train'][idx]

        prompt = build_execution_prediction_prompt(sample)
        messages = [
            {"role": "user", "content": prompt}
        ]

        inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt",
            return_dict=True,
            reasoning_effort=reasoning_effort,
        ).to(model.device)

        start_time = time.time()
        generated = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
        latency = time.time() - start_time
        total_latency += latency

        response = tokenizer.decode(
            generated[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        )

        predicted_output = extract_answer_from_response(response)
        is_correct, _ = check_predicted_output(
            predicted_output, sample['output']
        )

        if is_correct:
            correct += 1

    comparison_results[reasoning_effort] = {
        "correct": correct,
        "pass@1": correct / NUM_COMPARISON_SAMPLES,
        "avg_latency": total_latency / NUM_COMPARISON_SAMPLES
    }

# Print comparison
print("\n" + "="*60)
print("REASONING EFFORT COMPARISON")
print("="*60)
print(f"{'Reasoning':<12} {'pass@1':<10} {'Avg Latency':<15}")
print("-" * 60)
for level, stats in comparison_results.items():
    print(f"{level:<12} {stats['pass@1']*100:>6.1f}%   {stats['avg_latency']:>10.2f}s")
print("="*60)

## Next Steps

1. **Increase NUM_SAMPLES** to evaluate on more problems
2. **Try different reasoning_effort** levels
3. **Compare with other models** (DeepSeek-R1, GPT-4o, etc.)
4. **Analyze error patterns** to understand model weaknesses

---

**Note**: To run on full dataset (347 samples), expect ~1-2 hours on free Colab.