# GPT-OSS 20B Evaluation on LeetCode Dataset

This notebook evaluates OpenAI's gpt-oss-20b model on the LeetCode contests dataset using **Execution Prediction**.

**Methodology**: Instead of generating code, the model is given a Python program and predicts its output for a given input (following the paper's approach).

**Requirements**:
- Free Google Colab (T4 GPU)
- HuggingFace account (to load dataset)

**Author**: Code Reasoning Reproduction Team  
**Date**: 2025

## Step 1: Setup Environment

Install required packages for mxfp4 quantization support.

In [None]:
# Install bleeding-edge PyTorch and transformers
!pip install -q --upgrade torch
!pip install -q transformers triton==3.4 kernels
!pip uninstall -q torchvision torchaudio -y

# Install datasets library
!pip install -q datasets

⚠️ **IMPORTANT**: Please restart your Colab runtime after running the cell above.

Click: **Runtime → Restart runtime**

## Step 2: Load GPT-OSS 20B Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

print("Loading gpt-oss-20b model...")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

model_id = "openai/gpt-oss-20b"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="cuda",
)

print("✓ Model loaded successfully!")

## Step 3: Load LeetCode Dataset from HuggingFace

In [None]:
from datasets import load_dataset

# TODO: Replace with your HuggingFace dataset repo ID
DATASET_REPO_ID = "YOUR_USERNAME/leetcode-contests-431-467"

print(f"Loading dataset from {DATASET_REPO_ID}...")
dataset = load_dataset(DATASET_REPO_ID)

print(f"\n✓ Dataset loaded!")
print(f"Total samples: {len(dataset['train'])}")
print(f"\nFirst sample:")
sample = dataset['train'][0]
print(f"  ID: {sample['id']}")
print(f"  Function: {sample['function_name']}")
print(f"  Difficulty: {sample['difficulty']}")
print(f"  Input: {sample['input'][:100]}...")

## Step 4: Define Helper Functions

In [None]:
import re
from typing import Dict, List, Tuple

def build_prompt(sample: Dict) -> str:
    """
    Build prompt for execution prediction
    Uses the exact format from the paper (Execution Prediction Prompt - Zero-Shot)
    
    Format:
    You are given a Python program and an assertion containing an input to a function.
    Replace the ?? in the assertion with a literal representing the function's return
    value for the given input. Provide the full assertion in [ANSWER] and [/ANSWER] tags.
    
    [PYTHON]
    {program}
    assert {function_name}({input}) == ??
    [/PYTHON]
    """
    function_name = sample['function_name']
    code = sample['code']  # Use the collected solution as the program
    test_input = sample['input']  # e.g., "maxLength(nums=[1,2,3])"
    
    # Extract just the input arguments (remove function name and parentheses if present)
    if test_input.startswith(f"{function_name}(") and test_input.endswith(")"):
        input_args = test_input[len(function_name)+1:-1]
    else:
        input_args = test_input
    
    prompt = f"""You are given a Python program and an assertion containing an input to a function. Replace the ?? in the assertion with a literal (no unsimplified expressions, no function calls) representing the function's return value for the given input. Execute the program exactly as written, even if it is incorrect or incomplete. For your final answer, provide the full assertion in [ANSWER] and [/ANSWER] tags.

[PYTHON]
{code}
assert {function_name}({input_args}) == ??
[/PYTHON]"""
    
    return prompt


def extract_answer_from_response(response: str) -> str:
    """
    Extract predicted answer from [ANSWER] tags
    
    Expected format: [ANSWER] assert function_name(input) == output [/ANSWER]
    We want to extract just the output value
    """
    
    # Look for [ANSWER] tags
    pattern = r'\[ANSWER\](.*?)\[/ANSWER\]'
    matches = re.findall(pattern, response, re.DOTALL | re.IGNORECASE)
    if matches:
        assertion = matches[0].strip()
        
        # Parse the assertion to extract the predicted value
        # Format: "assert function_name(input) == value"
        match = re.search(r'assert\s+\w+\([^)]*\)\s*==\s*(.+)', assertion)
        if match:
            predicted_value = match.group(1).strip()
            return predicted_value
        
        # If we can't parse it, return the whole assertion
        return assertion
    
    # Fallback: try to find "assert ... == VALUE" pattern anywhere
    pattern = r'assert\s+\w+\([^)]*\)\s*==\s*(.+?)(?:\n|$)'
    matches = re.findall(pattern, response, re.MULTILINE)
    if matches:
        return matches[0].strip()
    
    # Return as-is if no answer tags found
    return response.strip()


def check_predicted_output(predicted_output: str, expected_output: str) -> Tuple[bool, str]:
    """
    Compare predicted output with expected output
    
    For execution prediction task, the model predicts what the output will be.
    We simply compare the predicted value with the actual expected value.
    """
    try:
        # Normalize both strings for comparison
        predicted = predicted_output.strip()
        expected = expected_output.strip()
        
        # Direct string comparison
        if predicted == expected:
            return (True, None)
        
        # Try evaluating both as Python literals and compare
        try:
            import ast
            predicted_val = ast.literal_eval(predicted)
            expected_val = ast.literal_eval(expected)
            
            if predicted_val == expected_val:
                return (True, None)
        except (ValueError, SyntaxError):
            # If we can't parse as literals, fall back to string comparison
            pass
        
        # Not equal
        return (False, f"Predicted: {predicted}, Expected: {expected}")
    
    except Exception as e:
        return (False, str(e))


print("✓ Helper functions defined")

## Step 5: Test Execution Prediction on One Sample

This cell demonstrates the **Execution Prediction** task:
1. Model receives a Python program and an assertion with `??`
2. Model predicts what the output will be
3. Model provides answer in `[ANSWER]...[/ANSWER]` tags

In [None]:
# Test with one sample
test_sample = dataset['train'][0]

prompt = build_prompt(test_sample)
print("Prompt:")
print(prompt)
print("\n" + "="*60)

# Generate prediction with gpt-oss
messages = [
    {"role": "user", "content": prompt}
]

inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True,
    reasoning_effort="medium",  # Can be "low", "medium", or "high"
).to(model.device)

print("Generating prediction...")
# Increase max_new_tokens to allow for reasoning + answer
generated = model.generate(**inputs, max_new_tokens=1000)
response = tokenizer.decode(generated[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

print("\nGenerated Response:")
print(response)
print("\n" + "="*60)

# Extract predicted output
predicted_output = extract_answer_from_response(response)
print("\nExtracted Predicted Output:")
print(predicted_output)
print("\n" + "="*60)

# Check correctness
is_correct, error = check_predicted_output(
    predicted_output,
    test_sample['output']
)

print(f"\nTest Result: {'✓ CORRECT' if is_correct else '✗ INCORRECT'}")
if error:
    print(f"Error: {error}")
print(f"Expected: {test_sample['output']}")

## Step 6: Evaluate on Multiple Samples

**Note**: Evaluating all 347 samples will take a while. Start with a subset.

In [None]:
import time
from tqdm.auto import tqdm

# Configuration
NUM_SAMPLES = 10  # Start small, increase later
REASONING_EFFORT = "medium"  # "low", "medium", or "high"
MAX_NEW_TOKENS = 1000  # Increased to allow for reasoning + answer

results = []
correct_count = 0

print(f"Evaluating {NUM_SAMPLES} samples with reasoning_effort={REASONING_EFFORT}...\n")

for idx in tqdm(range(NUM_SAMPLES)):
    sample = dataset['train'][idx]
    
    try:
        # Build prompt
        prompt = build_prompt(sample)
        
        # Generate prediction
        messages = [
            {"role": "user", "content": prompt}
        ]
        
        inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt",
            return_dict=True,
            reasoning_effort=REASONING_EFFORT,
        ).to(model.device)
        
        start_time = time.time()
        generated = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
        latency = time.time() - start_time
        
        response = tokenizer.decode(
            generated[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        )
        
        # Extract predicted output
        predicted_output = extract_answer_from_response(response)
        
        # Check correctness
        is_correct, error = check_predicted_output(
            predicted_output,
            sample['output']
        )
        
        if is_correct:
            correct_count += 1
        
        results.append({
            "problem_id": sample['id'],
            "function_name": sample['function_name'],
            "difficulty": sample['difficulty'],
            "correct": is_correct,
            "error": error,
            "latency_s": latency,
            "predicted_output": predicted_output,
            "expected_output": sample['output']
        })
        
    except Exception as e:
        print(f"\nError on sample {idx}: {e}")
        results.append({
            "problem_id": sample['id'],
            "function_name": sample['function_name'],
            "difficulty": sample['difficulty'],
            "correct": False,
            "error": str(e),
            "latency_s": 0,
            "predicted_output": "",
            "expected_output": sample['output']
        })

# Print results
print("\n" + "="*60)
print("EVALUATION RESULTS")
print("="*60)
print(f"Model: gpt-oss-20b")
print(f"Reasoning effort: {REASONING_EFFORT}")
print(f"Total samples: {NUM_SAMPLES}")
print(f"Correct: {correct_count}")
print(f"pass@1: {correct_count/NUM_SAMPLES*100:.2f}%")
print(f"Average latency: {sum(r['latency_s'] for r in results)/len(results):.2f}s")
print("="*60)

# Breakdown by difficulty
from collections import defaultdict
by_diff = defaultdict(lambda: {'total': 0, 'correct': 0})
for r in results:
    by_diff[r['difficulty']]['total'] += 1
    if r['correct']:
        by_diff[r['difficulty']]['correct'] += 1

print("\nBy Difficulty:")
for diff in ['easy', 'medium', 'hard']:
    if diff in by_diff:
        total = by_diff[diff]['total']
        correct = by_diff[diff]['correct']
        print(f"  {diff.capitalize()}: {correct}/{total} ({correct/total*100:.1f}%)")

## Step 7: Save Results

In [None]:
import json
from datetime import datetime

# Save results to JSON
output_filename = f"gpt_oss_20b_results_{REASONING_EFFORT}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

output_data = {
    "model": "gpt-oss-20b",
    "reasoning_effort": REASONING_EFFORT,
    "num_samples": NUM_SAMPLES,
    "correct_count": correct_count,
    "pass_at_1": correct_count / NUM_SAMPLES,
    "results": results
}

with open(output_filename, 'w') as f:
    json.dump(output_data, f, indent=2)

print(f"✓ Results saved to: {output_filename}")

# Download the file
from google.colab import files
files.download(output_filename)

## Step 8: Compare Reasoning Efforts (Optional)

Evaluate with different reasoning efforts to see the impact.

In [None]:
# Compare low vs medium vs high reasoning
reasoning_levels = ["low", "medium", "high"]
comparison_results = {}

NUM_COMPARISON_SAMPLES = 5  # Use small number for comparison
MAX_NEW_TOKENS = 1000  # Allow enough tokens for reasoning + answer

for reasoning_effort in reasoning_levels:
    print(f"\nTesting reasoning_effort={reasoning_effort}...")
    correct = 0
    total_latency = 0
    
    for idx in range(NUM_COMPARISON_SAMPLES):
        sample = dataset['train'][idx]
        
        prompt = build_prompt(sample)
        messages = [
            {"role": "user", "content": prompt}
        ]
        
        inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt",
            return_dict=True,
            reasoning_effort=reasoning_effort,
        ).to(model.device)
        
        start_time = time.time()
        generated = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
        latency = time.time() - start_time
        total_latency += latency
        
        response = tokenizer.decode(
            generated[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        )
        
        predicted_output = extract_answer_from_response(response)
        is_correct, _ = check_predicted_output(
            predicted_output, sample['output']
        )
        
        if is_correct:
            correct += 1
    
    comparison_results[reasoning_effort] = {
        "correct": correct,
        "pass@1": correct / NUM_COMPARISON_SAMPLES,
        "avg_latency": total_latency / NUM_COMPARISON_SAMPLES
    }

# Print comparison
print("\n" + "="*60)
print("REASONING EFFORT COMPARISON")
print("="*60)
print(f"{'Reasoning':<12} {'pass@1':<10} {'Avg Latency':<15}")
print("-" * 60)
for level, stats in comparison_results.items():
    print(f"{level:<12} {stats['pass@1']*100:>6.1f}%   {stats['avg_latency']:>10.2f}s")
print("="*60)

## Next Steps

1. **Increase NUM_SAMPLES** to evaluate on more problems
2. **Try different reasoning_effort** levels
3. **Compare with other models** (DeepSeek-R1, GPT-4o, etc.)
4. **Analyze error patterns** to understand model weaknesses

---

**Note**: To run on full dataset (347 samples), expect ~1-2 hours on free Colab.