In [None]:
%load_ext autoreload
%autoreload 2
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
import os
from tqdm import tqdm
import re
from typing import Any

# Load Qwen model and tokenizer
model_path = "../models/Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto")

# AlpacaEval test file path
alpaca_eval_path = "../data/alpaca_eval/alpaca_eval.jsonl"

# Format input prompts for AlpacaEval
def format_alpaca_prompt(instruction):
    prompt = f"{instruction}"
    return prompt

# Generate predictions
# Generation hyperparameters. When generating responses, we’ll use greedy decoding (i.e., temperature of 0.0, with top-p 1.0)
def generate_answer(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(
            **inputs, 
            max_new_tokens=512,
            temperature=0.0,
            top_p=1.0,
            do_sample=False
        )
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    response_part = decoded.split("Response:")[-1].strip()
    return response_part

# Load AlpacaEval data
eval_set = []
with open(alpaca_eval_path, 'r', encoding='utf-8') as f:
    for line in f:
        example = json.loads(line)
        eval_set.append({
            "instruction": example["instruction"],
            "dataset": example.get("dataset", "alpaca_eval")  # Default to alpaca_eval if not provided
        })

print(f"Loaded {len(eval_set)} examples from AlpacaEval")

model_id = "gwen2.5-0.5b"

# Generate predictions
for example in tqdm(eval_set, desc="Generating AlpacaEval predictions"):
    prompt = format_alpaca_prompt(example["instruction"])
    output = generate_answer(prompt)
    example["output"] = output
    example["generator"] = model_id

# Save predictions
os.makedirs("evaluation_results", exist_ok=True)
output_file = "evaluation_results/alpaca_eval_predictions.json"

with open(output_file, "w") as fout:
    json.dump(eval_set, fout, indent=2)

print(f"Saved AlpacaEval predictions to {output_file}")

# Print a sample prediction
if eval_set:
    sample = eval_set[0]
    print("\nSample prediction:")
    print(f"Instruction: {sample['instruction']}")
    print(f"Output: {sample['output'][:100]}...")
    print(f"Generator: {sample['generator']}")
    print(f"Dataset: {sample['dataset']}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Loaded 805 examples from AlpacaEval


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Generating AlpacaEval predictions:   0%|          | 1/805 [00:06<1:31:05,  6.80s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Generating AlpacaEval predictions:   0%|          | 2/805 [00:13<1:28:10,  6.59s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Generating AlpacaEval predictions:   0%|          | 3/805 [00:17<1:11:43,  5.37s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Generating AlpacaEval predictions:   0%|          | 4/805 [00:24<1:19:41,  5.97s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Generating AlpacaEval predictions:   1%|          | 5/805 [00:30<1:23:26,  6.26s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Generating AlpacaEval predictions:   1%|          | 6/805 [00:31<57:37,  4.33s/it]  Setting `pad_token_id` to `eos_token_id`:151643 for open-e

Saved AlpacaEval predictions to evaluation_results/alpaca_eval_predictions.json

Sample prediction:
Instruction: What are the names of some famous actors that started their careers on Broadway?
Output: What are the names of some famous actors that started their careers on Broadway? Some famous actors ...
Generator: gwen2.5-0.5b
Dataset: helpful_base



