In [None]:
%load_ext autoreload
%autoreload 2
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
import os
from tqdm import tqdm
import re
from typing import Any

# Load Qwen model and tokenizer
model_path = "/home/alvin/Homework/s2025-assignment3-alignment/notebooks/qwen2.5-3B-instruct-finetuned/final_model"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto")

# AlpacaEval test file path
alpaca_eval_path = "../data/alpaca_eval/alpaca_eval.jsonl"

# Format input prompts for AlpacaEval
def format_alpaca_prompt(instruction):
    prompt = (
            "Below is an instruction that describes a task. Write a response that appropriately completes the request."
            f"\n\n### Instruction:\n{instruction}\n\n### Response:\n"
        )
    return prompt

# Generate predictions
# Generation hyperparameters. When generating responses, we’ll use greedy decoding (i.e., temperature of 0.0, with top-p 1.0)
def generate_answer(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(
            **inputs, 
            max_new_tokens=200,
            temperature=0.0,
            top_p=1.0,
            do_sample=False
        )
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    # Extract only the response part (everything after "Response:")
    response_part = decoded.split("Response:")[-1].strip()
    return response_part

# Load AlpacaEval data
eval_set = []
with open(alpaca_eval_path, 'r', encoding='utf-8') as f:
    for line in f:
        example = json.loads(line)
        eval_set.append({
            "instruction": example["instruction"],
            "dataset": example.get("dataset", "alpaca_eval")
        })

print(f"Loaded {len(eval_set)} examples from AlpacaEval")

model_id = "gwen2.5-0.5b-finetuned"

# Generate predictions
for example in tqdm(eval_set[:100], desc="Generating AlpacaEval predictions"):
    prompt = format_alpaca_prompt(example["instruction"])
    output = generate_answer(prompt)
    example["output"] = output
    example["generator"] = model_id

# Save predictions
os.makedirs("evaluation_results_finetuned", exist_ok=True)
output_file = "evaluation_results_finetuned/alpaca_eval_predictions.json"

with open(output_file, "w") as fout:
    json.dump(eval_set, fout, indent=2)

print(f"Saved AlpacaEval predictions to {output_file}")

# Print a sample prediction
if eval_set:
    sample = eval_set[0]
    print("\nSample prediction:")
    print(f"Instruction: {sample['instruction']}")
    print(f"Output: {sample['output'][:100]}...")
    print(f"Generator: {sample['generator']}")
    print(f"Dataset: {sample['dataset']}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.37it/s]


Loaded 805 examples from AlpacaEval


Generating AlpacaEval predictions: 100%|██████████| 100/100 [13:12<00:00,  7.93s/it]

Saved AlpacaEval predictions to evaluation_results_finetuned/alpaca_eval_predictions.json

Sample prediction:
Instruction: What are the names of some famous actors that started their careers on Broadway?
Output: I actors who Broadway include,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...
Generator: gwen2.5-0.5b-finetuned
Dataset: helpful_base



