# 模型评估：对比微调前后性能

本notebook用于对比基础模型和微调后模型在测试集上的表现

In [1]:
import os
# 设置环境变量
os.environ['HF_HOME'] = '/macroverse/public/database/huggingface/hub'


In [2]:
import torch
import json
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


## 1. 配置参数

In [3]:
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
LORA_MODEL = "outputs/lora_model/final_model"
TEST_FILE = "data/processed/test.jsonl"
MAX_SAMPLES = 50  # 设置为None使用全部测试集

## 2. 加载测试数据

In [4]:
with open(TEST_FILE, 'r', encoding='utf-8') as f:
    test_data = [json.loads(line) for line in f]

if MAX_SAMPLES:
    test_data = test_data[:MAX_SAMPLES]

print(f"测试样本数: {len(test_data)}")
print(f"\n示例数据:")
print(json.dumps(test_data[0], ensure_ascii=False, indent=2)[:300] + "...")

测试样本数: 50

示例数据:
{
  "instruction": "请从以下邮件中提取事件信息，包括标题、时间、地点、参与者等关键信息，以JSON格式输出。",
  "input": "主题：All Accounting Offsite Meeting 5/23\n发件人：richard.causey@enron.com\n收件人：bob.butts@enron.com, wes.colwell@enron.com, tim.despain@enron.com, \n\tmary.perkins@enron.com, mike.mcconnell@enron.com, \n\traymond.bowen@enron.co...


## 3. 定义工具函数

In [5]:
def load_model(model_name, lora_path=None):
    """加载模型"""
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )
    
    if lora_path:
        model = PeftModel.from_pretrained(model, lora_path)
        model = model.merge_and_unload()
    
    model.eval()
    return model, tokenizer


def generate(model, tokenizer, email_content):
    """生成响应"""
    messages = [
        {"role": "system", "content": "你是一个专业的邮件事件信息提取助手。"},
        {"role": "user", "content": f"请从以下邮件中提取事件信息，包括标题、时间、地点、参与者等关键信息，以JSON格式输出。\n\n邮件内容：\n{email_content}"}
    ]
    
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id
        )
    
    return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)


def evaluate(model, tokenizer, data):
    """评估模型"""
    valid_json = 0
    results = []
    
    for item in tqdm(data):
        # 解析输入
        if 'messages' in item:
            email = item['messages'][1]['content'].split('邮件内容：\n')[-1]
        elif 'input' in item:
            email = item['input']
        else:
            continue
        
        pred = generate(model, tokenizer, email)
        
        # 验证JSON
        is_valid = False
        try:
            json.loads(pred)
            is_valid = True
            valid_json += 1
        except:
            pass
        
        results.append({'input': email, 'output': pred, 'valid_json': is_valid})
    
    accuracy = (valid_json / len(data)) * 100
    return results, accuracy

## 4. 评估基础模型（未微调）

In [6]:
print("加载基础模型...")
base_model, base_tokenizer = load_model(BASE_MODEL)

print("评估基础模型...")
base_results, base_acc = evaluate(base_model, base_tokenizer, test_data)

print(f"\n基础模型 JSON格式正确率: {base_acc:.2f}%")

# 释放内存
del base_model, base_tokenizer
torch.cuda.empty_cache()

加载基础模型...


Loading checkpoint shards: 100%|██████████| 4/4 [00:15<00:00,  3.91s/it]


评估基础模型...


100%|██████████| 50/50 [05:56<00:00,  7.13s/it]


基础模型 JSON格式正确率: 0.00%





## 5. 评估微调模型

In [7]:
print("加载微调模型...")
ft_model, ft_tokenizer = load_model(BASE_MODEL, LORA_MODEL)

print("评估微调模型...")
ft_results, ft_acc = evaluate(ft_model, ft_tokenizer, test_data)

print(f"\n微调模型 JSON格式正确率: {ft_acc:.2f}%")

加载微调模型...


Loading checkpoint shards: 100%|██████████| 4/4 [00:15<00:00,  3.89s/it]


评估微调模型...


100%|██████████| 50/50 [05:06<00:00,  6.12s/it]


微调模型 JSON格式正确率: 92.00%





## 6. 对比结果

In [8]:
print("=" * 60)
print("评估结果对比")
print("=" * 60)
print(f"基础模型 JSON正确率: {base_acc:.2f}%")
print(f"微调模型 JSON正确率: {ft_acc:.2f}%")
print(f"提升幅度: {ft_acc - base_acc:+.2f}%")
print("=" * 60)

评估结果对比
基础模型 JSON正确率: 0.00%
微调模型 JSON正确率: 92.00%
提升幅度: +92.00%


## 7. 查看示例对比

In [9]:
# 查看前3个样本的对比
for i in range(min(3, len(base_results))):
    print(f"\n{'='*60}")
    print(f"样本 {i+1}")
    print(f"{'='*60}")
    print(f"\n输入邮件:\n{base_results[i]['input'][:200]}...")
    print(f"\n基础模型输出 (有效JSON: {base_results[i]['valid_json']}):\n{base_results[i]['output']}")
    print(f"\n微调模型输出 (有效JSON: {ft_results[i]['valid_json']}):\n{ft_results[i]['output']}")


样本 1

输入邮件:
主题：All Accounting Offsite Meeting 5/23
发件人：richard.causey@enron.com
收件人：bob.butts@enron.com, wes.colwell@enron.com, tim.despain@enron.com, 
	mary.perkins@enron.com, mike.mcconnell@enron.com, 
	raymond...

基础模型输出 (有效JSON: False):
```json
{
  "标题": "All Accounting Offsite Meeting 5/23",
  "时间": "2001年5月23日",
  "地点": "Adams Mark Hotel",
  "参与者": [
    "richard.causey@enron.com",
    "bob.butts@enron.com",
    "wes.colwell@enron.com",
    "tim.despain@enron.com",
    "mary.perkins@enron.com",
    "mike.mcconnell@enron.com",
    "raymond.bowen@enron.com",
    "george.wasaff@enron.com",
    "jan.johnson@enron.com",
    "mark.frank@enron.com",
    "greek.rice@enron.com",
    "james.ginty@enron.com",
    "bill.donovan@enron.com",
    "paula.rieker@enron.com",
    "scott.neal@enron.com"
  ]
}
```

微调模型输出 (有效JSON: True):
{
  "event_type": "会议",
  "title": "All Accounting Offsite Meeting",
  "time": "May 23, 2001, 8:30 AM",
  "location": "Adams Mark Hotel, large ballroom",
  "partici