# LoRA Evaluation: Mistral 7B + LoRA Adapter

Evaluate the fine-tuned LoRA model and compare with baseline.

**Run in Colab with GPU (T4 or better)**

In [None]:
# Install dependencies
!pip install -q transformers accelerate bitsandbytes peft torch

In [None]:
# Clone repo and checkout lora-eval branch
import os
if not os.path.exists('lora-support'):
    !git clone https://github.com/aashnakunk/lora-support.git
    %cd lora-support
    !git checkout lora-eval
else:
    %cd lora-support
    print("Repo already exists")

In [None]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from typing import Dict, List
import re
from collections import Counter

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

In [None]:
# Load eval dataset
EVAL_PATH = "data/eval.jsonl"

eval_data = []
with open(EVAL_PATH, 'r') as f:
    for line in f:
        eval_data.append(json.loads(line))

print(f"Loaded {len(eval_data)} eval examples")

In [None]:
# Load base model with 4-bit quantization
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

print(f"Loading base model: {MODEL_NAME}...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

print("Base model loaded!")

In [None]:
# Load LoRA adapter
# Option 1: If you uploaded adapter to HuggingFace Hub
# ADAPTER_PATH = "YOUR_USERNAME/mistral-7b-json-lora"

# Option 2: If you uploaded adapter folder to the repo
ADAPTER_PATH = "./lora_adapter"  # Change this to your adapter location

print(f"\nLoading LoRA adapter from: {ADAPTER_PATH}")
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
print("LoRA adapter loaded successfully!")

In [None]:
INTENTS = ["refund", "cancel", "billing", "tech_support", "shipping", "other"]
PRIORITIES = ["low", "medium", "high"]

def is_valid_json(s: str) -> bool:
    try:
        json.loads(s)
        return True
    except:
        return False

def is_schema_compliant(s: str) -> bool:
    try:
        obj = json.loads(s)
        required_keys = ["intent", "priority", "entities", "needs_clarification", "clarifying_question"]
        if list(obj.keys()) != required_keys:
            return False
        if obj["intent"] not in INTENTS:
            return False
        if obj["priority"] not in PRIORITIES:
            return False
        if "order_id" not in obj["entities"] or "product" not in obj["entities"]:
            return False
        if not isinstance(obj["needs_clarification"], bool):
            return False
        return True
    except:
        return False

def extract_json_from_text(text: str) -> str:
    """Try to extract JSON from markdown/text wrapper"""
    text = re.sub(r'```json\s*', '', text)
    text = re.sub(r'```\s*', '', text)
    match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', text)
    if match:
        return match.group(0)
    return text.strip()

print("Validation functions ready")

In [None]:
def run_inference(example: Dict, max_new_tokens: int = 256) -> str:
    """Run inference on a single example"""
    messages = example['messages'][:2]  # system + user
    
    prompt = f"""<s>[INST] {messages[0]['content']}

{messages[1]['content']} [/INST]"""
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response.strip()

# Test on one example
test_output = run_inference(eval_data[0])
print("Test output:")
print(test_output)
print("\nValid JSON?", is_valid_json(extract_json_from_text(test_output)))
print("Schema compliant?", is_schema_compliant(extract_json_from_text(test_output)))

In [None]:
# Run evaluation on full eval set
EVAL_SIZE = 100  # Use 100 for quick test, 800 for full eval

results = []
valid_json_count = 0
schema_compliant_count = 0
intent_correct = 0
total = 0

print(f"Running LoRA evaluation on {EVAL_SIZE} examples...\n")

for i, example in enumerate(eval_data[:EVAL_SIZE]):
    if i % 10 == 0:
        print(f"Progress: {i}/{EVAL_SIZE}")
    
    expected = json.loads(example['messages'][2]['content'])
    output = run_inference(example)
    json_str = extract_json_from_text(output)
    
    valid_json = is_valid_json(json_str)
    schema_valid = is_schema_compliant(json_str) if valid_json else False
    
    if valid_json:
        valid_json_count += 1
        predicted = json.loads(json_str)
        
        if schema_valid:
            schema_compliant_count += 1
            if predicted['intent'] == expected['intent']:
                intent_correct += 1
    
    results.append({
        'user_message': example['messages'][1]['content'],
        'expected': expected,
        'predicted_raw': output,
        'predicted_json': json_str,
        'valid_json': valid_json,
        'schema_compliant': schema_valid
    })
    
    total += 1

print("\n" + "="*60)
print("LORA EVALUATION RESULTS")
print("="*60)
print(f"Total examples: {total}")
print(f"Valid JSON: {valid_json_count}/{total} ({valid_json_count/total*100:.1f}%)")
print(f"Schema compliant: {schema_compliant_count}/{total} ({schema_compliant_count/total*100:.1f}%)")
print(f"Intent accuracy: {intent_correct}/{total} ({intent_correct/total*100:.1f}%)")
print("="*60)

In [None]:
# Load baseline results for comparison
try:
    with open('baseline_results.json', 'r') as f:
        baseline = json.load(f)
    
    print("\n" + "="*60)
    print("COMPARISON: Baseline vs LoRA")
    print("="*60)
    print(f"\n{'Metric':<25} {'Baseline':<15} {'LoRA':<15} {'Improvement'}")
    print("-"*60)
    
    lora_valid_rate = valid_json_count / total
    lora_schema_rate = schema_compliant_count / total
    lora_intent_rate = intent_correct / total
    
    print(f"{'Valid JSON Rate':<25} {baseline['valid_json_rate']*100:>6.1f}%  {lora_valid_rate*100:>11.1f}%  {(lora_valid_rate - baseline['valid_json_rate'])*100:>+8.1f}%")
    print(f"{'Schema Compliance Rate':<25} {baseline['schema_compliance_rate']*100:>6.1f}%  {lora_schema_rate*100:>11.1f}%  {(lora_schema_rate - baseline['schema_compliance_rate'])*100:>+8.1f}%")
    print(f"{'Intent Accuracy':<25} {baseline['intent_accuracy']*100:>6.1f}%  {lora_intent_rate*100:>11.1f}%  {(lora_intent_rate - baseline['intent_accuracy'])*100:>+8.1f}%")
    print("="*60)
    
except FileNotFoundError:
    print("\nbaseline_results.json not found - skipping comparison")

In [None]:
# Show sample improvements
print("\nSample outputs (LoRA):")
print("="*60)
for i, r in enumerate(results[:3]):
    print(f"\nEXAMPLE {i+1}:")
    print(f"USER: {r['user_message'][:80]}...")
    print(f"OUTPUT: {r['predicted_raw'][:150]}...")
    print(f"Valid: {r['valid_json']}, Schema compliant: {r['schema_compliant']}")
    print("-"*60)

In [None]:
# Save LoRA results
with open('lora_results.json', 'w') as f:
    json.dump({
        'model': MODEL_NAME,
        'adapter': ADAPTER_PATH,
        'eval_size': total,
        'valid_json_rate': valid_json_count / total,
        'schema_compliance_rate': schema_compliant_count / total,
        'intent_accuracy': intent_correct / total,
        'detailed_results': results
    }, f, indent=2)

print("\nResults saved to lora_results.json")
print("Download this file to commit to your repo!")