# LoRA Evaluation: Mistral 7B + LoRA Adapter

Evaluate the fine-tuned LoRA model and compare with baseline.

**Run in Colab with GPU (T4 or better)**

In [1]:
# Install dependencies
!pip install -q transformers accelerate bitsandbytes peft torch

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Clone repo and checkout lora-eval branch
import os
if not os.path.exists('lora-support'):
    !git clone https://github.com/aashnakunk/lora-support.git
    %cd lora-support
    !git checkout lora-eval
else:
    %cd lora-support
    print("Repo already exists")

Cloning into 'lora-support'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 34 (delta 12), reused 29 (delta 7), pack-reused 0 (from 0)[K
Receiving objects: 100% (34/34), 219.41 KiB | 1.74 MiB/s, done.
Resolving deltas: 100% (12/12), done.
/content/lora-support
Branch 'lora-eval' set up to track remote branch 'lora-eval' from 'origin'.
Switched to a new branch 'lora-eval'


In [3]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from typing import Dict, List
import re
from collections import Counter

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

PyTorch version: 2.9.0+cu128
CUDA available: True
GPU: Tesla T4


In [4]:
# Load eval dataset
EVAL_PATH = "data/eval.jsonl"

eval_data = []
with open(EVAL_PATH, 'r') as f:
    for line in f:
        eval_data.append(json.loads(line))

print(f"Loaded {len(eval_data)} eval examples")

Loaded 800 eval examples


In [5]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,   # fp16 for T4
    bnb_4bit_use_double_quant=True,
)

print(f"Loading base model: {MODEL_NAME}...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

print("Base model loaded!")

CUDA available: True
GPU: Tesla T4
Loading base model: mistralai/Mistral-7B-Instruct-v0.3...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]



Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Base model loaded!


In [7]:
# Load LoRA adapter
# Option 1: If you uploaded adapter to HuggingFace Hub
ADAPTER_PATH = "aashnakunk/mistral-7b-json-support"

# Option 2: If you uploaded adapter folder to the repo
#ADAPTER_PATH = "./lora_adapter"  # Change this to your adapter location

print(f"\nLoading LoRA adapter from: {ADAPTER_PATH}")
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
print("LoRA adapter loaded successfully!")


Loading LoRA adapter from: aashnakunk/mistral-7b-json-support


adapter_config.json: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/54.6M [00:00<?, ?B/s]

LoRA adapter loaded successfully!


In [8]:
INTENTS = ["refund", "cancel", "billing", "tech_support", "shipping", "other"]
PRIORITIES = ["low", "medium", "high"]

def is_valid_json(s: str) -> bool:
    try:
        json.loads(s)
        return True
    except:
        return False

def is_schema_compliant(s: str) -> bool:
    try:
        obj = json.loads(s)
        required_keys = ["intent", "priority", "entities", "needs_clarification", "clarifying_question"]
        if list(obj.keys()) != required_keys:
            return False
        if obj["intent"] not in INTENTS:
            return False
        if obj["priority"] not in PRIORITIES:
            return False
        if "order_id" not in obj["entities"] or "product" not in obj["entities"]:
            return False
        if not isinstance(obj["needs_clarification"], bool):
            return False
        return True
    except:
        return False

def extract_json_from_text(text: str) -> str:
    """Try to extract JSON from markdown/text wrapper"""
    text = re.sub(r'```json\s*', '', text)
    text = re.sub(r'```\s*', '', text)
    match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', text)
    if match:
        return match.group(0)
    return text.strip()

print("Validation functions ready")

Validation functions ready


In [11]:
def run_inference(example: Dict, max_new_tokens: int = 256) -> str:
    """Run inference on a single example"""
    messages = example['messages'][:2]  # system + user

    prompt = f"""<s>[INST] {messages[0]['content']}

{messages[1]['content']} [/INST]"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
      outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response.strip()

# Test on one example
test_output = run_inference(eval_data[0])
print("Test output:")
print(test_output)
print("\nValid JSON?", is_valid_json(extract_json_from_text(test_output)))
print("Schema compliant?", is_schema_compliant(extract_json_from_text(test_output)))

Test output:
{"intent": "shipping", "priority": "medium", "entities": {"order_id": null, "product": null}, "needs_clarification": true, "clarifying_question": "Can you share your order ID and the delivery address ZIP code so I can check the shipment status?"}

Valid JSON? True
Schema compliant? True


In [13]:
# Run evaluation on full eval set
EVAL_SIZE = 100  # Use 100 for quick test, 800 for full eval

results = []
valid_json_count = 0
schema_compliant_count = 0
intent_correct = 0
total = 0

print(f"Running LoRA evaluation on {EVAL_SIZE} examples...\n")

for i, example in enumerate(eval_data[:EVAL_SIZE]):
    print(f"[{i+1}/{EVAL_SIZE}] ", end="")

    expected = json.loads(example['messages'][2]['content'])
    output = run_inference(example)
    json_str = extract_json_from_text(output)

    valid_json = is_valid_json(json_str)
    schema_valid = is_schema_compliant(json_str) if valid_json else False

    if valid_json:
        valid_json_count += 1
        predicted = json.loads(json_str)

        if schema_valid:
            schema_compliant_count += 1
            if predicted['intent'] == expected['intent']:
                intent_correct += 1
                print(f"✓ JSON valid, intent correct")
            else:
                print(f"✗ JSON valid, wrong intent: {predicted['intent']} vs {expected['intent']}")
        else:
            print(f"✗ JSON valid but schema invalid")
    else:
        print(f"✗ Invalid JSON: {output[:80]}")

    results.append({
        'user_message': example['messages'][1]['content'],
        'expected': expected,
        'predicted_raw': output,
        'predicted_json': json_str,
        'valid_json': valid_json,
        'schema_compliant': schema_valid
    })

    total += 1

print("\n" + "="*60)
print("LORA EVALUATION RESULTS")
print("="*60)
print(f"Total examples: {total}")
print(f"Valid JSON: {valid_json_count}/{total} ({valid_json_count/total*100:.1f}%)")
print(f"Schema compliant: {schema_compliant_count}/{total} ({schema_compliant_count/total*100:.1f}%)")
print(f"Intent accuracy: {intent_correct}/{total} ({intent_correct/total*100:.1f}%)")
print("="*60)

Running LoRA evaluation on 100 examples...

[1/100] ✓ JSON valid, intent correct
[2/100] ✓ JSON valid, intent correct
[3/100] ✓ JSON valid, intent correct
[4/100] ✓ JSON valid, intent correct
[5/100] ✓ JSON valid, intent correct
[6/100] ✓ JSON valid, intent correct
[7/100] ✓ JSON valid, intent correct
[8/100] ✓ JSON valid, intent correct
[9/100] ✓ JSON valid, intent correct
[10/100] ✓ JSON valid, intent correct
[11/100] ✓ JSON valid, intent correct
[12/100] ✓ JSON valid, intent correct
[13/100] ✓ JSON valid, intent correct
[14/100] ✓ JSON valid, intent correct
[15/100] ✓ JSON valid, intent correct
[16/100] ✓ JSON valid, intent correct
[17/100] ✓ JSON valid, intent correct
[18/100] ✓ JSON valid, intent correct
[19/100] ✓ JSON valid, intent correct
[20/100] ✓ JSON valid, intent correct
[21/100] ✓ JSON valid, intent correct
[22/100] ✓ JSON valid, intent correct
[23/100] ✓ JSON valid, intent correct
[24/100] ✓ JSON valid, intent correct
[25/100] ✓ JSON valid, intent correct
[26/100] ✓ JSON

In [15]:
# Load baseline results for comparison
try:
    with open('baseline_results.json', 'r') as f:
        baseline = json.load(f)

    print("\n" + "="*60)
    print("COMPARISON: Baseline vs LoRA")
    print("="*60)
    print(f"\n{'Metric':<25} {'Baseline':<15} {'LoRA':<15} {'Improvement'}")
    print("-"*60)

    lora_valid_rate = valid_json_count / total
    lora_schema_rate = schema_compliant_count / total
    lora_intent_rate = intent_correct / total

    print(f"{'Valid JSON Rate':<25} {baseline['valid_json_rate']*100:>6.1f}%  {lora_valid_rate*100:>11.1f}%  {(lora_valid_rate - baseline['valid_json_rate'])*100:>+8.1f}%")
    print(f"{'Schema Compliance Rate':<25} {baseline['schema_compliance_rate']*100:>6.1f}%  {lora_schema_rate*100:>11.1f}%  {(lora_schema_rate - baseline['schema_compliance_rate'])*100:>+8.1f}%")
    print(f"{'Intent Accuracy':<25} {baseline['intent_accuracy']*100:>6.1f}%  {lora_intent_rate*100:>11.1f}%  {(lora_intent_rate - baseline['intent_accuracy'])*100:>+8.1f}%")
    print("="*60)

except FileNotFoundError:
    print("\nbaseline_results.json not found - skipping comparison")


COMPARISON: Baseline vs LoRA

Metric                    Baseline        LoRA            Improvement
------------------------------------------------------------
Valid JSON Rate            100.0%        100.0%      +0.0%
Schema Compliance Rate      87.0%        100.0%     +13.0%
Intent Accuracy             84.0%         98.0%     +14.0%


In [16]:
#  sample improvements
print("\nSample outputs (LoRA):")
print("="*60)
for i, r in enumerate(results[:3]):
    print(f"\nEXAMPLE {i+1}:")
    print(f"USER: {r['user_message'][:80]}...")
    print(f"OUTPUT: {r['predicted_raw'][:150]}...")
    print(f"Valid: {r['valid_json']}, Schema compliant: {r['schema_compliant']}")
    print("-"*60)


Sample outputs (LoRA):

EXAMPLE 1:
USER: Shipping issue: need to change delivery address.
Please help!!!

--- Forwarded m...
OUTPUT: {"intent": "shipping", "priority": "medium", "entities": {"order_id": null, "product": null}, "needs_clarification": true, "clarifying_question": "Can...
Valid: True, Schema compliant: True
------------------------------------------------------------

EXAMPLE 2:
USER: I need to cancel my order for the phone case. It's ORD-62825. I ordered by mista...
OUTPUT: {"intent": "cancel", "priority": "low", "entities": {"order_id": "ORD-62825", "product": "phone case"}, "needs_clarification": false, "clarifying_ques...
Valid: True, Schema compliant: True
------------------------------------------------------------

EXAMPLE 3:
USER: Hi, I want a refund because my order box was open. pls fix asap...
OUTPUT: {"intent": "refund", "priority": "high", "entities": {"order_id": null, "product": null}, "needs_clarification": true, "clarifying_question": "Can you...
Valid: 

In [17]:
# Save LoRA results
with open('lora_results.json', 'w') as f:
    json.dump({
        'model': MODEL_NAME,
        'adapter': ADAPTER_PATH,
        'eval_size': total,
        'valid_json_rate': valid_json_count / total,
        'schema_compliance_rate': schema_compliant_count / total,
        'intent_accuracy': intent_correct / total,
        'detailed_results': results
    }, f, indent=2)

print("\nResults saved to lora_results.json")
print("Download this file to commit to your repo!")


Results saved to lora_results.json
Download this file to commit to your repo!
