# Evaluation: Llama-3.2-1B Customer Support Triage

This notebook evaluates the fine-tuned model against the base model on the test set.

## Overview
- **Model**: Llama-3.2-1B-Instruct (~1B parameters)
- **Task**: Convert customer tickets → internal bug reports with severity, owner, and investigation steps
- **Evaluation**: Qualitative comparison of base model vs fine-tuned model outputs

## 1. Setup and Configuration

In [None]:
import json
from pathlib import Path
from mlx_lm import load, generate

print("Setup complete!")

In [None]:
# Configuration
MODEL_NAME = "mlx-community/Llama-3.2-1B-Instruct-bf16"
DATA_DIR = Path("data")
ADAPTER_PATH = Path("adapters")

print(f"Model: {MODEL_NAME}")
print(f"Data directory: {DATA_DIR}")
print(f"Adapter path: {ADAPTER_PATH}")

## 2. Load Test Data

In [None]:
import random

# Choose data source: "test" for held-out test set, "train" for training sample (overfitted demo)
DATA_SOURCE = "train"  # Options: "test" or "train"
TRAIN_SAMPLE_SIZE = 5

random.seed(42)

if DATA_SOURCE == "train":
    # Sample from training data (useful for demonstrating overfitted model)
    all_train = []
    with open(DATA_DIR / "train.jsonl", "r") as f:
        for line in f:
            line = line.strip()
            if line:
                all_train.append(json.loads(line))
    eval_data = random.sample(all_train, min(TRAIN_SAMPLE_SIZE, len(all_train)))
    print(f"Sampled {len(eval_data)} records from training data ({len(all_train)} total)")
else:
    # Use held-out test data
    eval_data = []
    with open(DATA_DIR / "test.jsonl", "r") as f:
        for line in f:
            line = line.strip()
            if line:
                eval_data.append(json.loads(line))
    print(f"Loaded {len(eval_data)} test samples")

# Preview first sample
print("\nSample entry:")
print("=" * 60)
sample = eval_data[0]
user_msg = sample["messages"][0]["content"]
print(f"USER INPUT:\n{user_msg[:300]}...")

## 3. Generate Responses

Run inference on all test samples using both the base model and the fine-tuned model.

In [None]:
def generate_responses(model, tokenizer, test_data, max_tokens=500):
    """Generate responses for all test samples."""
    responses = []
    for i, sample in enumerate(test_data):
        user_content = sample["messages"][0]["content"]

        # Build prompt using chat template
        prompt = tokenizer.apply_chat_template(
            [{"role": "user", "content": user_content}],
            add_generation_prompt=True,
            tokenize=False
        )

        # Generate response
        response = generate(
            model,
            tokenizer,
            prompt=prompt,
            max_tokens=max_tokens,
            verbose=False
        )
        responses.append(response)
        print(f"Generated {i + 1}/{len(test_data)}")

    return responses

print("Generation function defined.")

In [None]:
# Load and run base model
print("=" * 60)
print("LOADING BASE MODEL")
print("=" * 60)

base_model, base_tokenizer = load(MODEL_NAME)

print("\nGenerating responses with base model...")
base_responses = generate_responses(base_model, base_tokenizer, eval_data)

print("\nBase model generation complete!")

In [None]:
# Load and run fine-tuned model
print("=" * 60)
print("LOADING FINE-TUNED MODEL")
print("=" * 60)

finetuned_model, finetuned_tokenizer = load(MODEL_NAME, adapter_path=str(ADAPTER_PATH))

print("\nGenerating responses with fine-tuned model...")
finetuned_responses = generate_responses(finetuned_model, finetuned_tokenizer, eval_data)

print("\nFine-tuned model generation complete!")

## 4. Compare Outputs

Display side-by-side comparison of base model vs fine-tuned model outputs for each test sample.

In [None]:
from IPython.display import display, HTML
import html as html_module

def styled_comparison(idx, total, user_content, expected, finetuned, base):
    """Generate styled HTML for a single comparison (Equal Experts brand)."""
    user_esc = html_module.escape(user_content).replace("\n", "<br>")
    expected_esc = html_module.escape(expected).replace("\n", "<br>")
    finetuned_esc = html_module.escape(finetuned).replace("\n", "<br>")
    base_esc = html_module.escape(base).replace("\n", "<br>")

    # Equal Experts Brand Colors
    # Primary: EE Blue #1795D4, Secondary: Tech Blue #22567C
    # Accents: Transform Teal #269C9E, Equal Ember #F07C00
    # Neutrals: Dark Data #212526, The Cloud #F5F5F5, Byte White #FFFFFF

    return f"""
    <link href="https://fonts.googleapis.com/css2?family=Lexend:wght@300;400;500&display=swap" rel="stylesheet">
    <div style="font-family: 'Lexend', sans-serif; margin: 24px 0; overflow: hidden; box-shadow: 0 2px 8px rgba(33,37,38,0.1); border: 1px solid #E0E0E0;">
        <div style="background: #22567C; color: #FFFFFF; padding: 18px 24px; font-size: 20px; font-weight: 400;">
            Sample {idx} of {total}
        </div>

        <div style="background: #FFFFFF; border-left: 6px solid #1795D4; padding: 18px 22px;">
            <div style="font-weight: 500; color: #22567C; margin-bottom: 12px; font-size: 13px; text-transform: uppercase; letter-spacing: 1px;">
                Customer Ticket
            </div>
            <div style="font-family: 'Lexend', sans-serif; font-weight: 300; font-size: 13px; color: #212526; line-height: 1.6;">{user_esc}</div>
        </div>

        <div style="background: #F5F5F5; border-left: 6px solid #269C9E; padding: 18px 22px;">
            <div style="font-weight: 500; color: #269C9E; margin-bottom: 12px; font-size: 13px; text-transform: uppercase; letter-spacing: 1px;">
                Fine-Tuned Model Response
            </div>
            <div style="font-family: 'Lexend', sans-serif; font-weight: 300; font-size: 13px; color: #212526; line-height: 1.6;">{finetuned_esc}</div>
        </div>

        <div style="background: #FFFFFF; border-left: 6px solid #212526; padding: 18px 22px;">
            <div style="font-weight: 500; color: #212526; margin-bottom: 12px; font-size: 13px; text-transform: uppercase; letter-spacing: 1px;">
                Base Model Response
            </div>
            <div style="font-family: 'Lexend', sans-serif; font-weight: 300; font-size: 13px; color: #212526; line-height: 1.6;">{base_esc}</div>
        </div>

        <details style="background: #F5F5F5; border-left: 6px solid #F07C00; padding: 18px 22px;">
            <summary style="font-weight: 500; color: #22567C; font-size: 13px; text-transform: uppercase; letter-spacing: 1px; cursor: pointer; border-radius: 4px; padding: 4px 0;">
                Expected Response (Ground Truth) — click to expand
            </summary>
            <div style="font-family: 'Lexend', sans-serif; font-weight: 300; font-size: 13px; margin-top: 14px; color: #212526; line-height: 1.6;">{expected_esc}</div>
        </details>
    </div>
    """

# Display styled comparisons
for i, sample in enumerate(eval_data):
    user_content = sample["messages"][0]["content"]
    expected_output = sample["messages"][1]["content"]

    display(HTML(styled_comparison(
        i + 1,
        len(eval_data),
        user_content,
        expected_output,
        finetuned_responses[i],
        base_responses[i]
    )))