# Evaluation: Gemma-3-270m-it Financial Sentiment Analysis

This notebook evaluates the fine-tuned model against the base model on the test set.

## Overview
- **Model**: gemma-3-270m-it-bf16 (~270M parameters)
- **Task**: Classify financial text as positive, negative, neutral, or bullish
- **Evaluation**: Compare base model vs fine-tuned model accuracy

## 1. Setup and Configuration

In [None]:
import pandas as pd
import json
from pathlib import Path
from mlx_lm import load, generate
import mlx.core as mx

print("Setup complete!")

In [None]:
# Configuration
MODEL_NAME = "mlx-community/gemma-3-270m-it-bf16"
DATA_DIR = Path("data")
ADAPTER_PATH = Path("adapters")

print(f"Model: {MODEL_NAME}")
print(f"Data directory: {DATA_DIR}")
print(f"Adapter path: {ADAPTER_PATH}")

## 2. Evaluation

Compare base model vs fine-tuned model on the test set using exact label matching.

In [None]:
def evaluate_model(model_path, adapter_path=None, test_file=None, max_samples=50):
    """
    Evaluate model accuracy on sentiment classification.
    
    Args:
        model_path: Path to base model
        adapter_path: Path to LoRA adapters (optional)
        test_file: Path to test data JSONL
        max_samples: Number of samples to evaluate
    
    Returns:
        dict with accuracy and predictions
    """
    if test_file is None:
        test_file = DATA_DIR / "test.jsonl"
    
    print(f"Loading model from {model_path}")
    if adapter_path:
        print(f"Loading adapters from {adapter_path}")
    
    model, tokenizer = load(model_path, adapter_path=adapter_path)
    
    valid_labels = {'positive', 'negative', 'neutral', 'bullish'}
    correct = 0
    total = 0
    predictions = []
    
    with open(test_file, 'r') as f:
        for i, line in enumerate(f):
            if i >= max_samples:
                break
            
            data = json.loads(line)
            messages = data['messages']
            
            # Expected label from assistant message
            expected_label = messages[2]['content'].strip().lower()
            
            # Build prompt using chat template
            prompt = tokenizer.apply_chat_template(
                messages[:2],  # system + user only
                add_generation_prompt=True,
                tokenize=False
            )
            
            # Generate response
            response = generate(
                model,
                tokenizer,
                prompt=prompt,
                max_tokens=500,
                verbose=False
            )
            
            # Extract predicted label (first word that matches a valid label)
            response_lower = response.lower()
            predicted_label = None
            for label in valid_labels:
                if label in response_lower:
                    predicted_label = label
                    break
            
            is_correct = predicted_label == expected_label
            if is_correct:
                correct += 1
            total += 1
            
            predictions.append({
                'expected': expected_label,
                'predicted': predicted_label,
                'response': response,
                'correct': is_correct
            })
            
            if (i + 1) % 10 == 0:
                print(f"Evaluated {i + 1}/{max_samples} samples...")
    
    accuracy = correct / total if total > 0 else 0
    
    return {
        'accuracy': accuracy,
        'correct': correct,
        'total': total,
        'predictions': predictions
    }

print("Evaluation function defined.")

In [None]:
# Evaluate base model (without fine-tuning)
print("=" * 60)
print("EVALUATING BASE MODEL (no fine-tuning)")
print("=" * 60)

base_results = evaluate_model(
    model_path=MODEL_NAME,
    adapter_path=None,
    max_samples=200  # Use more samples for reliable metrics (600 available)
)

print("\n" + "=" * 60)
print("BASE MODEL RESULTS")
print("=" * 60)
print(f"Accuracy: {base_results['accuracy']:.2%}")
print(f"Correct:  {base_results['correct']}/{base_results['total']}")

In [None]:
# Evaluate fine-tuned model
print("=" * 60)
print("EVALUATING FINE-TUNED MODEL (with LoRA adapters)")
print("=" * 60)

finetuned_results = evaluate_model(
    model_path=MODEL_NAME,
    adapter_path=str(ADAPTER_PATH),
    max_samples=200  # Use more samples for reliable metrics (600 available)
)

print("\n" + "=" * 60)
print("FINE-TUNED MODEL RESULTS")
print("=" * 60)
print(f"Accuracy: {finetuned_results['accuracy']:.2%}")
print(f"Correct:  {finetuned_results['correct']}/{finetuned_results['total']}")

In [None]:
# Compare results
print("\n" + "=" * 60)
print("COMPARISON")
print("=" * 60)
print(f"Base Model Accuracy:       {base_results['accuracy']:.2%}")
print(f"Fine-Tuned Model Accuracy: {finetuned_results['accuracy']:.2%}")

improvement = finetuned_results['accuracy'] - base_results['accuracy']
print(f"Improvement:               {improvement:+.2%}")

if improvement > 0:
    if base_results['accuracy'] > 0:
        factor = finetuned_results['accuracy'] / base_results['accuracy']
        print(f"\nFine-tuning improved accuracy by {factor:.2f}x")
    else:
        print(f"\nFine-tuning improved accuracy from 0% to {finetuned_results['accuracy']:.2%}")
elif improvement == 0:
    print("\nNo change in accuracy")
else:
    print("\nFine-tuning decreased accuracy (may indicate overfitting)")

In [None]:
# Show all prediction results as a table
results_df = pd.DataFrame(finetuned_results['predictions'])
results_df.index.name = '#'
results_df

In [None]:
# Classification metrics for fine-tuned model
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Extract predictions, handling None values
y_true = [p['expected'] for p in finetuned_results['predictions']]
y_pred = [p['predicted'] or 'unknown' for p in finetuned_results['predictions']]

labels = ['positive', 'negative', 'neutral', 'bullish']

# Add 'unknown' to labels if any predictions failed to match
if 'unknown' in y_pred:
    display_labels = labels + ['unknown']
else:
    display_labels = labels

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred, labels=display_labels)

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=display_labels, yticklabels=display_labels, ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Fine-tuned Model - Confusion Matrix')
plt.tight_layout()
plt.show()

# Classification Report
print("\nClassification Report (Fine-tuned Model)")
print("=" * 60)
print(classification_report(y_true, y_pred, labels=labels, zero_division=0))

In [None]:
# Base model prediction results
base_df = pd.DataFrame(base_results['predictions'])
base_df.index.name = '#'
base_df