# Testing Fine-tuned Quotes Model

This notebook tests the MLX LoRA fine-tuned Llama 3.2 3B model for motivational quote generation.

In [None]:
# Import MLX-LM for inference
import mlx_lm
import os

print("Loading fine-tuned model...")
print(f"Working directory: {os.getcwd()}")

In [None]:
# Load the fine-tuned model with LoRA adapters
model, tokenizer = mlx_lm.load(
    'mlx-community/Llama-3.2-3B-Instruct-4bit',
    adapter_path='../models/llama3.2-3b-quotes-lora-mlx'
)

print("✅ Model loaded successfully!")
print(f"Model type: {type(model)}")
print(f"Tokenizer type: {type(tokenizer)}")

In [None]:
# Test function
from mlx_lm.sample_utils import make_sampler

def test_quote_generation(prompt, max_tokens=100, temperature=0.7):
    """Generate a motivational quote for the given prompt"""
    print(f"🎯 Prompt: {prompt}")
    print("🤖 Generating...")
    
    # Create sampler with temperature
    sampler = make_sampler(temp=temperature)
    
    response = mlx_lm.generate(
        model, tokenizer, 
        prompt=prompt,
        max_tokens=max_tokens,
        sampler=sampler
    )
    
    print(f"💭 Response: {response}")
    print("-" * 50)
    return response

In [None]:
# Test 1: Perseverance
test_quote_generation("Give me advice about perseverance")

In [None]:
# Test 2: Self-discipline
test_quote_generation("Give me advice about self-discipline")

In [None]:
# Test 3: Success
test_quote_generation("Give me advice about success")

In [None]:
# Test 4: Leadership
test_quote_generation("Give me advice about leadership")

In [None]:
# Test 5: Personal Growth
test_quote_generation("Give me advice about personal growth")

In [None]:
# Load base model for comparison
print("Loading base model for comparison...")
base_model, base_tokenizer = mlx_lm.load('mlx-community/Llama-3.2-3B-Instruct-4bit')
print("✅ Base model loaded!")

def compare_models(prompt, temperature=0.7):
    """Simple side-by-side comparison of fine-tuned vs base model"""
    sampler = make_sampler(temp=temperature)
    
    print(f"🎯 Prompt: '{prompt}'\\n")
    
    # Fine-tuned model
    ft_response = mlx_lm.generate(
        model, tokenizer, 
        prompt=prompt, 
        max_tokens=100, 
        sampler=sampler
    )
    
    # Base model  
    base_response = mlx_lm.generate(
        base_model, base_tokenizer, 
        prompt=prompt, 
        max_tokens=100, 
        sampler=sampler
    )
    
    # Side by side display
    print(f"📚 Fine-tuned Model:\\n{ft_response}\\n")
    print(f"🔧 Base Model:\\n{base_response}\\n")
    print("=" * 80)

In [None]:
# Gradio Chat Interface - Fixed
import gradio as gr
from mlx_lm.sample_utils import make_sampler

# Global state for model switching
models_loaded = {"Fine-tuned": (model, tokenizer), "Base": None}

def load_base_model():
    """Load base model if not already loaded"""
    if models_loaded["Base"] is None:
        print("Loading base model...")
        base_model, base_tokenizer = mlx_lm.load('mlx-community/Llama-3.2-3B-Instruct-4bit')
        models_loaded["Base"] = (base_model, base_tokenizer)
        print("✅ Base model loaded!")

def chat_respond(message, history, model_selection, temperature):
    """Generate chat response with model switching"""
    # Load base model if needed
    if model_selection == "Base" and models_loaded["Base"] is None:
        load_base_model()
    
    # Get the selected model
    selected_model, selected_tokenizer = models_loaded[model_selection]
    
    # Simple prompt format (not using chat template to avoid issues)
    prompt = f"{message}"
    
    # Generate response
    sampler = make_sampler(temp=temperature)
    
    # Use direct generate instead of stream for simplicity
    try:
        response = mlx_lm.generate(
            selected_model, selected_tokenizer, 
            prompt=prompt, 
            max_tokens=150, 
            sampler=sampler
        )
        
        # Clean up the response (remove the original prompt)
        if prompt in response:
            response = response.replace(prompt, "").strip()
        
        return response
    except Exception as e:
        return f"Error: {str(e)}"

# Create simple Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# 🤖 MLX Quote Chat")
    gr.Markdown("Compare **Fine-tuned** vs **Base** model responses")
    
    with gr.Row():
        model_dropdown = gr.Dropdown(
            choices=["Fine-tuned", "Base"],
            value="Fine-tuned",
            label="Model"
        )
        temperature = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
    
    # Simple chat interface
    with gr.Row():
        with gr.Column():
            prompt_input = gr.Textbox(
                label="Your Prompt",
                placeholder="Give me advice about courage",
                lines=2
            )
            
            generate_btn = gr.Button("Generate", variant="primary")
            
            response_output = gr.Textbox(
                label="Response",
                lines=6,
                interactive=False
            )
    
    # Examples
    gr.Examples(
        examples=[
            "Give me advice about perseverance",
            "Give me advice about courage",
            "Give me advice about success",
            "Give me advice about self-discipline"
        ],
        inputs=prompt_input
    )
    
    # Event handler
    generate_btn.click(
        fn=chat_respond,
        inputs=[prompt_input, gr.State([]), model_dropdown, temperature],
        outputs=response_output
    )
    
    prompt_input.submit(
        fn=chat_respond,
        inputs=[prompt_input, gr.State([]), model_dropdown, temperature],
        outputs=response_output
    )

# Launch interface
demo.launch(share=False, server_port=7861)