In [None]:
# Cell 1 - Load Fine-tuned Model for Evaluation

import os
import torch
import json
import warnings
from pathlib import Path
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig,
    pipeline
)
from peft import PeftModel
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datetime import datetime

# Suppress warnings
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("🤖 LLAMA-3.1 CORPORATE ASSISTANT EVALUATION")
print("=" * 60)

# Model paths
BASE_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
FINE_TUNED_MODEL = "/home/azureuser/cloudfiles/code/Users/746582/llama-8b-ft-11th-june/llama-3.1-8b-corporate-assistant-final"
HF_TOKEN = "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"

# Check if fine-tuned model exists
if not Path(FINE_TUNED_MODEL).exists():
    print(f"❌ Fine-tuned model not found at: {FINE_TUNED_MODEL}")
    print("Available directories:")
    parent_dir = Path(FINE_TUNED_MODEL).parent
    for item in parent_dir.iterdir():
        if item.is_dir():
            print(f"   📁 {item.name}")
    raise FileNotFoundError("Please check the model path")

print(f"✅ Fine-tuned model found at: {FINE_TUNED_MODEL}")

# Load tokenizer
print("🔄 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL,
    token=HF_TOKEN,
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("✅ Tokenizer loaded successfully")

# Load base model for comparison
print("🔄 Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=HF_TOKEN,
    trust_remote_code=True
)

print("✅ Base model loaded")

# Load fine-tuned model
print("🔄 Loading fine-tuned model...")
fine_tuned_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=HF_TOKEN,
    trust_remote_code=True
)

# Load PEFT adapter
fine_tuned_model = PeftModel.from_pretrained(fine_tuned_model, FINE_TUNED_MODEL)
print("✅ Fine-tuned model with adapters loaded")

# Create pipelines
print("🔄 Creating inference pipelines...")

base_pipeline = pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

fine_tuned_pipeline = pipeline(
    "text-generation",
    model=fine_tuned_model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

print("✅ Inference pipelines ready")
print("\n🚀 Ready for evaluation!")

# GPU info
if torch.cuda.is_available():
    print(f"🔋 GPU: {torch.cuda.get_device_name(0)}")
    print(f"📊 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("⚠️  Running on CPU")

In [None]:
# Cell 2 - Test Cases and Evaluation Functions

# Corporate assistant test cases
test_cases = [
    {
        "category": "Staffing & SO",
        "question": "How to raise a staffing SO request?",
        "expected_keywords": ["OneC", "Quick SO", "New demand", "staffing"]
    },
    {
        "category": "Staffing & SO", 
        "question": "What is the difference between PM and FC job codes?",
        "expected_keywords": ["Full Time Employee", "Full Time Contractor", "PM", "FC"]
    },
    {
        "category": "CWR Process",
        "question": "How to create a CWR SO?",
        "expected_keywords": ["CWR", "Contractor Workforce Request", "Custom service", "CW00"]
    },
    {
        "category": "CWR Process",
        "question": "Process to convert CWR to FTE associates",
        "expected_keywords": ["New Demand", "CWR Conversion", "FTE"]
    },
    {
        "category": "System Issues",
        "question": "Unable to select subcontractor in the system",
        "expected_keywords": ["CWR", "OneC", "Quick SO", "grade selection"]
    },
    {
        "category": "Cloud Services",
        "question": "Are there any migration benefits available in Google Cloud?",
        "expected_keywords": ["migration", "google cloud", "ProcurementITCloud"]
    },
    {
        "category": "Immigration",
        "question": "How to view hardcopy of I-140 approval notice?",
        "expected_keywords": ["I-140", "approval notice", "USCIS", "company-owned"]
    },
    {
        "category": "Healthcare",
        "question": "What equipment do I need for a telemedicine appointment?",
        "expected_keywords": ["camera", "microphone", "internet", "MHC"]
    },
    {
        "category": "General Process",
        "question": "Do we still need to validate SOs through email for GGM SOs?",
        "expected_keywords": ["GGM", "validation", "APAC", "automated"]
    },
    {
        "category": "System Access",
        "question": "Unable to create opportunity id in winzone",
        "expected_keywords": ["winzone", "opportunity", "Account Manager", "CRM"]
    }
]

def generate_response(pipeline, question, max_tokens=200):
    """Generate response using the given pipeline"""
    system_prompt = "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions."
    
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
    
    try:
        response = pipeline(
            prompt,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            return_full_text=False
        )
        
        # Extract only the generated text
        generated_text = response[0]['generated_text']
        return generated_text.strip()
        
    except Exception as e:
        return f"Error generating response: {str(e)}"

def evaluate_response_quality(response, expected_keywords):
    """Evaluate response quality based on keyword matching and other factors"""
    if not response or "Error" in response:
        return 0.0
    
    response_lower = response.lower()
    
    # Keyword matching score (0-40%)
    keyword_matches = sum(1 for keyword in expected_keywords if keyword.lower() in response_lower)
    keyword_score = (keyword_matches / len(expected_keywords)) * 0.4
    
    # Length appropriateness (0-20%)
    response_length = len(response.split())
    if 10 <= response_length <= 100:
        length_score = 0.2
    elif 5 <= response_length <= 150:
        length_score = 0.15
    else:
        length_score = 0.1
    
    # Specificity indicators (0-20%)
    specific_indicators = ["step", "process", "follow", "contact", "email", "application", "system"]
    specificity_matches = sum(1 for indicator in specific_indicators if indicator in response_lower)
    specificity_score = min(specificity_matches / 3, 1.0) * 0.2
    
    # Corporate terminology (0-20%)
    corporate_terms = ["cognizant", "oneC", "so", "request", "process", "system", "application"]
    corporate_matches = sum(1 for term in corporate_terms if term.lower() in response_lower)
    corporate_score = min(corporate_matches / 3, 1.0) * 0.2
    
    total_score = keyword_score + length_score + specificity_score + corporate_score
    return min(total_score, 1.0)

print("✅ Test cases and evaluation functions loaded")
print(f"📊 Total test cases: {len(test_cases)}")
print(f"📊 Categories: {len(set(case['category'] for case in test_cases))}")

# Show test categories
categories = {}
for case in test_cases:
    category = case['category']
    if category not in categories:
        categories[category] = 0
    categories[category] += 1

print("\n📋 Test Categories:")
for category, count in categories.items():
    print(f"   • {category}: {count} questions")

In [None]:
# Cell 3 - Run Comprehensive Evaluation

import time
from tqdm import tqdm

print("🚀 STARTING COMPREHENSIVE MODEL EVALUATION")
print("=" * 60)

# Store results
evaluation_results = []

# Run evaluation
print("⏱️  This will take a few minutes...")
start_time = time.time()

for i, test_case in enumerate(tqdm(test_cases, desc="Evaluating")):
    print(f"\n📝 Question {i+1}/{len(test_cases)}: {test_case['question'][:50]}...")
    
    # Generate responses from both models
    base_response = generate_response(base_pipeline, test_case['question'])
    fine_tuned_response = generate_response(fine_tuned_pipeline, test_case['question'])
    
    # Evaluate responses
    base_score = evaluate_response_quality(base_response, test_case['expected_keywords'])
    fine_tuned_score = evaluate_response_quality(fine_tuned_response, test_case['expected_keywords'])
    
    # Store results
    result = {
        'question_id': i + 1,
        'category': test_case['category'],
        'question': test_case['question'],
        'expected_keywords': test_case['expected_keywords'],
        'base_response': base_response,
        'fine_tuned_response': fine_tuned_response,
        'base_score': base_score,
        'fine_tuned_score': fine_tuned_score,
        'improvement': fine_tuned_score - base_score,
        'improvement_pct': ((fine_tuned_score - base_score) / max(base_score, 0.01)) * 100
    }
    
    evaluation_results.append(result)
    
    # Quick preview
    print(f"   📊 Base Score: {base_score:.3f} | Fine-tuned Score: {fine_tuned_score:.3f} | Improvement: {result['improvement']:+.3f}")

end_time = time.time()
evaluation_time = end_time - start_time

print(f"\n✅ Evaluation completed in {evaluation_time:.1f} seconds")

# Convert to DataFrame for analysis
df_results = pd.DataFrame(evaluation_results)

# Calculate overall statistics
overall_stats = {
    'avg_base_score': df_results['base_score'].mean(),
    'avg_fine_tuned_score': df_results['fine_tuned_score'].mean(),
    'avg_improvement': df_results['improvement'].mean(),
    'avg_improvement_pct': df_results['improvement_pct'].mean(),
    'questions_improved': (df_results['improvement'] > 0).sum(),
    'questions_degraded': (df_results['improvement'] < 0).sum(),
    'questions_same': (df_results['improvement'] == 0).sum(),
    'total_questions': len(df_results)
}

print("\n📊 OVERALL EVALUATION RESULTS:")
print("=" * 40)
print(f"Average Base Model Score:      {overall_stats['avg_base_score']:.3f}")
print(f"Average Fine-tuned Score:      {overall_stats['avg_fine_tuned_score']:.3f}")
print(f"Average Improvement:           {overall_stats['avg_improvement']:+.3f}")
print(f"Average Improvement %:         {overall_stats['avg_improvement_pct']:+.1f}%")
print(f"Questions Improved:            {overall_stats['questions_improved']}/{overall_stats['total_questions']}")
print(f"Questions Degraded:            {overall_stats['questions_degraded']}/{overall_stats['total_questions']}")
print(f"Success Rate:                  {(overall_stats['questions_improved']/overall_stats['total_questions'])*100:.1f}%")

# Category-wise analysis
print("\n📈 CATEGORY-WISE PERFORMANCE:")
print("=" * 40)
category_stats = df_results.groupby('category').agg({
    'base_score': 'mean',
    'fine_tuned_score': 'mean', 
    'improvement': 'mean',
    'improvement_pct': 'mean'
}).round(3)

for category in category_stats.index:
    stats = category_stats.loc[category]
    print(f"{category}:")
    print(f"  Base: {stats['base_score']:.3f} → Fine-tuned: {stats['fine_tuned_score']:.3f} ({stats['improvement']:+.3f})")

print(f"\n💾 Results stored in 'df_results' DataFrame with {len(df_results)} rows")
print("🎯 Ready for visualization!")

In [None]:
# Cell 4 - Comprehensive Visualization Dashboard

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.patches import Rectangle

# Set up the plotting style
plt.rcParams['figure.figsize'] = (16, 12)
plt.rcParams['font.size'] = 10
sns.set_style("whitegrid")

# Create a comprehensive dashboard
fig = plt.figure(figsize=(20, 16))

# 1. Overall Performance Comparison (Top Left)
ax1 = plt.subplot(3, 3, 1)
models = ['Base Model', 'Fine-tuned Model']
scores = [overall_stats['avg_base_score'], overall_stats['avg_fine_tuned_score']]
colors = ['#ff7f7f', '#7fbf7f']

bars = ax1.bar(models, scores, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
ax1.set_title('🏆 Overall Performance Comparison', fontsize=14, fontweight='bold')
ax1.set_ylabel('Average Score')
ax1.set_ylim(0, 1)

# Add value labels on bars
for bar, score in zip(bars, scores):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{score:.3f}', ha='center', va='bottom', fontweight='bold')

# Add improvement annotation
improvement = overall_stats['avg_improvement']
ax1.annotate(f'Improvement: +{improvement:.3f}\n({overall_stats["avg_improvement_pct"]:+.1f}%)',
             xy=(1, scores[1]), xytext=(1.3, scores[1]),
             arrowprops=dict(arrowstyle='->', color='green', lw=2),
             fontsize=11, fontweight='bold', color='green')

# 2. Score Distribution (Top Center)
ax2 = plt.subplot(3, 3, 2)
x = np.arange(len(df_results))
width = 0.35

bars1 = ax2.bar(x - width/2, df_results['base_score'], width, label='Base Model', 
                color='#ff7f7f', alpha=0.8)
bars2 = ax2.bar(x + width/2, df_results['fine_tuned_score'], width, label='Fine-tuned Model',
                color='#7fbf7f', alpha=0.8)

ax2.set_title('📊 Score Distribution by Question', fontsize=14, fontweight='bold')
ax2.set_xlabel('Question ID')
ax2.set_ylabel('Score')
ax2.set_xticks(x)
ax2.set_xticklabels([f'Q{i+1}' for i in range(len(df_results))], rotation=45)
ax2.legend()
ax2.set_ylim(0, 1)

# 3. Improvement Heatmap (Top Right)
ax3 = plt.subplot(3, 3, 3)
# Create improvement matrix by category
category_improvement = df_results.groupby('category')['improvement'].apply(list)
max_questions = max(len(improvements) for improvements in category_improvement.values)

# Pad arrays and create matrix
improvement_matrix = []
category_labels = []
for category, improvements in category_improvement.items():
    padded = improvements + [np.nan] * (max_questions - len(improvements))
    improvement_matrix.append(padded)
    category_labels.append(category)

improvement_matrix = np.array(improvement_matrix)

# Create heatmap
im = ax3.imshow(improvement_matrix, cmap='RdYlGn', aspect='auto', vmin=-0.3, vmax=0.3)
ax3.set_title('🌡️ Improvement Heatmap by Category', fontsize=14, fontweight='bold')
ax3.set_yticks(range(len(category_labels)))
ax3.set_yticklabels(category_labels, fontsize=9)
ax3.set_xlabel('Question within Category')

# Add colorbar
cbar = plt.colorbar(im, ax=ax3, shrink=0.8)
cbar.set_label('Improvement Score')

# 4. Category Performance (Middle Left)
ax4 = plt.subplot(3, 3, 4)
category_stats_plot = df_results.groupby('category').agg({
    'base_score': 'mean',
    'fine_tuned_score': 'mean'
}).round(3)

categories = category_stats_plot.index
x_pos = np.arange(len(categories))

bars1 = ax4.bar(x_pos - 0.2, category_stats_plot['base_score'], 0.4, 
                label='Base Model', color='#ff7f7f', alpha=0.8)
bars2 = ax4.bar(x_pos + 0.2, category_stats_plot['fine_tuned_score'], 0.4,
                label='Fine-tuned Model', color='#7fbf7f', alpha=0.8)

ax4.set_title('📈 Performance by Category', fontsize=14, fontweight='bold')
ax4.set_xlabel('Category')
ax4.set_ylabel('Average Score')
ax4.set_xticks(x_pos)
ax4.set_xticklabels(categories, rotation=45, ha='right')
ax4.legend()
ax4.set_ylim(0, 1)

# 5. Success Rate Pie Chart (Middle Center)
ax5 = plt.subplot(3, 3, 5)
success_data = [
    overall_stats['questions_improved'],
    overall_stats['questions_degraded'], 
    overall_stats['questions_same']
]
labels = ['Improved', 'Degraded', 'Same']
colors = ['#7fbf7f', '#ff7f7f', '#ffff7f']
explode = (0.1, 0, 0)  # explode the "Improved" slice

wedges, texts, autotexts = ax5.pie(success_data, labels=labels, colors=colors, explode=explode,
                                   autopct='%1.1f%%', startangle=90, textprops={'fontsize': 10})

ax5.set_title('🎯 Question Improvement Distribution', fontsize=14, fontweight='bold')

# 6. Response Length Comparison (Middle Right)
ax6 = plt.subplot(3, 3, 6)
base_lengths = [len(resp.split()) for resp in df_results['base_response']]
ft_lengths = [len(resp.split()) for resp in df_results['fine_tuned_response']]

ax6.hist(base_lengths, bins=15, alpha=0.7, label='Base Model', color='#ff7f7f')
ax6.hist(ft_lengths, bins=15, alpha=0.7, label='Fine-tuned Model', color='#7fbf7f')
ax6.set_title('📏 Response Length Distribution', fontsize=14, fontweight='bold')
ax6.set_xlabel('Response Length (words)')
ax6.set_ylabel('Frequency')
ax6.legend()

# 7. Training Metrics Visualization (Bottom Left)
ax7 = plt.subplot(3, 3, 7)
# Load training metrics if available
try:
    with open(f"{FINE_TUNED_MODEL}/training_metrics.json", 'r') as f:
        training_logs = json.load(f)
    
    # Extract training and validation losses
    epochs = []
    train_losses = []
    eval_losses = []
    
    for log in training_logs:
        if 'epoch' in log and 'loss' in log:
            epochs.append(log['epoch'])
            train_losses.append(log['loss'])
        if 'epoch' in log and 'eval_loss' in log:
            eval_losses.append(log['eval_loss'])
    
    if epochs and train_losses:
        ax7.plot(epochs, train_losses, 'o-', label='Training Loss', color='blue', linewidth=2)
        if eval_losses:
            eval_epochs = [log['epoch'] for log in training_logs if 'eval_loss' in log]
            ax7.plot(eval_epochs, eval_losses, 's-', label='Validation Loss', color='red', linewidth=2)
        
        ax7.set_title('📉 Training Progress', fontsize=14, fontweight='bold')
        ax7.set_xlabel('Epoch')
        ax7.set_ylabel('Loss')
        ax7.legend()
        ax7.grid(True, alpha=0.3)
    else:
        ax7.text(0.5, 0.5, 'Training metrics\nnot available', ha='center', va='center',
                transform=ax7.transAxes, fontsize=12)
        ax7.set_title('📉 Training Progress', fontsize=14, fontweight='bold')

except:
    ax7.text(0.5, 0.5, 'Training metrics\nnot found', ha='center', va='center',
            transform=ax7.transAxes, fontsize=12)
    ax7.set_title('📉 Training Progress', fontsize=14, fontweight='bold')

# 8. Top Improvements (Bottom Center)
ax8 = plt.subplot(3, 3, 8)
top_improvements = df_results.nlargest(5, 'improvement')[['question_id', 'improvement']]
bottom_improvements = df_results.nsmallest(3, 'improvement')[['question_id', 'improvement']]

all_changes = pd.concat([top_improvements, bottom_improvements])
colors_change = ['green' if x > 0 else 'red' for x in all_changes['improvement']]

bars = ax8.barh(range(len(all_changes)), all_changes['improvement'], color=colors_change, alpha=0.7)
ax8.set_title('🔄 Biggest Changes by Question', fontsize=14, fontweight='bold')
ax8.set_xlabel('Score Change')
ax8.set_yticks(range(len(all_changes)))
ax8.set_yticklabels([f"Q{int(qid)}" for qid in all_changes['question_id']])
ax8.axvline(x=0, color='black', linestyle='-', alpha=0.3)

# Add value labels
for i, (bar, value) in enumerate(zip(bars, all_changes['improvement'])):
    ax8.text(value + (0.01 if value > 0 else -0.01), i, f'{value:.3f}', 
             va='center', ha='left' if value > 0 else 'right', fontweight='bold')

# 9. Summary Statistics (Bottom Right)
ax9 = plt.subplot(3, 3, 9)
ax9.axis('off')

summary_text = f"""
🎯 EVALUATION SUMMARY

📊 Total Questions: {overall_stats['total_questions']}
✅ Questions Improved: {overall_stats['questions_improved']} ({overall_stats['questions_improved']/overall_stats['total_questions']*100:.1f}%)
❌ Questions Degraded: {overall_stats['questions_degraded']} ({overall_stats['questions_degraded']/overall_stats['total_questions']*100:.1f}%)
➖ Questions Same: {overall_stats['questions_same']} ({overall_stats['questions_same']/overall_stats['total_questions']*100:.1f}%)

📈 Average Improvement: {overall_stats['avg_improvement']:+.3f}
📈 Percentage Improvement: {overall_stats['avg_improvement_pct']:+.1f}%

🏆 Best Category: {category_stats.loc[category_stats['improvement'].idxmax()].name}
🔧 Needs Work: {category_stats.loc[category_stats['improvement'].idxmin()].name}

⭐ Overall Assessment: {'EXCELLENT' if overall_stats['avg_improvement'] > 0.1 else 'GOOD' if overall_stats['avg_improvement'] > 0.05 else 'FAIR' if overall_stats['avg_improvement'] > 0 else 'NEEDS IMPROVEMENT'}
"""

ax9.text(0.05, 0.95, summary_text, transform=ax9.transAxes, fontsize=11,
         verticalalignment='top', fontfamily='monospace',
         bbox=dict(boxstyle="round,pad=0.5", facecolor="lightblue", alpha=0.8))

plt.tight_layout()
plt.suptitle('🤖 LLAMA-3.1 CORPORATE ASSISTANT EVALUATION DASHBOARD 🤖', 
             fontsize=18, fontweight='bold', y=0.98)

plt.show()

print("📊 Evaluation dashboard complete!")
print(f"💾 Results saved in 'df_results' DataFrame")
print(f"📈 Overall improvement: {overall_stats['avg_improvement']:+.3f} ({overall_stats['avg_improvement_pct']:+.1f}%)")

In [None]:
# Cell 5 - Interactive Response Comparison

def display_comparison(question_id=None, category=None):
    """Display detailed comparison for specific question or category"""
    
    if question_id is not None:
        # Show specific question
        result = df_results[df_results['question_id'] == question_id].iloc[0]
        results_to_show = [result]
    elif category is not None:
        # Show all questions from category
        results_to_show = df_results[df_results['category'] == category].to_dict('records')
    else:
        # Show top 3 improved questions
        results_to_show = df_results.nlargest(3, 'improvement').to_dict('records')
    
    for i, result in enumerate(results_to_show):
        print("=" * 80)
        print(f"🔍 QUESTION {result['question_id']}: {result['category']}")
        print("=" * 80)
        print(f"❓ Question: {result['question']}")
        print(f"🎯 Expected Keywords: {', '.join(result['expected_keywords'])}")
        print()
        
        print("🤖 BASE MODEL RESPONSE:")
        print("-" * 40)
        print(result['base_response'])
        print(f"📊 Score: {result['base_score']:.3f}")
        print()
        
        print("🚀 FINE-TUNED MODEL RESPONSE:")
        print("-" * 40)
        print(result['fine_tuned_response'])
        print(f"📊 Score: {result['fine_tuned_score']:.3f}")
        print()
        
        # Improvement analysis
        improvement = result['improvement']
        if improvement > 0:
            print(f"✅ IMPROVEMENT: +{improvement:.3f} ({result['improvement_pct']:+.1f}%)")
            print("🎉 Fine-tuned model performed better!")
        elif improvement < 0:
            print(f"❌ REGRESSION: {improvement:.3f} ({result['improvement_pct']:+.1f}%)")
            print("⚠️  Base model performed better on this question")
        else:
            print("➖ NO CHANGE: Both models performed equally")
        
        print("\n" + "=" * 80)
        if i < len(results_to_show) - 1:
            print()

# Interactive functions
def show_best_improvements(n=3):
    """Show top N improved questions"""
    print(f"🏆 TOP {n} IMPROVED QUESTIONS:")
    best = df_results.nlargest(n, 'improvement')
    for _, row in best.iterrows():
        print(f"Q{row['question_id']}: +{row['improvement']:.3f} ({row['improvement_pct']:+.1f}%) - {row['question'][:60]}...")

def show_worst_regressions(n=3):
    """Show top N degraded questions"""
    print(f"⚠️  TOP {n} DEGRADED QUESTIONS:")
    worst = df_results.nsmallest(n, 'improvement')
    for _, row in worst.iterrows():
        print(f"Q{row['question_id']}: {row['improvement']:.3f} ({row['improvement_pct']:+.1f}%) - {row['question'][:60]}...")

def show_category_performance():
    """Show performance by category"""
    print("📊 PERFORMANCE BY CATEGORY:")
    print("=" * 50)
    category_stats = df_results.groupby('category').agg({
        'base_score': 'mean',
        'fine_tuned_score': 'mean',
        'improvement': 'mean',
        'improvement_pct': 'mean'
    }).round(3)
    
    for category in category_stats.index:
        stats = category_stats.loc[category]
        status = "✅" if stats['improvement'] > 0 else "❌" if stats['improvement'] < 0 else "➖"
        print(f"{status} {category}:")
        print(f"   Base: {stats['base_score']:.3f} → Fine-tuned: {stats['fine_tuned_score']:.3f}")
        print(f"   Change: {stats['improvement']:+.3f} ({stats['improvement_pct']:+.1f}%)")
        print()

def ask_model_interactive():
    """Interactive function to test the model with custom questions"""
    print("🎮 INTERACTIVE MODEL TESTING")
    print("=" * 40)
    print("Ask your fine-tuned model any corporate question!")
    print("Type 'quit' to exit")
    print()
    
    while True:
        question = input("❓ Your question: ").strip()
        
        if question.lower() in ['quit', 'exit', 'q']:
            print("👋 Thanks for testing!")
            break
            
        if not question:
            continue
            
        print("\n🤖 Generating response...")
        
        # Get response from fine-tuned model
        response = generate_response(fine_tuned_pipeline, question, max_tokens=150)
        
        print("\n🚀 FINE-TUNED MODEL RESPONSE:")
        print("-" * 40)
        print(response)
        print("-" * 40)
        print()

# Quick access functions
print("🎯 QUICK EVALUATION FUNCTIONS AVAILABLE:")
print("=" * 50)
print("• show_best_improvements(n=3) - Show top improved questions")
print("• show_worst_regressions(n=3) - Show degraded questions") 
print("• show_category_performance() - Performance by category")
print("• display_comparison(question_id=X) - Show specific question")
print("• display_comparison(category='Category Name') - Show category")
print("• ask_model_interactive() - Test with custom questions")
print()

# Show some quick insights
print("🚀 QUICK INSIGHTS:")
print("=" * 20)
show_best_improvements(3)
print()
show_worst_regressions(2)
print()
show_category_performance()

print("\n💡 TIP: Use display_comparison(question_id=5) to see detailed comparison for question 5")
print("💡 TIP: Use ask_model_interactive() to test with your own questions!")

In [None]:
# Cell 6 - Export Results and Generate Report

from datetime import datetime
import os

def generate_evaluation_report():
    """Generate a comprehensive evaluation report"""
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    report_filename = f"llama_evaluation_report_{timestamp}.md"
    
    # Prepare detailed results
    best_questions = df_results.nlargest(3, 'improvement')
    worst_questions = df_results.nsmallest(3, 'improvement')
    
    report_content = f"""# Llama-3.1 Corporate Assistant Evaluation Report

**Generated:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
**Model:** Llama-3.1-8B-Instruct (Fine-tuned)
**Model Path:** {FINE_TUNED_MODEL}

## Executive Summary

### Overall Performance
- **Total Questions Evaluated:** {overall_stats['total_questions']}
- **Average Base Model Score:** {overall_stats['avg_base_score']:.3f}
- **Average Fine-tuned Score:** {overall_stats['avg_fine_tuned_score']:.3f}
- **Overall Improvement:** {overall_stats['avg_improvement']:+.3f} ({overall_stats['avg_improvement_pct']:+.1f}%)

### Success Metrics
- **Questions Improved:** {overall_stats['questions_improved']}/{overall_stats['total_questions']} ({overall_stats['questions_improved']/overall_stats['total_questions']*100:.1f}%)
- **Questions Degraded:** {overall_stats['questions_degraded']}/{overall_stats['total_questions']} ({overall_stats['questions_degraded']/overall_stats['total_questions']*100:.1f}%)
- **Questions Unchanged:** {overall_stats['questions_same']}/{overall_stats['total_questions']} ({overall_stats['questions_same']/overall_stats['total_questions']*100:.1f}%)

## Category Performance Analysis

"""
    
    # Add category analysis
    category_stats = df_results.groupby('category').agg({
        'base_score': 'mean',
        'fine_tuned_score': 'mean',
        'improvement': 'mean',
        'improvement_pct': 'mean'
    }).round(3)
    
    for category in category_stats.index:
        stats = category_stats.loc[category]
        status = "✅ Improved" if stats['improvement'] > 0 else "❌ Degraded" if stats['improvement'] < 0 else "➖ No Change"
        report_content += f"""
### {category} {status}
- **Base Score:** {stats['base_score']:.3f}
- **Fine-tuned Score:** {stats['fine_tuned_score']:.3f}
- **Improvement:** {stats['improvement']:+.3f} ({stats['improvement_pct']:+.1f}%)
"""
    
    # Add best performing questions
    report_content += f"""
## Top Performing Questions

### Best Improvements
"""
    for _, row in best_questions.iterrows():
        report_content += f"""
**Question {row['question_id']}:** {row['question']}
- **Category:** {row['category']}
- **Base Score:** {row['base_score']:.3f}
- **Fine-tuned Score:** {row['fine_tuned_score']:.3f}
- **Improvement:** {row['improvement']:+.3f} ({row['improvement_pct']:+.1f}%)

*Base Response:* {row['base_response'][:200]}...

*Fine-tuned Response:* {row['fine_tuned_response'][:200]}...

---
"""
    
    # Add worst performing questions
    report_content += f"""
### Areas for Improvement
"""
    for _, row in worst_questions.iterrows():
        report_content += f"""
**Question {row['question_id']}:** {row['question']}
- **Category:** {row['category']}
- **Base Score:** {row['base_score']:.3f}
- **Fine-tuned Score:** {row['fine_tuned_score']:.3f}
- **Change:** {row['improvement']:+.3f} ({row['improvement_pct']:+.1f}%)

---
"""
    
    # Add recommendations
    avg_improvement = overall_stats['avg_improvement']
    success_rate = overall_stats['questions_improved'] / overall_stats['total_questions']
    
    report_content += f"""
## Recommendations

### Overall Assessment
"""
    
    if avg_improvement > 0.1 and success_rate > 0.7:
        assessment = "EXCELLENT"
        recommendations = """
- ✅ The fine-tuning was highly successful
- ✅ Deploy the model to production
- ✅ Consider expanding the training dataset for even better performance
- ✅ Monitor performance in production and collect feedback
"""
    elif avg_improvement > 0.05 and success_rate > 0.6:
        assessment = "GOOD"
        recommendations = """
- ✅ The fine-tuning was successful
- ⚠️ Consider additional training on underperforming categories
- ✅ Deploy with monitoring and feedback collection
- 🔧 Review and improve training data quality
"""
    elif avg_improvement > 0 and success_rate > 0.5:
        assessment = "FAIR"
        recommendations = """
- ⚠️ Modest improvement achieved
- 🔧 Review training data quality and quantity
- 🔧 Consider adjusting hyperparameters
- 🔧 Add more diverse training examples
- ⚠️ Test thoroughly before production deployment
"""
    else:
        assessment = "NEEDS IMPROVEMENT"
        recommendations = """
- ❌ Fine-tuning did not achieve desired results
- 🔧 Review training data quality and alignment with evaluation criteria
- 🔧 Consider different model architecture or training approach
- 🔧 Increase training data quantity and diversity
- ❌ Do not deploy without significant improvements
"""
    
    report_content += f"""
**Assessment:** {assessment}

{recommendations}

### Technical Recommendations
- **Best Performing Category:** {category_stats.loc[category_stats['improvement'].idxmax()].name}
- **Needs Most Attention:** {category_stats.loc[category_stats['improvement'].idxmin()].name}
- **Training Data:** Consider adding more examples for underperforming categories
- **Model Architecture:** Current LoRA configuration appears effective

## Technical Details

### Model Configuration
- **Base Model:** {BASE_MODEL}
- **Fine-tuning Method:** QLoRA (4-bit quantization + LoRA adapters)
- **LoRA Rank:** 64
- **Target Modules:** q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj

### Evaluation Methodology
- **Evaluation Criteria:** Keyword matching, response length, specificity, corporate terminology
- **Test Cases:** {len(test_cases)} questions across {len(set(case['category'] for case in test_cases))} categories
- **Scoring Range:** 0.0 - 1.0 (higher is better)

---

*Report generated automatically by Llama-3.1 Corporate Assistant Evaluation System*
"""
    
    # Save the report
    with open(report_filename, 'w', encoding='utf-8') as f:
        f.write(report_content)
    
    print(f"📄 Comprehensive report saved as: {report_filename}")
    return report_filename

def export_results_csv():
    """Export detailed results to CSV"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_filename = f"llama_evaluation_results_{timestamp}.csv"
    
    # Prepare export data
    export_df = df_results.copy()
    export_df['timestamp'] = datetime.now()
    export_df['model_path'] = FINE_TUNED_MODEL
    
    export_df.to_csv(csv_filename, index=False)
    print(f"📊 Detailed results exported to: {csv_filename}")
    return csv_filename

def save_evaluation_plots():
    """Save the evaluation dashboard as images"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    plot_filename = f"llama_evaluation_dashboard_{timestamp}.png"
    
    # The plot from Cell 4 should still be active
    plt.savefig(plot_filename, dpi=300, bbox_inches='tight', 
                facecolor='white', edgecolor='none')
    print(f"📈 Evaluation dashboard saved as: {plot_filename}")
    return plot_filename

# Generate all outputs
print("📋 GENERATING EVALUATION OUTPUTS")
print("=" * 40)

# Generate comprehensive report
report_file = generate_evaluation_report()

# Export CSV data
csv_file = export_results_csv()

# Save plots
plot_file = save_evaluation_plots()

print("\n✅ EVALUATION COMPLETE!")
print("=" * 30)
print(f"📄 Report: {report_file}")
print(f"📊 Data: {csv_file}")
print(f"📈 Plots: {plot_file}")

# Final summary
print(f"\n🎯 FINAL ASSESSMENT: ", end="")
avg_improvement = overall_stats['avg_improvement']
success_rate = overall_stats['questions_improved'] / overall_stats['total_questions']

if avg_improvement > 0.1 and success_rate > 0.7:
    print("🏆 EXCELLENT - Ready for production!")
elif avg_improvement > 0.05 and success_rate > 0.6:
    print("✅ GOOD - Deploy with monitoring")
elif avg_improvement > 0 and success_rate > 0.5:
    print("⚠️ FAIR - Needs improvement")
else:
    print("❌ NEEDS WORK - Significant improvements required")

print(f"\n📊 Key Metrics:")
print(f"   • Average improvement: {overall_stats['avg_improvement']:+.3f}")
print(f"   • Success rate: {success_rate*100:.1f}%")
print(f"   • Questions improved: {overall_stats['questions_improved']}/{overall_stats['total_questions']}")

print(f"\n💾 All results stored in 'df_results' variable for further analysis")
print(f"🎮 Use ask_model_interactive() to test the model with your own questions!")