In [2]:
%pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Downloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m73.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets
Successfully installed ipywidgets-8.1.7 jupyterlab_widgets-3.0.15 widgetsnbextension-4.0.14
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Load Models for Interactive Testing

import os
import torch
import warnings
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig,
    pipeline
)
from peft import PeftModel
from pathlib import Path

# Suppress warnings
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("🤖 LOADING MODELS FOR INTERACTIVE TESTING")
print("=" * 60)

# Model configurations
BASE_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
FINE_TUNED_MODEL = "/home/azureuser/cloudfiles/code/Users/746582/llama-8b-ft-11th-june/llama-3.1-8b-corporate-assistant-final"
HF_TOKEN = "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"

# Check if fine-tuned model exists
print(f"🔍 Checking model path: {FINE_TUNED_MODEL}")
if not Path(FINE_TUNED_MODEL).exists():
    print(f"❌ Fine-tuned model not found!")
    print("📁 Available directories:")
    parent_dir = Path(FINE_TUNED_MODEL).parent
    for item in parent_dir.iterdir():
        if item.is_dir() and any(keyword in item.name.lower() for keyword in ['llama', 'corporate', 'assistant', 'final']):
            print(f"   📁 {item.name}")
    
    # Try to find the correct path
    possible_paths = [
        "/home/azureuser/cloudfiles/code/Users/746582/llama-8b-ft-11th-june/llama-3.1-8b-corporate-assistant",
        "./llama-3.1-8b-corporate-assistant-final", 
        "./llama-3.1-8b-corporate-assistant"
    ]
    
    for path in possible_paths:
        if Path(path).exists():
            FINE_TUNED_MODEL = path
            print(f"✅ Found model at: {FINE_TUNED_MODEL}")
            break
    else:
        raise FileNotFoundError(f"Cannot find fine-tuned model. Please check the path.")

print(f"✅ Using fine-tuned model: {FINE_TUNED_MODEL}")

# GPU check
if torch.cuda.is_available():
    print(f"🔋 GPU: {torch.cuda.get_device_name(0)}")
    print(f"📊 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    torch.cuda.empty_cache()  # Clear memory
else:
    print("⚠️  Running on CPU")

# Load tokenizer
print("\n🔄 Loading tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(
        BASE_MODEL,
        token=HF_TOKEN,
        trust_remote_code=True
    )
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    print("✅ Tokenizer loaded successfully")
    
except Exception as e:
    print(f"❌ Error loading tokenizer: {e}")
    raise

# Quantization config for memory efficiency
qlora_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load base model
print("\n🔄 Loading base model...")
try:
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=qlora_config,
        device_map="auto",
        token=HF_TOKEN,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16
    )
    print("✅ Base model loaded")
    
except Exception as e:
    print(f"❌ Error loading base model: {e}")
    raise

# Load fine-tuned model (base + adapters)
print("\n🔄 Loading fine-tuned model...")
try:
    # Load base model for fine-tuned version
    fine_tuned_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=qlora_config,
        device_map="auto", 
        token=HF_TOKEN,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16
    )
    
    # Load PEFT adapters
    fine_tuned_model = PeftModel.from_pretrained(fine_tuned_model, FINE_TUNED_MODEL)
    print("✅ Fine-tuned model with adapters loaded")
    
except Exception as e:
    print(f"❌ Error loading fine-tuned model: {e}")
    print("🔧 Trying alternative loading method...")
    
    try:
        # Alternative: Try loading without quantization first
        fine_tuned_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            token=HF_TOKEN,
            trust_remote_code=True
        )
        fine_tuned_model = PeftModel.from_pretrained(fine_tuned_model, FINE_TUNED_MODEL)
        print("✅ Fine-tuned model loaded (alternative method)")
        
    except Exception as e2:
        print(f"❌ Alternative loading failed: {e2}")
        print("🔧 Will use base model only for testing")
        fine_tuned_model = base_model

# Create pipelines
print("\n🔄 Creating inference pipelines...")
try:
    base_pipeline = pipeline(
        "text-generation",
        model=base_model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    print("✅ Base pipeline created")
    
    fine_tuned_pipeline = pipeline(
        "text-generation", 
        model=fine_tuned_model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    print("✅ Fine-tuned pipeline created")
    
except Exception as e:
    print(f"❌ Error creating pipelines: {e}")
    raise

# Test pipelines with a simple generation
print("\n🧪 Testing pipelines...")
test_prompt = "Hello"

try:
    # Test base pipeline
    base_test = base_pipeline(test_prompt, max_new_tokens=5, do_sample=False)
    print("✅ Base pipeline working")
    
    # Test fine-tuned pipeline  
    ft_test = fine_tuned_pipeline(test_prompt, max_new_tokens=5, do_sample=False)
    print("✅ Fine-tuned pipeline working")
    
except Exception as e:
    print(f"⚠️  Pipeline test warning: {e}")
    print("Pipelines may still work for longer generations")

# Memory check
if torch.cuda.is_available():
    memory_used = torch.cuda.memory_allocated() / 1e9
    memory_total = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"\n🔋 GPU Memory: {memory_used:.1f}GB / {memory_total:.1f}GB used ({memory_used/memory_total*100:.1f}%)")

print("\n" + "=" * 60)
print("🎉 MODELS LOADED SUCCESSFULLY!")
print("=" * 60)
print("✅ Variables available:")
print("   • base_pipeline - Base Llama-3.1 model")
print("   • fine_tuned_pipeline - Your fine-tuned model") 
print("   • tokenizer - Tokenizer for both models")
print("\n🚀 Ready for interactive testing!")
print("📝 Now run the interactive testing interface cells")

# Quick validation
print("\n🔍 QUICK VALIDATION:")
print(f"✅ base_pipeline: {type(base_pipeline)}")
print(f"✅ fine_tuned_pipeline: {type(fine_tuned_pipeline)}")
print(f"✅ tokenizer: {type(tokenizer)}")

print("\n💡 Next steps:")
print("1. Run the interactive testing interface cell")
print("2. Start testing your model with custom questions!")
print("3. Compare base vs fine-tuned responses")

🤖 LLAMA-3.1 CORPORATE ASSISTANT EVALUATION
✅ Fine-tuned model found at: /home/azureuser/cloudfiles/code/Users/746582/llama-8b-ft-11th-june/llama-3.1-8b-corporate-assistant-final
🔄 Loading tokenizer...
✅ Tokenizer loaded successfully
🔄 Loading base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Base model loaded
🔄 Loading fine-tuned model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0
Device set to use cuda:0


✅ Fine-tuned model with adapters loaded
🔄 Creating inference pipelines...
✅ Inference pipelines ready

🚀 Ready for evaluation!
🔋 GPU: NVIDIA H100 NVL
📊 GPU Memory: 99.9 GB


In [None]:
# Interactive Model Testing Interface

import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import time
from datetime import datetime

# Create interface components
class ModelTestingInterface:
    def __init__(self, base_pipeline, fine_tuned_pipeline, tokenizer):
        self.base_pipeline = base_pipeline
        self.fine_tuned_pipeline = fine_tuned_pipeline
        self.tokenizer = tokenizer
        self.test_history = []
        
        # Create widgets
        self.create_widgets()
        self.setup_interface()
    
    def create_widgets(self):
        """Create all interface widgets"""
        
        # Header
        self.header = widgets.HTML(
            value="""
            <div style='background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); 
                        padding: 20px; border-radius: 10px; margin-bottom: 20px;'>
                <h2 style='color: white; text-align: center; margin: 0;'>
                    🤖 Llama-3.1 Corporate Assistant Testing Interface
                </h2>
                <p style='color: white; text-align: center; margin: 5px 0 0 0;'>
                    Test your fine-tuned model with custom questions
                </p>
            </div>
            """
        )
        
        # Question input
        self.question_input = widgets.Textarea(
            value='How to raise a staffing SO request?',
            placeholder='Enter your corporate question here...',
            description='Question:',
            layout=widgets.Layout(width='100%', height='80px'),
            style={'description_width': '100px'}
        )
        
        # Model selection
        self.model_selector = widgets.RadioButtons(
            options=['Both Models', 'Fine-tuned Only', 'Base Model Only'],
            value='Both Models',
            description='Compare:',
            style={'description_width': '100px'}
        )
        
        # Generation parameters
        self.max_tokens = widgets.IntSlider(
            value=150,
            min=50,
            max=300,
            step=25,
            description='Max Tokens:',
            style={'description_width': '100px'}
        )
        
        self.temperature = widgets.FloatSlider(
            value=0.7,
            min=0.1,
            max=1.0,
            step=0.1,
            description='Temperature:',
            style={'description_width': '100px'}
        )
        
        self.top_p = widgets.FloatSlider(
            value=0.9,
            min=0.1,
            max=1.0,
            step=0.1,
            description='Top P:',
            style={'description_width': '100px'}
        )
        
        # Buttons
        self.generate_btn = widgets.Button(
            description='🚀 Generate Response',
            button_style='primary',
            layout=widgets.Layout(width='200px', height='40px')
        )
        
        self.clear_btn = widgets.Button(
            description='🧹 Clear Results',
            button_style='warning',
            layout=widgets.Layout(width='150px', height='40px')
        )
        
        self.history_btn = widgets.Button(
            description='📋 Show History',
            button_style='info',
            layout=widgets.Layout(width='150px', height='40px')
        )
        
        # Results output
        self.results_output = widgets.Output(
            layout=widgets.Layout(width='100%', height='600px', border='1px solid #ddd')
        )
        
        # Status
        self.status = widgets.HTML(
            value="<p style='color: green;'>✅ Ready to generate responses</p>"
        )
        
        # Quick questions
        self.quick_questions = [
            "How to raise a staffing SO request?",
            "What is the difference between PM and FC job codes?", 
            "How to create a CWR SO?",
            "Process to convert CWR to FTE associates",
            "Unable to select subcontractor in the system",
            "Are there any migration benefits available in Google Cloud?",
            "How to view hardcopy of I-140 approval notice?",
            "What equipment do I need for a telemedicine appointment?",
            "Do we still need to validate SOs through email for GGM SOs?",
            "Unable to create opportunity id in winzone"
        ]
        
        self.quick_selector = widgets.Dropdown(
            options=['Select a sample question...'] + self.quick_questions,
            value='Select a sample question...',
            description='Quick Test:',
            layout=widgets.Layout(width='100%'),
            style={'description_width': '100px'}
        )
    
    def setup_interface(self):
        """Setup event handlers"""
        self.generate_btn.on_click(self.generate_responses)
        self.clear_btn.on_click(self.clear_results)
        self.history_btn.on_click(self.show_history)
        self.quick_selector.observe(self.load_quick_question, names='value')
    
    def load_quick_question(self, change):
        """Load selected quick question"""
        if change['new'] != 'Select a sample question...':
            self.question_input.value = change['new']
    
    def generate_response_single(self, pipeline, question, model_name):
        """Generate response from a single model"""
        system_prompt = "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions."
        
        prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
        
        try:
            start_time = time.time()
            response = pipeline(
                prompt,
                max_new_tokens=self.max_tokens.value,
                do_sample=True,
                temperature=self.temperature.value,
                top_p=self.top_p.value,
                repetition_penalty=1.1,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.pad_token_id,
                return_full_text=False
            )
            
            generation_time = time.time() - start_time
            generated_text = response[0]['generated_text'].strip()
            
            return {
                'text': generated_text,
                'time': generation_time,
                'tokens': len(generated_text.split()),
                'chars': len(generated_text)
            }
            
        except Exception as e:
            return {
                'text': f"Error generating response: {str(e)}",
                'time': 0,
                'tokens': 0,
                'chars': 0
            }
    
    def generate_responses(self, button):
        """Generate responses based on selected models"""
        question = self.question_input.value.strip()
        
        if not question:
            with self.results_output:
                print("❌ Please enter a question first!")
            return
        
        # Update status
        self.status.value = "<p style='color: orange;'>⏳ Generating responses...</p>"
        
        with self.results_output:
            clear_output(wait=True)
            
            # Header
            print("🤖 MODEL RESPONSE COMPARISON")
            print("=" * 80)
            print(f"❓ Question: {question}")
            print(f"⚙️  Parameters: Max Tokens={self.max_tokens.value}, Temperature={self.temperature.value}, Top-P={self.top_p.value}")
            print(f"🕐 Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
            print("=" * 80)
            
            # Generate responses based on selection
            model_choice = self.model_selector.value
            
            if model_choice in ['Both Models', 'Base Model Only']:
                print("\n🔵 BASE MODEL RESPONSE:")
                print("-" * 50)
                base_result = self.generate_response_single(self.base_pipeline, question, "Base")
                print(base_result['text'])
                print("-" * 50)
                print(f"📊 Stats: {base_result['tokens']} tokens, {base_result['chars']} chars, {base_result['time']:.2f}s")
            
            if model_choice in ['Both Models', 'Fine-tuned Only']:
                print("\n🚀 FINE-TUNED MODEL RESPONSE:")
                print("-" * 50)
                ft_result = self.generate_response_single(self.fine_tuned_pipeline, question, "Fine-tuned")
                print(ft_result['text'])
                print("-" * 50)
                print(f"📊 Stats: {ft_result['tokens']} tokens, {ft_result['chars']} chars, {ft_result['time']:.2f}s")
            
            # Comparison analysis (if both models)
            if model_choice == 'Both Models':
                print("\n🔍 COMPARISON ANALYSIS:")
                print("-" * 30)
                
                # Length comparison
                base_len = base_result['tokens']
                ft_len = ft_result['tokens']
                len_diff = ft_len - base_len
                
                print(f"📏 Length: Base={base_len} tokens, Fine-tuned={ft_len} tokens ({len_diff:+d})")
                
                # Speed comparison
                base_speed = base_result['tokens'] / max(base_result['time'], 0.01)
                ft_speed = ft_result['tokens'] / max(ft_result['time'], 0.01)
                
                print(f"⚡ Speed: Base={base_speed:.1f} tok/s, Fine-tuned={ft_speed:.1f} tok/s")
                
                # Simple keyword analysis
                corporate_keywords = ['cognizant', 'oneC', 'so', 'request', 'process', 'system', 'application', 'email']
                base_keywords = sum(1 for kw in corporate_keywords if kw.lower() in base_result['text'].lower())
                ft_keywords = sum(1 for kw in corporate_keywords if kw.lower() in ft_result['text'].lower())
                
                print(f"🏢 Corporate Terms: Base={base_keywords}, Fine-tuned={ft_keywords}")
                
                if ft_len > base_len and ft_keywords >= base_keywords:
                    print("✅ Fine-tuned model provided more detailed corporate response")
                elif ft_keywords > base_keywords:
                    print("✅ Fine-tuned model used more corporate terminology")
                elif base_len > ft_len:
                    print("⚠️  Base model provided longer response")
                else:
                    print("➖ Responses are similar in characteristics")
        
        # Save to history
        history_entry = {
            'timestamp': datetime.now(),
            'question': question,
            'model_choice': model_choice,
            'parameters': {
                'max_tokens': self.max_tokens.value,
                'temperature': self.temperature.value,
                'top_p': self.top_p.value
            }
        }
        
        if model_choice in ['Both Models', 'Base Model Only']:
            history_entry['base_response'] = base_result['text']
        if model_choice in ['Both Models', 'Fine-tuned Only']:
            history_entry['ft_response'] = ft_result['text']
            
        self.test_history.append(history_entry)
        
        # Update status
        self.status.value = f"<p style='color: green;'>✅ Responses generated! (Test #{len(self.test_history)})</p>"
    
    def clear_results(self, button):
        """Clear the results output"""
        with self.results_output:
            clear_output()
            print("🧹 Results cleared. Ready for new questions!")
        self.status.value = "<p style='color: green;'>✅ Ready to generate responses</p>"
    
    def show_history(self, button):
        """Show testing history"""
        with self.results_output:
            clear_output()
            
            if not self.test_history:
                print("📋 No testing history yet. Generate some responses first!")
                return
            
            print(f"📋 TESTING HISTORY ({len(self.test_history)} tests)")
            print("=" * 80)
            
            for i, entry in enumerate(self.test_history[-10:], 1):  # Show last 10
                print(f"\n🔢 Test #{len(self.test_history)-10+i}:")
                print(f"🕐 {entry['timestamp'].strftime('%H:%M:%S')}")
                print(f"❓ {entry['question'][:60]}{'...' if len(entry['question']) > 60 else ''}")
                print(f"🎛️  {entry['model_choice']} | Tokens: {entry['parameters']['max_tokens']} | Temp: {entry['parameters']['temperature']}")
                print("-" * 40)
    
    def display(self):
        """Display the complete interface"""
        
        # Parameter controls
        params_box = widgets.VBox([
            widgets.HTML("<h3>🎛️ Generation Parameters</h3>"),
            widgets.HBox([self.max_tokens, self.temperature, self.top_p])
        ])
        
        # Control buttons
        buttons_box = widgets.HBox([
            self.generate_btn,
            self.clear_btn, 
            self.history_btn
        ], layout=widgets.Layout(justify_content='space-between'))
        
        # Main interface
        interface = widgets.VBox([
            self.header,
            widgets.HTML("<h3>📝 Question Input</h3>"),
            self.quick_selector,
            self.question_input,
            widgets.HTML("<h3>🤖 Model Selection</h3>"),
            self.model_selector,
            params_box,
            widgets.HTML("<h3>🚀 Generate & Control</h3>"),
            buttons_box,
            self.status,
            widgets.HTML("<h3>📊 Results</h3>"),
            self.results_output
        ])
        
        return interface

# Initialize the interface (assuming models are already loaded)
print("🔧 Setting up interactive testing interface...")

# Check if models are loaded
if 'base_pipeline' not in locals() or 'fine_tuned_pipeline' not in locals():
    print("⚠️  Models not found. Please run the model loading cells first!")
    print("Required variables: base_pipeline, fine_tuned_pipeline, tokenizer")
else:
    # Create and display interface
    testing_interface = ModelTestingInterface(base_pipeline, fine_tuned_pipeline, tokenizer)
    interface_widget = testing_interface.display()
    
    print("✅ Interactive interface ready!")
    print("\n💡 TIPS:")
    print("• Use the dropdown to select sample questions")
    print("• Adjust parameters to see how they affect responses")
    print("• Compare both models to see fine-tuning improvements")
    print("• Check history to review previous tests")
    print("\n🚀 Interface loading below...")
    
    display(interface_widget)

🔧 Setting up interactive testing interface...
⚠️  Models not found. Please run the model loading cells first!
Required variables: base_pipeline, fine_tuned_pipeline, tokenizer


In [1]:
# Interactive Model Testing Interface

import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import time
from datetime import datetime

# Create interface components
class ModelTestingInterface:
    def __init__(self, base_pipeline, fine_tuned_pipeline, tokenizer):
        self.base_pipeline = base_pipeline
        self.fine_tuned_pipeline = fine_tuned_pipeline
        self.tokenizer = tokenizer
        self.test_history = []
        
        # Create widgets
        self.create_widgets()
        self.setup_interface()
    
    def create_widgets(self):
        """Create all interface widgets"""
        
        # Header
        self.header = widgets.HTML(
            value="""
            <div style='background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); 
                        padding: 20px; border-radius: 10px; margin-bottom: 20px;'>
                <h2 style='color: white; text-align: center; margin: 0;'>
                    🤖 Llama-3.1 Corporate Assistant Testing Interface
                </h2>
                <p style='color: white; text-align: center; margin: 5px 0 0 0;'>
                    Test your fine-tuned model with custom questions
                </p>
            </div>
            """
        )
        
        # Question input
        self.question_input = widgets.Textarea(
            value='How to raise a staffing SO request?',
            placeholder='Enter your corporate question here...',
            description='Question:',
            layout=widgets.Layout(width='100%', height='80px'),
            style={'description_width': '100px'}
        )
        
        # Model selection
        self.model_selector = widgets.RadioButtons(
            options=['Both Models', 'Fine-tuned Only', 'Base Model Only'],
            value='Both Models',
            description='Compare:',
            style={'description_width': '100px'}
        )
        
        # Generation parameters
        self.max_tokens = widgets.IntSlider(
            value=150,
            min=50,
            max=300,
            step=25,
            description='Max Tokens:',
            style={'description_width': '100px'}
        )
        
        self.temperature = widgets.FloatSlider(
            value=0.7,
            min=0.1,
            max=1.0,
            step=0.1,
            description='Temperature:',
            style={'description_width': '100px'}
        )
        
        self.top_p = widgets.FloatSlider(
            value=0.9,
            min=0.1,
            max=1.0,
            step=0.1,
            description='Top P:',
            style={'description_width': '100px'}
        )
        
        # Buttons
        self.generate_btn = widgets.Button(
            description='🚀 Generate Response',
            button_style='primary',
            layout=widgets.Layout(width='200px', height='40px')
        )
        
        self.clear_btn = widgets.Button(
            description='🧹 Clear Results',
            button_style='warning',
            layout=widgets.Layout(width='150px', height='40px')
        )
        
        self.history_btn = widgets.Button(
            description='📋 Show History',
            button_style='info',
            layout=widgets.Layout(width='150px', height='40px')
        )
        
        # Results output
        self.results_output = widgets.Output(
            layout=widgets.Layout(width='100%', height='600px', border='1px solid #ddd')
        )
        
        # Status
        self.status = widgets.HTML(
            value="<p style='color: green;'>✅ Ready to generate responses</p>"
        )
        
        # Quick questions
        self.quick_questions = [
            "How to raise a staffing SO request?",
            "What is the difference between PM and FC job codes?", 
            "How to create a CWR SO?",
            "Process to convert CWR to FTE associates",
            "Unable to select subcontractor in the system",
            "Are there any migration benefits available in Google Cloud?",
            "How to view hardcopy of I-140 approval notice?",
            "What equipment do I need for a telemedicine appointment?",
            "Do we still need to validate SOs through email for GGM SOs?",
            "Unable to create opportunity id in winzone"
        ]
        
        self.quick_selector = widgets.Dropdown(
            options=['Select a sample question...'] + self.quick_questions,
            value='Select a sample question...',
            description='Quick Test:',
            layout=widgets.Layout(width='100%'),
            style={'description_width': '100px'}
        )
    
    def setup_interface(self):
        """Setup event handlers"""
        self.generate_btn.on_click(self.generate_responses)
        self.clear_btn.on_click(self.clear_results)
        self.history_btn.on_click(self.show_history)
        self.quick_selector.observe(self.load_quick_question, names='value')
    
    def load_quick_question(self, change):
        """Load selected quick question"""
        if change['new'] != 'Select a sample question...':
            self.question_input.value = change['new']
    
    def generate_response_single(self, pipeline, question, model_name):
        """Generate response from a single model"""
        system_prompt = "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions."
        
        prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
        
        try:
            start_time = time.time()
            response = pipeline(
                prompt,
                max_new_tokens=self.max_tokens.value,
                do_sample=True,
                temperature=self.temperature.value,
                top_p=self.top_p.value,
                repetition_penalty=1.1,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.pad_token_id,
                return_full_text=False
            )
            
            generation_time = time.time() - start_time
            generated_text = response[0]['generated_text'].strip()
            
            return {
                'text': generated_text,
                'time': generation_time,
                'tokens': len(generated_text.split()),
                'chars': len(generated_text)
            }
            
        except Exception as e:
            return {
                'text': f"Error generating response: {str(e)}",
                'time': 0,
                'tokens': 0,
                'chars': 0
            }
    
    def generate_responses(self, button):
        """Generate responses based on selected models"""
        question = self.question_input.value.strip()
        
        if not question:
            with self.results_output:
                print("❌ Please enter a question first!")
            return
        
        # Update status
        self.status.value = "<p style='color: orange;'>⏳ Generating responses...</p>"
        
        with self.results_output:
            clear_output(wait=True)
            
            # Header
            print("🤖 MODEL RESPONSE COMPARISON")
            print("=" * 80)
            print(f"❓ Question: {question}")
            print(f"⚙️  Parameters: Max Tokens={self.max_tokens.value}, Temperature={self.temperature.value}, Top-P={self.top_p.value}")
            print(f"🕐 Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
            print("=" * 80)
            
            # Generate responses based on selection
            model_choice = self.model_selector.value
            
            if model_choice in ['Both Models', 'Base Model Only']:
                print("\n🔵 BASE MODEL RESPONSE:")
                print("-" * 50)
                base_result = self.generate_response_single(self.base_pipeline, question, "Base")
                print(base_result['text'])
                print("-" * 50)
                print(f"📊 Stats: {base_result['tokens']} tokens, {base_result['chars']} chars, {base_result['time']:.2f}s")
            
            if model_choice in ['Both Models', 'Fine-tuned Only']:
                print("\n🚀 FINE-TUNED MODEL RESPONSE:")
                print("-" * 50)
                ft_result = self.generate_response_single(self.fine_tuned_pipeline, question, "Fine-tuned")
                print(ft_result['text'])
                print("-" * 50)
                print(f"📊 Stats: {ft_result['tokens']} tokens, {ft_result['chars']} chars, {ft_result['time']:.2f}s")
            
            # Comparison analysis (if both models)
            if model_choice == 'Both Models':
                print("\n🔍 COMPARISON ANALYSIS:")
                print("-" * 30)
                
                # Length comparison
                base_len = base_result['tokens']
                ft_len = ft_result['tokens']
                len_diff = ft_len - base_len
                
                print(f"📏 Length: Base={base_len} tokens, Fine-tuned={ft_len} tokens ({len_diff:+d})")
                
                # Speed comparison
                base_speed = base_result['tokens'] / max(base_result['time'], 0.01)
                ft_speed = ft_result['tokens'] / max(ft_result['time'], 0.01)
                
                print(f"⚡ Speed: Base={base_speed:.1f} tok/s, Fine-tuned={ft_speed:.1f} tok/s")
                
                # Simple keyword analysis
                corporate_keywords = ['cognizant', 'oneC', 'so', 'request', 'process', 'system', 'application', 'email']
                base_keywords = sum(1 for kw in corporate_keywords if kw.lower() in base_result['text'].lower())
                ft_keywords = sum(1 for kw in corporate_keywords if kw.lower() in ft_result['text'].lower())
                
                print(f"🏢 Corporate Terms: Base={base_keywords}, Fine-tuned={ft_keywords}")
                
                if ft_len > base_len and ft_keywords >= base_keywords:
                    print("✅ Fine-tuned model provided more detailed corporate response")
                elif ft_keywords > base_keywords:
                    print("✅ Fine-tuned model used more corporate terminology")
                elif base_len > ft_len:
                    print("⚠️  Base model provided longer response")
                else:
                    print("➖ Responses are similar in characteristics")
        
        # Save to history
        history_entry = {
            'timestamp': datetime.now(),
            'question': question,
            'model_choice': model_choice,
            'parameters': {
                'max_tokens': self.max_tokens.value,
                'temperature': self.temperature.value,
                'top_p': self.top_p.value
            }
        }
        
        if model_choice in ['Both Models', 'Base Model Only']:
            history_entry['base_response'] = base_result['text']
        if model_choice in ['Both Models', 'Fine-tuned Only']:
            history_entry['ft_response'] = ft_result['text']
            
        self.test_history.append(history_entry)
        
        # Update status
        self.status.value = f"<p style='color: green;'>✅ Responses generated! (Test #{len(self.test_history)})</p>"
    
    def clear_results(self, button):
        """Clear the results output"""
        with self.results_output:
            clear_output()
            print("🧹 Results cleared. Ready for new questions!")
        self.status.value = "<p style='color: green;'>✅ Ready to generate responses</p>"
    
    def show_history(self, button):
        """Show testing history"""
        with self.results_output:
            clear_output()
            
            if not self.test_history:
                print("📋 No testing history yet. Generate some responses first!")
                return
            
            print(f"📋 TESTING HISTORY ({len(self.test_history)} tests)")
            print("=" * 80)
            
            for i, entry in enumerate(self.test_history[-10:], 1):  # Show last 10
                print(f"\n🔢 Test #{len(self.test_history)-10+i}:")
                print(f"🕐 {entry['timestamp'].strftime('%H:%M:%S')}")
                print(f"❓ {entry['question'][:60]}{'...' if len(entry['question']) > 60 else ''}")
                print(f"🎛️  {entry['model_choice']} | Tokens: {entry['parameters']['max_tokens']} | Temp: {entry['parameters']['temperature']}")
                print("-" * 40)
    
    def display(self):
        """Display the complete interface"""
        
        # Parameter controls
        params_box = widgets.VBox([
            widgets.HTML("<h3>🎛️ Generation Parameters</h3>"),
            widgets.HBox([self.max_tokens, self.temperature, self.top_p])
        ])
        
        # Control buttons
        buttons_box = widgets.HBox([
            self.generate_btn,
            self.clear_btn, 
            self.history_btn
        ], layout=widgets.Layout(justify_content='space-between'))
        
        # Main interface
        interface = widgets.VBox([
            self.header,
            widgets.HTML("<h3>📝 Question Input</h3>"),
            self.quick_selector,
            self.question_input,
            widgets.HTML("<h3>🤖 Model Selection</h3>"),
            self.model_selector,
            params_box,
            widgets.HTML("<h3>🚀 Generate & Control</h3>"),
            buttons_box,
            self.status,
            widgets.HTML("<h3>📊 Results</h3>"),
            self.results_output
        ])
        
        return interface

# Initialize the interface (assuming models are already loaded)
print("🔧 Setting up interactive testing interface...")

# Check if models are loaded
if 'base_pipeline' not in locals() or 'fine_tuned_pipeline' not in locals():
    print("⚠️  Models not found. Please run the model loading cells first!")
    print("Required variables: base_pipeline, fine_tuned_pipeline, tokenizer")
else:
    # Create and display interface
    testing_interface = ModelTestingInterface(base_pipeline, fine_tuned_pipeline, tokenizer)
    interface_widget = testing_interface.display()
    
    print("✅ Interactive interface ready!")
    print("\n💡 TIPS:")
    print("• Use the dropdown to select sample questions")
    print("• Adjust parameters to see how they affect responses")
    print("• Compare both models to see fine-tuning improvements")
    print("• Check history to review previous tests")
    print("\n🚀 Interface loading below...")
    
    display(interface_widget)

🔧 Setting up interactive testing interface...
⚠️  Models not found. Please run the model loading cells first!
Required variables: base_pipeline, fine_tuned_pipeline, tokenizer
