# vLLM Entity Extraction Performance Troubleshooting

This notebook provides comprehensive tools for troubleshooting entity extraction performance issues using vLLM as a Python package.

## Goals:
- Direct control over all vLLM parameters
- Identify why extraction fails at ~5000 characters
- Find optimal settings for dual NVIDIA A40 GPUs (46GB VRAM each)
- Compare Granite 3.3 2B vs Qwen 2.5 72B performance

## 1. Installation and Setup

```bash
# Run in terminal first:
conda create -n vllm-test python=3.12 -y
conda activate vllm-test
VLLM_USE_PRECOMPILED=1 pip install vllm
pip install jupyter ipykernel pandas matplotlib psutil nvidia-ml-py3
```

In [None]:
# Core imports
import os
import json
import time
import psutil
import pandas as pd
import matplotlib.pyplot as plt
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, asdict
import torch
import gc

# vLLM imports
from vllm import LLM, SamplingParams
from vllm.utils import is_hip

# GPU monitoring
try:
    import nvidia_ml_py3 as nvml
    nvml.nvmlInit()
    GPU_AVAILABLE = True
except:
    GPU_AVAILABLE = False
    print("Warning: NVIDIA GPU monitoring not available")

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU count: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

## 2. Performance Monitoring Utilities

In [None]:
@dataclass
class PerformanceMetrics:
    """Container for performance metrics"""
    model_name: str
    input_chars: int
    input_tokens: int
    output_tokens: int
    total_time: float
    time_to_first_token: float
    tokens_per_second: float
    gpu_memory_used: float
    gpu_utilization: float
    finish_reason: str
    config: Dict
    
class GPUMonitor:
    """Monitor GPU memory and utilization"""
    
    @staticmethod
    def get_gpu_memory():
        """Get current GPU memory usage in GB"""
        if not GPU_AVAILABLE:
            return {"gpu_0": 0, "gpu_1": 0}
        
        memory_info = {}
        for i in range(torch.cuda.device_count()):
            handle = nvml.nvmlDeviceGetHandleByIndex(i)
            info = nvml.nvmlDeviceGetMemoryInfo(handle)
            memory_info[f"gpu_{i}"] = info.used / 1024**3  # Convert to GB
        return memory_info
    
    @staticmethod
    def get_gpu_utilization():
        """Get current GPU utilization percentage"""
        if not GPU_AVAILABLE:
            return {"gpu_0": 0, "gpu_1": 0}
        
        util_info = {}
        for i in range(torch.cuda.device_count()):
            handle = nvml.nvmlDeviceGetHandleByIndex(i)
            util = nvml.nvmlDeviceGetUtilizationRates(handle)
            util_info[f"gpu_{i}"] = util.gpu
        return util_info
    
    @staticmethod
    def print_gpu_status():
        """Print current GPU status"""
        memory = GPUMonitor.get_gpu_memory()
        util = GPUMonitor.get_gpu_utilization()
        
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: Memory: {memory[f'gpu_{i}']:.2f}GB, Utilization: {util[f'gpu_{i}']}%")

# Test GPU monitoring
GPUMonitor.print_gpu_status()

## 3. vLLM Configuration Presets

Different configuration presets for various optimization goals.

In [None]:
class vLLMConfigs:
    """Pre-defined vLLM configurations for different scenarios"""
    
    # Baseline configuration
    BASELINE = {
        "gpu_memory_utilization": 0.9,
        "max_model_len": None,  # Use model default
        "dtype": "auto",
    }
    
    # Optimized for throughput
    HIGH_THROUGHPUT = {
        "gpu_memory_utilization": 0.95,
        "max_model_len": 16384,
        "max_num_seqs": 256,
        "max_num_batched_tokens": 4096,
        "enable_prefix_caching": True,
        "enable_chunked_prefill": True,
        "dtype": "float16",
    }
    
    # Optimized for low latency
    LOW_LATENCY = {
        "gpu_memory_utilization": 0.9,
        "max_model_len": 8192,
        "max_num_seqs": 64,
        "max_num_batched_tokens": 512,
        "enable_prefix_caching": True,
        "enable_chunked_prefill": True,
        "dtype": "float16",
    }
    
    # Memory constrained (for large models)
    MEMORY_OPTIMIZED = {
        "gpu_memory_utilization": 0.95,
        "max_model_len": 4096,
        "max_num_seqs": 32,
        "max_num_batched_tokens": 1024,
        "enable_prefix_caching": True,
        "enable_chunked_prefill": True,
        "swap_space": 16,  # GB of CPU swap space
        "dtype": "float16",
    }
    
    # For debugging 5000+ char issues
    DEBUG_LONG_CONTEXT = {
        "gpu_memory_utilization": 0.95,
        "max_model_len": 32768,  # Extended context
        "max_num_seqs": 16,  # Reduced for more KV cache
        "max_num_batched_tokens": 8192,  # Large batch for long docs
        "enable_prefix_caching": True,
        "enable_chunked_prefill": True,
        "block_size": 16,
        "dtype": "float16",
    }

# Print available configs
print("Available configurations:")
for name in dir(vLLMConfigs):
    if not name.startswith('_') and name.isupper():
        print(f"  - {name}")

## 4. Model Loading and Management

In [None]:
class ModelManager:
    """Manage vLLM model loading and unloading"""
    
    def __init__(self):
        self.current_model = None
        self.current_config = None
    
    def load_granite_2b(self, config_name: str = "BASELINE"):
        """Load IBM Granite 3.3 2B model"""
        self.unload_model()
        
        config = getattr(vLLMConfigs, config_name).copy()
        print(f"Loading Granite 3.3 2B with {config_name} config...")
        print(f"Config: {json.dumps(config, indent=2)}")
        
        # Granite specific settings
        if config["max_model_len"] is None:
            config["max_model_len"] = 128000  # Granite supports 128K context
        
        start_time = time.time()
        self.current_model = LLM(
            model="ibm-granite/granite-3.3-2b-instruct",
            **config
        )
        load_time = time.time() - start_time
        
        self.current_config = config
        print(f"Model loaded in {load_time:.2f} seconds")
        GPUMonitor.print_gpu_status()
        return self.current_model
    
    def load_qwen_72b(self, config_name: str = "MEMORY_OPTIMIZED"):
        """Load Qwen 2.5 72B model with tensor parallelism"""
        self.unload_model()
        
        config = getattr(vLLMConfigs, config_name).copy()
        print(f"Loading Qwen 2.5 72B with {config_name} config...")
        
        # Qwen 72B specific settings
        if config["max_model_len"] is None:
            config["max_model_len"] = 32768  # Qwen default
        
        # Add tensor parallelism for dual GPUs
        config["tensor_parallel_size"] = 2
        config["distributed_executor_backend"] = "mp"
        
        print(f"Config: {json.dumps(config, indent=2)}")
        
        start_time = time.time()
        self.current_model = LLM(
            model="Qwen/Qwen2.5-72B-Instruct",
            **config
        )
        load_time = time.time() - start_time
        
        self.current_config = config
        print(f"Model loaded in {load_time:.2f} seconds")
        GPUMonitor.print_gpu_status()
        return self.current_model
    
    def load_custom(self, model_name: str, config: Dict):
        """Load a custom model with custom configuration"""
        self.unload_model()
        
        print(f"Loading {model_name} with custom config...")
        print(f"Config: {json.dumps(config, indent=2)}")
        
        start_time = time.time()
        self.current_model = LLM(model=model_name, **config)
        load_time = time.time() - start_time
        
        self.current_config = config
        print(f"Model loaded in {load_time:.2f} seconds")
        GPUMonitor.print_gpu_status()
        return self.current_model
    
    def unload_model(self):
        """Unload current model and free GPU memory"""
        if self.current_model is not None:
            print("Unloading current model...")
            del self.current_model
            self.current_model = None
            self.current_config = None
            
            # Force garbage collection and clear CUDA cache
            gc.collect()
            torch.cuda.empty_cache()
            
            print("Model unloaded")
            GPUMonitor.print_gpu_status()

# Initialize model manager
model_manager = ModelManager()

## 5. Entity Extraction Test Suite

In [None]:
class EntityExtractionTester:
    """Test entity extraction with various document sizes and configurations"""
    
    def __init__(self, model_manager: ModelManager):
        self.model_manager = model_manager
        self.results = []
    
    def create_test_prompt(self, text: str, char_count: int) -> str:
        """Create entity extraction prompt"""
        # Truncate or pad text to desired character count
        if len(text) > char_count:
            text = text[:char_count]
        elif len(text) < char_count:
            # Repeat text to reach desired length
            multiplier = (char_count // len(text)) + 1
            text = (text * multiplier)[:char_count]
        
        prompt = f"""Extract all legal entities from the following document.
Return the entities in JSON format with these categories:
- persons
- organizations
- locations
- dates
- monetary_amounts
- case_numbers

Document:
{text}

Extracted entities:
"""
        return prompt
    
    def test_extraction(self, 
                       text: str,
                       char_counts: List[int],
                       sampling_params: Optional[SamplingParams] = None) -> pd.DataFrame:
        """Test extraction with different document sizes"""
        
        if self.model_manager.current_model is None:
            raise ValueError("No model loaded. Use model_manager to load a model first.")
        
        if sampling_params is None:
            sampling_params = SamplingParams(
                temperature=0.1,
                top_p=0.95,
                max_tokens=2048,
                repetition_penalty=1.05,
            )
        
        results = []
        
        for char_count in char_counts:
            print(f"\nTesting with {char_count} characters...")
            
            # Create prompt
            prompt = self.create_test_prompt(text, char_count)
            
            # Monitor GPU before generation
            gpu_before = GPUMonitor.get_gpu_memory()
            
            try:
                # Generate
                start_time = time.time()
                outputs = self.model_manager.current_model.generate(
                    [prompt], 
                    sampling_params
                )
                total_time = time.time() - start_time
                
                # Get output
                output = outputs[0]
                generated_text = output.outputs[0].text
                
                # Calculate metrics
                input_tokens = len(output.prompt_token_ids)
                output_tokens = len(output.outputs[0].token_ids)
                tokens_per_second = output_tokens / total_time if total_time > 0 else 0
                
                # Monitor GPU after generation
                gpu_after = GPUMonitor.get_gpu_memory()
                gpu_used = max(gpu_after.values()) - max(gpu_before.values())
                
                # Store results
                result = {
                    "char_count": char_count,
                    "input_tokens": input_tokens,
                    "output_tokens": output_tokens,
                    "total_time": total_time,
                    "tokens_per_second": tokens_per_second,
                    "gpu_memory_delta": gpu_used,
                    "finish_reason": output.outputs[0].finish_reason,
                    "success": True,
                    "error": None,
                    "output_preview": generated_text[:200]
                }
                
                print(f"  ✓ Success: {input_tokens} input tokens, {output_tokens} output tokens")
                print(f"  Time: {total_time:.2f}s, Speed: {tokens_per_second:.1f} tokens/s")
                
            except Exception as e:
                result = {
                    "char_count": char_count,
                    "input_tokens": 0,
                    "output_tokens": 0,
                    "total_time": 0,
                    "tokens_per_second": 0,
                    "gpu_memory_delta": 0,
                    "finish_reason": "error",
                    "success": False,
                    "error": str(e),
                    "output_preview": None
                }
                print(f"  ✗ Error: {str(e)}")
            
            results.append(result)
            self.results.append(result)
        
        return pd.DataFrame(results)
    
    def plot_results(self, df: pd.DataFrame):
        """Visualize test results"""
        fig, axes = plt.subplots(2, 2, figsize=(12, 8))
        
        # Success rate
        ax = axes[0, 0]
        success_data = df.groupby('char_count')['success'].mean() * 100
        ax.plot(success_data.index, success_data.values, 'o-')
        ax.set_xlabel('Character Count')
        ax.set_ylabel('Success Rate (%)')
        ax.set_title('Extraction Success Rate')
        ax.grid(True)
        
        # Token usage
        ax = axes[0, 1]
        ax.plot(df['char_count'], df['input_tokens'], 'o-', label='Input Tokens')
        ax.plot(df['char_count'], df['output_tokens'], 's-', label='Output Tokens')
        ax.set_xlabel('Character Count')
        ax.set_ylabel('Token Count')
        ax.set_title('Token Usage')
        ax.legend()
        ax.grid(True)
        
        # Processing time
        ax = axes[1, 0]
        ax.plot(df['char_count'], df['total_time'], 'o-')
        ax.set_xlabel('Character Count')
        ax.set_ylabel('Time (seconds)')
        ax.set_title('Processing Time')
        ax.grid(True)
        
        # Tokens per second
        ax = axes[1, 1]
        ax.plot(df['char_count'], df['tokens_per_second'], 'o-')
        ax.set_xlabel('Character Count')
        ax.set_ylabel('Tokens/Second')
        ax.set_title('Generation Speed')
        ax.grid(True)
        
        plt.tight_layout()
        plt.show()

# Initialize tester
tester = EntityExtractionTester(model_manager)

## 6. Configuration Comparison Tool

In [None]:
class ConfigComparison:
    """Compare different vLLM configurations"""
    
    @staticmethod
    def compare_configs(model_manager: ModelManager,
                       model_loader: str,  # 'load_granite_2b' or 'load_qwen_72b'
                       configs: List[str],
                       test_text: str,
                       test_char_count: int = 5000) -> pd.DataFrame:
        """Compare multiple configurations"""
        
        results = []
        
        for config_name in configs:
            print(f"\n{'='*60}")
            print(f"Testing {config_name} configuration")
            print(f"{'='*60}")
            
            # Load model with config
            loader = getattr(model_manager, model_loader)
            loader(config_name)
            
            # Test extraction
            tester = EntityExtractionTester(model_manager)
            df = tester.test_extraction(test_text, [test_char_count])
            
            if len(df) > 0:
                row = df.iloc[0].to_dict()
                row['config'] = config_name
                row['model'] = model_loader
                results.append(row)
            
            # Unload to free memory
            model_manager.unload_model()
        
        comparison_df = pd.DataFrame(results)
        return comparison_df
    
    @staticmethod
    def plot_comparison(df: pd.DataFrame):
        """Plot configuration comparison"""
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        
        # Processing time comparison
        ax = axes[0]
        ax.bar(df['config'], df['total_time'])
        ax.set_xlabel('Configuration')
        ax.set_ylabel('Time (seconds)')
        ax.set_title('Processing Time')
        ax.tick_params(axis='x', rotation=45)
        
        # Tokens per second
        ax = axes[1]
        ax.bar(df['config'], df['tokens_per_second'])
        ax.set_xlabel('Configuration')
        ax.set_ylabel('Tokens/Second')
        ax.set_title('Generation Speed')
        ax.tick_params(axis='x', rotation=45)
        
        # Success rate
        ax = axes[2]
        success_values = df['success'].astype(int) * 100
        colors = ['green' if s else 'red' for s in df['success']]
        ax.bar(df['config'], success_values, color=colors)
        ax.set_xlabel('Configuration')
        ax.set_ylabel('Success (%)')
        ax.set_title('Extraction Success')
        ax.tick_params(axis='x', rotation=45)
        ax.set_ylim(0, 110)
        
        plt.tight_layout()
        plt.show()

## 7. Advanced Sampling Parameters

In [None]:
class SamplingStrategies:
    """Different sampling strategies for entity extraction"""
    
    # Deterministic extraction
    DETERMINISTIC = SamplingParams(
        temperature=0.0,
        top_p=1.0,
        max_tokens=2048,
        repetition_penalty=1.0,
    )
    
    # Slightly creative but consistent
    CONSISTENT = SamplingParams(
        temperature=0.1,
        top_p=0.95,
        max_tokens=2048,
        repetition_penalty=1.05,
    )
    
    # Balanced extraction
    BALANCED = SamplingParams(
        temperature=0.3,
        top_p=0.9,
        max_tokens=2048,
        repetition_penalty=1.1,
        frequency_penalty=0.1,
        presence_penalty=0.1,
    )
    
    # For structured output
    STRUCTURED = SamplingParams(
        temperature=0.0,
        top_p=1.0,
        max_tokens=4096,
        stop=["</entities>", "\n\n\n", "```\n"],
        skip_special_tokens=False,
    )
    
    # Long document optimized
    LONG_DOCUMENT = SamplingParams(
        temperature=0.1,
        top_p=0.95,
        max_tokens=8192,  # Extended for long outputs
        repetition_penalty=1.02,  # Lower to allow entity repetition
    )

print("Available sampling strategies:")
for name in dir(SamplingStrategies):
    if not name.startswith('_') and name.isupper():
        strategy = getattr(SamplingStrategies, name)
        print(f"  - {name}: temp={strategy.temperature}, max_tokens={strategy.max_tokens}")

## 8. Troubleshooting 5000+ Character Issue

In [None]:
def diagnose_5k_char_issue(model_manager: ModelManager, test_text: str):
    """Comprehensive diagnosis of the 5000+ character extraction issue"""
    
    print("Diagnosing 5000+ character extraction issue...\n")
    
    # Test points around the 5000 character boundary
    test_points = [3000, 4000, 4500, 4900, 5000, 5100, 5500, 6000, 7000]
    
    results = []
    
    # Test with different configurations
    configs_to_test = [
        ("BASELINE", vLLMConfigs.BASELINE),
        ("DEBUG_LONG_CONTEXT", vLLMConfigs.DEBUG_LONG_CONTEXT),
    ]
    
    for config_name, config in configs_to_test:
        print(f"\nTesting with {config_name} configuration:")
        print(f"max_model_len: {config.get('max_model_len', 'default')}")
        print(f"max_num_batched_tokens: {config.get('max_num_batched_tokens', 'default')}")
        print(f"enable_chunked_prefill: {config.get('enable_chunked_prefill', False)}")
        print()
        
        # Load model
        model_manager.load_granite_2b(config_name)
        
        for char_count in test_points:
            # Create test prompt
            prompt = tester.create_test_prompt(test_text, char_count)
            
            # Count tokens
            try:
                # Try to tokenize to get token count
                # Note: This is approximate as vLLM uses its own tokenizer
                approx_tokens = len(prompt) // 4  # Rough estimate: 4 chars per token
                
                # Test generation
                sampling_params = SamplingStrategies.LONG_DOCUMENT
                
                start_time = time.time()
                outputs = model_manager.current_model.generate(
                    [prompt], 
                    sampling_params
                )
                elapsed = time.time() - start_time
                
                output = outputs[0]
                success = True
                error = None
                input_tokens = len(output.prompt_token_ids)
                output_tokens = len(output.outputs[0].token_ids)
                finish_reason = output.outputs[0].finish_reason
                
            except Exception as e:
                success = False
                error = str(e)
                input_tokens = approx_tokens
                output_tokens = 0
                finish_reason = "error"
                elapsed = 0
            
            result = {
                "config": config_name,
                "char_count": char_count,
                "approx_tokens": approx_tokens,
                "actual_input_tokens": input_tokens,
                "output_tokens": output_tokens,
                "success": success,
                "finish_reason": finish_reason,
                "time": elapsed,
                "error": error
            }
            
            results.append(result)
            
            status = "✓" if success else "✗"
            print(f"{status} {char_count:5d} chars | {input_tokens:5d} tokens | "
                  f"Reason: {finish_reason:10s} | Time: {elapsed:.2f}s")
        
        # Unload model
        model_manager.unload_model()
    
    # Analyze results
    df = pd.DataFrame(results)
    
    print("\n" + "="*60)
    print("DIAGNOSIS SUMMARY")
    print("="*60)
    
    # Find failure point
    for config_name in df['config'].unique():
        config_df = df[df['config'] == config_name]
        failures = config_df[~config_df['success']]
        
        print(f"\n{config_name}:")
        if len(failures) > 0:
            first_failure = failures.iloc[0]
            print(f"  First failure at: {first_failure['char_count']} characters")
            print(f"  Token count: {first_failure['actual_input_tokens']}")
            print(f"  Error: {first_failure['error']}")
        else:
            print("  All tests passed!")
    
    # Plot results
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Success by character count
    ax = axes[0]
    for config_name in df['config'].unique():
        config_df = df[df['config'] == config_name]
        ax.plot(config_df['char_count'], 
               config_df['success'].astype(int) * 100,
               'o-', label=config_name)
    ax.set_xlabel('Character Count')
    ax.set_ylabel('Success (%)')
    ax.set_title('Extraction Success by Document Size')
    ax.axvline(x=5000, color='red', linestyle='--', alpha=0.5, label='5000 chars')
    ax.legend()
    ax.grid(True)
    
    # Token count vs char count
    ax = axes[1]
    for config_name in df['config'].unique():
        config_df = df[df['config'] == config_name]
        ax.plot(config_df['char_count'], 
               config_df['actual_input_tokens'],
               'o-', label=config_name)
    ax.set_xlabel('Character Count')
    ax.set_ylabel('Token Count')
    ax.set_title('Token Usage by Document Size')
    ax.axvline(x=5000, color='red', linestyle='--', alpha=0.5)
    ax.legend()
    ax.grid(True)
    
    plt.tight_layout()
    plt.show()
    
    return df

## 9. Example Usage - Basic Tests

In [None]:
# Sample legal text for testing
sample_text = """
UNITED STATES DISTRICT COURT
SOUTHERN DISTRICT OF NEW YORK

JOHN DOE, individually and on behalf of all others similarly situated,
Plaintiff,
v.
ACME CORPORATION, a Delaware corporation, and JANE SMITH, CEO,
Defendants.

Case No. 23-CV-1234-ABC

COMPLAINT

Plaintiff John Doe, by and through undersigned counsel, brings this action against 
Defendants Acme Corporation and Jane Smith, and alleges as follows:

1. This is a class action brought on behalf of all purchasers of Acme Corporation's 
   products between January 1, 2020 and December 31, 2023.

2. The amount in controversy exceeds $5,000,000.00, exclusive of interest and costs.

3. On or about March 15, 2023, Defendant Jane Smith made false statements regarding 
   the safety of Acme's products at the company's headquarters in New York, NY.
"""

# Load Granite model with baseline config
model = model_manager.load_granite_2b("BASELINE")

In [None]:
# Test with increasing document sizes
test_sizes = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]

results_df = tester.test_extraction(
    text=sample_text,
    char_counts=test_sizes,
    sampling_params=SamplingStrategies.CONSISTENT
)

# Display results
print("\nTest Results Summary:")
print(results_df[['char_count', 'input_tokens', 'output_tokens', 
                  'total_time', 'tokens_per_second', 'success']])

# Plot results
tester.plot_results(results_df)

## 10. Advanced Troubleshooting - 5000 Character Issue

In [None]:
# Diagnose the 5000+ character issue
diagnosis_df = diagnose_5k_char_issue(model_manager, sample_text)

## 11. Configuration Comparison

In [None]:
# Compare different configurations
configs_to_compare = ["BASELINE", "HIGH_THROUGHPUT", "LOW_LATENCY", "DEBUG_LONG_CONTEXT"]

comparison_df = ConfigComparison.compare_configs(
    model_manager=model_manager,
    model_loader="load_granite_2b",
    configs=configs_to_compare,
    test_text=sample_text,
    test_char_count=5000
)

print("\nConfiguration Comparison Results:")
print(comparison_df[['config', 'total_time', 'tokens_per_second', 'success']])

ConfigComparison.plot_comparison(comparison_df)

## 12. Custom Configuration Testing

In [None]:
# Create and test a custom configuration
custom_config = {
    "gpu_memory_utilization": 0.98,  # Maximum memory usage
    "max_model_len": 16384,  # Moderate context length
    "max_num_seqs": 8,  # Few sequences for more KV cache per sequence
    "max_num_batched_tokens": 16384,  # Large batch for long documents
    "enable_prefix_caching": True,
    "enable_chunked_prefill": True,
    "block_size": 32,  # Larger blocks
    "dtype": "float16",
}

# Load with custom config
model = model_manager.load_custom(
    model_name="ibm-granite/granite-3.3-2b-instruct",
    config=custom_config
)

# Test with problematic size
test_df = tester.test_extraction(
    text=sample_text,
    char_counts=[5000, 5500, 6000],
    sampling_params=SamplingStrategies.LONG_DOCUMENT
)

print("\nCustom Configuration Results:")
print(test_df[['char_count', 'success', 'total_time', 'finish_reason']])

## 13. Memory and Performance Monitoring

In [None]:
def monitor_generation_performance(model_manager: ModelManager, 
                                  prompt: str,
                                  sampling_params: SamplingParams):
    """Detailed performance monitoring during generation"""
    
    print("Monitoring generation performance...\n")
    
    # Pre-generation status
    print("Before generation:")
    GPUMonitor.print_gpu_status()
    
    # CPU memory
    process = psutil.Process()
    cpu_mem_before = process.memory_info().rss / 1024**3  # GB
    print(f"CPU Memory: {cpu_mem_before:.2f} GB\n")
    
    # Generation with timing
    start_time = time.time()
    outputs = model_manager.current_model.generate([prompt], sampling_params)
    total_time = time.time() - start_time
    
    # Post-generation status
    print("\nAfter generation:")
    GPUMonitor.print_gpu_status()
    
    cpu_mem_after = process.memory_info().rss / 1024**3
    print(f"CPU Memory: {cpu_mem_after:.2f} GB")
    print(f"CPU Memory Delta: {cpu_mem_after - cpu_mem_before:.2f} GB\n")
    
    # Output analysis
    output = outputs[0]
    print("Generation Metrics:")
    print(f"  Input tokens: {len(output.prompt_token_ids)}")
    print(f"  Output tokens: {len(output.outputs[0].token_ids)}")
    print(f"  Total time: {total_time:.2f} seconds")
    print(f"  Tokens/second: {len(output.outputs[0].token_ids) / total_time:.1f}")
    print(f"  Finish reason: {output.outputs[0].finish_reason}")
    
    return output

# Test with monitoring
if model_manager.current_model is not None:
    test_prompt = tester.create_test_prompt(sample_text, 5000)
    output = monitor_generation_performance(
        model_manager,
        test_prompt,
        SamplingStrategies.CONSISTENT
    )

## 14. Recommendations and Best Practices

Based on the tests, here are recommendations for resolving the 5000+ character issue:

### Key Findings:
1. **Token Limit**: The issue likely occurs when input tokens exceed a threshold
2. **KV Cache**: Insufficient KV cache memory for long sequences
3. **Batching**: Large documents may exceed max_num_batched_tokens

### Recommended Solutions:

#### For Granite 3.3 2B:
```python
config = {
    "gpu_memory_utilization": 0.95,
    "max_model_len": 32768,  # Increase from default
    "max_num_seqs": 16,  # Reduce for more KV cache per sequence
    "max_num_batched_tokens": 8192,  # Increase for long documents
    "enable_chunked_prefill": True,
    "enable_prefix_caching": True,
}
```

#### For Qwen 2.5 72B:
```python
config = {
    "tensor_parallel_size": 2,
    "gpu_memory_utilization": 0.95,
    "max_model_len": 16384,  # Balance memory and context
    "max_num_seqs": 8,
    "max_num_batched_tokens": 4096,
    "enable_chunked_prefill": True,
    "dtype": "float16",
}
```

### Debugging Steps:
1. Monitor token counts at failure point
2. Check GPU memory usage patterns
3. Test with progressively larger max_model_len
4. Enable chunked_prefill for long documents
5. Reduce max_num_seqs to allocate more KV cache per sequence

In [None]:
# Clean up
model_manager.unload_model()
print("Notebook execution complete. All models unloaded.")