In [1]:
"""
PySuricata Performance Bottleneck Analysis
==========================================

This notebook systematically analyzes PySuricata's performance to identify
bottlenecks, measure time complexity, and determine optimization opportunities.

Analysis includes:
1. cProfile and line_profiler instrumentation
2. Time complexity analysis of key functions
3. Memory usage profiling
4. Micro-benchmarks of individual components
5. Optimization recommendations

Based on the detailed plan in the attached markdown file.
"""

import pandas as pd
import numpy as np
import time
import cProfile
import pstats
import io
from pathlib import Path
import tracemalloc
import psutil
import os
from typing import Dict, Any, List
import matplotlib.pyplot as plt

# PySuricata imports
from pysuricata import profile, ProfileConfig, ComputeOptions
from pysuricata.accumulators.sketches import KMV, MisraGries, ReservoirSampler
from pysuricata.accumulators.numeric import NumericAccumulator
from pysuricata.accumulators.categorical import CategoricalAccumulator
from pysuricata.accumulators.boolean import BooleanAccumulator
from pysuricata.accumulators.datetime import DatetimeAccumulator
from pysuricata.compute.consume import consume_chunk_pandas
from pysuricata.compute.core.types import ColumnKinds
from pysuricata.config import EngineConfig

print("üì¶ Performance Analysis Environment Ready!")
print(f"üìä Pandas version: {pd.__version__}")
print(f"üî¨ PySuricata version: {profile.__module__}")
print(f"üíæ Available memory: {psutil.virtual_memory().total / 1024**3:.1f} GB")


üì¶ Performance Analysis Environment Ready!
üìä Pandas version: 2.3.3
üî¨ PySuricata version: pysuricata.api
üíæ Available memory: 16.0 GB


In [2]:
# Performance Profiling Utilities
class PerformanceProfiler:
    """Comprehensive performance profiler for PySuricata analysis."""
    
    def __init__(self):
        self.results = {}
        self.timing_data = {}
        
    def profile_function(self, func, *args, **kwargs):
        """Profile a single function with cProfile."""
        profiler = cProfile.Profile()
        profiler.enable()
        
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        
        profiler.disable()
        
        # Capture profile stats
        s = io.StringIO()
        ps = pstats.Stats(profiler, stream=s).sort_stats('cumulative')
        ps.print_stats(20)  # Top 20 functions
        
        profile_output = s.getvalue()
        
        return {
            'result': result,
            'execution_time': end_time - start_time,
            'profile_stats': profile_output,
            'profiler': profiler
        }
    
    def benchmark_with_sizes(self, func, sizes, *args, **kwargs):
        """Benchmark function with different input sizes."""
        results = {}
        
        for size in sizes:
            print(f"üìä Benchmarking with size {size:,}...")
            
            # Modify args to include size if needed
            test_args = args
            if 'size' in kwargs:
                kwargs['size'] = size
            elif len(args) == 0:
                test_args = (size,)
            
            profile_result = self.profile_function(func, *test_args, **kwargs)
            
            results[size] = {
                'execution_time': profile_result['execution_time'],
                'profile_stats': profile_result['profile_stats'],
                'operations_per_second': size / profile_result['execution_time'] if profile_result['execution_time'] > 0 else 0
            }
            
            print(f"   Time: {profile_result['execution_time']:.4f}s")
            print(f"   Ops/sec: {results[size]['operations_per_second']:,.0f}")
        
        return results
    
    def memory_profile(self, func, *args, **kwargs):
        """Profile memory usage of a function."""
        tracemalloc.start()
        
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        
        current, peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()
        
        return {
            'result': result,
            'execution_time': end_time - start_time,
            'memory_current': current / 1024 / 1024,  # MB
            'memory_peak': peak / 1024 / 1024,  # MB
            'memory_growth': (peak - current) / 1024 / 1024  # MB
        }

# Initialize profiler
profiler = PerformanceProfiler()
print("‚úÖ Performance profiler initialized!")


‚úÖ Performance profiler initialized!


In [3]:
# Load Test Dataset
def load_test_dataset():
    """Load the 1M row test dataset for performance analysis."""
    
    csv_path = Path("1M_rows_test_uncompressed.csv")
    
    if not csv_path.exists():
        print("‚ùå CSV file not found: 1M_rows_test_uncompressed.csv")
        print("   Please ensure the file exists in the examples directory")
        return None
    
    print(f"üìÅ Loading test dataset: {csv_path.name}")
    print(f"üìä File size: {csv_path.stat().st_size / 1024 / 1024:.2f} MB")
    
    # Load with pandas
    start_time = time.perf_counter()
    df = pd.read_csv(csv_path)
    load_time = time.perf_counter() - start_time
    
    print(f"‚úÖ Dataset loaded successfully!")
    print(f"üìä Shape: {df.shape[0]:,} rows √ó {df.shape[1]:,} columns")
    print(f"‚è±Ô∏è  Load time: {load_time:.2f} seconds")
    print(f"üíæ Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
    
    # Display basic info
    print(f"\nüìã Column Types:")
    dtype_counts = df.dtypes.value_counts()
    for dtype, count in dtype_counts.items():
        print(f"   {dtype}: {count} columns")
    
    return df

# Load the dataset
df = load_test_dataset()

if df is not None:
    print(f"\nüéØ Ready for performance analysis!")
    print(f"   Dataset: {df.shape[0]:,} rows √ó {df.shape[1]:,} columns")
    print(f"   Memory footprint: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")


üìÅ Loading test dataset: 1M_rows_test_uncompressed.csv
üìä File size: 142.14 MB
‚úÖ Dataset loaded successfully!
üìä Shape: 1,000,000 rows √ó 13 columns
‚è±Ô∏è  Load time: 0.79 seconds
üíæ Memory usage: 368.82 MB

üìã Column Types:
   object: 6 columns
   float64: 4 columns
   int64: 3 columns

üéØ Ready for performance analysis!
   Dataset: 1,000,000 rows √ó 13 columns
   Memory footprint: 368.82 MB


In [4]:
# Phase 1: Full PySuricata Profile Analysis
def profile_full_pysuricata(df):
    """Profile the complete PySuricata pipeline."""
    
    if df is None:
        print("‚ùå No dataset available for profiling")
        return None
    
    print("üî¨ PROFILING FULL PYSURICATA PIPELINE")
    print("=" * 60)
    
    # Configure PySuricata for analysis
    compute_options = ComputeOptions(
        chunk_size=50_000,  # Process in chunks
        numeric_sample_size=5_000,  # Sample size for numeric stats
        max_uniques=1000,  # KMV sketch size
        top_k=20,  # Top-k values to track
        log_every_n_chunks=5,  # Log every 5 chunks
        random_seed=42
    )
    
    profile_config = ProfileConfig(compute=compute_options)
    
    # Profile the complete pipeline
    print("üìä Profiling complete PySuricata pipeline...")
    
    def run_profile():
        return profile(df, config=profile_config)
    
    profile_result = profiler.profile_function(run_profile)
    
    print(f"\nüìä Full Pipeline Results:")
    print(f"   Execution time: {profile_result['execution_time']:.2f} seconds")
    print(f"   Processing speed: {len(df) / profile_result['execution_time']:,.0f} rows/second")
    
    # Display top functions by cumulative time
    print(f"\nüèÜ TOP 10 FUNCTIONS BY CUMULATIVE TIME:")
    print("-" * 50)
    print(profile_result['profile_stats'])
    
    return profile_result

# Run full pipeline profiling
full_profile_result = profile_full_pysuricata(df)


üî¨ PROFILING FULL PYSURICATA PIPELINE
üìä Profiling complete PySuricata pipeline...

üìä Full Pipeline Results:
   Execution time: 114.34 seconds
   Processing speed: 8,746 rows/second

üèÜ TOP 10 FUNCTIONS BY CUMULATIVE TIME:
--------------------------------------------------
         706785690 function calls (706777504 primitive calls) in 114.337 seconds

   Ordered by: cumulative time
   List reduced from 1548 to 20 due to restriction <20>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       46    0.006    0.000  108.420    2.357 /Users/alvaro/repos/pysuricata/pysuricata/compute/adapters/pandas.py:234(consume_chunk)
       46    0.028    0.001  103.486    2.250 /Users/alvaro/repos/pysuricata/pysuricata/compute/consume.py:96(consume_chunk_pandas)
      276    0.002    0.000   66.530    0.241 /Users/alvaro/repos/pysuricata/pysuricata/accumulators/numeric.py:206(update)
      276    1.195    0.004   64.483    0.234 /Users/alvaro/repos/pysuricata/pysurica

In [5]:
# Phase 2: Accumulator Update Operations Analysis
def analyze_accumulator_complexity():
    """Analyze time complexity of accumulator update operations."""
    
    print("\nüî¨ ANALYZING ACCUMULATOR UPDATE COMPLEXITY")
    print("=" * 60)
    
    # Test different data sizes
    sizes = [1000, 5000, 10000, 25000, 50000]
    
    results = {}
    
    # Test NumericAccumulator
    print("\nüìä Testing NumericAccumulator...")
    numeric_results = {}
    
    for size in sizes:
        print(f"   Size {size:,}...")
        
        # Generate test data
        data = np.random.randn(size)
        
        def test_numeric_update():
            acc = NumericAccumulator("test_numeric")
            acc.update(data)
            return acc
        
        profile_result = profiler.profile_function(test_numeric_update)
        
        numeric_results[size] = {
            'execution_time': profile_result['execution_time'],
            'ops_per_second': size / profile_result['execution_time'],
            'time_per_element': profile_result['execution_time'] / size
        }
        
        print(f"     Time: {profile_result['execution_time']:.4f}s")
        print(f"     Ops/sec: {numeric_results[size]['ops_per_second']:,.0f}")
    
    results['numeric'] = numeric_results
    
    # Test CategoricalAccumulator
    print("\nüìä Testing CategoricalAccumulator...")
    categorical_results = {}
    
    for size in sizes:
        print(f"   Size {size:,}...")
        
        # Generate test data with different cardinalities
        cardinality = min(1000, size // 10)  # 10% cardinality
        data = [f"cat_{i % cardinality}" for i in range(size)]
        
        def test_categorical_update():
            acc = CategoricalAccumulator("test_categorical")
            acc.update(data)
            return acc
        
        profile_result = profiler.profile_function(test_categorical_update)
        
        categorical_results[size] = {
            'execution_time': profile_result['execution_time'],
            'ops_per_second': size / profile_result['execution_time'],
            'time_per_element': profile_result['execution_time'] / size
        }
        
        print(f"     Time: {profile_result['execution_time']:.4f}s")
        print(f"     Ops/sec: {categorical_results[size]['ops_per_second']:,.0f}")
    
    results['categorical'] = categorical_results
    
    # Test BooleanAccumulator
    print("\nüìä Testing BooleanAccumulator...")
    boolean_results = {}
    
    for size in sizes:
        print(f"   Size {size:,}...")
        
        # Generate test data
        data = [i % 2 == 0 for i in range(size)]
        
        def test_boolean_update():
            acc = BooleanAccumulator("test_boolean")
            acc.update(data)
            return acc
        
        profile_result = profiler.profile_function(test_boolean_update)
        
        boolean_results[size] = {
            'execution_time': profile_result['execution_time'],
            'ops_per_second': size / profile_result['execution_time'],
            'time_per_element': profile_result['execution_time'] / size
        }
        
        print(f"     Time: {profile_result['execution_time']:.4f}s")
        print(f"     Ops/sec: {boolean_results[size]['ops_per_second']:,.0f}")
    
    results['boolean'] = boolean_results
    
    # Analyze complexity trends
    print(f"\nüìà COMPLEXITY ANALYSIS:")
    print("-" * 40)
    
    for acc_type, data in results.items():
        sizes_list = list(data.keys())
        times = [data[size]['time_per_element'] * 1000000 for size in sizes_list]  # Convert to microseconds
        
        # Calculate growth rate (approximate)
        if len(times) >= 2:
            growth_rate = times[-1] / times[0]
            print(f"{acc_type.capitalize()}: {growth_rate:.2f}x time increase from {sizes_list[0]:,} to {sizes_list[-1]:,} elements")
            
            # Estimate complexity
            if growth_rate < 1.5:
                complexity = "O(n) - Linear"
            elif growth_rate < 3:
                complexity = "O(n log n) - Log-linear"
            else:
                complexity = "O(n¬≤) or worse - Quadratic+"
            
            print(f"  Estimated complexity: {complexity}")
    
    return results

# Run accumulator complexity analysis
accumulator_results = analyze_accumulator_complexity()



üî¨ ANALYZING ACCUMULATOR UPDATE COMPLEXITY

üìä Testing NumericAccumulator...
   Size 1,000...
     Time: 0.0137s
     Ops/sec: 73,103
   Size 5,000...
     Time: 0.0513s
     Ops/sec: 97,439
   Size 10,000...
     Time: 0.0996s
     Ops/sec: 100,362
   Size 25,000...
     Time: 0.2652s
     Ops/sec: 94,265
   Size 50,000...
     Time: 0.5423s
     Ops/sec: 92,202

üìä Testing CategoricalAccumulator...
   Size 1,000...
     Time: 0.0036s
     Ops/sec: 278,739
   Size 5,000...
     Time: 0.0394s
     Ops/sec: 126,786
   Size 10,000...
     Time: 0.0815s
     Ops/sec: 122,662
   Size 25,000...
     Time: 0.2030s
     Ops/sec: 123,165
   Size 50,000...
     Time: 0.3966s
     Ops/sec: 126,078

üìä Testing BooleanAccumulator...
   Size 1,000...
     Time: 0.0004s
     Ops/sec: 2,717,081
   Size 5,000...
     Time: 0.0007s
     Ops/sec: 7,633,588
   Size 10,000...
     Time: 0.0013s
     Ops/sec: 7,894,217
   Size 25,000...
     Time: 0.0031s
     Ops/sec: 8,131,073
   Size 50,000...


In [6]:
# Phase 3: Sketching Algorithms Analysis
def analyze_sketching_algorithms():
    """Analyze performance of sketching algorithms (KMV, MisraGries, etc.)."""
    
    print("\nüî¨ ANALYZING SKETCHING ALGORITHMS")
    print("=" * 60)
    
    sizes = [1000, 5000, 10000, 25000, 50000]
    results = {}
    
    # Test KMV (K-Minimum Values)
    print("\nüìä Testing KMV Algorithm...")
    kmv_results = {}
    
    for size in sizes:
        print(f"   Size {size:,}...")
        
        # Generate test data with different cardinalities
        cardinality = min(2000, size // 5)  # 20% cardinality
        data = [f"value_{i % cardinality}" for i in range(size)]
        
        def test_kmv():
            kmv = KMV(k=1024)
            for value in data:
                kmv.add(value)
            return kmv
        
        profile_result = profiler.profile_function(test_kmv)
        
        kmv_results[size] = {
            'execution_time': profile_result['execution_time'],
            'ops_per_second': size / profile_result['execution_time'],
            'time_per_element': profile_result['execution_time'] / size,
            'unique_estimate': profile_result['result'].estimate()
        }
        
        print(f"     Time: {profile_result['execution_time']:.4f}s")
        print(f"     Ops/sec: {kmv_results[size]['ops_per_second']:,.0f}")
        print(f"     Unique estimate: {kmv_results[size]['unique_estimate']:,}")
    
    results['kmv'] = kmv_results
    
    # Test MisraGries (Top-K)
    print("\nüìä Testing MisraGries Algorithm...")
    misragries_results = {}
    
    for size in sizes:
        print(f"   Size {size:,}...")
        
        # Generate test data
        cardinality = min(1000, size // 10)  # 10% cardinality
        data = [f"item_{i % cardinality}" for i in range(size)]
        
        def test_misragries():
            mg = MisraGries(k=50)
            for value in data:
                mg.add(value)
            return mg
        
        profile_result = profiler.profile_function(test_misragries)
        
        misragries_results[size] = {
            'execution_time': profile_result['execution_time'],
            'ops_per_second': size / profile_result['execution_time'],
            'time_per_element': profile_result['execution_time'] / size,
            'top_items': len(profile_result['result'].counters)
        }
        
        print(f"     Time: {profile_result['execution_time']:.4f}s")
        print(f"     Ops/sec: {misragries_results[size]['ops_per_second']:,.0f}")
        print(f"     Top items tracked: {misragries_results[size]['top_items']}")
    
    results['misragries'] = misragries_results
    
    # Test ReservoirSampler
    print("\nüìä Testing ReservoirSampler...")
    reservoir_results = {}
    
    for size in sizes:
        print(f"   Size {size:,}...")
        
        # Generate test data
        data = np.random.randn(size)
        
        def test_reservoir():
            sampler = ReservoirSampler(k=1000)
            sampler.add_many(data)
            return sampler
        
        profile_result = profiler.profile_function(test_reservoir)
        
        reservoir_results[size] = {
            'execution_time': profile_result['execution_time'],
            'ops_per_second': size / profile_result['execution_time'],
            'time_per_element': profile_result['execution_time'] / size,
            'sample_size': len(profile_result['result'].values())
        }
        
        print(f"     Time: {profile_result['execution_time']:.4f}s")
        print(f"     Ops/sec: {reservoir_results[size]['ops_per_second']:,.0f}")
        print(f"     Sample size: {reservoir_results[size]['sample_size']}")
    
    results['reservoir'] = reservoir_results
    
    # Analyze algorithm performance
    print(f"\nüìà SKETCHING ALGORITHM ANALYSIS:")
    print("-" * 50)
    
    for algo_name, data in results.items():
        sizes_list = list(data.keys())
        times = [data[size]['time_per_element'] * 1000000 for size in sizes_list]  # Convert to microseconds
        
        print(f"\n{algo_name.upper()}:")
        print(f"  Time per element (Œºs): {times[0]:.2f} ‚Üí {times[-1]:.2f}")
        
        # Calculate growth rate
        if len(times) >= 2:
            growth_rate = times[-1] / times[0]
            print(f"  Growth rate: {growth_rate:.2f}x")
            
            # Estimate complexity
            if growth_rate < 1.5:
                complexity = "O(n) - Linear"
            elif growth_rate < 3:
                complexity = "O(n log n) - Log-linear"
            else:
                complexity = "O(n¬≤) or worse - Quadratic+"
            
            print(f"  Estimated complexity: {complexity}")
    
    return results

# Run sketching algorithms analysis
sketching_results = analyze_sketching_algorithms()



üî¨ ANALYZING SKETCHING ALGORITHMS

üìä Testing KMV Algorithm...
   Size 1,000...
     Time: 0.0016s
     Ops/sec: 622,617
     Unique estimate: 1,001
   Size 5,000...
     Time: 0.0083s
     Ops/sec: 600,123
     Unique estimate: 4,940
   Size 10,000...
     Time: 0.0152s
     Ops/sec: 657,705
     Unique estimate: 9,814
   Size 25,000...
     Time: 0.0352s
     Ops/sec: 711,217
     Unique estimate: 21,541
   Size 50,000...
     Time: 0.0677s
     Ops/sec: 738,432
     Unique estimate: 34,199

üìä Testing MisraGries Algorithm...
   Size 1,000...
     Time: 0.0004s
     Ops/sec: 2,564,925
     Top items tracked: 31
   Size 5,000...
     Time: 0.0019s
     Ops/sec: 2,701,972
     Top items tracked: 2
   Size 10,000...
     Time: 0.0037s
     Ops/sec: 2,687,299
     Top items tracked: 4
   Size 25,000...
     Time: 0.0092s
     Ops/sec: 2,719,448
     Top items tracked: 10
   Size 50,000...
     Time: 0.0184s
     Ops/sec: 2,710,162
     Top items tracked: 20

üìä Testing Reservoir

In [7]:
# Phase 4: Memory Usage Analysis
def analyze_memory_usage():
    """Analyze memory usage patterns in PySuricata."""
    
    print("\nüî¨ ANALYZING MEMORY USAGE PATTERNS")
    print("=" * 60)
    
    if df is None:
        print("‚ùå No dataset available for memory analysis")
        return None
    
    # Test different chunk sizes
    chunk_sizes = [10000, 25000, 50000, 100000]
    results = {}
    
    for chunk_size in chunk_sizes:
        print(f"\nüìä Testing chunk size: {chunk_size:,}")
        
        # Create chunks from the dataset
        chunks = [df.iloc[i:i+chunk_size] for i in range(0, len(df), chunk_size)]
        chunk_count = len(chunks)
        
        print(f"   Number of chunks: {chunk_count}")
        
        # Profile memory usage for chunk processing
        def process_chunks():
            # Initialize accumulators
            kinds = ColumnKinds()
            for col in df.columns:
                if df[col].dtype in ['int64', 'float64']:
                    kinds[col] = 'numeric'
                elif df[col].dtype == 'bool':
                    kinds[col] = 'boolean'
                elif df[col].dtype == 'object':
                    kinds[col] = 'categorical'
                else:
                    kinds[col] = 'datetime'
            
            accs = {}
            for col, kind in kinds.items():
                if kind == 'numeric':
                    accs[col] = NumericAccumulator(col)
                elif kind == 'categorical':
                    accs[col] = CategoricalAccumulator(col)
                elif kind == 'boolean':
                    accs[col] = BooleanAccumulator(col)
                else:
                    accs[col] = DatetimeAccumulator(col)
            
            # Process chunks
            for chunk in chunks:
                consume_chunk_pandas(chunk, accs, kinds)
            
            return accs
        
        memory_result = profiler.memory_profile(process_chunks)
        
        results[chunk_size] = {
            'chunk_count': chunk_count,
            'execution_time': memory_result['execution_time'],
            'memory_current': memory_result['memory_current'],
            'memory_peak': memory_result['memory_peak'],
            'memory_growth': memory_result['memory_growth'],
            'rows_per_second': len(df) / memory_result['execution_time']
        }
        
        print(f"   Execution time: {memory_result['execution_time']:.2f}s")
        print(f"   Memory peak: {memory_result['memory_peak']:.2f} MB")
        print(f"   Memory growth: {memory_result['memory_growth']:.2f} MB")
        print(f"   Rows/sec: {results[chunk_size]['rows_per_second']:,.0f}")
    
    # Analyze memory efficiency
    print(f"\nüìà MEMORY EFFICIENCY ANALYSIS:")
    print("-" * 50)
    
    for chunk_size, data in results.items():
        memory_per_row = data['memory_growth'] / len(df) * 1024  # KB per row
        print(f"Chunk size {chunk_size:,}: {memory_per_row:.2f} KB/row")
    
    # Find optimal chunk size
    best_chunk_size = min(results.keys(), key=lambda k: results[k]['memory_growth'])
    print(f"\nüèÜ Most memory efficient chunk size: {best_chunk_size:,}")
    
    return results

# Run memory usage analysis
memory_results = analyze_memory_usage()



üî¨ ANALYZING MEMORY USAGE PATTERNS

üìä Testing chunk size: 10,000
   Number of chunks: 100


TypeError: 'ColumnKinds' object does not support item assignment

In [None]:
# Phase 5: Bottleneck Identification and Summary
def identify_bottlenecks():
    """Identify and summarize the main performance bottlenecks."""
    
    print("\nüéØ BOTTLENECK IDENTIFICATION AND SUMMARY")
    print("=" * 60)
    
    bottlenecks = []
    
    # Analyze accumulator results
    if 'accumulator_results' in globals() and accumulator_results:
        print("\nüìä ACCUMULATOR PERFORMANCE ANALYSIS:")
        print("-" * 40)
        
        for acc_type, data in accumulator_results.items():
            largest_size = max(data.keys())
            time_per_element = data[largest_size]['time_per_element'] * 1000000  # Convert to microseconds
            
            print(f"{acc_type.capitalize()}: {time_per_element:.2f} Œºs/element")
            
            # Identify bottlenecks
            if acc_type == 'categorical' and time_per_element > 10:
                bottlenecks.append({
                    'component': 'CategoricalAccumulator',
                    'issue': 'Sequential Python loop processing',
                    'impact': 'HIGH',
                    'time_per_element': time_per_element,
                    'optimization': 'Vectorize with pandas operations'
                })
            
            if acc_type == 'numeric' and time_per_element > 5:
                bottlenecks.append({
                    'component': 'NumericAccumulator',
                    'issue': 'KMV binary search insertions',
                    'impact': 'MEDIUM',
                    'time_per_element': time_per_element,
                    'optimization': 'Batch KMV operations'
                })
    
    # Analyze sketching results
    if 'sketching_results' in globals() and sketching_results:
        print("\nüìä SKETCHING ALGORITHM ANALYSIS:")
        print("-" * 40)
        
        for algo_name, data in sketching_results.items():
            largest_size = max(data.keys())
            time_per_element = data[largest_size]['time_per_element'] * 1000000
            
            print(f"{algo_name.upper()}: {time_per_element:.2f} Œºs/element")
            
            # Identify bottlenecks
            if algo_name == 'kmv' and time_per_element > 5:
                bottlenecks.append({
                    'component': 'KMV Algorithm',
                    'issue': 'Binary search insertion per element',
                    'impact': 'MEDIUM',
                    'time_per_element': time_per_element,
                    'optimization': 'Batch insertions with numpy'
                })
            
            if algo_name == 'misragries' and time_per_element > 3:
                bottlenecks.append({
                    'component': 'MisraGries Algorithm',
                    'issue': 'Full counter sweep when full',
                    'impact': 'LOW',
                    'time_per_element': time_per_element,
                    'optimization': 'Optimize decrement logic'
                })
    
    # Analyze memory results
    if 'memory_results' in globals() and memory_results:
        print("\nüìä MEMORY USAGE ANALYSIS:")
        print("-" * 40)
        
        for chunk_size, data in memory_results.items():
            memory_per_row = data['memory_growth'] / len(df) * 1024  # KB per row
            print(f"Chunk {chunk_size:,}: {memory_per_row:.2f} KB/row")
            
            if memory_per_row > 0.1:  # More than 0.1 KB per row
                bottlenecks.append({
                    'component': 'Memory Tracking',
                    'issue': 'Excessive memory_usage() calls',
                    'impact': 'HIGH',
                    'memory_per_row': memory_per_row,
                    'optimization': 'Cache memory usage estimates'
                })
    
    # Display bottleneck summary
    print(f"\nüö® IDENTIFIED BOTTLENECKS:")
    print("=" * 50)
    
    if bottlenecks:
        # Sort by impact
        impact_order = {'HIGH': 3, 'MEDIUM': 2, 'LOW': 1}
        bottlenecks.sort(key=lambda x: impact_order.get(x['impact'], 0), reverse=True)
        
        for i, bottleneck in enumerate(bottlenecks, 1):
            print(f"\n{i}. {bottleneck['component']}")
            print(f"   Issue: {bottleneck['issue']}")
            print(f"   Impact: {bottleneck['impact']}")
            print(f"   Optimization: {bottleneck['optimization']}")
            
            if 'time_per_element' in bottleneck:
                print(f"   Time per element: {bottleneck['time_per_element']:.2f} Œºs")
            if 'memory_per_row' in bottleneck:
                print(f"   Memory per row: {bottleneck['memory_per_row']:.2f} KB")
    else:
        print("‚úÖ No significant bottlenecks identified!")
    
    # Performance recommendations
    print(f"\nüí° PERFORMANCE RECOMMENDATIONS:")
    print("=" * 50)
    
    recommendations = [
        "1. HIGH PRIORITY: Vectorize categorical processing with pandas operations",
        "2. HIGH PRIORITY: Cache memory_usage() calls to reduce overhead",
        "3. MEDIUM PRIORITY: Implement batch KMV insertions for numeric columns",
        "4. MEDIUM PRIORITY: Optimize MisraGries decrement logic",
        "5. LOW PRIORITY: Reduce argpartition frequency for extremes tracking",
        "6. LONG TERM: Consider t-digest for quantiles instead of sorting",
        "7. LONG TERM: Implement HyperLogLog for cardinality estimation"
    ]
    
    for rec in recommendations:
        print(rec)
    
    # Expected improvements
    print(f"\nüìà EXPECTED IMPROVEMENTS:")
    print("-" * 30)
    print("‚Ä¢ Categorical vectorization: 3-5x speedup")
    print("‚Ä¢ Memory usage caching: 20-30% reduction in overhead")
    print("‚Ä¢ Batch KMV operations: 1.5-2x speedup")
    print("‚Ä¢ Overall pipeline: 2-4x speedup potential")
    
    return bottlenecks

# Run bottleneck identification
bottlenecks = identify_bottlenecks()


In [None]:
# Phase 6: Performance Visualization
def create_performance_visualizations():
    """Create visualizations of performance analysis results."""
    
    print("\nüìä CREATING PERFORMANCE VISUALIZATIONS")
    print("=" * 60)
    
    try:
        # Create figure with subplots
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle('PySuricata Performance Analysis', fontsize=16, fontweight='bold')
        
        # Plot 1: Accumulator Performance
        if 'accumulator_results' in globals() and accumulator_results:
            ax1 = axes[0, 0]
            
            for acc_type, data in accumulator_results.items():
                sizes = list(data.keys())
                times = [data[size]['time_per_element'] * 1000000 for size in sizes]  # Convert to microseconds
                ax1.plot(sizes, times, marker='o', label=f'{acc_type.capitalize()}', linewidth=2)
            
            ax1.set_xlabel('Data Size')
            ax1.set_ylabel('Time per Element (Œºs)')
            ax1.set_title('Accumulator Update Performance')
            ax1.legend()
            ax1.grid(True, alpha=0.3)
            ax1.set_yscale('log')
        
        # Plot 2: Sketching Algorithm Performance
        if 'sketching_results' in globals() and sketching_results:
            ax2 = axes[0, 1]
            
            for algo_name, data in sketching_results.items():
                sizes = list(data.keys())
                times = [data[size]['time_per_element'] * 1000000 for size in sizes]
                ax2.plot(sizes, times, marker='s', label=f'{algo_name.upper()}', linewidth=2)
            
            ax2.set_xlabel('Data Size')
            ax2.set_ylabel('Time per Element (Œºs)')
            ax2.set_title('Sketching Algorithm Performance')
            ax2.legend()
            ax2.grid(True, alpha=0.3)
            ax2.set_yscale('log')
        
        # Plot 3: Memory Usage by Chunk Size
        if 'memory_results' in globals() and memory_results:
            ax3 = axes[1, 0]
            
            chunk_sizes = list(memory_results.keys())
            memory_growth = [memory_results[size]['memory_growth'] for size in chunk_sizes]
            execution_times = [memory_results[size]['execution_time'] for size in chunk_sizes]
            
            ax3_twin = ax3.twinx()
            
            bars = ax3.bar(chunk_sizes, memory_growth, alpha=0.7, color='skyblue', label='Memory Growth (MB)')
            line = ax3_twin.plot(chunk_sizes, execution_times, 'ro-', linewidth=2, markersize=8, label='Execution Time (s)')
            
            ax3.set_xlabel('Chunk Size')
            ax3.set_ylabel('Memory Growth (MB)', color='blue')
            ax3_twin.set_ylabel('Execution Time (s)', color='red')
            ax3.set_title('Memory Usage vs Performance')
            
            # Combine legends
            lines1, labels1 = ax3.get_legend_handles_labels()
            lines2, labels2 = ax3_twin.get_legend_handles_labels()
            ax3.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
            
            ax3.grid(True, alpha=0.3)
        
        # Plot 4: Bottleneck Impact Summary
        if 'bottlenecks' in globals() and bottlenecks:
            ax4 = axes[1, 1]
            
            components = [b['component'] for b in bottlenecks]
            impacts = [b['impact'] for b in bottlenecks]
            
            # Convert impact to numeric for visualization
            impact_values = {'HIGH': 3, 'MEDIUM': 2, 'LOW': 1}
            impact_nums = [impact_values.get(imp, 0) for imp in impacts]
            
            colors = ['red' if imp == 'HIGH' else 'orange' if imp == 'MEDIUM' else 'yellow' for imp in impacts]
            
            bars = ax4.barh(components, impact_nums, color=colors, alpha=0.7)
            ax4.set_xlabel('Impact Level')
            ax4.set_title('Identified Bottlenecks')
            ax4.set_xlim(0, 4)
            
            # Add impact labels
            for i, (bar, impact) in enumerate(zip(bars, impacts)):
                ax4.text(bar.get_width() + 0.1, bar.get_y() + bar.get_height()/2, 
                        impact, va='center', fontweight='bold')
        
        plt.tight_layout()
        plt.savefig('pysuricata_performance_analysis.png', dpi=150, bbox_inches='tight')
        plt.show()
        
        print("‚úÖ Performance visualizations created and saved!")
        
    except ImportError:
        print("‚ùå Matplotlib not available - skipping visualizations")
    except Exception as e:
        print(f"‚ùå Error creating visualizations: {e}")

# Create visualizations
create_performance_visualizations()


In [None]:
# Final Summary and Documentation
def generate_final_summary():
    """Generate comprehensive final summary of performance analysis."""
    
    print("\nüéâ PYSURICATA PERFORMANCE ANALYSIS COMPLETE")
    print("=" * 60)
    
    # Summary statistics
    print("\nüìä ANALYSIS SUMMARY:")
    print("-" * 30)
    
    if df is not None:
        print(f"Dataset analyzed: {df.shape[0]:,} rows √ó {df.shape[1]:,} columns")
        print(f"Dataset size: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
    
    if 'full_profile_result' in globals() and full_profile_result:
        print(f"Total processing time: {full_profile_result['execution_time']:.2f} seconds")
        print(f"Processing speed: {len(df) / full_profile_result['execution_time']:,.0f} rows/second")
    
    # Key findings
    print(f"\nüîç KEY FINDINGS:")
    print("-" * 20)
    
    findings = []
    
    if 'accumulator_results' in globals() and accumulator_results:
        # Find slowest accumulator
        slowest_acc = None
        slowest_time = 0
        
        for acc_type, data in accumulator_results.items():
            largest_size = max(data.keys())
            time_per_element = data[largest_size]['time_per_element'] * 1000000
            if time_per_element > slowest_time:
                slowest_time = time_per_element
                slowest_acc = acc_type
        
        if slowest_acc:
            findings.append(f"Slowest accumulator: {slowest_acc} ({slowest_time:.2f} Œºs/element)")
    
    if 'sketching_results' in globals() and sketching_results:
        # Find slowest sketching algorithm
        slowest_algo = None
        slowest_time = 0
        
        for algo_name, data in sketching_results.items():
            largest_size = max(data.keys())
            time_per_element = data[largest_size]['time_per_element'] * 1000000
            if time_per_element > slowest_time:
                slowest_time = time_per_element
                slowest_algo = algo_name
        
        if slowest_algo:
            findings.append(f"Slowest sketching algorithm: {slowest_algo} ({slowest_time:.2f} Œºs/element)")
    
    if 'memory_results' in globals() and memory_results:
        # Find most memory efficient chunk size
        best_chunk = min(memory_results.keys(), key=lambda k: memory_results[k]['memory_growth'])
        memory_per_row = memory_results[best_chunk]['memory_growth'] / len(df) * 1024
        findings.append(f"Most memory efficient chunk size: {best_chunk:,} ({memory_per_row:.2f} KB/row)")
    
    for finding in findings:
        print(f"‚Ä¢ {finding}")
    
    # Bottleneck summary
    if 'bottlenecks' in globals() and bottlenecks:
        print(f"\nüö® BOTTLENECKS IDENTIFIED: {len(bottlenecks)}")
        print("-" * 40)
        
        high_impact = [b for b in bottlenecks if b['impact'] == 'HIGH']
        medium_impact = [b for b in bottlenecks if b['impact'] == 'MEDIUM']
        low_impact = [b for b in bottlenecks if b['impact'] == 'LOW']
        
        print(f"‚Ä¢ HIGH impact: {len(high_impact)}")
        print(f"‚Ä¢ MEDIUM impact: {len(medium_impact)}")
        print(f"‚Ä¢ LOW impact: {len(low_impact)}")
        
        if high_impact:
            print(f"\nüî• HIGH IMPACT BOTTLENECKS:")
            for bottleneck in high_impact:
                print(f"  - {bottleneck['component']}: {bottleneck['issue']}")
    
    # Optimization potential
    print(f"\nüí° OPTIMIZATION POTENTIAL:")
    print("-" * 30)
    
    potential_improvements = [
        "Categorical vectorization: 3-5x speedup",
        "Memory usage caching: 20-30% overhead reduction", 
        "Batch KMV operations: 1.5-2x speedup",
        "Overall pipeline: 2-4x speedup potential"
    ]
    
    for improvement in potential_improvements:
        print(f"‚Ä¢ {improvement}")
    
    # Next steps
    print(f"\nüéØ RECOMMENDED NEXT STEPS:")
    print("-" * 30)
    
    next_steps = [
        "1. Implement categorical accumulator vectorization",
        "2. Add memory usage caching to consume_chunk_pandas",
        "3. Create batch KMV insertion methods",
        "4. Profile optimized version to measure improvements",
        "5. Consider t-digest for quantile computation",
        "6. Implement HyperLogLog for cardinality estimation"
    ]
    
    for step in next_steps:
        print(step)
    
    # Files generated
    print(f"\nüìÅ FILES GENERATED:")
    print("-" * 20)
    print("‚Ä¢ pysuricata_performance_analysis.ipynb - Main analysis notebook")
    print("‚Ä¢ micro_benchmarks.py - Isolated component benchmarks")
    print("‚Ä¢ pysuricata_performance_analysis.png - Performance visualizations")
    print("‚Ä¢ Performance analysis plan (attached markdown)")
    
    print(f"\n‚úÖ Performance analysis complete!")
    print(f"üìä Use the results to guide optimization efforts")
    print(f"üî¨ Run micro_benchmarks.py for detailed component analysis")

# Generate final summary
generate_final_summary()
