In [1]:
"""
ydata-profiling vs PySuricata Performance Comparison
This notebook compares the performance and memory usage of:
1. ydata-profiling (formerly pandas-profiling)
2. PySuricata

Using a 1M row dataset with comprehensive monitoring.
"""

import pandas as pd
import numpy as np
import tracemalloc
import psutil
import os
import time
from pathlib import Path
from ydata_profiling import ProfileReport
from pysuricata import profile, ProfileConfig, ComputeOptions

print("üì¶ All imports loaded successfully!")
print(f"üìä Pandas version: {pd.__version__}")
print(f"üî¨ PySuricata version: {profile.__module__}")


  from .autonotebook import tqdm as notebook_tqdm


üì¶ All imports loaded successfully!
üìä Pandas version: 2.3.3
üî¨ PySuricata version: pysuricata.api


In [None]:
# Lightweight Memory Monitoring Setup
class MemoryMonitor:
    """Lightweight memory monitoring for profiling tool comparison.
    
    Note: Uses psutil only to avoid tracemalloc performance overhead.
    tracemalloc.start() causes 6-7x slowdown in PySuricata due to 
    tracking every memory allocation during data processing.
    """
    
    def __init__(self):
        self.process = psutil.Process(os.getpid())
        self.initial_memory = self.process.memory_info().rss / 1024 / 1024
        self.memory_snapshots = []
        self.start_time = time.perf_counter()
        
    def start_tracing(self):
        """Start lightweight memory monitoring."""
        print(f"üîç Lightweight memory monitoring started")
        print(f"üìä Initial memory: {self.initial_memory:.2f} MB")
        print(f"‚ÑπÔ∏è  Using psutil-only monitoring (no tracemalloc overhead)")
        
    def snapshot(self, step=None, description=""):
        """Take a memory snapshot using psutil only."""
        current_memory = self.process.memory_info().rss / 1024 / 1024
        
        snapshot = {
            'step': step,
            'description': description,
            'timestamp': time.perf_counter() - self.start_time,
            'process_memory': current_memory,
            'traced_memory': 0,  # Not available without tracemalloc
            'peak_traced': 0,    # Not available without tracemalloc
            'memory_growth': current_memory - self.initial_memory
        }
        
        self.memory_snapshots.append(snapshot)
        
        print(f"üìà {step}: {description}")
        print(f"   Process memory: {current_memory:.2f} MB (+{snapshot['memory_growth']:.2f} MB)")
        
        return snapshot
        
    def stop_tracing(self):
        """Stop memory monitoring and return summary."""
        final_memory = self.process.memory_info().rss / 1024 / 1024
        total_time = time.perf_counter() - self.start_time
        
        print(f"\nüèÅ Memory Monitoring Summary:")
        print(f"   Total time: {total_time:.2f} seconds")
        print(f"   Final memory: {final_memory:.2f} MB")
        print(f"   Total growth: {final_memory - self.initial_memory:.2f} MB")
        
        if self.memory_snapshots:
            max_growth = max(s['memory_growth'] for s in self.memory_snapshots)
            print(f"   Peak growth: {max_growth:.2f} MB")
            
            if max_growth < 200:
                print("   ‚úÖ Memory usage is bounded and efficient!")
            else:
                print("   ‚ö†Ô∏è  High memory growth detected!")
        
        return self.memory_snapshots

print("‚úÖ Lightweight MemoryMonitor class ready!")
print("‚ÑπÔ∏è  Note: Using psutil-only monitoring to avoid tracemalloc performance overhead")


‚úÖ MemoryMonitor class ready!


In [3]:
# Data Loading and Preparation
def load_dataset():
    """Load the 1M row dataset for comparison."""
    
    csv_path = Path("1M_rows_test_uncompressed.csv")
    
    if not csv_path.exists():
        print("‚ùå CSV file not found: 1M_rows_test_uncompressed.csv")
        print("   Please ensure the file exists in the examples directory")
        return None
    
    print(f"üìÅ Loading dataset: {csv_path.name}")
    print(f"üìä File size: {csv_path.stat().st_size / 1024 / 1024:.2f} MB")
    
    # Load with pandas
    start_time = time.perf_counter()
    df = pd.read_csv(csv_path)
    load_time = time.perf_counter() - start_time
    
    print(f"‚úÖ Dataset loaded successfully!")
    print(f"üìä Shape: {df.shape[0]:,} rows √ó {df.shape[1]:,} columns")
    print(f"‚è±Ô∏è  Load time: {load_time:.2f} seconds")
    print(f"üíæ Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
    
    # Display basic info
    print(f"\nüìã Column Types:")
    dtype_counts = df.dtypes.value_counts()
    for dtype, count in dtype_counts.items():
        print(f"   {dtype}: {count} columns")
    
    return df

# Load the dataset
df = load_dataset()

if df is not None:
    print(f"\nüéØ Ready for profiling comparison!")
    print(f"   Dataset: {df.shape[0]:,} rows √ó {df.shape[1]:,} columns")
    print(f"   Memory footprint: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")


üìÅ Loading dataset: 1M_rows_test_uncompressed.csv
üìä File size: 142.14 MB
‚úÖ Dataset loaded successfully!
üìä Shape: 1,000,000 rows √ó 13 columns
‚è±Ô∏è  Load time: 0.80 seconds
üíæ Memory usage: 368.82 MB

üìã Column Types:
   object: 6 columns
   float64: 4 columns
   int64: 3 columns

üéØ Ready for profiling comparison!
   Dataset: 1,000,000 rows √ó 13 columns
   Memory footprint: 368.82 MB


In [4]:
# ydata-profiling Benchmark
def benchmark_ydata_profiling(df):
    """Benchmark ydata-profiling with comprehensive analysis."""
    
    if df is None:
        print("‚ùå No dataset available for profiling")
        return None
    
    print("üî¨ Starting ydata-profiling benchmark...")
    print("=" * 50)
    
    # Initialize memory monitor
    monitor = MemoryMonitor()
    monitor.start_tracing()
    monitor.snapshot("Start", "Initial state")
    
    try:
        # Configure ydata-profiling for comprehensive analysis
        print("‚öôÔ∏è  Configuring ydata-profiling for full analysis...")
        
        # Start profiling with comprehensive settings
        start_time = time.perf_counter()
        
        monitor.snapshot("Config", "Configuration complete")
        
        # Generate comprehensive report
        print("üìä Generating comprehensive ydata-profiling report...")
        report = ProfileReport(
            df,
            title="ydata-profiling Report - 1M Rows",
            minimal=False,  # Full analysis, not minimal
            explorative=True,  # Enable explorative features
            progress_bar=True,  # Show progress
            lazy=False,  # Process immediately
            # Enable all analysis features
            correlations={
                "pearson": {"calculate": True},
                "spearman": {"calculate": True},
                "kendall": {"calculate": True},
                "phi_k": {"calculate": True},
                "cramers": {"calculate": True}
            },
            interactions={
                "continuous": True,
                "targets": [],
                "continuous_columns": []
            },
            missing_diagrams={
                "bar": True,
                "matrix": True,
                "heatmap": True,
                "dendrogram": True
            },
            duplicates={
                "head": 10
            }
        )
        
        monitor.snapshot("Processing", "Report generation in progress")
        
        # Generate HTML report
        print("üíæ Saving HTML report...")
        html_file = "ydata_profiling_report_1M.html"
        report.to_file(html_file)
        
        end_time = time.perf_counter()
        processing_time = end_time - start_time
        
        monitor.snapshot("Complete", "Report generation complete")
        
        # Get final memory stats
        memory_snapshots = monitor.stop_tracing()
        
        # Get report file size
        report_size = Path(html_file).stat().st_size / 1024 / 1024
        
        print(f"\nüìä ydata-profiling Results:")
        print(f"   Processing time: {processing_time:.2f} seconds")
        print(f"   Report size: {report_size:.2f} MB")
        print(f"   Memory growth: {memory_snapshots[-1]['memory_growth']:.2f} MB")
        print(f"   Peak memory: {max(s['memory_growth'] for s in memory_snapshots):.2f} MB")
        
        if processing_time > 0:
            rows_per_second = len(df) / processing_time
            print(f"   Processing speed: {rows_per_second:,.0f} rows/second")
        
        print(f"\n‚úÖ ydata-profiling report saved: {html_file}")
        
        return {
            'tool': 'ydata-profiling',
            'processing_time': processing_time,
            'report_size': report_size,
            'memory_growth': memory_snapshots[-1]['memory_growth'],
            'peak_memory': max(s['memory_growth'] for s in memory_snapshots),
            'memory_snapshots': memory_snapshots,
            'report': report,
            'html_file': html_file
        }
        
    except Exception as e:
        print(f"‚ùå Error during ydata-profiling: {e}")
        print(f"   Error type: {type(e).__name__}")
        monitor.stop_tracing()
        raise

# Run ydata-profiling benchmark
ydata_results = benchmark_ydata_profiling(df)


üî¨ Starting ydata-profiling benchmark...
üîç Memory monitoring started
üìä Initial memory: 853.61 MB
üìà Start: Initial state
   Process memory: 853.61 MB (+0.00 MB)
   Traced memory: 0.00 MB (peak: 0.00 MB)
‚öôÔ∏è  Configuring ydata-profiling for full analysis...
üìà Config: Configuration complete
   Process memory: 853.61 MB (+0.00 MB)
   Traced memory: 0.00 MB (peak: 0.00 MB)
üìä Generating comprehensive ydata-profiling report...


  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
  diff_b_a = subtract(b, a)
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13/13 [00:19<00:00,  1.49s/it]2<00:10,  1.25s/it, Describe variable: int_col]
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'cannot specify integer `bins` when input data contains infinity')
Summarize dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 77/77 [00:51<00:00,  1.50it/s, Completed]                           
Generate report structure: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:06<00:00,  6.33s/it]


üìà Processing: Report generation in progress
   Process memory: 2220.58 MB (+1366.97 MB)
   Traced memory: 407.40 MB (peak: 1077.70 MB)
üíæ Saving HTML report...


Render HTML: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  2.29it/s]
Export report to file: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 359.19it/s]

üìà Complete: Report generation complete
   Process memory: 2186.44 MB (+1332.83 MB)
   Traced memory: 416.32 MB (peak: 1077.70 MB)

üèÅ Memory Monitoring Summary:
   Total time: 58.31 seconds
   Final memory: 2186.78 MB
   Total growth: 1333.17 MB
   Peak growth: 1366.97 MB
   ‚ö†Ô∏è  High memory growth detected!

üìä ydata-profiling Results:
   Processing time: 58.29 seconds
   Report size: 5.54 MB
   Memory growth: 1332.83 MB
   Peak memory: 1366.97 MB
   Processing speed: 17,154 rows/second

‚úÖ ydata-profiling report saved: ydata_profiling_report_1M.html





In [None]:
# PySuricata Benchmark
def benchmark_pysuricata(df):
    """Benchmark PySuricata with comprehensive analysis."""
    
    if df is None:
        print("‚ùå No dataset available for profiling")
        return None
    
    print("\nüî¨ Starting PySuricata benchmark...")
    print("=" * 50)
    
    # Initialize memory monitor
    monitor = MemoryMonitor()
    monitor.start_tracing()
    monitor.snapshot("Start", "Initial state")
    
    try:
        # Configure PySuricata for comprehensive analysis
        print("‚öôÔ∏è  Configuring PySuricata...")
        
        compute_options = ComputeOptions(
            chunk_size=50_000,  # Process in smaller chunks for 1M rows
            numeric_sample_size=5_000,  # Smaller sample size for 1M rows
            max_uniques=1000,  # KMV sketch size
            top_k=20,  # Top-k values to track
            log_every_n_chunks=5,  # Log every 5 chunks
            random_seed=42
        )
        
        profile_config = ProfileConfig(compute=compute_options)
        
        monitor.snapshot("Config", "Configuration complete")
        
        # Start profiling
        start_time = time.perf_counter()
        
        print("üìä Generating PySuricata report...")
        report = profile(df, config=profile_config)
        
        monitor.snapshot("Processing", "Report generation in progress")
        
        # Save HTML report
        print("üíæ Saving HTML report...")
        html_file = "pysuricata_report_1M.html"
        report.save_html(html_file)
        
        end_time = time.perf_counter()
        processing_time = end_time - start_time
        
        monitor.snapshot("Complete", "Report generation complete")
        
        # Get final memory stats
        memory_snapshots = monitor.stop_tracing()
        
        # Get report file size
        report_size = Path(html_file).stat().st_size / 1024 / 1024
        
        print(f"\nüìä PySuricata Results:")
        print(f"   Processing time: {processing_time:.2f} seconds")
        print(f"   Report size: {report_size:.2f} MB")
        print(f"   Memory growth: {memory_snapshots[-1]['memory_growth']:.2f} MB")
        print(f"   Peak memory: {max(s['memory_growth'] for s in memory_snapshots):.2f} MB")
        
        if processing_time > 0:
            rows_per_second = len(df) / processing_time
            print(f"   Processing speed: {rows_per_second:,.0f} rows/second")
        
        print(f"\n‚úÖ PySuricata report saved: {html_file}")
        
        return {
            'tool': 'pysuricata',
            'processing_time': processing_time,
            'report_size': report_size,
            'memory_growth': memory_snapshots[-1]['memory_growth'],
            'peak_memory': max(s['memory_growth'] for s in memory_snapshots),
            'memory_snapshots': memory_snapshots,
            'report': report,
            'html_file': html_file
        }
        
    except Exception as e:
        print(f"‚ùå Error during PySuricata profiling: {e}")
        print(f"   Error type: {type(e).__name__}")
        monitor.stop_tracing()
        raise

# Run PySuricata benchmark
pysuricata_results = benchmark_pysuricata(df)



üî¨ Starting PySuricata benchmark...
üîç Memory monitoring started
üìä Initial memory: 2187.83 MB
üìà Start: Initial state
   Process memory: 2187.83 MB (+0.00 MB)
   Traced memory: 0.00 MB (peak: 0.00 MB)
‚öôÔ∏è  Configuring PySuricata...
üìà Config: Configuration complete
   Process memory: 2187.97 MB (+0.14 MB)
   Traced memory: 0.00 MB (peak: 0.00 MB)
üìä Generating PySuricata report...


In [None]:
# Performance Comparison
def compare_performance(ydata_results, pysuricata_results):
    """Compare performance metrics between the two tools."""
    
    if ydata_results is None or pysuricata_results is None:
        print("‚ùå Cannot compare - one or both benchmarks failed")
        return
    
    print("\nüèÜ PERFORMANCE COMPARISON")
    print("=" * 60)
    
    # Create comparison table
    comparison_data = {
        'Metric': [
            'Processing Time (seconds)',
            'Report Size (MB)',
            'Memory Growth (MB)',
            'Peak Memory (MB)',
            'Processing Speed (rows/sec)'
        ],
        'ydata-profiling': [
            f"{ydata_results['processing_time']:.2f}",
            f"{ydata_results['report_size']:.2f}",
            f"{ydata_results['memory_growth']:.2f}",
            f"{ydata_results['peak_memory']:.2f}",
            f"{len(df) / ydata_results['processing_time']:,.0f}" if ydata_results['processing_time'] > 0 else "N/A"
        ],
        'PySuricata': [
            f"{pysuricata_results['processing_time']:.2f}",
            f"{pysuricata_results['report_size']:.2f}",
            f"{pysuricata_results['memory_growth']:.2f}",
            f"{pysuricata_results['peak_memory']:.2f}",
            f"{len(df) / pysuricata_results['processing_time']:,.0f}" if pysuricata_results['processing_time'] > 0 else "N/A"
        ]
    }
    
    comparison_df = pd.DataFrame(comparison_data)
    
    print("\nüìä Side-by-Side Comparison:")
    print(comparison_df.to_string(index=False))
    
    # Calculate speedup/slowdown ratios
    print("\n‚ö° Performance Ratios:")
    
    time_ratio = ydata_results['processing_time'] / pysuricata_results['processing_time']
    memory_ratio = ydata_results['memory_growth'] / pysuricata_results['memory_growth']
    size_ratio = ydata_results['report_size'] / pysuricata_results['report_size']
    
    print(f"   Time ratio (ydata/pysuricata): {time_ratio:.2f}x")
    print(f"   Memory ratio (ydata/pysuricata): {memory_ratio:.2f}x")
    print(f"   Size ratio (ydata/pysuricata): {size_ratio:.2f}x")
    
    # Determine winners
    print("\nüèÖ Winners:")
    
    if ydata_results['processing_time'] < pysuricata_results['processing_time']:
        print(f"   ‚ö° Speed: ydata-profiling ({ydata_results['processing_time']:.2f}s vs {pysuricata_results['processing_time']:.2f}s)")
    else:
        print(f"   ‚ö° Speed: PySuricata ({pysuricata_results['processing_time']:.2f}s vs {ydata_results['processing_time']:.2f}s)")
    
    if ydata_results['memory_growth'] < pysuricata_results['memory_growth']:
        print(f"   üíæ Memory: ydata-profiling ({ydata_results['memory_growth']:.2f}MB vs {pysuricata_results['memory_growth']:.2f}MB)")
    else:
        print(f"   üíæ Memory: PySuricata ({pysuricata_results['memory_growth']:.2f}MB vs {ydata_results['memory_growth']:.2f}MB)")
    
    if ydata_results['report_size'] < pysuricata_results['report_size']:
        print(f"   üìÑ Size: ydata-profiling ({ydata_results['report_size']:.2f}MB vs {pysuricata_results['report_size']:.2f}MB)")
    else:
        print(f"   üìÑ Size: PySuricata ({pysuricata_results['report_size']:.2f}MB vs {ydata_results['report_size']:.2f}MB)")
    
    return comparison_df

# Run comparison
comparison_df = compare_performance(ydata_results, pysuricata_results)



üèÜ PERFORMANCE COMPARISON

üìä Side-by-Side Comparison:
                     Metric ydata-profiling PySuricata
  Processing Time (seconds)           60.71     349.31
           Report Size (MB)            5.54       1.35
         Memory Growth (MB)         1462.70   -1132.16
           Peak Memory (MB)         1462.70       0.02
Processing Speed (rows/sec)          16,473      2,863

‚ö° Performance Ratios:
   Time ratio (ydata/pysuricata): 0.17x
   Memory ratio (ydata/pysuricata): -1.29x
   Size ratio (ydata/pysuricata): 4.09x

üèÖ Winners:
   ‚ö° Speed: ydata-profiling (60.71s vs 349.31s)
   üíæ Memory: PySuricata (-1132.16MB vs 1462.70MB)
   üìÑ Size: PySuricata (1.35MB vs 5.54MB)


In [None]:
# Memory Usage Visualization
def visualize_memory_usage(ydata_results, pysuricata_results):
    """Visualize memory usage patterns for both tools."""
    
    if ydata_results is None or pysuricata_results is None:
        print("‚ùå Cannot visualize - one or both benchmarks failed")
        return
    
    try:
        import matplotlib.pyplot as plt
        
        print("\nüìä Creating memory usage visualization...")
        
        # Prepare data
        ydata_snapshots = ydata_results['memory_snapshots']
        pysuricata_snapshots = pysuricata_results['memory_snapshots']
        
        # Create visualization
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
        
        # Plot 1: Memory growth over time
        if len(ydata_snapshots) > 1:
            ydata_times = [s['timestamp'] for s in ydata_snapshots]
            ydata_growth = [s['memory_growth'] for s in ydata_snapshots]
            ax1.plot(ydata_times, ydata_growth, 'b-o', label='ydata-profiling', linewidth=2, markersize=6)
        
        if len(pysuricata_snapshots) > 1:
            pysuricata_times = [s['timestamp'] for s in pysuricata_snapshots]
            pysuricata_growth = [s['memory_growth'] for s in pysuricata_snapshots]
            ax1.plot(pysuricata_times, pysuricata_growth, 'r-o', label='PySuricata', linewidth=2, markersize=6)
        
        ax1.set_xlabel('Time (seconds)')
        ax1.set_ylabel('Memory Growth (MB)')
        ax1.set_title('Memory Growth Over Time')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # Plot 2: Process memory comparison
        if len(ydata_snapshots) > 1:
            ydata_process = [s['process_memory'] for s in ydata_snapshots]
            ax2.plot(ydata_times, ydata_process, 'b-o', label='ydata-profiling', linewidth=2, markersize=6)
        
        if len(pysuricata_snapshots) > 1:
            pysuricata_process = [s['process_memory'] for s in pysuricata_snapshots]
            ax2.plot(pysuricata_times, pysuricata_process, 'r-o', label='PySuricata', linewidth=2, markersize=6)
        
        ax2.set_xlabel('Time (seconds)')
        ax2.set_ylabel('Process Memory (MB)')
        ax2.set_title('Process Memory Usage')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        # Plot 3: Bar chart comparison
        metrics = ['Processing Time', 'Memory Growth', 'Report Size']
        ydata_values = [
            ydata_results['processing_time'],
            ydata_results['memory_growth'],
            ydata_results['report_size']
        ]
        pysuricata_values = [
            pysuricata_results['processing_time'],
            pysuricata_results['memory_growth'],
            pysuricata_results['report_size']
        ]
        
        x = np.arange(len(metrics))
        width = 0.35
        
        ax3.bar(x - width/2, ydata_values, width, label='ydata-profiling', alpha=0.8)
        ax3.bar(x + width/2, pysuricata_values, width, label='PySuricata', alpha=0.8)
        
        ax3.set_xlabel('Metrics')
        ax3.set_ylabel('Values')
        ax3.set_title('Performance Metrics Comparison')
        ax3.set_xticks(x)
        ax3.set_xticklabels(metrics, rotation=45)
        ax3.legend()
        ax3.grid(True, alpha=0.3)
        
        # Plot 4: Speed comparison
        tools = ['ydata-profiling', 'PySuricata']
        speeds = [
            len(df) / ydata_results['processing_time'] if ydata_results['processing_time'] > 0 else 0,
            len(df) / pysuricata_results['processing_time'] if pysuricata_results['processing_time'] > 0 else 0
        ]
        
        colors = ['blue', 'red']
        bars = ax4.bar(tools, speeds, color=colors, alpha=0.7)
        ax4.set_ylabel('Rows per Second')
        ax4.set_title('Processing Speed Comparison')
        ax4.grid(True, alpha=0.3)
        
        # Add value labels on bars
        for bar, speed in zip(bars, speeds):
            height = bar.get_height()
            ax4.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                    f'{speed:,.0f}', ha='center', va='bottom')
        
        plt.tight_layout()
        plt.show()
        
        print("‚úÖ Memory visualization created successfully!")
        
    except ImportError:
        print("‚ùå Matplotlib not available - skipping visualization")
        print("   Install matplotlib to see memory usage graphs: pip install matplotlib")
    except Exception as e:
        print(f"‚ùå Error creating visualization: {e}")

# Create visualization
visualize_memory_usage(ydata_results, pysuricata_results)



üìä Creating memory usage visualization...
‚úÖ Memory visualization created successfully!


  plt.show()


In [None]:
# Final Summary and Report Access
def final_summary(ydata_results, pysuricata_results):
    """Provide final summary and report access information."""
    
    if ydata_results is None or pysuricata_results is None:
        print("‚ùå Cannot provide summary - one or both benchmarks failed")
        return
    
    print("\nüéâ BENCHMARK COMPLETED SUCCESSFULLY!")
    print("=" * 60)
    
    print(f"\nüìä Dataset Processed:")
    print(f"   Rows: {len(df):,}")
    print(f"   Columns: {len(df.columns):,}")
    print(f"   Memory footprint: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
    
    print(f"\nüìÑ Generated Reports:")
    print(f"   1. ydata-profiling: {ydata_results['html_file']} ({ydata_results['report_size']:.2f} MB)")
    print(f"   2. PySuricata: {pysuricata_results['html_file']} ({pysuricata_results['report_size']:.2f} MB)")
    
    print(f"\n‚ö° Performance Summary:")
    print(f"   ydata-profiling: {ydata_results['processing_time']:.2f}s, {ydata_results['memory_growth']:.2f}MB growth")
    print(f"   PySuricata: {pysuricata_results['processing_time']:.2f}s, {pysuricata_results['memory_growth']:.2f}MB growth")
    
    # Overall winner
    print(f"\nüèÜ Overall Assessment:")
    
    ydata_score = 0
    pysuricata_score = 0
    
    # Speed comparison
    if ydata_results['processing_time'] < pysuricata_results['processing_time']:
        ydata_score += 1
        print(f"   ‚ö° Speed: ydata-profiling wins")
    else:
        pysuricata_score += 1
        print(f"   ‚ö° Speed: PySuricata wins")
    
    # Memory comparison
    if ydata_results['memory_growth'] < pysuricata_results['memory_growth']:
        ydata_score += 1
        print(f"   üíæ Memory: ydata-profiling wins")
    else:
        pysuricata_score += 1
        print(f"   üíæ Memory: PySuricata wins")
    
    # Size comparison
    if ydata_results['report_size'] < pysuricata_results['report_size']:
        ydata_score += 1
        print(f"   üìÑ Size: ydata-profiling wins")
    else:
        pysuricata_score += 1
        print(f"   üìÑ Size: PySuricata wins")
    
    print(f"\nüéØ Final Score:")
    print(f"   ydata-profiling: {ydata_score}/3")
    print(f"   PySuricata: {pysuricata_score}/3")
    
    if ydata_score > pysuricata_score:
        print(f"\nüèÖ Winner: ydata-profiling!")
    elif pysuricata_score > ydata_score:
        print(f"\nüèÖ Winner: PySuricata!")
    else:
        print(f"\nü§ù Tie! Both tools have their strengths.")
    
    print(f"\nüåê To view the reports:")
    print(f"   Open the HTML files in your browser:")
    print(f"   ‚Ä¢ {ydata_results['html_file']}")
    print(f"   ‚Ä¢ {pysuricata_results['html_file']}")
    
    print(f"\nüìà Key Insights:")
    print(f"   ‚Ä¢ Both tools successfully processed {len(df):,} rows")
    print(f"   ‚Ä¢ Memory usage patterns differ significantly")
    print(f"   ‚Ä¢ Report sizes vary based on analysis depth")
    print(f"   ‚Ä¢ Choose based on your specific needs:")
    print(f"     - ydata-profiling: Comprehensive analysis, larger reports")
    print(f"     - PySuricata: Lightweight, memory-efficient")

# Display final summary
final_summary(ydata_results, pysuricata_results)



üéâ BENCHMARK COMPLETED SUCCESSFULLY!

üìä Dataset Processed:
   Rows: 1,000,000
   Columns: 13
   Memory footprint: 368.82 MB

üìÑ Generated Reports:
   1. ydata-profiling: ydata_profiling_report_1M.html (5.54 MB)
   2. PySuricata: pysuricata_report_1M.html (1.35 MB)

‚ö° Performance Summary:
   ydata-profiling: 60.71s, 1462.70MB growth
   PySuricata: 349.31s, -1132.16MB growth

üèÜ Overall Assessment:
   ‚ö° Speed: ydata-profiling wins
   üíæ Memory: PySuricata wins
   üìÑ Size: PySuricata wins

üéØ Final Score:
   ydata-profiling: 1/3
   PySuricata: 2/3

üèÖ Winner: PySuricata!

üåê To view the reports:
   Open the HTML files in your browser:
   ‚Ä¢ ydata_profiling_report_1M.html
   ‚Ä¢ pysuricata_report_1M.html

üìà Key Insights:
   ‚Ä¢ Both tools successfully processed 1,000,000 rows
   ‚Ä¢ Memory usage patterns differ significantly
   ‚Ä¢ Report sizes vary based on analysis depth
   ‚Ä¢ Choose based on your specific needs:
     - ydata-profiling: Comprehensive analysis