# Bloom Filter Performance Analysis

This notebook analyzes the performance of different Bloom filter implementations in our LSM tree.

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Configure plot styles
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('deep')
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['font.size'] = 12

## Load Benchmark Data

In [None]:
# Load the latest benchmark results
benchmark_file = '../benchmarks/bloom_bench_results_latest.csv'
df = pd.read_csv(benchmark_file)

# Display the first few rows
df.head()

## Prepare Data for Analysis

In [None]:
# Extract the operation type from the Implementation column
df['Operation'] = df['Implementation'].str.split('_').str[-1]
df['Implementation'] = df['Implementation'].str.split('_').str[0]

# Create a more readable version of the implementation names
implementation_mapping = {
    'bloom': 'Custom Bloom',
    'speeddb': 'SpeedDB Bloom',
    'rocksdb': 'RocksDB Bloom',
    'fastbloom': 'FastBloom'
}
df['Implementation'] = df['Implementation'].map(implementation_mapping)

# Create a more readable version of the operation names
operation_mapping = {
    'insert': 'Insert',
    'lookup': 'Lookup',
    'fp': 'False Positive',
    'insert': 'Insert',
    'batch': 'Batch',
    'concurrent': 'Concurrent'
}
df['Operation'] = df['Operation'].map(operation_mapping)

# Group the operations into categories
df['Category'] = 'Standard'
df.loc[df['Operation'].str.contains('Batch'), 'Category'] = 'Batch'
df.loc[df['Operation'].str.contains('Concurrent'), 'Category'] = 'Concurrent'

# Display the processed data
df.head()

## Performance Comparison - Basic Operations

In [None]:
# Filter for standard operations only (no batch/concurrent operations)
standard_ops = df[df['Category'] == 'Standard']

# Create a bar chart for standard operations
plt.figure(figsize=(14, 10))
sns.barplot(x='Implementation', y='Time_ns', hue='Operation', data=standard_ops)
plt.title('Performance Comparison of Bloom Filter Implementations', fontsize=16)
plt.xlabel('Implementation', fontsize=14)
plt.ylabel('Time per Operation (ns)', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.legend(title='Operation', fontsize=12, title_fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add values on top of the bars
for i, p in enumerate(plt.gca().patches):
    plt.gca().annotate(f'{p.get_height():.1f}', 
                        (p.get_x() + p.get_width() / 2., p.get_height()), 
                        ha = 'center', va = 'bottom', 
                        xytext = (0, 5), textcoords = 'offset points')

plt.tight_layout()
plt.savefig('../benchmarks/bloom_filter_standard_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## Batch Operations Performance

In [None]:
# Filter for our custom Bloom filter implementation to compare standard vs batch operations
custom_bloom = df[df['Implementation'] == 'Custom Bloom']

# Create groupings for the plot
custom_bloom['OpGroup'] = custom_bloom['Operation']
custom_bloom.loc[custom_bloom['Operation'] == 'Insert', 'OpGroup'] = 'Insert'
custom_bloom.loc[custom_bloom['Operation'] == 'Batch', 'OpGroup'] = 'Insert (Batch)'
custom_bloom.loc[custom_bloom['Operation'] == 'Concurrent', 'OpGroup'] = 'Insert (Concurrent)'
custom_bloom.loc[custom_bloom['Operation'] == 'Lookup', 'OpGroup'] = 'Lookup'
custom_bloom.loc[custom_bloom['Operation'] == 'Lookup Batch', 'OpGroup'] = 'Lookup (Batch)'
custom_bloom.loc[custom_bloom['Operation'] == 'False Positive', 'OpGroup'] = 'FP Test'
custom_bloom.loc[custom_bloom['Operation'] == 'False Positive Batch', 'OpGroup'] = 'FP Test (Batch)'

# Create a bar chart comparing standard, batch, and concurrent operations
plt.figure(figsize=(14, 8))
sns.barplot(x='OpGroup', y='Time_ns', data=custom_bloom)
plt.title('Effect of Batch Operations on Bloom Filter Performance', fontsize=16)
plt.xlabel('Operation Type', fontsize=14)
plt.ylabel('Time per Operation (ns)', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add values on top of the bars
for i, p in enumerate(plt.gca().patches):
    plt.gca().annotate(f'{p.get_height():.1f}', 
                        (p.get_x() + p.get_width() / 2., p.get_height()), 
                        ha = 'center', va = 'bottom', 
                        xytext = (0, 5), textcoords = 'offset points')

plt.tight_layout()
plt.savefig('../benchmarks/bloom_filter_batch_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## Performance Improvement from Batching

In [None]:
# Calculate speedup factor for each operation type
# First, create a reference dataframe with standard operations
standard_ops_ref = custom_bloom[custom_bloom['Category'] == 'Standard'].copy()
standard_ops_ref = standard_ops_ref[['Operation', 'Time_ns']].rename(columns={'Time_ns': 'Standard_Time_ns'})

# Map batch operations to their standard counterparts
batch_mapping = {
    'Batch': 'Insert',
    'Concurrent': 'Insert',
    'Lookup Batch': 'Lookup',
    'False Positive Batch': 'False Positive'
}

# Calculate speedup for batch operations
batch_ops = custom_bloom[custom_bloom['Category'] != 'Standard'].copy()
batch_ops['Standard_Op'] = batch_ops['Operation'].map(batch_mapping)
batch_ops = pd.merge(batch_ops, standard_ops_ref, left_on='Standard_Op', right_on='Operation', suffixes=('', '_std'))
batch_ops['Speedup'] = batch_ops['Standard_Time_ns'] / batch_ops['Time_ns']
batch_ops['Improvement'] = (batch_ops['Speedup'] - 1) * 100

# Create a bar chart showing speedup factors
plt.figure(figsize=(12, 8))
sns.barplot(x='Operation', y='Improvement', data=batch_ops)
plt.title('Performance Improvement from Batched Operations', fontsize=16)
plt.xlabel('Operation Type', fontsize=14)
plt.ylabel('Improvement (%)', fontsize=14)
plt.axhline(y=0, color='r', linestyle='--')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add values on top of the bars
for i, p in enumerate(plt.gca().patches):
    plt.gca().annotate(f'{p.get_height():.1f}%', 
                        (p.get_x() + p.get_width() / 2., p.get_height()), 
                        ha = 'center', va = 'bottom', 
                        xytext = (0, 5), textcoords = 'offset points')

plt.tight_layout()
plt.savefig('../benchmarks/bloom_filter_improvement.png', dpi=300, bbox_inches='tight')
plt.show()

## Performance Comparison with FastBloom

In [None]:
# Filter data for comparison with FastBloom
fastbloom_comparison = standard_ops[standard_ops['Implementation'].isin(['Custom Bloom', 'FastBloom'])].copy()

# Create a reference frame with FastBloom times
fastbloom_ref = fastbloom_comparison[fastbloom_comparison['Implementation'] == 'FastBloom'][['Operation', 'Time_ns']].rename(columns={'Time_ns': 'FastBloom_Time_ns'})

# Calculate relative performance
comparison = pd.merge(fastbloom_comparison[fastbloom_comparison['Implementation'] == 'Custom Bloom'], fastbloom_ref, on='Operation')
comparison['Relative_Performance'] = (comparison['FastBloom_Time_ns'] / comparison['Time_ns'] - 1) * 100

# Create a bar chart for the comparison
plt.figure(figsize=(10, 8))
plt.bar(comparison['Operation'], comparison['Relative_Performance'], color=sns.color_palette('deep')[0])
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Custom Bloom Filter Performance Relative to FastBloom', fontsize=16)
plt.xlabel('Operation Type', fontsize=14)
plt.ylabel('Relative Performance (%)', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add values on top of the bars
for i, v in enumerate(comparison['Relative_Performance']):
    label = f'{v:.1f}%' if v >= 0 else f'{v:.1f}%'
    color = 'green' if v >= 0 else 'red'
    plt.annotate(label,
                 (i, v),
                 textcoords="offset points",
                 xytext=(0, 5 if v >= 0 else -15),
                 ha='center',
                 color=color,
                 fontweight='bold')

plt.tight_layout()
plt.savefig('../benchmarks/bloom_filter_vs_fastbloom.png', dpi=300, bbox_inches='tight')
plt.show()

## Table of Results

In [None]:
# Create a summary table
summary = standard_ops.pivot_table(values='Time_ns', index='Implementation', columns='Operation')

# Calculate relative performance to Custom Bloom
reference = summary.loc['Custom Bloom']
relative = summary.div(reference) * 100 - 100

# Format for the report
summary_formatted = summary.round(2).reset_index()
relative_formatted = relative.round(2).reset_index()

# Display and save
print("Absolute Performance (ns per operation):")
display(summary_formatted)
summary_formatted.to_csv('../benchmarks/bloom_filter_summary_absolute.csv', index=False)

print("\nRelative Performance (% compared to Custom Bloom):")
display(relative_formatted)
relative_formatted.to_csv('../benchmarks/bloom_filter_summary_relative.csv', index=False)

## Summary Statistics for the Report

In [None]:
# Calculate key performance statistics
batch_speedup = batch_ops['Speedup'].mean()
batch_improvement = batch_ops['Improvement'].mean()
concurrent_speedup = batch_ops[batch_ops['Operation'] == 'Concurrent']['Speedup'].values[0]
concurrent_improvement = batch_ops[batch_ops['Operation'] == 'Concurrent']['Improvement'].values[0]
vs_fastbloom_insert = comparison[comparison['Operation'] == 'Insert']['Relative_Performance'].values[0]
vs_fastbloom_lookup = comparison[comparison['Operation'] == 'Lookup']['Relative_Performance'].values[0]

# Print summary statistics for the report
print(f"Average speedup from batch operations: {batch_speedup:.2f}x ({batch_improvement:.1f}% improvement)")
print(f"Concurrent batch insert speedup: {concurrent_speedup:.2f}x ({concurrent_improvement:.1f}% improvement)")
print(f"Custom Bloom vs FastBloom insert: {vs_fastbloom_insert:.1f}% {'faster' if vs_fastbloom_insert > 0 else 'slower'}")
print(f"Custom Bloom vs FastBloom lookup: {vs_fastbloom_lookup:.1f}% {'faster' if vs_fastbloom_lookup > 0 else 'slower'}")

# Generate summary chart data for the report
summary_data = {
    "Metric": [
        "Average Batch Speedup",
        "Concurrent Insert Speedup",
        "Insert vs FastBloom",
        "Lookup vs FastBloom"
    ],
    "Value": [
        f"{batch_speedup:.2f}x",
        f"{concurrent_speedup:.2f}x",
        f"{vs_fastbloom_insert:.1f}%",
        f"{vs_fastbloom_lookup:.1f}%"
    ]
}

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv('../benchmarks/bloom_filter_key_stats.csv', index=False)
display(summary_df)