In [None]:
"""
HLLSet Immutable Batch Processing Tests

This notebook demonstrates the immutable batch processing API for HLLSet.
Key features:
- All operations return new instances (immutable)
- Batch processing is the primary mode
- Multi-batch processing with optional parallelization
- Union-based merging for accumulated results

IMPORTANT: If you've made changes to hllset.py, restart the kernel before running.
"""

# NOTE: After code changes, restart kernel (Kernel -> Restart)
from core.hllset import HLLSet, JULIA_AVAILABLE
import time
import numpy as np

print(f"Julia Backend Available: {JULIA_AVAILABLE}")
print("Ready to test HLLSet batch processing!\n")

In [None]:
# Test 1: Basic batch creation
print("=" * 60)
print("Test 1: Basic Batch Creation")
print("=" * 60)

# Create HLLSet from a single batch of tokens
tokens = ['apple', 'banana', 'cherry', 'date', 'elderberry']
hll1 = HLLSet.from_batch(tokens)

print(f"Created HLLSet from batch of {len(tokens)} tokens")
print(f"HLLSet: {hll1}")
print(f"Cardinality: {hll1.cardinality():.2f}")
print(f"Name: {hll1.short_name}")
print()

In [None]:
# Summary
print("=" * 60)
print("SUMMARY: HLLSet Immutable Batch Processing API")
print("=" * 60)
print("""
Key Features:
✓ Fully immutable - all operations return new instances
✓ Batch processing is the primary mode (from_batch)
✓ Multi-batch processing with optional parallelization (from_batches)
  NOTE: Parallel processing automatically disabled with Julia backend
        due to thread-safety constraints (Julia is fast enough!)
✓ Efficient merging via union operations
✓ Content-addressed naming (deterministic hashes)
✓ Set operations: union, intersect, diff
✓ Similarity metrics: Jaccard, Cosine

Recommended Patterns:
1. Single batch:     HLLSet.from_batch(tokens)
2. Multiple batches: HLLSet.from_batches(batches)  # Sequential with Julia
                     HLLSet.from_batches(batches, parallel=True)  # Only without Julia
3. Merging:          HLLSet.merge([hll1, hll2, ...])
4. Accumulating:     hll1.union(hll2).union(hll3)

Legacy Methods (for backward compatibility):
- HLLSet.absorb(tokens) -> use from_batch() instead
- HLLSet.add(base, tokens) -> use from_batch() + union() instead
""")

In [None]:
# Test 9: Verify content addressing
print("=" * 60)
print("Test 9: Content-Addressed Naming")
print("=" * 60)

# Same tokens in different order should produce same HLLSet
tokens_v1 = ['z', 'y', 'x', 'w', 'v']
tokens_v2 = ['v', 'w', 'x', 'y', 'z']

hll_v1 = HLLSet.from_batch(tokens_v1)
hll_v2 = HLLSet.from_batch(tokens_v2)

print(f"Tokens v1: {tokens_v1}")
print(f"Tokens v2: {tokens_v2}")
print(f"\nHLL v1 name: {hll_v1.short_name}")
print(f"HLL v2 name: {hll_v2.short_name}")
print(f"Same content produces same name: {hll_v1.name == hll_v2.name}")

# Different tokens should produce different HLLSets
tokens_v3 = ['a', 'b', 'c', 'd', 'e']
hll_v3 = HLLSet.from_batch(tokens_v3)
print(f"\nDifferent tokens: {tokens_v3}")
print(f"HLL v3 name: {hll_v3.short_name}")
print(f"Different content produces different name: {hll_v1.name != hll_v3.name}")
print()

In [None]:
# Test 8: Recommended usage pattern
print("=" * 60)
print("Test 8: Recommended Usage Pattern")
print("=" * 60)
print("Pattern 1: Single batch")
print("-" * 40)
data_batch = ['user1', 'user2', 'user3', 'user1']  # duplicates OK
hll = HLLSet.from_batch(data_batch)
print(f"Input: {data_batch}")
print(f"Result: {hll}")
print(f"Unique count: {hll.cardinality():.0f}")

In [None]:
print("\nPattern 2: Multiple batches (with parallel flag)")
print("-" * 40)
print("NOTE: Parallel processing is automatically disabled when Julia backend")
print("      is available due to thread-safety. Julia sequential is still fast!")
print()
incoming_batches = [
    ['event_a_1', 'event_a_2', 'event_a_3'],
    ['event_b_1', 'event_b_2', 'event_b_3'],
    ['event_c_1', 'event_c_2', 'event_c_3'],
]
# parallel=True will be ignored if Julia backend is available (prevents crash)
hll_combined = HLLSet.from_batches(incoming_batches, parallel=True)
print(f"Input: {len(incoming_batches)} batches")
print(f"Result: {hll_combined}")
print(f"Total unique: {hll_combined.cardinality():.0f}")

In [None]:





print("\nPattern 3: Accumulating over time")
print("-" * 40)
# Day 1
day1_data = [f'user_{i}' for i in range(100)]
hll_day1 = HLLSet.from_batch(day1_data)
print(f"Day 1: {hll_day1.cardinality():.0f} unique users")

# Day 2 (some overlap)
day2_data = [f'user_{i}' for i in range(50, 150)]
hll_day2 = HLLSet.from_batch(day2_data)
print(f"Day 2: {hll_day2.cardinality():.0f} unique users")

# Cumulative
hll_total = hll_day1.union(hll_day2)
print(f"Total unique users across both days: {hll_total.cardinality():.0f}")

print("\nPattern 4: Compare datasets")
print("-" * 40)
dataset_a = [f'item_{i}' for i in range(100)]
dataset_b = [f'item_{i}' for i in range(50, 150)]

hll_a = HLLSet.from_batch(dataset_a)
hll_b = HLLSet.from_batch(dataset_b)

similarity = hll_a.similarity(hll_b)
cosine_sim = hll_a.cosine(hll_b)

print(f"Dataset A: {hll_a.cardinality():.0f} items")
print(f"Dataset B: {hll_b.cardinality():.0f} items")
print(f"Jaccard similarity: {similarity:.2%}")
print(f"Cosine similarity: {cosine_sim:.2%}")
print()

In [None]:
# Test 7: Large-scale batch processing
print("=" * 60)
print("Test 7: Large-Scale Batch Processing")
print("=" * 60)

# Simulate streaming data with multiple batches
num_batches = 10
batch_size = 1000
print(f"Processing {num_batches} batches of {batch_size} tokens each")

# Generate batches
large_batches = [
    [f'stream_token_{batch_idx}_{i}' for i in range(batch_size)]
    for batch_idx in range(num_batches)
]

# Sequential
start = time.time()
hll_seq = HLLSet.from_batches(large_batches, parallel=False)
seq_time = time.time() - start
print(f"\nSequential processing: {seq_time:.4f}s")
print(f"Result: {hll_seq}")

# Parallel
start = time.time()
hll_par = HLLSet.from_batches(large_batches, parallel=True)
par_time = time.time() - start
print(f"\nParallel processing: {par_time:.4f}s")
print(f"Result: {hll_par}")
print(f"Speedup: {seq_time/par_time:.2f}x")

# Verify results
print(f"\nResults match: {hll_seq.name == hll_par.name}")
print(f"Total estimated unique: {hll_seq.cardinality():.0f}")
print(f"Expected: {num_batches * batch_size} (all unique)")
print()

In [None]:
# Test 6: Set operations remain immutable
print("=" * 60)
print("Test 6: Set Operations (Immutable)")
print("=" * 60)

# Create two HLLSets
tokens_1 = [f'token_{i}' for i in range(50)]
tokens_2 = [f'token_{i}' for i in range(25, 75)]  # overlaps 25-50

hll1 = HLLSet.from_batch(tokens_1)
hll2 = HLLSet.from_batch(tokens_2)

print(f"HLL1 cardinality: {hll1.cardinality():.2f}")
print(f"HLL2 cardinality: {hll2.cardinality():.2f}")

# Union
hll_union = hll1.union(hll2)
print(f"\nUnion cardinality: {hll_union.cardinality():.2f}")
print(f"Expected: ~75 (50 + 50 - 25 overlap)")

# Intersection
hll_intersect = hll1.intersect(hll2)
print(f"\nIntersection cardinality: {hll_intersect.cardinality():.2f}")
print(f"Expected: ~25 (overlap)")

# Difference
hll_diff = hll1.diff(hll2)
print(f"\nDifference cardinality: {hll_diff.cardinality():.2f}")
print(f"Expected: ~25 (tokens only in hll1)")

# Verify originals unchanged
print(f"\nHLL1 still has original cardinality: {hll1.cardinality():.2f} == 50.0")
print(f"HLL2 still has original cardinality: {hll2.cardinality():.2f} == 50.0")
print()

In [None]:
# Test 5: Manual merge pattern (for overlapping batches)
print("=" * 60)
print("Test 5: Manual Merge Pattern (Overlapping Batches)")
print("=" * 60)

# Create overlapping batches
batch_a = ['x', 'y', 'z', 'a', 'b']
batch_b = ['a', 'b', 'c', 'd', 'e']  # overlaps with batch_a
batch_c = ['m', 'n', 'o', 'p', 'q']  # no overlap

print(f"Batch A: {batch_a}")
print(f"Batch B: {batch_b} (overlaps with A)")
print(f"Batch C: {batch_c} (no overlap)")

# Process each batch independently
hll_a = HLLSet.from_batch(batch_a)
hll_b = HLLSet.from_batch(batch_b)
hll_c = HLLSet.from_batch(batch_c)

print(f"\nHLL A cardinality: {hll_a.cardinality():.2f}")
print(f"HLL B cardinality: {hll_b.cardinality():.2f}")
print(f"HLL C cardinality: {hll_c.cardinality():.2f}")

# Merge all
hll_merged = HLLSet.merge([hll_a, hll_b, hll_c])
print(f"\nMerged cardinality: {hll_merged.cardinality():.2f}")
print(f"Expected unique: {len(set(batch_a + batch_b + batch_c))}")

# Manual union also works
hll_union = hll_a.union(hll_b).union(hll_c)
print(f"Union cardinality: {hll_union.cardinality():.2f}")
print(f"Merge and union produce same result: {hll_merged.name == hll_union.name}")
print()

In [None]:
# Test 3: Multi-batch processing (sequential)
print("=" * 60)
print("Test 3: Multi-Batch Processing (Sequential)")
print("=" * 60)

# Create multiple batches
batch1 = [f'token_batch1_{i}' for i in range(100)]
batch2 = [f'token_batch2_{i}' for i in range(100)]
batch3 = [f'token_batch3_{i}' for i in range(100)]
batches = [batch1, batch2, batch3]

print(f"Processing {len(batches)} batches, each with 100 tokens")

# Sequential processing
start = time.time()
hll_sequential = HLLSet.from_batches(batches, parallel=False)
sequential_time = time.time() - start

print(f"Sequential time: {sequential_time:.4f}s")
print(f"Result: {hll_sequential}")
print(f"Total estimated unique tokens: {hll_sequential.cardinality():.2f}")
print()

In [None]:
# Test 2: Immutability verification
print("=" * 60)
print("Test 2: Immutability Verification")
print("=" * 60)

# Create base HLLSet
base_tokens = ['a', 'b', 'c']
hll_base = HLLSet.from_batch(base_tokens)
print(f"Base HLLSet: {hll_base}")
print(f"Base cardinality: {hll_base.cardinality():.2f}")

# Add more tokens - should return NEW instance
new_tokens = ['d', 'e', 'f']
hll_new = HLLSet.add(hll_base, new_tokens)
print(f"\nAfter adding {new_tokens}:")
print(f"New HLLSet: {hll_new}")
print(f"New cardinality: {hll_new.cardinality():.2f}")

# Verify base is unchanged
print(f"\nBase HLLSet after operation: {hll_base}")
print(f"Base cardinality after operation: {hll_base.cardinality():.2f}")
print(f"Base unchanged: {hll_base.cardinality() == 3.0}")
print()