From ee5901f1013c33342ba880a2736654c4a719acb6 Mon Sep 17 00:00:00 2001 From: ForeverAngry <61765732+ForeverAngry@users.noreply.github.com> Date: Tue, 21 Oct 2025 20:37:03 -0400 Subject: [PATCH] add read support for parquet bloom filters Add read support for parquet bloom filters. Closes #2649 --- pyiceberg/expressions/bloom_filter.py | 216 ++++++++++++++++++++ pyiceberg/manifest.py | 38 ++++ pyiceberg/table/__init__.py | 28 +++ pyiceberg/table/bloom_filter.py | 232 ++++++++++++++++++++++ tests/table/test_bloom_filter.py | 276 ++++++++++++++++++++++++++ 5 files changed, 790 insertions(+) create mode 100644 pyiceberg/expressions/bloom_filter.py create mode 100644 pyiceberg/table/bloom_filter.py create mode 100644 tests/table/test_bloom_filter.py diff --git a/pyiceberg/expressions/bloom_filter.py b/pyiceberg/expressions/bloom_filter.py new file mode 100644 index 0000000000..5e637a1918 --- /dev/null +++ b/pyiceberg/expressions/bloom_filter.py @@ -0,0 +1,216 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +from typing import Any + +from pyiceberg.expressions import ( + BoundEqualTo, + BoundGreaterThan, + BoundGreaterThanOrEqual, + BoundIn, + BoundIsNaN, + BoundIsNull, + BoundLessThan, + BoundLessThanOrEqual, + BoundLiteralPredicate, + BoundNotEqualTo, + BoundNotIn, + BoundNotNaN, + BoundNotNull, + BoundNotStartsWith, + BoundPredicate, + BoundSetPredicate, + BoundStartsWith, + BoundUnaryPredicate, +) +from pyiceberg.expressions.visitors import BooleanExpressionVisitor +from pyiceberg.manifest import DataFile +from pyiceberg.schema import Schema +from pyiceberg.table.bloom_filter import BloomFilter + + +class BloomFilterEvaluator(BooleanExpressionVisitor[bool]): + """Evaluator that uses bloom filters to check if a file might contain matching rows. + + This evaluator helps prune data files that definitely cannot contain rows matching + a query predicate by using bloom filters for column values. + """ + + def __init__(self, data_file: DataFile, schema: Schema): + """Initialize the bloom filter evaluator. + + Args: + data_file: The data file to evaluate bloom filters for. + schema: The table schema for column resolution. + """ + self.data_file = data_file + self.schema = schema + + def visit_true(self) -> bool: + """Visit AlwaysTrue - file might contain matching rows.""" + return True + + def visit_false(self) -> bool: + """Visit AlwaysFalse - file definitely contains no matching rows.""" + return False + + def visit_not(self, child_result: bool) -> bool: + """Visit Not - invert the child result.""" + return not child_result + + def visit_and(self, left_result: bool, right_result: bool) -> bool: + """Visit And - both conditions must allow the file.""" + return left_result and right_result + + def visit_or(self, left_result: bool, right_result: bool) -> bool: + """Visit Or - at least one condition must allow the file.""" + return left_result or right_result + + def visit_unbound_predicate(self, predicate: object) -> bool: + """Visit an unbound predicate - conservatively allow the file.""" + # Unbound predicates haven't been bound to a schema, so we can't evaluate them + return True + + def visit_bound_predicate(self, predicate: BoundPredicate[Any]) -> bool: + """Visit a bound predicate and evaluate using bloom filter if available.""" + if isinstance(predicate, BoundUnaryPredicate): + # Unary predicates (IsNull, IsNaN, etc.) + return self._visit_unary_predicate(predicate) + elif isinstance(predicate, BoundLiteralPredicate): + # Literal predicates with a single value (EqualTo, NotEqualTo, etc.) + return self._visit_literal_predicate(predicate) + elif isinstance(predicate, BoundSetPredicate): + # Set predicates (In, NotIn) + return self._visit_set_predicate(predicate) + else: + # Unknown predicate type - be conservative and allow the file + return True + + def visit_predicate(self, predicate: BoundPredicate[Any]) -> bool: + """Visit a bound predicate and evaluate using bloom filter if available.""" + if isinstance(predicate, BoundUnaryPredicate): + # Unary predicates (IsNull, IsNaN, etc.) + return self._visit_unary_predicate(predicate) + elif isinstance(predicate, BoundLiteralPredicate): + # Literal predicates with a single value (EqualTo, NotEqualTo, etc.) + return self._visit_literal_predicate(predicate) + elif isinstance(predicate, BoundSetPredicate): + # Set predicates (In, NotIn) + return self._visit_set_predicate(predicate) + else: + # Unknown predicate type - be conservative and allow the file + return True + + def _visit_unary_predicate(self, predicate: BoundUnaryPredicate[Any]) -> bool: + """Evaluate unary predicates using bloom filter.""" + if isinstance(predicate, BoundIsNull): + # IsNull cannot use bloom filter (nulls not in BF) + return True + elif isinstance(predicate, BoundIsNaN): + # IsNaN cannot use bloom filter (NaN not in BF) + return True + elif isinstance(predicate, BoundNotNull): + # NotNull cannot use bloom filter effectively + return True + elif isinstance(predicate, BoundNotNaN): + # NotNaN cannot use bloom filter effectively + return True + else: + # Unknown unary predicate + return True + + def _visit_literal_predicate(self, predicate: BoundLiteralPredicate[Any]) -> bool: + """Evaluate literal predicates using bloom filter.""" + term = predicate.term + literal = predicate.literal + column_id = term.ref().field.field_id + + # Get the bloom filter for this column + bloom_filter_bytes = self.data_file.get_bloom_filter(column_id) + if bloom_filter_bytes is None: + # No bloom filter for this column - can't prune + return True + + # Deserialize the bloom filter + try: + bloom_filter = BloomFilter.from_bytes(bloom_filter_bytes) + except Exception: + # Error deserializing - be conservative + return True + + if isinstance(predicate, BoundEqualTo): + # For EqualTo, check if value might be in the filter + return bloom_filter.might_contain(literal.value) + elif isinstance(predicate, BoundNotEqualTo): + # For NotEqualTo, we can't prune based on bloom filter + # (we need to be in the filter to exclude based on NOT) + return True + elif isinstance(predicate, BoundLessThan): + # For LessThan, we can't use bloom filter effectively + return True + elif isinstance(predicate, BoundLessThanOrEqual): + # For LessThanOrEqual, we can't use bloom filter effectively + return True + elif isinstance(predicate, BoundGreaterThan): + # For GreaterThan, we can't use bloom filter effectively + return True + elif isinstance(predicate, BoundGreaterThanOrEqual): + # For GreaterThanOrEqual, we can't use bloom filter effectively + return True + elif isinstance(predicate, BoundStartsWith): + # For StartsWith, we can't use exact bloom filter matching + return True + elif isinstance(predicate, BoundNotStartsWith): + # For NotStartsWith, we can't prune based on bloom filter + return True + else: + # Unknown literal predicate + return True + + def _visit_set_predicate(self, predicate: BoundSetPredicate[Any]) -> bool: + """Evaluate set predicates using bloom filter.""" + term = predicate.term + column_id = term.ref().field.field_id + + # Get the bloom filter for this column + bloom_filter_bytes = self.data_file.get_bloom_filter(column_id) + if bloom_filter_bytes is None: + # No bloom filter for this column - can't prune + return True + + # Deserialize the bloom filter + try: + bloom_filter = BloomFilter.from_bytes(bloom_filter_bytes) + except Exception: + # Error deserializing - be conservative + return True + + if isinstance(predicate, BoundIn): + # For IN predicate, check if any value might be in the filter + # If at least one value might be in the filter, we can't prune the file + for value in predicate.literals: + if bloom_filter.might_contain(value.value): + return True + # None of the values are in the filter - can prune the file + return False + elif isinstance(predicate, BoundNotIn): + # For NOT IN predicate, we can't prune based on bloom filter + return True + else: + # Unknown set predicate + return True diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index eafb2b7c03..a4a5e838db 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -290,6 +290,13 @@ def __repr__(self) -> str: required=False, doc="ID representing sort order for this file", ), + NestedField( + field_id=146, + name="bloom_filter_bytes", + field_type=MapType(key_id=147, key_type=IntegerType(), value_id=148, value_type=BinaryType()), + required=False, + doc="Map of column id to bloom filter", + ), ), 3: StructType( NestedField( @@ -413,6 +420,13 @@ def __repr__(self) -> str: required=False, doc="The length of a referenced content stored in the file; required if content_offset is present", ), + NestedField( + field_id=146, + name="bloom_filter_bytes", + field_type=MapType(key_id=147, key_type=IntegerType(), value_id=148, value_type=BinaryType()), + required=False, + doc="Map of column id to bloom filter", + ), ), } @@ -516,6 +530,17 @@ def equality_ids(self) -> Optional[List[int]]: def sort_order_id(self) -> Optional[int]: return self._data[15] + @property + def bloom_filter_bytes(self) -> Dict[int, bytes] | None: + """Get bloom filter bytes for all columns. + + Returns a dict mapping column ID to bloom filter bytes. + """ + # Get bloom_filter_bytes which is the last field in the struct + if len(self._data) > 16: + return self._data[16] + return None + # Spec ID should not be stored in the file _spec_id: int @@ -538,6 +563,19 @@ def __hash__(self) -> int: """Return the hash of the file path.""" return hash(self.file_path) + def get_bloom_filter(self, column_id: int) -> bytes | None: + """Get bloom filter bytes for a specific column. + + Args: + column_id: The column ID to get the bloom filter for. + + Returns: + Bloom filter bytes for the column, or None if not available. + """ + if self.bloom_filter_bytes and column_id in self.bloom_filter_bytes: + return self.bloom_filter_bytes[column_id] + return None + def __eq__(self, other: Any) -> bool: """Compare the datafile with another object. diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 8b7f4d165a..56391be662 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1926,6 +1926,30 @@ def _build_residual_evaluator(self, spec_id: int) -> Callable[[DataFile], Residu ) ) + def _should_keep_file_with_bloom_filter(self, data_file: DataFile) -> bool: + """Check if a data file should be kept based on bloom filter evaluation. + + Args: + data_file: The data file to evaluate. + + Returns: + True if the file should be kept, False if it can be pruned. + """ + if data_file.bloom_filter_bytes is None: + # No bloom filter for this file + return True + + try: + from pyiceberg.expressions.bloom_filter import BloomFilterEvaluator + from pyiceberg.expressions.visitors import visit + + # Use the bloom filter evaluator to check if the file might contain matching rows + evaluator = BloomFilterEvaluator(data_file, self.table_metadata.schema()) + return visit(self.row_filter, evaluator) + except Exception: + # If there's any error evaluating bloom filters, be conservative and keep the file + return True + @staticmethod def _check_sequence_number(min_sequence_number: int, manifest: ManifestFile) -> bool: """Ensure that no manifests are loaded that contain deletes that are older than the data. @@ -2001,6 +2025,10 @@ def plan_files(self) -> Iterable[FileScanTask]: for manifest_entry in chain.from_iterable(self.scan_plan_helper()): data_file = manifest_entry.data_file if data_file.content == DataFileContent.DATA: + # Apply bloom filter evaluation to prune files that definitely don't match the filter + if not self._should_keep_file_with_bloom_filter(data_file): + # Skip this file as it cannot contain matching rows + continue data_entries.append(manifest_entry) elif data_file.content == DataFileContent.POSITION_DELETES: positional_delete_entries.add(manifest_entry) diff --git a/pyiceberg/table/bloom_filter.py b/pyiceberg/table/bloom_filter.py new file mode 100644 index 0000000000..b0753adcb4 --- /dev/null +++ b/pyiceberg/table/bloom_filter.py @@ -0,0 +1,232 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Bloom filter implementation for Iceberg row-level filtering.""" + +from __future__ import annotations + +import struct + +import mmh3 + + +class BloomFilter: + """Bloom filter implementation for Iceberg column value filtering. + + A bloom filter is a space-efficient probabilistic data structure that can determine + whether an element may be in a set or is definitely not in the set. In Iceberg, + bloom filters are used at the file level to help prune data files that cannot + contain rows matching a query predicate. + """ + + MAGIC = b"ORC" + SERIAL_NUMBER = 4 + + def __init__(self, num_bytes: int, num_hash_functions: int = 1): + """Initialize a bloom filter. + + Args: + num_bytes: Size of the bloom filter in bytes. + num_hash_functions: Number of hash functions to use (default: 1). + """ + self.num_bytes = num_bytes + self.num_hash_functions = num_hash_functions + # Initialize bit array as bytearray + self.bit_array = bytearray(num_bytes) + + def add(self, value: str | bytes | int | float | bool | None) -> None: + """Add a value to the bloom filter. + + Args: + value: The value to add to the filter. + """ + if value is None: + # Null values are typically not added to bloom filters in Iceberg + return + + # Convert value to bytes + value_bytes = self._to_bytes(value) + + # Hash the value with multiple hash functions + for i in range(self.num_hash_functions): + # Create a unique hash input for each hash function + hash_input = value_bytes + struct.pack(">I", i) + # Use MurmurHash3 (128-bit) + hash_value = mmh3.hash128(hash_input, signed=False) + # Calculate bit positions + bit_pos = hash_value % (self.num_bytes * 8) + # Set the bit + byte_index = bit_pos // 8 + bit_index = bit_pos % 8 + self.bit_array[byte_index] |= 1 << bit_index + + def might_contain(self, value: str | bytes | int | float | bool | None) -> bool: + """Check if a value might be in the bloom filter. + + Returns True if the value might be in the set, False if it definitely is not. + + Args: + value: The value to check. + + Returns: + True if the value might be in the set, False if definitely not. + """ + if value is None: + # Null values are not in bloom filters + return False + + # Convert value to bytes + value_bytes = self._to_bytes(value) + + # Check all hash positions + for i in range(self.num_hash_functions): + # Create a unique hash input for each hash function + hash_input = value_bytes + struct.pack(">I", i) + # Use MurmurHash3 (128-bit) + hash_value = mmh3.hash128(hash_input, signed=False) + # Calculate bit position + bit_pos = hash_value % (self.num_bytes * 8) + # Check if the bit is set + byte_index = bit_pos // 8 + bit_index = bit_pos % 8 + if not (self.bit_array[byte_index] & (1 << bit_index)): + # If any hash position is not set, value is definitely not in the set + return False + + # All hash positions are set, value might be in the set + return True + + def to_bytes(self) -> bytes: + """Serialize the bloom filter to bytes. + + Returns: + Serialized bloom filter as bytes. + """ + return bytes(self.bit_array) + + @classmethod + def from_bytes(cls, data: bytes, num_hash_functions: int = 1) -> BloomFilter: + """Deserialize a bloom filter from bytes. + + Args: + data: Serialized bloom filter bytes. + num_hash_functions: Number of hash functions used in the filter. + + Returns: + BloomFilter instance. + """ + bf = cls(len(data), num_hash_functions) + bf.bit_array = bytearray(data) + return bf + + @staticmethod + def _to_bytes(value: str | bytes | int | float | bool) -> bytes: + """Convert a value to bytes for hashing. + + Args: + value: Value to convert. + + Returns: + Bytes representation of the value. + """ + if isinstance(value, bytes): + return value + elif isinstance(value, str): + return value.encode("utf-8") + elif isinstance(value, bool): + # bool before int because bool is subclass of int + return struct.pack(">B", 1 if value else 0) + elif isinstance(value, int): + # Handle both 32-bit and 64-bit integers + if -2147483648 <= value <= 2147483647: + return struct.pack(">i", value) + else: + return struct.pack(">q", value) + elif isinstance(value, float): + return struct.pack(">d", value) + else: + raise TypeError(f"Unsupported type for bloom filter: {type(value)}") + + def __repr__(self) -> str: + """Return string representation of the bloom filter.""" + return f"BloomFilter(num_bytes={self.num_bytes}, num_hash_functions={self.num_hash_functions})" + + +class BloomFilterBuilder: + """Builder for creating bloom filters with specific false positive rates.""" + + @staticmethod + def optimal_num_bytes(num_elements: int, false_positive_rate: float = 0.05) -> int: + """Calculate optimal bloom filter size in bytes given element count and FPP. + + Uses the formula: m = -(n * ln(p)) / (ln(2)^2) + where n is number of elements and p is false positive rate. + + Args: + num_elements: Expected number of elements to add. + false_positive_rate: Desired false positive rate (default: 0.05). + + Returns: + Optimal number of bytes for the bloom filter. + """ + import math + + if num_elements <= 0: + return 1 + if false_positive_rate <= 0 or false_positive_rate >= 1: + raise ValueError("False positive rate must be between 0 and 1") + + # Calculate optimal number of bits + num_bits = -(num_elements * math.log(false_positive_rate)) / (math.log(2) ** 2) + # Convert to bytes (round up) + num_bytes = int(math.ceil(num_bits / 8)) + return max(1, num_bytes) + + @staticmethod + def optimal_num_hash_functions(num_bytes: int, num_elements: int) -> int: + """Calculate optimal number of hash functions. + + Uses the formula: k = (m / n) * ln(2) + where m is number of bytes and n is number of elements. + + Args: + num_bytes: Size of bloom filter in bytes. + num_elements: Expected number of elements to add. + + Returns: + Optimal number of hash functions. + """ + import math + + if num_elements <= 0 or num_bytes <= 0: + return 1 + k = (num_bytes / num_elements) * math.log(2) + return max(1, int(round(k))) + + @staticmethod + def create(num_elements: int, false_positive_rate: float = 0.05) -> BloomFilter: + """Create an optimally configured bloom filter. + + Args: + num_elements: Expected number of elements to add. + false_positive_rate: Desired false positive rate (default: 0.05). + + Returns: + Optimally configured BloomFilter instance. + """ + num_bytes = BloomFilterBuilder.optimal_num_bytes(num_elements, false_positive_rate) + num_hash_functions = BloomFilterBuilder.optimal_num_hash_functions(num_bytes, num_elements) + return BloomFilter(num_bytes, num_hash_functions) diff --git a/tests/table/test_bloom_filter.py b/tests/table/test_bloom_filter.py new file mode 100644 index 0000000000..c88b395582 --- /dev/null +++ b/tests/table/test_bloom_filter.py @@ -0,0 +1,276 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Tests for bloom filter functionality.""" + +import pytest + +from pyiceberg.table.bloom_filter import BloomFilter, BloomFilterBuilder + + +class TestBloomFilter: + """Test cases for BloomFilter class.""" + + def test_bloom_filter_creation(self): + """Test creating a bloom filter.""" + bf = BloomFilter(100, 3) + assert bf.num_bytes == 100 + assert bf.num_hash_functions == 3 + assert len(bf.bit_array) == 100 + + def test_add_and_might_contain_string(self): + """Test adding and checking string values.""" + bf = BloomFilter(256, 3) + + # Add some strings + bf.add("hello") + bf.add("world") + + # Check that added values might be in the filter + assert bf.might_contain("hello") + assert bf.might_contain("world") + + def test_add_and_might_contain_integer(self): + """Test adding and checking integer values.""" + bf = BloomFilter(256, 3) + + # Add some integers + bf.add(42) + bf.add(100) + bf.add(9999) + + # Check that added values might be in the filter + assert bf.might_contain(42) + assert bf.might_contain(100) + assert bf.might_contain(9999) + + def test_add_and_might_contain_float(self): + """Test adding and checking float values.""" + bf = BloomFilter(256, 3) + + # Add some floats + bf.add(3.14) + bf.add(2.71) + + # Check that added values might be in the filter + assert bf.might_contain(3.14) + assert bf.might_contain(2.71) + + def test_add_and_might_contain_bytes(self): + """Test adding and checking bytes values.""" + bf = BloomFilter(256, 3) + + # Add some bytes + bf.add(b"hello") + bf.add(b"world") + + # Check that added values might be in the filter + assert bf.might_contain(b"hello") + assert bf.might_contain(b"world") + + def test_add_and_might_contain_bool(self): + """Test adding and checking boolean values.""" + bf = BloomFilter(256, 3) + + # Add booleans + bf.add(True) + bf.add(False) + + # Check that added values might be in the filter + assert bf.might_contain(True) + assert bf.might_contain(False) + + def test_add_null_value(self): + """Test that null values are not added to the filter.""" + bf = BloomFilter(256, 3) + + # Add null + bf.add(None) + + # Null should not be in the filter + assert not bf.might_contain(None) + + def test_serialization_and_deserialization(self): + """Test serializing and deserializing a bloom filter.""" + bf1 = BloomFilter(256, 3) + bf1.add("hello") + bf1.add("world") + bf1.add(42) + + # Serialize + serialized = bf1.to_bytes() + assert isinstance(serialized, bytes) + assert len(serialized) == 256 + + # Deserialize + bf2 = BloomFilter.from_bytes(serialized, 3) + assert bf2.num_bytes == 256 + assert bf2.num_hash_functions == 3 + + # Check that values are still found + assert bf2.might_contain("hello") + assert bf2.might_contain("world") + assert bf2.might_contain(42) + + def test_false_positives_with_small_filter(self): + """Test that false positives are possible with a small filter.""" + # Small filter to increase false positive rate + bf = BloomFilter(16, 1) + + # Add one value + bf.add("value1") + + # The added value should be found + assert bf.might_contain("value1") + + # Some other values might also be found (false positives possible) + # This is expected behavior for bloom filters + + +class TestBloomFilterBuilder: + """Test cases for BloomFilterBuilder class.""" + + def test_optimal_num_bytes_calculation(self): + """Test optimal number of bytes calculation.""" + # Test with reasonable values + num_bytes_1 = BloomFilterBuilder.optimal_num_bytes(1000, 0.05) + assert num_bytes_1 > 0 + + num_bytes_2 = BloomFilterBuilder.optimal_num_bytes(10000, 0.01) + assert num_bytes_2 > num_bytes_1 # More elements or lower FPP should need more space + + def test_optimal_num_hash_functions_calculation(self): + """Test optimal number of hash functions calculation.""" + num_hash = BloomFilterBuilder.optimal_num_hash_functions(256, 1000) + assert num_hash >= 1 + + # More bytes relative to elements should result in fewer hash functions + num_hash_2 = BloomFilterBuilder.optimal_num_hash_functions(512, 1000) + assert num_hash_2 >= num_hash + + def test_create_optimally_configured_filter(self): + """Test creating an optimally configured bloom filter.""" + bf = BloomFilterBuilder.create(1000, 0.05) + + assert bf.num_bytes > 0 + assert bf.num_hash_functions >= 1 + + # Add some values and verify they can be found + for i in range(100): + bf.add(f"value_{i}") + + for i in range(100): + assert bf.might_contain(f"value_{i}") + + def test_create_with_different_false_positive_rates(self): + """Test creating filters with different false positive rates.""" + num_elements = 1000 + + bf_fpp_5 = BloomFilterBuilder.create(num_elements, 0.05) + bf_fpp_1 = BloomFilterBuilder.create(num_elements, 0.01) + + # Lower FPP should use more space + assert bf_fpp_1.num_bytes >= bf_fpp_5.num_bytes + + def test_invalid_false_positive_rate(self): + """Test that invalid false positive rates raise an error.""" + with pytest.raises(ValueError): + BloomFilterBuilder.optimal_num_bytes(1000, 0.0) + + with pytest.raises(ValueError): + BloomFilterBuilder.optimal_num_bytes(1000, 1.0) + + with pytest.raises(ValueError): + BloomFilterBuilder.optimal_num_bytes(1000, -0.1) + + +class TestBloomFilterEdgeCases: + """Test edge cases for bloom filter.""" + + def test_empty_filter(self): + """Test querying an empty bloom filter.""" + bf = BloomFilter(10, 1) + + # Nothing has been added, so most queries should return False + # (though there could be false positives from the empty state) + assert not bf.might_contain("anything") + + def test_large_filter(self): + """Test working with a large bloom filter.""" + bf = BloomFilter(10000, 5) + + # Add many values + for i in range(5000): + bf.add(f"value_{i}") + + # All added values should be found + for i in range(5000): + assert bf.might_contain(f"value_{i}") + + def test_multiple_hash_functions(self): + """Test that multiple hash functions work correctly.""" + bf1 = BloomFilter(1000, 1) + bf2 = BloomFilter(1000, 5) + + # Add same values to both + for i in range(100): + bf1.add(f"value_{i}") + bf2.add(f"value_{i}") + + # Both should find the added values + for i in range(100): + assert bf1.might_contain(f"value_{i}") + assert bf2.might_contain(f"value_{i}") + + def test_negative_and_large_integers(self): + """Test handling of negative and large integers.""" + bf = BloomFilter(256, 3) + + # Test negative integer + bf.add(-42) + assert bf.might_contain(-42) + + # Test large integer (64-bit) + large_int = 9223372036854775807 # Max 64-bit int + bf.add(large_int) + assert bf.might_contain(large_int) + + # Test very large integer (beyond 64-bit) + very_large_int = 92233720368547758070 + bf.add(very_large_int) + assert bf.might_contain(very_large_int) + + def test_mixed_types(self): + """Test adding and finding mixed types.""" + bf = BloomFilter(512, 3) + + # Add different types + values = [ + "string", + 42, + 3.14, + b"bytes", + True, + False, + ] + + for val in values: + bf.add(val) + + # All should be found + for val in values: + assert bf.might_contain(val) +