From ee5901f1013c33342ba880a2736654c4a719acb6 Mon Sep 17 00:00:00 2001
From: ForeverAngry <61765732+ForeverAngry@users.noreply.github.com>
Date: Tue, 21 Oct 2025 20:37:03 -0400
Subject: [PATCH] add read support for parquet bloom filters Add read support
 for parquet bloom filters. Closes #2649

---
 pyiceberg/expressions/bloom_filter.py | 216 ++++++++++++++++++++
 pyiceberg/manifest.py                 |  38 ++++
 pyiceberg/table/__init__.py           |  28 +++
 pyiceberg/table/bloom_filter.py       | 232 ++++++++++++++++++++++
 tests/table/test_bloom_filter.py      | 276 ++++++++++++++++++++++++++
 5 files changed, 790 insertions(+)
 create mode 100644 pyiceberg/expressions/bloom_filter.py
 create mode 100644 pyiceberg/table/bloom_filter.py
 create mode 100644 tests/table/test_bloom_filter.py

diff --git a/pyiceberg/expressions/bloom_filter.py b/pyiceberg/expressions/bloom_filter.py
new file mode 100644
index 0000000000..5e637a1918
--- /dev/null
+++ b/pyiceberg/expressions/bloom_filter.py
@@ -0,0 +1,216 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from __future__ import annotations
+
+from typing import Any
+
+from pyiceberg.expressions import (
+    BoundEqualTo,
+    BoundGreaterThan,
+    BoundGreaterThanOrEqual,
+    BoundIn,
+    BoundIsNaN,
+    BoundIsNull,
+    BoundLessThan,
+    BoundLessThanOrEqual,
+    BoundLiteralPredicate,
+    BoundNotEqualTo,
+    BoundNotIn,
+    BoundNotNaN,
+    BoundNotNull,
+    BoundNotStartsWith,
+    BoundPredicate,
+    BoundSetPredicate,
+    BoundStartsWith,
+    BoundUnaryPredicate,
+)
+from pyiceberg.expressions.visitors import BooleanExpressionVisitor
+from pyiceberg.manifest import DataFile
+from pyiceberg.schema import Schema
+from pyiceberg.table.bloom_filter import BloomFilter
+
+
+class BloomFilterEvaluator(BooleanExpressionVisitor[bool]):
+    """Evaluator that uses bloom filters to check if a file might contain matching rows.
+
+    This evaluator helps prune data files that definitely cannot contain rows matching
+    a query predicate by using bloom filters for column values.
+    """
+
+    def __init__(self, data_file: DataFile, schema: Schema):
+        """Initialize the bloom filter evaluator.
+
+        Args:
+            data_file: The data file to evaluate bloom filters for.
+            schema: The table schema for column resolution.
+        """
+        self.data_file = data_file
+        self.schema = schema
+
+    def visit_true(self) -> bool:
+        """Visit AlwaysTrue - file might contain matching rows."""
+        return True
+
+    def visit_false(self) -> bool:
+        """Visit AlwaysFalse - file definitely contains no matching rows."""
+        return False
+
+    def visit_not(self, child_result: bool) -> bool:
+        """Visit Not - invert the child result."""
+        return not child_result
+
+    def visit_and(self, left_result: bool, right_result: bool) -> bool:
+        """Visit And - both conditions must allow the file."""
+        return left_result and right_result
+
+    def visit_or(self, left_result: bool, right_result: bool) -> bool:
+        """Visit Or - at least one condition must allow the file."""
+        return left_result or right_result
+
+    def visit_unbound_predicate(self, predicate: object) -> bool:
+        """Visit an unbound predicate - conservatively allow the file."""
+        # Unbound predicates haven't been bound to a schema, so we can't evaluate them
+        return True
+
+    def visit_bound_predicate(self, predicate: BoundPredicate[Any]) -> bool:
+        """Visit a bound predicate and evaluate using bloom filter if available."""
+        if isinstance(predicate, BoundUnaryPredicate):
+            # Unary predicates (IsNull, IsNaN, etc.)
+            return self._visit_unary_predicate(predicate)
+        elif isinstance(predicate, BoundLiteralPredicate):
+            # Literal predicates with a single value (EqualTo, NotEqualTo, etc.)
+            return self._visit_literal_predicate(predicate)
+        elif isinstance(predicate, BoundSetPredicate):
+            # Set predicates (In, NotIn)
+            return self._visit_set_predicate(predicate)
+        else:
+            # Unknown predicate type - be conservative and allow the file
+            return True
+
+    def visit_predicate(self, predicate: BoundPredicate[Any]) -> bool:
+        """Visit a bound predicate and evaluate using bloom filter if available."""
+        if isinstance(predicate, BoundUnaryPredicate):
+            # Unary predicates (IsNull, IsNaN, etc.)
+            return self._visit_unary_predicate(predicate)
+        elif isinstance(predicate, BoundLiteralPredicate):
+            # Literal predicates with a single value (EqualTo, NotEqualTo, etc.)
+            return self._visit_literal_predicate(predicate)
+        elif isinstance(predicate, BoundSetPredicate):
+            # Set predicates (In, NotIn)
+            return self._visit_set_predicate(predicate)
+        else:
+            # Unknown predicate type - be conservative and allow the file
+            return True
+
+    def _visit_unary_predicate(self, predicate: BoundUnaryPredicate[Any]) -> bool:
+        """Evaluate unary predicates using bloom filter."""
+        if isinstance(predicate, BoundIsNull):
+            # IsNull cannot use bloom filter (nulls not in BF)
+            return True
+        elif isinstance(predicate, BoundIsNaN):
+            # IsNaN cannot use bloom filter (NaN not in BF)
+            return True
+        elif isinstance(predicate, BoundNotNull):
+            # NotNull cannot use bloom filter effectively
+            return True
+        elif isinstance(predicate, BoundNotNaN):
+            # NotNaN cannot use bloom filter effectively
+            return True
+        else:
+            # Unknown unary predicate
+            return True
+
+    def _visit_literal_predicate(self, predicate: BoundLiteralPredicate[Any]) -> bool:
+        """Evaluate literal predicates using bloom filter."""
+        term = predicate.term
+        literal = predicate.literal
+        column_id = term.ref().field.field_id
+
+        # Get the bloom filter for this column
+        bloom_filter_bytes = self.data_file.get_bloom_filter(column_id)
+        if bloom_filter_bytes is None:
+            # No bloom filter for this column - can't prune
+            return True
+
+        # Deserialize the bloom filter
+        try:
+            bloom_filter = BloomFilter.from_bytes(bloom_filter_bytes)
+        except Exception:
+            # Error deserializing - be conservative
+            return True
+
+        if isinstance(predicate, BoundEqualTo):
+            # For EqualTo, check if value might be in the filter
+            return bloom_filter.might_contain(literal.value)
+        elif isinstance(predicate, BoundNotEqualTo):
+            # For NotEqualTo, we can't prune based on bloom filter
+            # (we need to be in the filter to exclude based on NOT)
+            return True
+        elif isinstance(predicate, BoundLessThan):
+            # For LessThan, we can't use bloom filter effectively
+            return True
+        elif isinstance(predicate, BoundLessThanOrEqual):
+            # For LessThanOrEqual, we can't use bloom filter effectively
+            return True
+        elif isinstance(predicate, BoundGreaterThan):
+            # For GreaterThan, we can't use bloom filter effectively
+            return True
+        elif isinstance(predicate, BoundGreaterThanOrEqual):
+            # For GreaterThanOrEqual, we can't use bloom filter effectively
+            return True
+        elif isinstance(predicate, BoundStartsWith):
+            # For StartsWith, we can't use exact bloom filter matching
+            return True
+        elif isinstance(predicate, BoundNotStartsWith):
+            # For NotStartsWith, we can't prune based on bloom filter
+            return True
+        else:
+            # Unknown literal predicate
+            return True
+
+    def _visit_set_predicate(self, predicate: BoundSetPredicate[Any]) -> bool:
+        """Evaluate set predicates using bloom filter."""
+        term = predicate.term
+        column_id = term.ref().field.field_id
+
+        # Get the bloom filter for this column
+        bloom_filter_bytes = self.data_file.get_bloom_filter(column_id)
+        if bloom_filter_bytes is None:
+            # No bloom filter for this column - can't prune
+            return True
+
+        # Deserialize the bloom filter
+        try:
+            bloom_filter = BloomFilter.from_bytes(bloom_filter_bytes)
+        except Exception:
+            # Error deserializing - be conservative
+            return True
+
+        if isinstance(predicate, BoundIn):
+            # For IN predicate, check if any value might be in the filter
+            # If at least one value might be in the filter, we can't prune the file
+            for value in predicate.literals:
+                if bloom_filter.might_contain(value.value):
+                    return True
+            # None of the values are in the filter - can prune the file
+            return False
+        elif isinstance(predicate, BoundNotIn):
+            # For NOT IN predicate, we can't prune based on bloom filter
+            return True
+        else:
+            # Unknown set predicate
+            return True
diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py
index eafb2b7c03..a4a5e838db 100644
--- a/pyiceberg/manifest.py
+++ b/pyiceberg/manifest.py
@@ -290,6 +290,13 @@ def __repr__(self) -> str:
             required=False,
             doc="ID representing sort order for this file",
         ),
+        NestedField(
+            field_id=146,
+            name="bloom_filter_bytes",
+            field_type=MapType(key_id=147, key_type=IntegerType(), value_id=148, value_type=BinaryType()),
+            required=False,
+            doc="Map of column id to bloom filter",
+        ),
     ),
     3: StructType(
         NestedField(
@@ -413,6 +420,13 @@ def __repr__(self) -> str:
             required=False,
             doc="The length of a referenced content stored in the file; required if content_offset is present",
         ),
+        NestedField(
+            field_id=146,
+            name="bloom_filter_bytes",
+            field_type=MapType(key_id=147, key_type=IntegerType(), value_id=148, value_type=BinaryType()),
+            required=False,
+            doc="Map of column id to bloom filter",
+        ),
     ),
 }
 
@@ -516,6 +530,17 @@ def equality_ids(self) -> Optional[List[int]]:
     def sort_order_id(self) -> Optional[int]:
         return self._data[15]
 
+    @property
+    def bloom_filter_bytes(self) -> Dict[int, bytes] | None:
+        """Get bloom filter bytes for all columns.
+
+        Returns a dict mapping column ID to bloom filter bytes.
+        """
+        # Get bloom_filter_bytes which is the last field in the struct
+        if len(self._data) > 16:
+            return self._data[16]
+        return None
+
     # Spec ID should not be stored in the file
     _spec_id: int
 
@@ -538,6 +563,19 @@ def __hash__(self) -> int:
         """Return the hash of the file path."""
         return hash(self.file_path)
 
+    def get_bloom_filter(self, column_id: int) -> bytes | None:
+        """Get bloom filter bytes for a specific column.
+
+        Args:
+            column_id: The column ID to get the bloom filter for.
+
+        Returns:
+            Bloom filter bytes for the column, or None if not available.
+        """
+        if self.bloom_filter_bytes and column_id in self.bloom_filter_bytes:
+            return self.bloom_filter_bytes[column_id]
+        return None
+
     def __eq__(self, other: Any) -> bool:
         """Compare the datafile with another object.
 
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
index 8b7f4d165a..56391be662 100644
--- a/pyiceberg/table/__init__.py
+++ b/pyiceberg/table/__init__.py
@@ -1926,6 +1926,30 @@ def _build_residual_evaluator(self, spec_id: int) -> Callable[[DataFile], Residu
             )
         )
 
+    def _should_keep_file_with_bloom_filter(self, data_file: DataFile) -> bool:
+        """Check if a data file should be kept based on bloom filter evaluation.
+
+        Args:
+            data_file: The data file to evaluate.
+
+        Returns:
+            True if the file should be kept, False if it can be pruned.
+        """
+        if data_file.bloom_filter_bytes is None:
+            # No bloom filter for this file
+            return True
+
+        try:
+            from pyiceberg.expressions.bloom_filter import BloomFilterEvaluator
+            from pyiceberg.expressions.visitors import visit
+
+            # Use the bloom filter evaluator to check if the file might contain matching rows
+            evaluator = BloomFilterEvaluator(data_file, self.table_metadata.schema())
+            return visit(self.row_filter, evaluator)
+        except Exception:
+            # If there's any error evaluating bloom filters, be conservative and keep the file
+            return True
+
     @staticmethod
     def _check_sequence_number(min_sequence_number: int, manifest: ManifestFile) -> bool:
         """Ensure that no manifests are loaded that contain deletes that are older than the data.
@@ -2001,6 +2025,10 @@ def plan_files(self) -> Iterable[FileScanTask]:
         for manifest_entry in chain.from_iterable(self.scan_plan_helper()):
             data_file = manifest_entry.data_file
             if data_file.content == DataFileContent.DATA:
+                # Apply bloom filter evaluation to prune files that definitely don't match the filter
+                if not self._should_keep_file_with_bloom_filter(data_file):
+                    # Skip this file as it cannot contain matching rows
+                    continue
                 data_entries.append(manifest_entry)
             elif data_file.content == DataFileContent.POSITION_DELETES:
                 positional_delete_entries.add(manifest_entry)
diff --git a/pyiceberg/table/bloom_filter.py b/pyiceberg/table/bloom_filter.py
new file mode 100644
index 0000000000..b0753adcb4
--- /dev/null
+++ b/pyiceberg/table/bloom_filter.py
@@ -0,0 +1,232 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Bloom filter implementation for Iceberg row-level filtering."""
+
+from __future__ import annotations
+
+import struct
+
+import mmh3
+
+
+class BloomFilter:
+    """Bloom filter implementation for Iceberg column value filtering.
+    
+    A bloom filter is a space-efficient probabilistic data structure that can determine
+    whether an element may be in a set or is definitely not in the set. In Iceberg,
+    bloom filters are used at the file level to help prune data files that cannot
+    contain rows matching a query predicate.
+    """
+
+    MAGIC = b"ORC"
+    SERIAL_NUMBER = 4
+
+    def __init__(self, num_bytes: int, num_hash_functions: int = 1):
+        """Initialize a bloom filter.
+
+        Args:
+            num_bytes: Size of the bloom filter in bytes.
+            num_hash_functions: Number of hash functions to use (default: 1).
+        """
+        self.num_bytes = num_bytes
+        self.num_hash_functions = num_hash_functions
+        # Initialize bit array as bytearray
+        self.bit_array = bytearray(num_bytes)
+
+    def add(self, value: str | bytes | int | float | bool | None) -> None:
+        """Add a value to the bloom filter.
+
+        Args:
+            value: The value to add to the filter.
+        """
+        if value is None:
+            # Null values are typically not added to bloom filters in Iceberg
+            return
+
+        # Convert value to bytes
+        value_bytes = self._to_bytes(value)
+
+        # Hash the value with multiple hash functions
+        for i in range(self.num_hash_functions):
+            # Create a unique hash input for each hash function
+            hash_input = value_bytes + struct.pack(">I", i)
+            # Use MurmurHash3 (128-bit)
+            hash_value = mmh3.hash128(hash_input, signed=False)
+            # Calculate bit positions
+            bit_pos = hash_value % (self.num_bytes * 8)
+            # Set the bit
+            byte_index = bit_pos // 8
+            bit_index = bit_pos % 8
+            self.bit_array[byte_index] |= 1 << bit_index
+
+    def might_contain(self, value: str | bytes | int | float | bool | None) -> bool:
+        """Check if a value might be in the bloom filter.
+
+        Returns True if the value might be in the set, False if it definitely is not.
+
+        Args:
+            value: The value to check.
+
+        Returns:
+            True if the value might be in the set, False if definitely not.
+        """
+        if value is None:
+            # Null values are not in bloom filters
+            return False
+
+        # Convert value to bytes
+        value_bytes = self._to_bytes(value)
+
+        # Check all hash positions
+        for i in range(self.num_hash_functions):
+            # Create a unique hash input for each hash function
+            hash_input = value_bytes + struct.pack(">I", i)
+            # Use MurmurHash3 (128-bit)
+            hash_value = mmh3.hash128(hash_input, signed=False)
+            # Calculate bit position
+            bit_pos = hash_value % (self.num_bytes * 8)
+            # Check if the bit is set
+            byte_index = bit_pos // 8
+            bit_index = bit_pos % 8
+            if not (self.bit_array[byte_index] & (1 << bit_index)):
+                # If any hash position is not set, value is definitely not in the set
+                return False
+
+        # All hash positions are set, value might be in the set
+        return True
+
+    def to_bytes(self) -> bytes:
+        """Serialize the bloom filter to bytes.
+
+        Returns:
+            Serialized bloom filter as bytes.
+        """
+        return bytes(self.bit_array)
+
+    @classmethod
+    def from_bytes(cls, data: bytes, num_hash_functions: int = 1) -> BloomFilter:
+        """Deserialize a bloom filter from bytes.
+
+        Args:
+            data: Serialized bloom filter bytes.
+            num_hash_functions: Number of hash functions used in the filter.
+
+        Returns:
+            BloomFilter instance.
+        """
+        bf = cls(len(data), num_hash_functions)
+        bf.bit_array = bytearray(data)
+        return bf
+
+    @staticmethod
+    def _to_bytes(value: str | bytes | int | float | bool) -> bytes:
+        """Convert a value to bytes for hashing.
+
+        Args:
+            value: Value to convert.
+
+        Returns:
+            Bytes representation of the value.
+        """
+        if isinstance(value, bytes):
+            return value
+        elif isinstance(value, str):
+            return value.encode("utf-8")
+        elif isinstance(value, bool):
+            # bool before int because bool is subclass of int
+            return struct.pack(">B", 1 if value else 0)
+        elif isinstance(value, int):
+            # Handle both 32-bit and 64-bit integers
+            if -2147483648 <= value <= 2147483647:
+                return struct.pack(">i", value)
+            else:
+                return struct.pack(">q", value)
+        elif isinstance(value, float):
+            return struct.pack(">d", value)
+        else:
+            raise TypeError(f"Unsupported type for bloom filter: {type(value)}")
+
+    def __repr__(self) -> str:
+        """Return string representation of the bloom filter."""
+        return f"BloomFilter(num_bytes={self.num_bytes}, num_hash_functions={self.num_hash_functions})"
+
+
+class BloomFilterBuilder:
+    """Builder for creating bloom filters with specific false positive rates."""
+
+    @staticmethod
+    def optimal_num_bytes(num_elements: int, false_positive_rate: float = 0.05) -> int:
+        """Calculate optimal bloom filter size in bytes given element count and FPP.
+
+        Uses the formula: m = -(n * ln(p)) / (ln(2)^2)
+        where n is number of elements and p is false positive rate.
+
+        Args:
+            num_elements: Expected number of elements to add.
+            false_positive_rate: Desired false positive rate (default: 0.05).
+
+        Returns:
+            Optimal number of bytes for the bloom filter.
+        """
+        import math
+
+        if num_elements <= 0:
+            return 1
+        if false_positive_rate <= 0 or false_positive_rate >= 1:
+            raise ValueError("False positive rate must be between 0 and 1")
+
+        # Calculate optimal number of bits
+        num_bits = -(num_elements * math.log(false_positive_rate)) / (math.log(2) ** 2)
+        # Convert to bytes (round up)
+        num_bytes = int(math.ceil(num_bits / 8))
+        return max(1, num_bytes)
+
+    @staticmethod
+    def optimal_num_hash_functions(num_bytes: int, num_elements: int) -> int:
+        """Calculate optimal number of hash functions.
+
+        Uses the formula: k = (m / n) * ln(2)
+        where m is number of bytes and n is number of elements.
+
+        Args:
+            num_bytes: Size of bloom filter in bytes.
+            num_elements: Expected number of elements to add.
+
+        Returns:
+            Optimal number of hash functions.
+        """
+        import math
+
+        if num_elements <= 0 or num_bytes <= 0:
+            return 1
+        k = (num_bytes / num_elements) * math.log(2)
+        return max(1, int(round(k)))
+
+    @staticmethod
+    def create(num_elements: int, false_positive_rate: float = 0.05) -> BloomFilter:
+        """Create an optimally configured bloom filter.
+
+        Args:
+            num_elements: Expected number of elements to add.
+            false_positive_rate: Desired false positive rate (default: 0.05).
+
+        Returns:
+            Optimally configured BloomFilter instance.
+        """
+        num_bytes = BloomFilterBuilder.optimal_num_bytes(num_elements, false_positive_rate)
+        num_hash_functions = BloomFilterBuilder.optimal_num_hash_functions(num_bytes, num_elements)
+        return BloomFilter(num_bytes, num_hash_functions)
diff --git a/tests/table/test_bloom_filter.py b/tests/table/test_bloom_filter.py
new file mode 100644
index 0000000000..c88b395582
--- /dev/null
+++ b/tests/table/test_bloom_filter.py
@@ -0,0 +1,276 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for bloom filter functionality."""
+
+import pytest
+
+from pyiceberg.table.bloom_filter import BloomFilter, BloomFilterBuilder
+
+
+class TestBloomFilter:
+    """Test cases for BloomFilter class."""
+
+    def test_bloom_filter_creation(self):
+        """Test creating a bloom filter."""
+        bf = BloomFilter(100, 3)
+        assert bf.num_bytes == 100
+        assert bf.num_hash_functions == 3
+        assert len(bf.bit_array) == 100
+
+    def test_add_and_might_contain_string(self):
+        """Test adding and checking string values."""
+        bf = BloomFilter(256, 3)
+
+        # Add some strings
+        bf.add("hello")
+        bf.add("world")
+
+        # Check that added values might be in the filter
+        assert bf.might_contain("hello")
+        assert bf.might_contain("world")
+
+    def test_add_and_might_contain_integer(self):
+        """Test adding and checking integer values."""
+        bf = BloomFilter(256, 3)
+
+        # Add some integers
+        bf.add(42)
+        bf.add(100)
+        bf.add(9999)
+
+        # Check that added values might be in the filter
+        assert bf.might_contain(42)
+        assert bf.might_contain(100)
+        assert bf.might_contain(9999)
+
+    def test_add_and_might_contain_float(self):
+        """Test adding and checking float values."""
+        bf = BloomFilter(256, 3)
+
+        # Add some floats
+        bf.add(3.14)
+        bf.add(2.71)
+
+        # Check that added values might be in the filter
+        assert bf.might_contain(3.14)
+        assert bf.might_contain(2.71)
+
+    def test_add_and_might_contain_bytes(self):
+        """Test adding and checking bytes values."""
+        bf = BloomFilter(256, 3)
+
+        # Add some bytes
+        bf.add(b"hello")
+        bf.add(b"world")
+
+        # Check that added values might be in the filter
+        assert bf.might_contain(b"hello")
+        assert bf.might_contain(b"world")
+
+    def test_add_and_might_contain_bool(self):
+        """Test adding and checking boolean values."""
+        bf = BloomFilter(256, 3)
+
+        # Add booleans
+        bf.add(True)
+        bf.add(False)
+
+        # Check that added values might be in the filter
+        assert bf.might_contain(True)
+        assert bf.might_contain(False)
+
+    def test_add_null_value(self):
+        """Test that null values are not added to the filter."""
+        bf = BloomFilter(256, 3)
+
+        # Add null
+        bf.add(None)
+
+        # Null should not be in the filter
+        assert not bf.might_contain(None)
+
+    def test_serialization_and_deserialization(self):
+        """Test serializing and deserializing a bloom filter."""
+        bf1 = BloomFilter(256, 3)
+        bf1.add("hello")
+        bf1.add("world")
+        bf1.add(42)
+
+        # Serialize
+        serialized = bf1.to_bytes()
+        assert isinstance(serialized, bytes)
+        assert len(serialized) == 256
+
+        # Deserialize
+        bf2 = BloomFilter.from_bytes(serialized, 3)
+        assert bf2.num_bytes == 256
+        assert bf2.num_hash_functions == 3
+
+        # Check that values are still found
+        assert bf2.might_contain("hello")
+        assert bf2.might_contain("world")
+        assert bf2.might_contain(42)
+
+    def test_false_positives_with_small_filter(self):
+        """Test that false positives are possible with a small filter."""
+        # Small filter to increase false positive rate
+        bf = BloomFilter(16, 1)
+
+        # Add one value
+        bf.add("value1")
+
+        # The added value should be found
+        assert bf.might_contain("value1")
+
+        # Some other values might also be found (false positives possible)
+        # This is expected behavior for bloom filters
+
+
+class TestBloomFilterBuilder:
+    """Test cases for BloomFilterBuilder class."""
+
+    def test_optimal_num_bytes_calculation(self):
+        """Test optimal number of bytes calculation."""
+        # Test with reasonable values
+        num_bytes_1 = BloomFilterBuilder.optimal_num_bytes(1000, 0.05)
+        assert num_bytes_1 > 0
+
+        num_bytes_2 = BloomFilterBuilder.optimal_num_bytes(10000, 0.01)
+        assert num_bytes_2 > num_bytes_1  # More elements or lower FPP should need more space
+
+    def test_optimal_num_hash_functions_calculation(self):
+        """Test optimal number of hash functions calculation."""
+        num_hash = BloomFilterBuilder.optimal_num_hash_functions(256, 1000)
+        assert num_hash >= 1
+
+        # More bytes relative to elements should result in fewer hash functions
+        num_hash_2 = BloomFilterBuilder.optimal_num_hash_functions(512, 1000)
+        assert num_hash_2 >= num_hash
+
+    def test_create_optimally_configured_filter(self):
+        """Test creating an optimally configured bloom filter."""
+        bf = BloomFilterBuilder.create(1000, 0.05)
+
+        assert bf.num_bytes > 0
+        assert bf.num_hash_functions >= 1
+
+        # Add some values and verify they can be found
+        for i in range(100):
+            bf.add(f"value_{i}")
+
+        for i in range(100):
+            assert bf.might_contain(f"value_{i}")
+
+    def test_create_with_different_false_positive_rates(self):
+        """Test creating filters with different false positive rates."""
+        num_elements = 1000
+
+        bf_fpp_5 = BloomFilterBuilder.create(num_elements, 0.05)
+        bf_fpp_1 = BloomFilterBuilder.create(num_elements, 0.01)
+
+        # Lower FPP should use more space
+        assert bf_fpp_1.num_bytes >= bf_fpp_5.num_bytes
+
+    def test_invalid_false_positive_rate(self):
+        """Test that invalid false positive rates raise an error."""
+        with pytest.raises(ValueError):
+            BloomFilterBuilder.optimal_num_bytes(1000, 0.0)
+
+        with pytest.raises(ValueError):
+            BloomFilterBuilder.optimal_num_bytes(1000, 1.0)
+
+        with pytest.raises(ValueError):
+            BloomFilterBuilder.optimal_num_bytes(1000, -0.1)
+
+
+class TestBloomFilterEdgeCases:
+    """Test edge cases for bloom filter."""
+
+    def test_empty_filter(self):
+        """Test querying an empty bloom filter."""
+        bf = BloomFilter(10, 1)
+
+        # Nothing has been added, so most queries should return False
+        # (though there could be false positives from the empty state)
+        assert not bf.might_contain("anything")
+
+    def test_large_filter(self):
+        """Test working with a large bloom filter."""
+        bf = BloomFilter(10000, 5)
+
+        # Add many values
+        for i in range(5000):
+            bf.add(f"value_{i}")
+
+        # All added values should be found
+        for i in range(5000):
+            assert bf.might_contain(f"value_{i}")
+
+    def test_multiple_hash_functions(self):
+        """Test that multiple hash functions work correctly."""
+        bf1 = BloomFilter(1000, 1)
+        bf2 = BloomFilter(1000, 5)
+
+        # Add same values to both
+        for i in range(100):
+            bf1.add(f"value_{i}")
+            bf2.add(f"value_{i}")
+
+        # Both should find the added values
+        for i in range(100):
+            assert bf1.might_contain(f"value_{i}")
+            assert bf2.might_contain(f"value_{i}")
+
+    def test_negative_and_large_integers(self):
+        """Test handling of negative and large integers."""
+        bf = BloomFilter(256, 3)
+
+        # Test negative integer
+        bf.add(-42)
+        assert bf.might_contain(-42)
+
+        # Test large integer (64-bit)
+        large_int = 9223372036854775807  # Max 64-bit int
+        bf.add(large_int)
+        assert bf.might_contain(large_int)
+
+        # Test very large integer (beyond 64-bit)
+        very_large_int = 92233720368547758070
+        bf.add(very_large_int)
+        assert bf.might_contain(very_large_int)
+
+    def test_mixed_types(self):
+        """Test adding and finding mixed types."""
+        bf = BloomFilter(512, 3)
+
+        # Add different types
+        values = [
+            "string",
+            42,
+            3.14,
+            b"bytes",
+            True,
+            False,
+        ]
+
+        for val in values:
+            bf.add(val)
+
+        # All should be found
+        for val in values:
+            assert bf.might_contain(val)
+