# Exploring Rabin-Karp-style Min-Hash Fingerprinting

This document showcases the differences between different numeric types that one can use to implement a Rabin-Karp-style min-hash fingerprinting algorithm.
It answers several important questions:

- How to use floating-point numbers for a traditionally integer-based task - "hashing"?
- How to properly compose many such hash functions to maximize the quality of fingerprints?

## Rabin-Karp Rolling Hashing

In [None]:
from typing import Generator


def rabin_karp_ints(
    s: str,
    window_width: int,
    multiplier: int,
    modulo: int,
    alphabet_size: int = 256,
) -> Generator[int, None, None]:
    """Return the rolling polynomial hashes of every length-`window_width` substring of `s`"""
    
    assert window_width > 0, "Window width must be positive"
    assert multiplier > 0, "Multiplier must be positive"
    assert modulo > 0, "Modulo must be positive"
    assert multiplier < modulo, "Multiplier must be less than modulo"

    if len(s) < window_width:
        return

    current_hash: int = 0
    for char in s[:window_width]:
        new_term = ord(char) + 1
        assert new_term <= alphabet_size, "Pass correct `alphabet_size`"
        current_hash = (current_hash * multiplier + new_term) % modulo
    yield current_hash

    discarding_multiplier: int = pow(multiplier, window_width - 1, modulo)
    total_hashes = len(s) - window_width + 1
    for i in range(1, total_hashes):  # First hash is already yielded
        old_term = ord(s[i - 1]) + 1
        new_term = ord(s[i + window_width - 1]) + 1
        
        # Remove leftmost char and add the new rightmost one.
        # All operations must be modulo `modulo`, but assuming the infinite precision of integers,
        # we don't care in this draft.
        current_hash = (current_hash - old_term * discarding_multiplier) % modulo
        current_hash = (current_hash * multiplier + new_term) % modulo
        yield current_hash


# Quick sanity-check
assert list(rabin_karp_ints("abcd", 3, 31, 1_000_000_007)) == [
    next(rabin_karp_ints("abc", 3, 31, 1_000_000_007)),
    next(rabin_karp_ints("bcd", 3, 31, 1_000_000_007)),
]
assert list(rabin_karp_ints("abcdefdhijklmnopqr", 17, 31, 65521)) == [
    next(rabin_karp_ints("abcdefdhijklmnopq", 17, 31, 65521)),
    next(rabin_karp_ints("bcdefdhijklmnopqr", 17, 31, 65521)),
]

### Rabin-Karp Rolling Hashing via Floats

The Python's `int` type is unbounded, so it can be used to implement the Rabin-Karp rolling hash algorithm without worrying about overflow.
It is, however, insanely expensive to use, and doesn't allow us to explore optimization opportunities.
The `float`, on the other hand, is just a double-precision IEEE 754 floating-point number, which can exactly represent 52-bit integers!
Thus, we can convert our arithmetic to use `float`s, if we guarantee, that no intermediate result will exceed that limit.

In [10]:
from typing import Generator

LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0

def rabin_karp_floats(
    s: str,
    window_width: int,
    multiplier: int,
    modulo: int,
    alphabet_size: int = 256,
) -> Generator[int, None, None]:
    """Return the rolling polynomial hashes of every length-`window_width` substring of `s`"""
    
    assert window_width > 0, "Window width must be positive"
    assert multiplier > 0, "Multiplier must be positive"
    assert modulo > 0, "Modulo must be positive"
    assert multiplier < modulo, "Multiplier must be less than modulo"

    if len(s) < window_width:
        return

    multiplier = float(multiplier)
    modulo = float(modulo)
    assert modulo < LARGEST_INTEGRAL_FLOAT, "Modulo can't exceed the largest integral float value"
    
    # Ensure, we won't overflow the floating-point representation
    largest_post_modulo = modulo - 1
    max_possible_term = alphabet_size
    assert largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT, "Will overflow"
    
    # All of the operations will happen with a modulo:
    def mul_mod(a: float, b: float) -> float:
        return (a * b) % modulo

    def add_mod(a: float, b: float) -> float:
        return (a + b) % modulo

    def sub_mod(a: float, b: float) -> float:
        return (a - b) % modulo
    
    # Precompute the discarding multiplier
    discarding_multiplier: float = 1
    for _ in range(window_width - 1):
        discarding_multiplier = mul_mod(discarding_multiplier, multiplier)

    # Handle the first window - without dropping any characters
    current_hash: float = 0.0
    for char in s[:window_width]:
        new_term = float(ord(char) + 1)
        assert new_term <= alphabet_size, "Pass correct `alphabet_size`"
        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)
    yield int(current_hash)

    # Roll through the rest of the string
    total_hashes = len(s) - window_width + 1
    for i in range(1, total_hashes):  # First hash is already yielded
        old_term = float(ord(s[i - 1]) + 1)
        new_term = float(ord(s[i + window_width - 1]) + 1)
        
        # Remove leftmost char and add the new rightmost one.
        # All operations must be modulo `modulo`, but assuming the infinite precision of integers,
        # we don't care in this draft.
        current_hash = sub_mod(current_hash, mul_mod(old_term, discarding_multiplier))
        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)
        yield int(current_hash)


# Quick sanity-check
assert list(rabin_karp_floats("abcd", 3, 31, 1_000_000_007)) == [
    next(rabin_karp_floats("abc", 3, 31, 1_000_000_007)),
    next(rabin_karp_floats("bcd", 3, 31, 1_000_000_007)),
]
assert list(rabin_karp_floats("abcdefdhijklmnopqr", 17, 31, 65521)) == [
    next(rabin_karp_floats("abcdefdhijklmnopq", 17, 31, 65521)),
    next(rabin_karp_floats("bcdefdhijklmnopqr", 17, 31, 65521)),
]

Let's load some data and ensure that the outputs are identical between the `int` and `float` implementations.

In [11]:
from pathlib import Path

dataset_directory = Path("..")

In [12]:
textual_dataset_path = dataset_directory / "leipzig1M.txt"
textual_dataset = open(textual_dataset_path, "r").read().strip()

In [13]:
textual_lines = textual_dataset.split("\n")
print(f"Loaded {len(textual_lines)} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters")

Loaded 1000000 lines of mean length 128.64 characters


In [16]:
for line in textual_lines[:2]:
    int_hashes = list(rabin_karp_ints(line, 17, 31, 65521))
    float_hashes = list(rabin_karp_floats(line, 17, 31, 65521))
    assert int_hashes == float_hashes, "Hashes do not match between int and float implementations"
    print(f"Line: {line}\nInt Hashes:   {int_hashes}\nFloat Hashes: {float_hashes}\n")

Line: A rebel statement sent to Lisbon from Jamba said 86 government soldiers and 13 guerrillas were killed in the fighting that ended Jan. 3. It said the rebel forces sill held Mavinga.
Int Hashes:   [39214, 58636, 27178, 56589, 55578, 20249, 42817, 9257, 3408, 19872, 51144, 34335, 40889, 27513, 38487, 18584, 3184, 58010, 7771, 24162, 33155, 61759, 39219, 14922, 28769, 36392, 58327, 32414, 10374, 39320, 40408, 4077, 12114, 26399, 16417, 50534, 55902, 1252, 17344, 47927, 138, 41634, 46625, 25284, 41597, 24437, 58118, 20946, 19491, 58575, 57224, 56366, 22315, 406, 8929, 63975, 22447, 40979, 42287, 37755, 61352, 18591, 11389, 3971, 37410, 42182, 55091, 29653, 55815, 31582, 62078, 4119, 40697, 38010, 15788, 56146, 15508, 45089, 3719, 28289, 54549, 57318, 12654, 64992, 49444, 14053, 34856, 24544, 61435, 49726, 13041, 39965, 64324, 28738, 43965, 31012, 56925, 49485, 19391, 3776, 56393, 4412, 49781, 24572, 57835, 2348, 32024, 18779, 62846, 26631, 23656, 945, 58585, 55891, 53897, 21132, 16665

A bigger question now is, will the same hold, if we use much larger modulo values?

In [19]:
LARGEST_MODULO_SAFE_MODULO = 4503599626977

for line in textual_lines[:50]:
    for window_width in [3, 17, 64]:
        int_hashes = list(rabin_karp_ints(line, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO))
        float_hashes = list(rabin_karp_floats(line, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO))
        assert int_hashes == float_hashes, "Hashes do not match between int and float implementations"

print("All tests passed for larger modulo values!")

All tests passed for larger modulo values!


### Rabin-Karp Rolling Hashing via FMAs

- How aggressively can we use **FMA** (Fused Multiply-Add) operations to optimize the algorithm?
- How many of the modulo operations can we avoid?
- How can we simplify the `%` modulo operation?

In [None]:
from typing import Generator

LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0

def rabin_karp_fma(
    s: str,
    window_width: int,
    multiplier: int,
    modulo: int,
    alphabet_size: int = 256,
) -> Generator[int, None, None]:
    """Return the rolling polynomial hashes of every length-`window_width` substring of `s`"""
    
    assert window_width > 0, "Window width must be positive"
    assert multiplier > 0, "Multiplier must be positive"
    assert modulo > 0, "Modulo must be positive"
    assert multiplier < modulo, "Multiplier must be less than modulo"

    if len(s) < window_width:
        return

    multiplier = float(multiplier)
    modulo = float(modulo)
    assert modulo < LARGEST_INTEGRAL_FLOAT, "Modulo can't exceed the largest integral float value"
    
    # Ensure, we won't overflow the floating-point representation
    largest_post_modulo = modulo - 1
    max_possible_term = alphabet_size
    assert largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT, "Will overflow"
    
    ...

## Min-Hash Fingerprinting

In [None]:
from typing import List

def min_hash(rolling_hashes: List[np.ndarray]) -> np.ndarray:
    ...


In [5]:
def hamming_distance(a: np.ndarray, b: np.ndarray) -> int:
    """Return the Hamming distance between two arrays of the same length."""
    if len(a) != len(b):
        raise ValueError("Arrays must be of the same length")
    return np.sum(a != b)

In [9]:
dna_dataset_path = dataset_directory / "acgt_10k.txt"
dna_dataset = open(dna_dataset_path, "r").read().strip()

In [10]:
window_widths = [3, 4, 8, 16, 32]
precision_levels = [np.uint8, np.uint16, np.uint32, np.uint64]
modulo_per_precision = [251, 65521, 2**31 - 1, 2**63 - 1]

In [None]:
for window_width in window_widths:
    for precision, modulo in zip(precision_levels, modulo_per_precision):
        print(
            f"Window width: {window_width}, Precision: {precision}, Modulo: {modulo}"
        )
        
        # Textual dataset
        textual_hashes = rolling_hashes(
            textual_dataset, window_width, 31, modulo, dtype=precision
        )
        print(f"Textual hashes: {textual_hashes[:10]}...")