# Exploring Rabin-Karp-style Min-Hash Fingerprinting

This document showcases the differences between different numeric types that one can use to implement a Rabin-Karp-style min-hash fingerprinting algorithm.
It answers several important questions:

- How to use floating-point numbers for a traditionally integer-based task - "hashing"?
- How to properly compose many such hash functions to maximize the quality of fingerprints?

## Rabin-Karp Rolling Hashing

In [None]:
from typing import Generator


def rabin_karp_ints(
    s: str,
    window_width: int,
    multiplier: int,
    modulo: int,
    alphabet_size: int = 256,
) -> Generator[int, None, None]:
    """Return the rolling polynomial hashes of every length-`window_width` substring of `s`"""
    
    assert window_width > 0, "Window width must be positive"
    assert multiplier > 0, "Multiplier must be positive"
    assert modulo > 0, "Modulo must be positive"
    assert multiplier < modulo, "Multiplier must be less than modulo"

    if len(s) < window_width:
        return

    current_hash: int = 0
    for char in s[:window_width]:
        new_term = ord(char) + 1
        assert new_term <= alphabet_size, "Pass correct `alphabet_size`"
        current_hash = (current_hash * multiplier + new_term) % modulo
    yield current_hash

    discarding_multiplier: int = pow(multiplier, window_width - 1, modulo)
    total_hashes = len(s) - window_width + 1
    for i in range(1, total_hashes):  # First hash is already yielded
        old_term = ord(s[i - 1]) + 1
        new_term = ord(s[i + window_width - 1]) + 1
        
        # Remove leftmost char and add the new rightmost one.
        # All operations must be modulo `modulo`, but assuming the infinite precision of integers,
        # we don't care in this draft.
        current_hash = (current_hash - old_term * discarding_multiplier) % modulo
        current_hash = (current_hash * multiplier + new_term) % modulo
        yield current_hash


# Quick sanity-check
assert list(rabin_karp_ints("abcd", 3, 31, 1_000_000_007)) == [
    next(rabin_karp_ints("abc", 3, 31, 1_000_000_007)),
    next(rabin_karp_ints("bcd", 3, 31, 1_000_000_007)),
]
assert list(rabin_karp_ints("abcdefdhijklmnopqr", 17, 31, 65521)) == [
    next(rabin_karp_ints("abcdefdhijklmnopq", 17, 31, 65521)),
    next(rabin_karp_ints("bcdefdhijklmnopqr", 17, 31, 65521)),
]

## Rabin-Karp Rolling Hashing via Floats

The Python's `int` type is unbounded, so it can be used to implement the Rabin-Karp rolling hash algorithm without worrying about overflow.
It is, however, insanely expensive to use, and doesn't allow us to explore optimization opportunities.
The `float`, on the other hand, is just a double-precision IEEE 754 floating-point number, which can exactly represent 52-bit integers!
Thus, we can convert our arithmetic to use `float`s, if we guarantee, that no intermediate result will exceed that limit.

In [None]:
from typing import Generator

LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0


def rabin_karp_floats(
    s: str,
    window_width: int,
    multiplier: int,
    modulo: int,
    alphabet_size: int = 256,
) -> Generator[int, None, None]:
    """Return the rolling polynomial hashes of every length-`window_width` substring of `s`"""

    assert window_width > 0, "Window width must be positive"
    assert multiplier > 0, "Multiplier must be positive"
    assert modulo > 0, "Modulo must be positive"
    assert multiplier < modulo, "Multiplier must be less than modulo"

    if len(s) < window_width:
        return

    multiplier = float(multiplier)
    modulo = float(modulo)
    assert (
        modulo < LARGEST_INTEGRAL_FLOAT
    ), "Modulo can't exceed the largest integral float value"

    # Ensure, we won't overflow the floating-point representation
    largest_post_modulo = modulo - 1
    max_possible_term = alphabet_size
    assert (
        largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT
    ), "Will overflow"

    # All of the operations will happen with a modulo:
    def mul_mod(a: float, b: float) -> float:
        return (a * b) % modulo

    def add_mod(a: float, b: float) -> float:
        return (a + b) % modulo

    def sub_mod(a: float, b: float) -> float:
        return (a - b) % modulo

    # Precompute the discarding multiplier
    discarding_multiplier: float = 1.0
    for _ in range(window_width - 1):
        discarding_multiplier = mul_mod(discarding_multiplier, multiplier)

    # Handle the first window - without dropping any characters
    current_hash: float = 0.0
    for char in s[:window_width]:
        new_term = float(ord(char) + 1)
        assert new_term <= alphabet_size, "Pass correct `alphabet_size`"
        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)
    yield int(current_hash)

    # Roll through the rest of the string
    total_hashes = len(s) - window_width + 1
    for i in range(1, total_hashes):  # First hash is already yielded
        old_term = float(ord(s[i - 1]) + 1)
        new_term = float(ord(s[i + window_width - 1]) + 1)

        # Remove leftmost char and add the new rightmost one.
        current_hash = sub_mod(current_hash, mul_mod(old_term, discarding_multiplier))
        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)
        yield int(current_hash)


# Quick sanity-check
assert list(rabin_karp_floats("abcd", 3, 31, 1_000_000_007)) == [
    next(rabin_karp_floats("abc", 3, 31, 1_000_000_007)),
    next(rabin_karp_floats("bcd", 3, 31, 1_000_000_007)),
]
assert list(rabin_karp_floats("abcdefdhijklmnopqr", 17, 31, 65521)) == [
    next(rabin_karp_floats("abcdefdhijklmnopq", 17, 31, 65521)),
    next(rabin_karp_floats("bcdefdhijklmnopqr", 17, 31, 65521)),
]

Let's load some data and ensure that the outputs are identical between the `int` and `float` implementations.

In [None]:
from pathlib import Path

dataset_directory = Path("..")

In [None]:
textual_dataset_path = dataset_directory / "leipzig1M.txt"
textual_dataset = open(textual_dataset_path, "r").read().strip()

In [None]:
textual_lines = textual_dataset.split("\n")
print(f"Loaded {len(textual_lines):,} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters")

In [None]:
def compare_hashes(line, make_baseline_generator, make_test_generator):
    int_hashes = list(make_baseline_generator(line))
    float_hashes = list(make_test_generator(line))
    if int_hashes != float_hashes:
        print(f"Int Hashes:   {int_hashes}")
        print(f"Float Hashes: {float_hashes}")


for line in textual_lines[:2]:
    compare_hashes(
        line,
        lambda l: rabin_karp_ints(l, 17, 31, 65521),
        lambda l: rabin_karp_floats(l, 17, 31, 65521),
    )

A bigger question now is, will the same hold, if we use much larger modulo values?

In [None]:
LARGEST_MODULO_SAFE_MODULO = 4503599626977

for window_width in [3, 17, 64]:
    for line in textual_lines[:50]:
        compare_hashes(
            line,
            lambda l: rabin_karp_ints(l, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO),
            lambda l: rabin_karp_floats(l, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO))
    print(f"Passed for window width: {window_width}!")

## Rabin-Karp Rolling Hashing via FMAs

- How aggressively can we use **FMA** (Fused Multiply-Add) operations to optimize the algorithm?
- How many of the modulo operations can we avoid?
- How can we simplify the `%` modulo operation?

In [None]:
import math
from typing import Generator

LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0


def rabin_karp_fma(
    s: str,
    window_width: int,
    multiplier: int,
    modulo: int,
    alphabet_size: int = 256,
) -> Generator[int, None, None]:
    """Return the rolling polynomial hashes of every length-`window_width` substring of `s`
    using Fused-Multiply-Add (FMA) operations & Barrett reduction for performance."""

    assert window_width > 0, "Window width must be positive"
    assert multiplier > 0, "Multiplier must be positive"
    assert modulo > 0, "Modulo must be positive"
    assert multiplier < modulo, "Multiplier must be less than modulo"

    if len(s) < window_width:
        return

    multiplier = float(multiplier)
    modulo = float(modulo)
    assert (
        modulo < LARGEST_INTEGRAL_FLOAT
    ), "Modulo can't exceed the largest integral float value"

    # Ensure, we won't overflow the floating-point representation
    largest_post_modulo = modulo - 1
    max_possible_term = alphabet_size
    assert (
        largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT
    ), "Will overflow"

    inverse_modulo: float = 1.0 / modulo

    # Barrett reduction function
    # It will be used to reduce the intermediate results to the modulo range
    def barrett_mod(x: float) -> float:
        q = math.floor(x * inverse_modulo)
        result = x - q * modulo
        # Handle potential off-by-one errors
        if result >= modulo:
            result -= modulo
        elif result < 0:
            result += modulo
        assert result == (x % modulo), "Barrett reduction failed"
        return result

    # All of the operations will happen with a modulo:
    def fma_mod(a: float, b: float, c: float) -> float:
        intermediate = a * b + c
        assert intermediate <= LARGEST_INTEGRAL_FLOAT, "FMA did exceed integral range"
        return barrett_mod(intermediate)

    # Precompute the discarding multiplier
    negative_discarding_multiplier: float = 1.0
    for _ in range(window_width - 1):
        negative_discarding_multiplier = fma_mod(
            negative_discarding_multiplier, multiplier, 0.0
        )
    negative_discarding_multiplier = (
        -negative_discarding_multiplier
    )  # Negate for FMA compatibility

    # Handle the first window - without dropping any characters
    current_hash: float = 0.0
    for char in s[:window_width]:
        new_term = float(ord(char) + 1)
        assert new_term <= alphabet_size, "Pass correct `alphabet_size`"
        current_hash = fma_mod(current_hash, multiplier, new_term)
    yield int(current_hash)

    # Roll through the rest of the string
    total_hashes = len(s) - window_width + 1
    for i in range(1, total_hashes):  # First hash is already yielded
        old_term = float(ord(s[i - 1]) + 1)
        new_term = float(ord(s[i + window_width - 1]) + 1)

        # Remove leftmost char and add the new rightmost one.
        current_hash = fma_mod(old_term, negative_discarding_multiplier, current_hash)
        assert (
            current_hash >= -modulo
        ), "Intermediate hash may be negative, but within modulo range"
        current_hash = fma_mod(current_hash, multiplier, new_term)
        assert current_hash >= 0, "Current hash should not be negative"
        yield int(current_hash)


# Quick sanity-check
assert list(rabin_karp_fma("abcd", 3, 31, 1_000_000_007)) == [
    next(rabin_karp_fma("abc", 3, 31, 1_000_000_007)),
    next(rabin_karp_fma("bcd", 3, 31, 1_000_000_007)),
]
assert list(rabin_karp_fma("abcdefdhijklmnopqr", 17, 31, 65521)) == [
    next(rabin_karp_fma("abcdefdhijklmnopq", 17, 31, 65521)),
    next(rabin_karp_fma("bcdefdhijklmnopqr", 17, 31, 65521)),
]

In [None]:
LARGEST_MODULO_SAFE_MODULO = 4503599626977

for window_width in [3, 17, 64]:
    for line in textual_lines[:50]:
        compare_hashes(
            line,
            lambda l: rabin_karp_ints(l, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO),
            lambda l: rabin_karp_fma(l, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO))
    print(f"Passed for window width: {window_width}!")

As we can handle typical texts, let's try several tricky inputs... where we'll be at a brink of an overflow! Some uncomfortable character values are: `\x00`, `\x01`, `\x7F`, `\xFF`. To really stress-test, let's pick the largest prime number below `LARGEST_INTEGRAL_FLOAT`, that can be used safely for a given alphabet size.

In [None]:
from typing import Final, List

# Fixed witnesses that make Miller-Rabin exact for n < 2**64
MR_BASES: Final[List[int]] = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37]


def _is_prime_64(n: int) -> bool:
    """Exact primality for 0 < n < 2**64."""
    if n < 2:
        return False
    # Quick reject: small prime factors
    for p in MR_BASES:  # covers all primes ≤ 37
        if n == p:
            return True
        if n % p == 0:
            return False

    # Write n-1 = d · 2ˢ  with d odd
    d, s = n - 1, 0
    while d & 1 == 0:
        d >>= 1
        s += 1

    # Strong-probable-prime test for each base
    for a in MR_BASES:
        x = pow(a, d, n)
        if x in (1, n - 1):  # self-loop or −1 ⇒ may be prime
            continue
        for _ in range(s - 1):  # square until −1 or cycle
            x = pow(x, 2, n)
            if x == n - 1:
                break
        else:  # never hit −1 ⇒ composite
            return False
    return True


def largest_prime_below(n: int) -> int:
    """
    Return the largest prime strictly less than n (n must be > 2).
    Average cost: O(log n * log log n) because the prime gap ~ log n.
    """
    if n <= 2:
        raise ValueError("Threshold must exceed 2.")
    n -= n % 2 == 0  # make n odd
    while not _is_prime_64(n):
        n -= 2
    return n


LARGEST_INTEGRAL_FLOAT_PRIME = largest_prime_below(int(LARGEST_INTEGRAL_FLOAT))
print(f"{LARGEST_INTEGRAL_FLOAT_PRIME:,}")  # This will be used for stress-testing

In [None]:
import random

all_0 = "\x00" * 1_000
all_1 = "\x01" * 1_000
all_127 = "\x7f" * 1_000
all_255 = "\xff" * 1_000
all_0_255 = "\x00\xff" * 500  # alternating 0 and 255 characters
all_uncomfortable = "\x00\x01\x7f\xfe\xff" * 250  # all uncomfortable characters

long_random_strings = [
    "".join(random.choices("\x00\x01\x7f\xfe\xff", k=10_000)) for _ in range(10)
]  # 10 long random strings with uncomfortable characters

alphabet_size = 256
multiplier = 257
largest_term = alphabet_size + 1  # in this specific case, same as `multiplier`
large_modulo = largest_prime_below(
    int(LARGEST_INTEGRAL_FLOAT) // multiplier - largest_term
)

for window_width in [3, 17, 64, 707]:
    for line in [
        all_0,
        all_1,
        all_127,
        all_255,
        all_0_255,
        all_uncomfortable,
        *long_random_strings,
    ]:
        compare_hashes(
            line,
            lambda l: rabin_karp_ints(
                l,
                window_width=window_width,
                multiplier=multiplier,
                modulo=large_modulo,
                alphabet_size=alphabet_size,
            ),
            lambda l: rabin_karp_fma(
                l,
                window_width=window_width,
                multiplier=multiplier,
                modulo=large_modulo,
                alphabet_size=alphabet_size,
            ),
        )
    print(f"Passed for window width: {window_width}, modulo: {large_modulo:,}!")

## Min-Hash Fingerprinting

Min-Hash fingerprints transform variable length text representations into **fixed-length vectors**, where each dimension stores the minimum hash value of a certain hash function across the whole document.
It's great for large-scale information retrieval using Hamming Distance or Jaccard Similarity ($|A ∩ B| / |A ∪ B|$) or its weighted alternative.

A potentially more informative alternative is "weighted Min-Hash", which takes into account the frequency of each element in the document. This makes the fingerprints compatible with **TF-IDF**-like algorithms, and makes the system more robust especially for narrow rolling windows.

In [None]:
!pip install tqdm numpy

In [None]:
import numpy as np
from numpy.dtypes import StringDType
from typing import List, Tuple

def count_min_sketch(
    text: str,
    window_widths: List[int],
    multipliers: List[int],
    modulo: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Produces a weighted Min-Hash fingerprint also called a Count-Min Sketch.
    Those sketches are trivial to merge
    
    https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch
    """
    
    count_widths = len(window_widths)
    count_multipliers = len(multipliers)
    assert count_widths == count_multipliers, f"{count_widths=} != {count_multipliers=}"
    
    fingerprint_hashes = np.empty((len(window_widths),), dtype=np.uint32)
    fingerprint_weights = np.empty((len(window_widths),), dtype=np.uint32)
    fingerprint_ngrams = np.empty((len(window_widths),), dtype=StringDType())
    
    skipped_u32_hash = np.iinfo(np.uint32).max
    skipped_u64_intermediary = np.iinfo(np.uint64).max
    hashers = [
        rabin_karp_fma(text, window_width=width, multiplier=multiplier, modulo=modulo)
        for width, multiplier in zip(window_widths, multipliers)
    ]
    
    for i, hasher in enumerate(hashers):
        smallest_hash = skipped_u64_intermediary
        smallest_count = 0
        smallest_example = None
        for rolling_intermediate_u64_hash in hasher:
            new_smallest_hash = min(smallest_hash, rolling_intermediate_u64_hash)
            if new_smallest_hash < smallest_hash:
                smallest_count = 1
                smallest_hash = new_smallest_hash
                smallest_example = text[i:i + window_widths[i]]
            elif new_smallest_hash == smallest_hash:
                smallest_count += 1
            
        smallest_hash &= skipped_u32_hash  # Ensure we don't exceed the `uint32` range
        fingerprint_hashes[i] = smallest_hash
        fingerprint_weights[i] = smallest_count
        fingerprint_ngrams[i] = smallest_example

    return fingerprint_hashes, fingerprint_weights, fingerprint_ngrams

count_min_sketch("abcde", [3, 4], [257, 258], 4503599626977)

A good set of hyper-parameters for Min-Hashing binary text would be:

- `window_widths`: ${3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 21, 24, 27, 30}$ - 16 widths
- `alphabet_size`: $256$ for ASCII & binary UTF-8 content
- `ndim`: $16...1024$, something like 192 should be great for X/Twitter
- `multipliers`: ${257, 258, 259, 260, 261, 262, ..., 1024 + 256}$

When processing less usual inputs, like the DNA sequences, parameters may be different, e.g.:

- `window_widths`: ${3, 6, 9, 12, 15, 30, 60, 120}$
- `alphabet_size`: $4$ for DNA sequences
- `ndim`: should be probably proportional to $√n$, where $n$ is the typical length of sequences
- `multipliers`: ${5, 6, 7, 8, 9, ..., 4 * n + 1}$

In every case, the `modulo` should be co-prime to the multiplier.
The easiest option is to use a large prime, that can be obtained via:

```python
largest_prime_below(int(LARGEST_INTEGRAL_FLOAT) // max(multipliers) - (alphabet_size + 1))
```

In [None]:
import numpy as np
from typing import Tuple


def jaccard_similarity(a: np.ndarray, b: np.ndarray) -> float:
    if a.shape != b.shape:
        raise ValueError("Fingerprints must have identical length")

    return float(np.mean(a == b))


def weighted_jaccard_similarity(
    a: Tuple[np.ndarray, np.ndarray],
    b: Tuple[np.ndarray, np.ndarray],
) -> float:
    hashes_a, weights_a = a
    hashes_b, weights_b = b

    if hashes_a.shape != hashes_b.shape or weights_a.shape != weights_b.shape:
        raise ValueError("Both fingerprints must have identical dimensions")

    magnitude_i = (weights_a * weights_b)[hashes_a == hashes_b].sum()
    magnitude_a = (weights_a * weights_a).sum()
    magnitude_b = (weights_b * weights_b).sum()
    magnitude_u = magnitude_a + magnitude_b - magnitude_i

    return float(magnitude_i) / float(magnitude_u)

Let's compute the rolling fingerprints:

In [None]:
textual_dataset_path = dataset_directory / "leipzig1M.txt"
textual_dataset = open(textual_dataset_path, "r").read().strip()
textual_lines = textual_dataset.split("\n")
print(f"Loaded {len(textual_lines):,} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters")

In [None]:
from tqdm import tqdm

multipliers = list(range(256, 256+192))
window_widths = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 21, 24, 27, 30]
window_widths *= (192 // len(window_widths))
LARGEST_MODULO_SAFE_MODULO = 4503599626977

fingerprint_hashes = []
fingerprint_counts = []
fingerprint_ngrams = []

DATASET_SIZE_LIMIT = 10_000

for line in tqdm(textual_lines[:DATASET_SIZE_LIMIT], desc="Fingerprinting lines", unit="line"):
    hashes, counts, ngrams = count_min_sketch(
        text=line,
        window_widths=window_widths,
        multipliers=multipliers,
        modulo= LARGEST_MODULO_SAFE_MODULO,
    )
    fingerprint_hashes.append(hashes)
    fingerprint_counts.append(counts)
    fingerprint_ngrams.append(ngrams)

Let's estimate Recall @ 1, but before we do that - let's find a way to highlight N-gram matches between strings.

In [None]:
COLOR_ARRAY = [
    "\033[38;5;196m",  # red
    "\033[38;5;208m",  # orange
    "\033[38;5;226m",  # yellow
    "\033[38;5;082m",  # green
    "\033[38;5;039m",  # blue
    "\033[38;5;201m",  # magenta
    "\033[38;5;129m",  # purple
]

# Sometimes hashes match, but the n-grams are different
COLOR_COLLISION = "\033[38;5;244m"  # grey
COLOR_RESET = "\033[0m"

def color_code_matches(
    query_text: str,
    document_text: str,
    query_hashes: np.ndarray,
    document_hashes: np.ndarray,
    query_ngrams: np.ndarray,
    document_ngrams: np.ndarray,
) -> Tuple[str, str]:
    
    color_index = 0
    for dim in range(len(query_hashes)):
        is_matching_hash = query_hashes[dim] == document_hashes[dim]
        is_matching_ngram = query_ngrams[dim] == document_ngrams[dim]
        
        if is_matching_ngram:
            color = COLOR_ARRAY[color_index % len(COLOR_ARRAY)]
            ngram_replacement = f"{color}{query_ngrams[dim]}{COLOR_RESET}"
            query_text = query_text.replace(query_ngrams[dim], ngram_replacement)
            document_text = document_text.replace(document_ngrams[dim], ngram_replacement)
            color_index += 1
        elif is_matching_hash:
            ngram_replacement = f"{COLOR_COLLISION}{query_ngrams[dim]}{COLOR_RESET}"
            query_text = query_text.replace(query_ngrams[dim], ngram_replacement)
            document_text = document_text.replace(document_ngrams[dim], ngram_replacement)
    
    return query_text, document_text

In [None]:
from tqdm import tqdm

QUERIES_TO_COMPARE = 100

for i, query_hashes, query_counts, query_ngrams in tqdm(zip(
    range(QUERIES_TO_COMPARE),
    fingerprint_hashes[:QUERIES_TO_COMPARE],
    fingerprint_counts[:QUERIES_TO_COMPARE],
    fingerprint_ngrams[:QUERIES_TO_COMPARE],
), desc="Searching", unit="doc", total=QUERIES_TO_COMPARE):
    
    # Compare with all other fingerprints
    best_score, best_index = 0.0, -1
    for j, dataset_hashes, dataset_counts, dataset_ngrams in zip(
        range(len(fingerprint_hashes)),
        fingerprint_hashes,
        fingerprint_counts,
        fingerprint_ngrams,
    ):
        if i == j:
            continue

        score = jaccard_similarity(query_hashes, dataset_hashes)
        if score > best_score:
            best_score = score
            best_index = j

    query = textual_lines[i]
    doc = textual_lines[best_index]
    colored_query, colored_doc = color_code_matches(
        query_text=query,
        document_text=doc,
        query_hashes=query_hashes,
        document_hashes=fingerprint_hashes[best_index],
        query_ngrams=query_ngrams,
        document_ngrams=fingerprint_ngrams[best_index],
    )
    print(f"Matched query {i:,} with document {best_index:,} with score {best_score:.4f}")
    print(f"- {colored_query}")
    print(f"- {colored_doc}")

In [None]:
from tqdm import tqdm

QUERIES_TO_COMPARE = 100

for i, query_hashes, query_counts, query_ngrams in tqdm(
    zip(
        range(QUERIES_TO_COMPARE),
        fingerprint_hashes[:QUERIES_TO_COMPARE],
        fingerprint_counts[:QUERIES_TO_COMPARE],
        fingerprint_ngrams[:QUERIES_TO_COMPARE],
    ),
    desc="Searching",
    unit="doc",
    total=QUERIES_TO_COMPARE,
):

    # Compare with all other fingerprints
    best_score, best_index = 0.0, -1
    for j, dataset_hashes, dataset_counts, dataset_ngrams in zip(
        range(len(fingerprint_hashes)),
        fingerprint_hashes,
        fingerprint_counts,
        fingerprint_ngrams,
    ):
        if i == j:
            continue

        score = weighted_jaccard_similarity(
            query_hashes,
            dataset_hashes,
            query_ngrams,
            dataset_ngrams,
        )
        if score > best_score:
            best_score = score
            best_index = j

    query = textual_lines[i]
    doc = textual_lines[best_index]
    colored_query, colored_doc = color_code_matches(
        query_text=query,
        document_text=doc,
        query_hashes=query_hashes,
        document_hashes=fingerprint_hashes[best_index],
        query_ngrams=query_ngrams,
        document_ngrams=fingerprint_ngrams[best_index],
    )
    print(
        f"Matched query {i:,} with document {best_index:,} with score {best_score:.4f}"
    )
    print(f"- {colored_query}")
    print(f"- {colored_doc}")

## Min-Hash Fingerprinting DNA & Protein Sequences

In [None]:
dna_dataset_path = dataset_directory / "acgt_10k.txt"
dna_dataset = open(dna_dataset_path, "r").read().strip()