# Task 4

a) At first, we define a method that can get all shingles of size k from the given string

In [None]:
def compute_k_shingles(string, k):
    """
    Compute the k-shingles of a string of decimal digits.

    Args:
        string (str): The input string that represents a sequence of digits.
        k (int): The size of the shingles.

    Returns:
        List[int]: An ordered list of positions of 1's in the Boolean vector.
    """
    if k <= 0 or k > len(string):
        raise ValueError("k must be a positive integer less than or equal to the length of the string.")
    
    k_shingles = set()
    
    for i in range(len(string) - k + 1):
        shingle_at_position_i = string[i:i + k]
        k_shingles.add(int(shingle_at_position_i))

    return sorted(k_shingles)

Run the method with a few examples

In [None]:
input_strings = ["1234567", "2024", "0002024", "00000", "1010101"]
k = 4

for input_string in input_strings:
    result = compute_k_shingles(input_string, k)
    
    print(f"Input string: {input_string}, k: {k}")
    print(f"Resulting k-shingles positions: {result}")
    print()

b) We import pi and get the shingles, by using the method from a)
Note, pi from the package mpmath begins with 3., so we have to compensate for the first two characters

In [None]:
from mpmath import mp

mp.dps = 10002
pi_digits = str(mp.pi)[2:10002] 

Now we get the shingles of size 12, and write those into the result file.

In [None]:
k = 12
k_shingles_positions = compute_k_shingles(pi_digits, k)

output_file_path = "./Task 4 b results.txt"
with open(output_file_path, "w") as file:
    for position in k_shingles_positions:
        file.write(f"{position}\n")


c) We begin, by efining the method for the hash function from the lecture

In [None]:
import random

def minhash_signature(positions, num_hashes=5):
    """
    Compute MinHash signature using specified hash functions.

    Args:
        positions (List[int]): List of positions of 1s (k-shingles) in the Boolean vector.
        num_hashes (int): Number of hash functions to use.

    Returns:
        List[int]: List of MinHash values (one per hash function).
    """
    # The hash parameters given by the exercise
    hash_params = [(37, 126, 10**15 + 223)]
    N = len(positions)
    
    # The additional hash params
    primes = [10**15 + i for i in [37, 91, 159, 187]]
    U = 10**12
    for p in primes:
        a = random.randint(0, U)
        b = random.randint(0, U)
        hash_params.append((a, b, p))
    
    minhash_values = []
    
    # Compute MinHash for each hash function
    for a, b, p in hash_params:

        def h(x):
            return ((a + x * b) % p) % N + 1
        
        min_value = min(h(x) for x in positions)
        minhash_values.append(min_value)
    
    return minhash_values

minhash_result = minhash_signature(k_shingles_positions)

print("MinHash Signatures:")
for i, val in enumerate(minhash_result, 1):
    print(f"Hash Function {i}: {val}")