In [None]:
import hashlib

def trailing_zeros(x):
    """Counts the number of trailing zeros in the binary representation of a number."""
    if x == 0:
        return 32  # Assuming 32-bit integers
    count = 0
    while (x & 1) == 0:
        x >>= 1
        count += 1
    return count

def flajolet_martin_for_dataset(dataset, num_hashes):
    max_trailing_zeros = [0] * num_hashes
    for item in dataset:
        for i in range(num_hashes):
            hash_val = hashlib.md5(str(item).encode() + str(i).encode()).digest()
            max_trailing_zeros[i] = max(max_trailing_zeros[i], trailing_zeros(int.from_bytes(hash_val, byteorder='big')))
    return [2 ** max_trailing_zero for max_trailing_zero in max_trailing_zeros]

# Reading the dataset from a file
def read_dataset_from_file(filename):
    with open(filename, 'r') as file:
        dataset = [line.strip() for line in file]
    return dataset

# Testing the algorithm with a specific dataset
if __name__ == "__main__":
    dataset_filename = "/content/sample_data/WhatsgoodlyData-6.csv"  # Change this to your dataset file name
    num_hashes = 5  # Number of hash functions to use

    # Read the dataset from the file
    dataset = read_dataset_from_file(dataset_filename)

    # Apply Flajolet-Martin algorithm
    estimations = flajolet_martin_for_dataset(dataset, num_hashes)

    print("Estimated distinct elements:")
    for i, est in enumerate(estimations):
        print(f"Hash {i + 1}: {est}")


Estimated distinct elements:
Hash 1: 131072
Hash 2: 512
Hash 3: 1024
Hash 4: 16384
Hash 5: 256
