In [30]:
# Probability of a hash collision: A hash collision occurs when two different inputs produce the same hash output. If we have 10 objects and each one is assigned a hash between 1 and 10 (inclusive) with equal probability, then the probability of no collision for the first object is 10/10. For the second object, there’s a 1 in 10 chance it will collide with the first object’s hash, so the probability of no collision is 9/10. For the third object, 8/10, and so on. The probability of no collision when hashing 10 objects into 10 slots is therefore:
# (10/10 * 9/10 * 8/10 * … * 1/10) =  (10!) / (10^10)

import math

p_no_collis = math.factorial(10) / (10**10)

p_no_collis, 1-no_collis # and the probability of at least one collision is 1 - p_no_collis

(0.00036288, 0.99963712)

In [93]:
from scipy.stats import multinomial
import numpy as np

# Define the number of slots
n_slots = 10  # number of slots

p = [1/n_slots]*n_slots  # equal probabilities for each outcome

# Define the specific outcome you're interested in
k = [1]*n_slots  # for each of the n trials, a different of the n elements is extracted

# Calculate the probability of the outcome
prob = multinomial.pmf(k, n_slots, p)

print(f"The probability of no collisions {k} is {prob}")

print(f"The probability of collisions is {1-prob}")

The probability of no collisions [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] is 0.00036288000000000005
The probability of collisions is 0.99963712


In [103]:
n_objects = 30  # number of object

# Generate all possible outcomes
outcomes = multinomial.rvs(n_objects, p, size=1000000)

outcomes[:7]

array([[2, 1, 2, 4, 5, 3, 4, 2, 3, 4],
       [3, 4, 1, 6, 1, 3, 4, 2, 2, 4],
       [1, 4, 2, 4, 1, 2, 2, 3, 8, 3],
       [4, 4, 0, 2, 5, 4, 1, 3, 3, 4],
       [1, 4, 1, 3, 1, 5, 3, 4, 5, 3],
       [1, 4, 2, 2, 5, 5, 3, 4, 3, 1],
       [5, 2, 3, 3, 3, 0, 1, 6, 2, 5]])

In [102]:
# Probability of a hash collision

collision = np.any(outcomes > 1, axis=1)
print(collision[collision==False].shape)
print(collision[collision==True].shape)

prob_collision = np.mean(collision)
print(f"Probability of a hash collision: {prob_collision}")


(0,)
(1000000,)
Probability of a hash collision: 1.0


In [105]:
# Expected number of unused hashes: This is equivalent to the number of slots that have zero objects hashed to them. The probability that a given slot is unused after one object is hashed is 9/10. After two objects are hashed, the probability that a given slot is still unused is (9/10)^2, and so on. Therefore, the expected number of unused slots after 10 objects are hashed is 10 * (9/10)^10 ≈ 3.486.

p_unused_slot = ((n_slots-1)/n_slots)**n_objects

exp_numb_unused_slots = n_slots * p_unused_slot

p_unused_slot, exp_numb_unused_slots

(0.04239115827521624, 0.42391158275216234)

In [106]:
# Expected number of unused hashes

unused_hashes = np.count_nonzero(outcomes == 0, axis=1) # it counts the number of slots with 0 elements for each rvs of the outcome sample
print(unused_hashes)

print(f"Expected number of unused hashes: {np.mean(unused_hashes)}")

[0 0 0 ... 0 1 1]
Expected number of unused hashes: 0.42512


In [107]:
outcomes[:3]

array([[2, 1, 2, 4, 5, 3, 4, 2, 3, 4],
       [3, 4, 1, 6, 1, 3, 4, 2, 2, 4],
       [1, 4, 2, 4, 1, 2, 2, 3, 8, 3]])

In [108]:
# Expected number of hash collisions

np.mean([sum(slot[slot>1]) - len(slot[slot>1]) for slot in outcomes])

20.42512

In [111]:
# Calculate the number of collisions for each outcome
collisions = np.sum(outcomes - 1, where=outcomes>1, axis=1)

# Calculate the expected number of collisions
expected_collisions = np.mean(collisions)

print(f"Expected number of hash collisions: {expected_collisions}")


Expected number of hash collisions: 20.42512


In [None]:
# Expected number of hash collisions ---------- to be calculated analitically ------------