# The Birthday Paradox

## 1. Generate random sample lists

In [1]:
from random import randint

In [2]:
def generate_samples(num_samples, sample_size):
    master = []
    for i in range(num_samples):
        sample = []
        for x in range(sample_size):
            birthday = randint(1, 365) # 365 days in a year
            sample.append(birthday)
        master.append(sample)
    return master

In [3]:
ten_samples = generate_samples(10, 45)

In [4]:
ten_samples

[[252,
  141,
  78,
  224,
  161,
  290,
  226,
  102,
  152,
  66,
  60,
  71,
  237,
  348,
  263,
  73,
  214,
  279,
  31,
  281,
  106,
  91,
  8,
  311,
  70,
  54,
  201,
  75,
  359,
  238,
  264,
  64,
  200,
  192,
  207,
  82,
  312,
  293,
  325,
  81,
  108,
  26,
  87,
  133,
  258],
 [288,
  134,
  325,
  264,
  82,
  74,
  317,
  247,
  206,
  365,
  24,
  167,
  35,
  252,
  147,
  37,
  297,
  83,
  179,
  239,
  113,
  306,
  315,
  210,
  123,
  4,
  161,
  299,
  149,
  326,
  262,
  275,
  37,
  361,
  65,
  73,
  7,
  48,
  17,
  323,
  250,
  188,
  299,
  112,
  13],
 [153,
  20,
  89,
  326,
  130,
  164,
  272,
  47,
  31,
  65,
  41,
  248,
  272,
  152,
  255,
  199,
  49,
  348,
  207,
  97,
  237,
  265,
  122,
  274,
  181,
  1,
  38,
  126,
  216,
  218,
  186,
  74,
  193,
  192,
  185,
  354,
  259,
  118,
  185,
  193,
  21,
  318,
  167,
  188,
  282],
 [157,
  23,
  277,
  57,
  175,
  29,
  348,
  24,
  175,
  81,
  365,
  333,
  250,
  168,
  361

## 2. Check for duplicate birthdays in each sample list

In [5]:
def has_duplicates(sample, print_results=True):
    check = []
    duplicates = []
    for birthday in sample:
        if birthday in check:
            duplicates.append(birthday)
        check.append(birthday)
    if print_results:
        print("Duplicate birthdays: {}".format(str(duplicates)))
    return len(duplicates) > 0

In [6]:
one_sample = generate_samples(1, 45)
has_duplicates(one_sample[0]) # index placed at 0 as generate_samples creates a nested list

Duplicate birthdays: [229]


True

In [7]:
for sample in ten_samples:
    has_duplicates(sample)

Duplicate birthdays: []
Duplicate birthdays: [37, 299]
Duplicate birthdays: [272, 185, 193]
Duplicate birthdays: [175, 277]
Duplicate birthdays: [271, 279, 302]
Duplicate birthdays: [48, 326, 37]
Duplicate birthdays: [54, 111, 88, 302]
Duplicate birthdays: [136, 196]
Duplicate birthdays: [190, 184, 58]
Duplicate birthdays: [92]


## 3. Run experiments with large sample sizes to find probabilities

The law of large numbers in statistics tells us that as the number of samples approaches infinity, the ratio of outcomes will move towards the expected value for the distribution. By calculating the mean of a large number of samples, we can get a close approximation of the probability that at least two students will have the same birthday in **any** given sample.

In [8]:
def experiment(sample_sizes, class_size):
    print("Conducting this experiment with sample classes of {} students.\n".format(str(class_size)))
    probabilities = []
    for ix, size in enumerate(sample_sizes):
        duplicates = 0
        test = generate_samples(size, class_size)
        for sample in test:
            if has_duplicates(sample, print_results=False): # Not going to print results for thousands of samples
                duplicates += 1
        duplicates_pct = round((duplicates/size), 4)*100
        print("Test: {} | Number of Samples: {} | Duplicates: {} ({}%)".format((ix + 1), size, duplicates, duplicates_pct))

### 45 Students

In [9]:
sample_sizes = [1, 10, 100, 1000, 10000, 100000]
experiment(sample_sizes, 45)

Conducting this experiment with sample classes of 45 students.

Test: 1 | Number of Samples: 1 | Duplicates: 1 (100.0%)
Test: 2 | Number of Samples: 10 | Duplicates: 9 (90.0%)
Test: 3 | Number of Samples: 100 | Duplicates: 94 (94.0%)
Test: 4 | Number of Samples: 1000 | Duplicates: 936 (93.60000000000001%)
Test: 5 | Number of Samples: 10000 | Duplicates: 9361 (93.61%)
Test: 6 | Number of Samples: 100000 | Duplicates: 94097 (94.1%)


With 100,000 samples we get a fairly close approximation of the underlying probability - with a class of 45 students, there's a 94% chance that at least two of students will have the same birthday.

Here's the same experiment being run with different class sizes:

### 20 Students

In [10]:
experiment(sample_sizes, 20)

Conducting this experiment with sample classes of 20 students.

Test: 1 | Number of Samples: 1 | Duplicates: 0 (0.0%)
Test: 2 | Number of Samples: 10 | Duplicates: 5 (50.0%)
Test: 3 | Number of Samples: 100 | Duplicates: 38 (38.0%)
Test: 4 | Number of Samples: 1000 | Duplicates: 397 (39.7%)
Test: 5 | Number of Samples: 10000 | Duplicates: 4096 (40.96%)
Test: 6 | Number of Samples: 100000 | Duplicates: 41024 (41.02%)


### 55 Students

In [11]:
experiment(sample_sizes, 55)

Conducting this experiment with sample classes of 55 students.

Test: 1 | Number of Samples: 1 | Duplicates: 1 (100.0%)
Test: 2 | Number of Samples: 10 | Duplicates: 10 (100.0%)
Test: 3 | Number of Samples: 100 | Duplicates: 99 (99.0%)
Test: 4 | Number of Samples: 1000 | Duplicates: 984 (98.4%)
Test: 5 | Number of Samples: 10000 | Duplicates: 9871 (98.71%)
Test: 6 | Number of Samples: 100000 | Duplicates: 98549 (98.55000000000001%)
