In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

import plotly.graph_objects as go
import plotly.express as px

from scipy.special import comb

In [2]:
def compute_p0(N: int, K: int, n: int) -> float:
    """Probability mass function of the counting variable following a multivariate hypergeometric, evaluated in 0, 
    i.e. the probability of sampling 0 times object o_i in n draws (sample size = n), without replacement, 
    from a finite population of size N that contains exactly K such objects 

    Args:
        N (int): Population size
        K (int): Number of occurrences of object o_i in the population
        n (int): Sample size

    Returns:
        float: probability
    """
    return comb(N - K, n, exact=True) / comb(N, n, exact=True)

In [3]:
# frequencies = {
#     'No Skew [2,2,2,2]': [2, 2, 2, 2],
#     'Medium Skew [3,3,1,1]': [3, 3, 1, 1],
#     'Large Skew [5,1,1,1]': [5, 1, 1, 1],
# }
frequencies = {
    'No Skew': [2, 2, 2, 2],
    'Medium Skew': [3, 3, 1, 1],
    'Large Skew': [5, 1, 1, 1],
}

batch_size = 4

In [4]:
def expected_occurrences_in_sample(population_size, sample_size, object_occurrences):
    
    N = population_size
    n = sample_size
    K = object_occurrences
    
    p0 = compute_p0(N, K, n)
    
    return n * K / N + p0

In [5]:
expected_occurrences = dict()

for f, freq in frequencies.items():
    N = sum(freq)
    n = batch_size

    occs = list()
    for k_i in freq:
        occ = expected_occurrences_in_sample(N, n, k_i)
        occs.append(occ)
    
    expected_occurrences[f] = occs

In [6]:
expected_occurrences

{'No Skew': [1.2142857142857142,
  1.2142857142857142,
  1.2142857142857142,
  1.2142857142857142],
 'Medium Skew': [1.5714285714285714, 1.5714285714285714, 1.0, 1.0],
 'Large Skew': [2.5, 1.0, 1.0, 1.0]}

In [7]:
samples = []

for f in frequencies:
    for i, occ in enumerate(expected_occurrences[f]):
        samples.append({'skew': f, 'occ': occ, 'sample': f"Sample {i+1}"})
    
    samples.append({'skew': f, 'occ': sum(expected_occurrences[f]), 'sample': f"Total"})

In [8]:
df = pd.DataFrame(samples)

In [9]:
df

Unnamed: 0,skew,occ,sample
0,No Skew,1.214286,Sample 1
1,No Skew,1.214286,Sample 2
2,No Skew,1.214286,Sample 3
3,No Skew,1.214286,Sample 4
4,No Skew,4.857143,Total
5,Medium Skew,1.571429,Sample 1
6,Medium Skew,1.571429,Sample 2
7,Medium Skew,1.0,Sample 3
8,Medium Skew,1.0,Sample 4
9,Medium Skew,5.142857,Total


In [18]:
layout = go.Layout(
    font=dict(size=20),
    margin=go.layout.Margin(
        l=0, #left margin
        r=0, #right margin
        b=0, #bottom margin
        t=0, #top margin
    ),
    autosize=True,
)

# fig = px.bar(df, x='sample', y='occ', color='skew', barmode = 'group', orientation='v', 
#     color_discrete_sequence=px.colors.qualitative.D3)

fig = px.bar(df, x='skew', y='occ', color='sample', pattern_shape='sample', barmode = 'group', orientation='v', 
    color_discrete_sequence=px.colors.qualitative.T10, )

fig.update_layout(layout)

fig.update_xaxes(title='')
fig.update_yaxes(title='Expected occurrences')
fig.update_layout(legend_title_text='')

fig.update_layout(font=dict(
    size=26,
))

fig.update_layout(legend=dict(
    x=0.0,
    y=0.95,
    bgcolor="rgba(0,0,0,0)",
    font=dict(size=30),
))

fig.show()

fig.write_image('num_dups_distribution_.png', width=999, height=450, scale=3)