## Probability & Statistics Questions

### References
- https://huyenchip.com/ml-interviews-book

In [None]:
from dataclasses import dataclass
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import *
import pandas as pd

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set()

---

#### Given $n$ samples from a uniform distribution over $[0,d]$, how do you estimate $d$?
Also known as the [German tank problem](https://en.wikipedia.org/wiki/German_tank_problem)

In [None]:
Estimator = Callable[[np.ndarray], float]

class World:
    def __init__(self, *, d: int, n: int, replacement: bool = False):
        self.d = d
        self.n = n
        self.replacement = replacement
        self.estimators: Dict[str, Estimator] = {}
    
    def add_estimator(self, name: str, est: Estimator):
        self.estimators[name] = est
    
    def sample(self) -> np.ndarray:
        return np.random.choice(
            np.arange(1, self.d+1), 
            replace=self.replacement, 
            size=self.n
        )

    def run_estimators(self):
        xs = self.sample()
        return {name: est(xs) for (name, est) in self.estimators.items()}

def mean_estimator(xs: np.ndarray) -> float:
    """
    E[X] = μ = d/2 => 2 * mean(xs) ~ d
    """
    return np.round(2 * np.mean(xs)).astype(int)
    
def max_estimator(xs: np.ndarray) -> float:
    """
    P(max(xs) != d) = (1 - 1/d) ** len(xs)
    """
    return np.max(xs)

def umvu_estimator(xs: np.ndarray) -> float:
    m = np.max(xs)
    n = len(xs)
    return m + (m - n) / n

In [None]:
d = 100
n = 20

w = World(d=d, n=n, replacement=False)

w.add_estimator('mean', mean_estimator)
w.add_estimator('max', max_estimator)
w.add_estimator('umvu', umvu_estimator)

df = []
N = 10_000
for _ in range(N):
    df.append(w.run_estimators())
df = pd.DataFrame(df)

In [None]:
# average gap between sorted samples is ~ d/n

d = 300
n = 50

w = World(d=d, n=n, replacement=False)

xs = np.sort(w.sample())
gaps = xs[1:] - xs[:-1]
np.mean(gaps), d / n