## Probability & Statistics Questions

Some questions are taken from https://huyenchip.com/ml-interviews-book

In [None]:
from dataclasses import dataclass
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import *
import pandas as pd
from tqdm import trange

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set()

---

#### Given $n$ samples from a uniform distribution over $[0,d]$, how do you estimate $d$?
Also known as the [German tank problem](https://en.wikipedia.org/wiki/German_tank_problem)

In [None]:
Estimator = Callable[[np.ndarray], float]

class World:
    def __init__(self, *, d: int, n: int, replacement: bool = False):
        self.d = d
        self.n = n
        self.replacement = replacement
        self.estimators: Dict[str, Estimator] = {}
    
    def add_estimator(self, name: str, est: Estimator):
        self.estimators[name] = est
    
    def sample(self) -> np.ndarray:
        return np.random.choice(
            np.arange(1, self.d+1), 
            replace=self.replacement, 
            size=self.n
        )

    def run_estimators(self):
        xs = self.sample()
        return {name: est(xs) for (name, est) in self.estimators.items()}

def mean_estimator(xs: np.ndarray) -> float:
    """
    E[X] = μ = d/2 => 2 * mean(xs) ~ d
    """
    return np.round(2 * np.mean(xs)).astype(int)
    
def max_estimator(xs: np.ndarray) -> float:
    """
    P(max(xs) != d) = (1 - 1/d) ** len(xs)
    """
    return np.max(xs)

def umvu_estimator(xs: np.ndarray) -> float:
    m = np.max(xs)
    n = len(xs)
    return m + (m - n) / n

In [None]:
d = 100
n = 20

w = World(d=d, n=n, replacement=False)

w.add_estimator('mean', mean_estimator)
w.add_estimator('max', max_estimator)
w.add_estimator('umvu', umvu_estimator)

df = []
N = 10_000
for _ in range(N):
    df.append(w.run_estimators())
df = pd.DataFrame(df)

In [None]:
# average gap between sorted samples is ~ d/n

d = 300
n = 50

w = World(d=d, n=n, replacement=False)

xs = np.sort(w.sample())
gaps = xs[1:] - xs[:-1]
np.mean(gaps), d / n

---

In [None]:
from sympy import primepi
from sympy.ntheory import isprime
from math import log10

In [None]:
def π(k):
    return k / log10(k)

def p(k):
    if k == 1:
        return 4/10
    else:
        top = π(10**k) - π(10**(k-1))
        bot = 9 * 10**(k-1)
        return top / bot
    
ps = [(1 - p(k)) for k in range(1, 309)]
np.product(ps)
plt.plot(ps)

In [None]:
r = lambda: np.random.randint(0, 10, 1).item()
        
def experiment() -> int:
    g = r()
    steps = 1
    
    while not isprime(g):
        g = g * 10 + r()
        if g >= 2**64:
            return np.infty
        
        # print(g)
        steps += 1
        
    return steps

N = 1_000_000
trials = np.array([experiment() for _ in trange(N)])

In [None]:
10 ** 19 < 2**64

In [None]:
def n_digit_primes(n):
    top = primepi(10**n) - primepi(10**(n-1))
    bot = 10 if n == 1 else 9 * 10**(n-1)
    return top / bot

xs = range(1, 13)
plt.plot(xs, [n_digit_primes(i) for i in xs], linestyle='--', marker='o')
plt.xticks(xs)
pass

In [None]:
plt.figure(figsize=(14, 5))
x, f = np.unique(trials[trials != np.infty], return_counts=True)
f = f / f.sum()

p_est = 1 / np.dot(x, f)
print(p_est)

plt.stem(x, f, linefmt='C0:', markerfmt='C0o')

x, f = np.unique(np.random.geometric(p_est, size=N), return_counts=True)
f = f / f.sum()
plt.stem(x, f, linefmt='C1:', markerfmt='C1o')
pass