# Initialize Environment

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import numpy as np
import pandas as pd
# ensure kaggle api credentials available via .env
from kaggle.api.kaggle_api_extended import KaggleApi
import os
import shutil
import plotnine as p9
import scipy.special as ssp

In [None]:
DATA_NEEDS_DOWNLOADED = False

# Extract Data

In [None]:
if DATA_NEEDS_DOWNLOADED:

    # expected workflow, though authentication issues persist
    # api = KaggleApi()
    # api.authenticate()
    # api.competition_download_file("dont-get-kicked", "training.csv", path="./data/dont_get_kicked")

    os.system('kaggle competitions download -c DontGetKicked')
    shutil.unpack_archive("DontGetKicked.zip", "./data")
    os.remove("DontGetKicked.zip")

In [None]:
df_train = pd.read_csv("./data/training.csv")
df_test = pd.read_csv("./data/test.csv")

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
df_train.head().transpose()

# Analyze

## Build Intuition through Visual

In [None]:
N_BINS = 10

In [None]:
x_q = df_train['VehOdo']
n_q = df_train.shape[0]

x_p = df_test['VehOdo']
n_p = df_test.shape[0]

In [None]:
q = np.histogram(x_q, bins=N_BINS)

In [None]:
q

In [None]:
df_q = pd.DataFrame({
    'n': q[0],
    'prb': q[0] / n_q,
    'series': 'Train'
})
df_q['bin'] = df_q.index.values

In [None]:
p = np.histogram(x_p, bins=q[1])

In [None]:
p

In [None]:
df_p = pd.DataFrame({
    'n': p[0],
    'prb': p[0] / n_p,
    'series': 'Test'
})
df_p['bin'] = df_p.index.values

In [None]:
compare_p_q = pd.concat([df_q, df_p], axis=0)

In [None]:
(
    p9.ggplot(compare_p_q) + 
    p9.theme_minimal() + 
    p9.geom_col(p9.aes('bin', 'prb', fill='series'), position='dodge') + 
    p9.labs(fill='')
)

## Statistical Tests

In [None]:
N_SAMPLING_DISTR_DRAWS = 1000

In [None]:
kl_div_point = ssp.rel_entr(p[0] / n_p, q[0] / n_q).sum()
kl_div_point

In [None]:
def bootstrap_draw_kl_divergence(x_q, n_bins, nobs_test_set):
    """
    Under the (null) condition that 
    a new observations test set generates from 
    the baseline population probability distribution q:
    bootstrap sample one test set draw, then compute KL Divergence.

    q characteristics:
        - Has generated observations `x_q`
        - Estimated by discrete pmf with `n_bins`
    """

    q_hist = np.histogram(x_q, bins=n_bins)
    n_q = q_hist[0].sum()

    x_p_sample = np.random.choice(x_q, size=nobs_test_set, replace=True)
    p_hist = np.histogram(x_p_sample, bins=q_hist[1])

    q_hat = q_hist[0] / n_q
    p_hat = p_hist[0] / nobs_test_set

    kl_div = ssp.rel_entr(p_hat, q_hat).sum()

    out = {'p': p_hat, 'n_p': nobs_test_set, 'q': q_hat, 'kl_divergence': kl_div}

    return out

In [None]:
def bootstrap_sampling_distr_kl_divergence(x_q, n_bins, nobs_test_set, n_draws):
    """
    Under the (null) condition that 
    a new observations test set generates from 
    the baseline population probability distribution q:
    simulate sampling distribution of KL Divergence value.

    q characteristics:
        - Has generated observations `x_q`
        - Estimated by discrete pmf with `n_bins`

    When a new observations test set does truly generate from
    population probability distribution q, 
    KL Divergence sampling variation partly controlled by:
        - Test set sample size (small sample size, wider variation)
        - Probability distribution q estimate precision 
        (more discretized bins, wider variation) 

    """

    kl_div_draws = [
        bootstrap_draw_kl_divergence(x_q, n_bins, nobs_test_set)
        for i in range(n_draws)
    ]

    kl_div_values = [x['kl_divergence'] for x in kl_div_draws]
    idx_sort = np.argsort(kl_div_values)
    kl_div_draws = [kl_div_draws[i] for i in idx_sort]

    return kl_div_draws


### Naive

In [None]:
kl_div_distr0 = bootstrap_sampling_distr_kl_divergence(x_q, N_BINS, df_test.shape[0], N_SAMPLING_DISTR_DRAWS)
kl_div_distr = [x['kl_divergence'] for x in kl_div_distr0]

In [None]:
sum(np.array(kl_div_distr) > kl_div_point)

In [None]:
np.quantile(kl_div_distr, q = [0.1, 0.25, 0.5, 0.75, 0.9])

In [None]:
# delta = [kl_div_distr[i] - kl_div_distr[i-1] for i in range(1, len(kl_div_distr))]
# sum(np.array(delta) < 0)

### Calibrated

In [None]:
kl_div_distr0 = bootstrap_sampling_distr_kl_divergence(x_q, N_BINS, 100, N_SAMPLING_DISTR_DRAWS)
kl_div_distr = [x['kl_divergence'] for x in kl_div_distr0]

sum(np.array(kl_div_distr) > kl_div_point)

In [None]:
df_p = pd.DataFrame({'prb': kl_div_distr0[999]['p'], 'series': 'Test'})
df_p['bin'] = df_p.index.values

df_q = pd.DataFrame({'prb': kl_div_distr0[999]['q'], 'series': 'Train'})
df_q['bin'] = df_q.index.values

compare_p_q = pd.concat([df_p, df_q], axis=0)

In [None]:
(
    p9.ggplot(compare_p_q) + 
    p9.theme_minimal() + 
    p9.geom_col(p9.aes('bin', 'prb', fill='series'), position='dodge') + 
    p9.labs(fill='')
)

In [None]:
kl_div_distr0 = bootstrap_sampling_distr_kl_divergence(x_q, N_BINS, 1000, N_SAMPLING_DISTR_DRAWS)
kl_div_distr = [x['kl_divergence'] for x in kl_div_distr0]

sum(np.array(kl_div_distr) > kl_div_point)

In [None]:
df_p = pd.DataFrame({'prb': kl_div_distr0[999]['p'], 'series': 'Test'})
df_p['bin'] = df_p.index.values

df_q = pd.DataFrame({'prb': kl_div_distr0[999]['q'], 'series': 'Train'})
df_q['bin'] = df_q.index.values

compare_p_q = pd.concat([df_p, df_q], axis=0)

In [None]:
(
    p9.ggplot(compare_p_q) + 
    p9.theme_minimal() + 
    p9.geom_col(p9.aes('bin', 'prb', fill='series'), position='dodge') + 
    p9.labs(fill='')
)

In [None]:
kl_div_distr0 = bootstrap_sampling_distr_kl_divergence(x_q, N_BINS, 10000, N_SAMPLING_DISTR_DRAWS)
kl_div_distr = [x['kl_divergence'] for x in kl_div_distr0]

sum(np.array(kl_div_distr) > kl_div_point)