# Initialize Environment

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import numpy as np
import pandas as pd
# ensure kaggle api credentials available via .env
from kaggle.api.kaggle_api_extended import KaggleApi
import os
import shutil
import plotnine as p9
import scipy.special as ssp

In [None]:
DATA_NEEDS_DOWNLOADED = False

# Extract Data

In [None]:
if DATA_NEEDS_DOWNLOADED:

    # expected workflow, though authentication issues persist
    # api = KaggleApi()
    # api.authenticate()
    # api.competition_download_file("dont-get-kicked", "training.csv", path="./data/dont_get_kicked")

    os.system('kaggle competitions download -c DontGetKicked')
    shutil.unpack_archive("DontGetKicked.zip", "./data")
    os.remove("DontGetKicked.zip")

In [None]:
df_train = pd.read_csv("./data/training.csv")
df_test = pd.read_csv("./data/test.csv")

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
df_train.head().transpose()

# Analyze

## Build Intuition through Visual

In [None]:
N_BINS = 10

In [None]:
x_q = df_train['VehOdo']
n_q = df_train.shape[0]

x_p = df_test['VehOdo']
n_p = df_test.shape[0]

In [None]:
q = np.histogram(x_q, bins=N_BINS)

In [None]:
q

In [None]:
df_q = pd.DataFrame({
    'n': q[0],
    'prb': q[0] / n_q,
    'series': 'Train'
})
df_q['bin'] = df_q.index.values

In [None]:
p = np.histogram(x_p, bins=q[1])

In [None]:
p

In [None]:
df_p = pd.DataFrame({
    'n': p[0],
    'prb': p[0] / n_p,
    'series': 'Test'
})
df_p['bin'] = df_p.index.values

In [None]:
compare_p_q = pd.concat([df_q, df_p], axis=0)

In [None]:
(
    p9.ggplot(compare_p_q) + 
    p9.theme_minimal() + 
    p9.geom_col(p9.aes('bin', 'prb', fill='series'), position='dodge') + 
    p9.labs(fill='')
)

## Statistical Tests

In [None]:
N_SAMPLING_DISTR_DRAWS = 1000

In [None]:
kl_div_point = ssp.rel_entr(p[0] / n_p, q[0] / n_q).sum()
kl_div_point

### Naive

In [None]:
def sample_kl_divergence(x_q, n_bins, nobs_monitoring_window):

    x_p_sample = np.random.choice(x_q, size=nobs_monitoring_window, replace=True)

    q = np.histogram(x_q, bins=n_bins)
    p = np.histogram(x_p_sample, bins=q[1])

    kl_div = ssp.rel_entr(
        q[0] / q[0].sum(), 
        p[0] / nobs_monitoring_window
        ).sum()

    return kl_div

In [None]:
kl_div_dist = [
    sample_kl_divergence(x_q, N_BINS, nobs_monitoring_window=df_test.shape[0])
    for i in range(N_SAMPLING_DISTR_DRAWS)
]

In [None]:
sum(np.array(kl_div_dist) > kl_div_point)

In [None]:
np.quantile(kl_div_dist, q = [0.1, 0.25, 0.5, 0.75, 0.9])

### Calibrated

In [None]:
kl_div_dist = [
    sample_kl_divergence(x_q, N_BINS, nobs_monitoring_window=100)
    for i in range(N_SAMPLING_DISTR_DRAWS)
]

sum(np.array(kl_div_dist) > kl_div_point)

In [None]:
kl_div_dist = [
    sample_kl_divergence(x_q, N_BINS, nobs_monitoring_window=1000)
    for i in range(N_SAMPLING_DISTR_DRAWS)
]

sum(np.array(kl_div_dist) > kl_div_point)

In [None]:
kl_div_dist = [
    sample_kl_divergence(x_q, N_BINS, nobs_monitoring_window=10000)
    for i in range(N_SAMPLING_DISTR_DRAWS)
]

sum(np.array(kl_div_dist) > kl_div_point)

In [None]:
# what does histogram look like for an extreme KL Div case?