In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from data_boxscore.data import load_dataframes
from data_boxscore.elo import compute_ELO, compute_kl_divergence
from data_boxscore.constants import features_minmax, features_no_scaling, features_perc, features_robust, features_standard
%reload_ext autoreload
%autoreload 2


In [None]:
gdf, X_train, y_train, X_val, y_val, X_test, y_test, _, _ = load_dataframes(filename='bball.csv',
                                                                    features_standard = features_standard, 
                                                                    features_minmax = features_minmax, 
                                                                    features_robust = features_robust, 
                                                                    features_perc = features_perc,
                                                                    features_no_scaling = features_no_scaling,              
                                                                    use_ELO = True,
                                                                    k_elo = 20)
print(gdf.shape, X_train.shape, X_val.shape ,X_test.shape, )
gdf['home_win'].mean()

In [None]:
gdf.head()

In [None]:
gdf, elo_end = compute_ELO(gdf, k_elo=20, return_elo_end=True)
gdf

In [None]:
fig, axs = plt.subplots(2,3, figsize = (12,6))
for i,k_elo in enumerate([5,10,20,40,75,100]):
    df = compute_ELO(gdf, k_elo)
    train_data = df[df['split'] == 1]['ELO_diff_before'].values.reshape(-1,1)
    data = df[df['split'] != 0]['ELO_diff_before'].values.reshape(-1,1)
    scaler = StandardScaler()
    scaler.fit(train_data)
    data = scaler.transform(data)
    sns.histplot(data, kde = True,ax = axs[i//3, i%3], legend=False)
    axs[i//3, i%3].set_title(f"k_ELO = {k_elo}")    
    
    

In [None]:
elos = {}
for k_elo in range(5,101,5):
    df = compute_ELO(gdf, k_elo)
    train_data = df[df['split'] == 1]['ELO_diff_before'].values.reshape(-1,1)
    data = df[df['split'] != 0]['ELO_diff_before'].values.reshape(-1,1)
    scaler = StandardScaler()
    scaler.fit(train_data)
    data = scaler.transform(data)
    elos[k_elo] = data

In [None]:
kls = []
for k,v in elos.items():
    tmp = {}
    for k2,v2 in elos.items():
        tmp[k2] = compute_kl_divergence(v,v2, bins=50)
    kls.append(tmp)
kldf = pd.DataFrame(kls, index = elos.keys())

In [None]:
print(f"{kldf.max().max():.3f}")
kldf.round(2)