In [29]:


import os
# import torch
import numpy as np
import pandas as pd

from tqdm import tqdm
# from scipy import sparse
# from datetime import datetime
# from scipy.linalg import solve
# from scipy.optimize import minimize
# from scipy.sparse.linalg import spsolve
# from bayes_opt import BayesianOptimization
from sklearn.preprocessing import MinMaxScaler
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import cross_val_predict



In [30]:


DATA_PATH = '../data/testing/ncaam_sample_data.csv'
def load_data(data_path):
    return pd.read_csv(data_path)

m_data = load_data(DATA_PATH)
scaler = MinMaxScaler()
m_data['team_score']= m_data['team_score'].clip(36, 120)
m_data['continuous_target'] = scaler.fit_transform(m_data['team_score'].values.reshape(-1, 1))
m_data['continuous_target_2'] = m_data['team_fgm'].copy()/m_data['team_fga'].copy()
m_data['date'] = pd.to_datetime(m_data['date'])
m_data = m_data.sort_values(by=['date', 'team_name']).reset_index(drop=True)



In [36]:
class Optimizer:
    def __init__(self):
        pass

    def load_data(self):
        raise NotImplementedError("Subclasses must implement the load_data method.")
    
    def optimize(self):
        raise NotImplementedError("Subclasses must implement the optimize method.")
    

class EloOptimizer(Optimizer):
    def __init__(self, protag_col='team', antag_col='opponent', stat_cols=['team_sq_score'], order_col='date', meta_cols=['location']):
        super().__init__()
        self.protag_col = protag_col
        self.antag_col = antag_col
        self.stat_cols = stat_cols
        self.order_col = order_col  
        self.meta_cols = meta_cols
        

    def load_data(self, data_or_path):
        if type(data_or_path) == str:
            self.data = pd.read_csv(data_or_path)
        else:
            self.data = data_or_path

        for col in [self.protag_col, self.antag_col]+self.stat_cols+[self.order_col]+self.meta_cols:
            assert(col in self.data.columns), f"{col} not found in data columns."
        self.preprocess_data()

    def preprocess_data(self):

        if len(self.stat_cols) > 1:
            self.data = self.data.melt(id_vars=[self.order_col, self.protag_col, self.antag_col]+self.meta_cols, value_vars=self.stat_cols, var_name='stat', value_name='stat_value')
        
        self.data = self.data.sort_values(by=self.order_col)
        return self.data
    
    def checks(self, k, meta_k, priors, initial_rating):
        assert(type(priors) == dict), "Priors must be a dictionary with key as protag and value as Elo rating."
        if k is None:
            raise ValueError("Must provide K value")
        else:
            assert(type(k) in [dict, float, int]), "K must be a single numeric quantity, or a dict that has unique values for each stat"
        assert(type(initial_rating) in [dict, float, int]), "Initial rating must be a single numeric quantity, or a dict that has unique values for each stat"
    
    def run_history(self, k, meta_adj={}, priors={}, initial_rating=1500):

        self.checks(k, meta_adj, priors, initial_rating)

        protags = self.data[self.protag_col].unique()
        antags = self.data[self.antag_col].unique()
        protags = sorted(protags)
        antags = sorted(antags)
        stats = sorted(self.stat_cols)

        protag_idx_map = {protag: idx for idx, protag in enumerate(protags)}
        antag_idx_map = {antag: idx + len(protags) for idx, antag in enumerate(antags)}
        stat_idx_map = {stat: idx for idx, stat in enumerate(stats)}

        self.data['protag_idx'] = self.data[self.protag_col].map(protag_idx_map)
        self.data['antag_idx'] = self.data[self.antag_col].map(antag_idx_map) 
        if len(stats) > 1:
            self.data['stat_idx'] = self.data['stat'].map(stat_idx_map)
        else:
            self.data['stat_idx'] = 0

        ratings_mat = np.zeros((len(protags)+len(antags),len(self.stat_cols)))

        if len(stats)==1:
            for entity, prior in priors.items():
                if entity in protags:
                    ratings_mat[protag_idx_map[entity], 0] = prior
                else:
                    ratings_mat[antag_idx_map[entity], 0] = prior
        else:
            for entity, prior in priors.items():
                if entity in protags:
                    for idx, stat in enumerate(stats):
                        ratings_mat[protag_idx_map[entity], idx] = prior[stat]
                else:
                    for idx, stat in enumerate(stats):
                        ratings_mat[antag_idx_map[entity], idx] = prior[stat]

        no_prior_protags = set(protags) - set(priors.keys())
        no_prior_antags = set(antags) - set(priors.keys())
        for protag in no_prior_protags:
            if len(stats)==1:
                ratings_mat[protag_idx_map[protag], 0] = initial_rating
            else:
                for idx, stat in enumerate(stats):
                    ratings_mat[protag_idx_map[protag], idx] = initial_rating
        for antag in no_prior_antags:
            if len(stats)==1:
                ratings_mat[antag_idx_map[antag], 0] = initial_rating
            else:
                for idx, stat in enumerate(stats):
                    ratings_mat[antag_idx_map[antag], idx] = initial_rating

        ratings = []
        for rating_period, results in tqdm(self.data.groupby(self.order_col), total=self.data[self.order_col].nunique()):
            
            protag_idxs = results['protag_idx'].values
            antag_idxs = results['antag_idx'].values
            stat_idxs = results['stat_idx'].values

            protag_ratings = ratings_mat[protag_idxs, stat_idxs]
            antag_ratings = ratings_mat[antag_idxs, stat_idxs]
            protag_win_probs = 1 / (1 + 10**((antag_ratings - protag_ratings) / 400))
            antag_win_probs = 1 - protag_win_probs

            ## todo: upgrade for multiple stats
            if len(stats) == 1:
                protag_wins = results[self.stat_cols[0]].values
            else:
                protag_wins = results['stat_value'].values
            antag_wins = 1 - protag_wins

            protag_update = k * (protag_wins - protag_win_probs)
            antag_update = k * (antag_wins - antag_win_probs)

            ratings_mat[protag_idxs, stat_idxs] += protag_update
            ratings_mat[antag_idxs, stat_idxs] += antag_update

            to_append = pd.DataFrame({
                self.order_col: rating_period,
                'protag': results[self.protag_col],
                'antag': results[self.antag_col],
                'stat': results['stat'],    
                'protag_rating': protag_ratings,
                'antag_rating': antag_ratings,
                'protag_win_prob': protag_win_probs,
                'antag_win_prob': antag_win_probs,
                'protag_update': protag_update,
                'antag_update': antag_update,
                'protag_result': protag_wins,
                'antag_result': antag_wins,
                'protag_postgame_rating': protag_ratings+protag_update,
                'antag_postgame_rating': antag_ratings+antag_update,
            })

            ratings.append(to_append)

        ratings = pd.concat(ratings).reset_index(drop=True)

        return ratings
    

elo = EloOptimizer(protag_col='team_name', antag_col='opp_name', stat_cols=['continuous_target','continuous_target_2'], order_col='date', meta_cols=['is_home'])
elo.load_data(m_data)
rtgs = elo.run_history(k=45, initial_rating=1500)
rtgs



100%|██████████| 2674/2674 [00:00<00:00, 2973.85it/s]


Unnamed: 0,date,protag,antag,stat,protag_rating,antag_rating,protag_win_prob,antag_win_prob,protag_update,antag_update,protag_result,antag_result,protag_postgame_rating,antag_postgame_rating
0,2002-11-14,Alabama,Oklahoma,continuous_target,1500.000000,1500.000000,0.500000,0.500000,-5.357143,5.357143,0.380952,0.619048,1494.642857,1505.357143
1,2002-11-14,Syracuse,Memphis,continuous_target_2,1500.000000,1500.000000,0.500000,0.500000,-6.380597,6.380597,0.358209,0.641791,1493.619403,1506.380597
2,2002-11-14,Oklahoma,Alabama,continuous_target_2,1500.000000,1500.000000,0.500000,0.500000,-3.820755,3.820755,0.415094,0.584906,1496.179245,1503.820755
3,2002-11-14,Alabama,Oklahoma,continuous_target_2,1500.000000,1500.000000,0.500000,0.500000,-1.551724,1.551724,0.465517,0.534483,1498.448276,1501.551724
4,2002-11-14,Memphis,Syracuse,continuous_target_2,1500.000000,1500.000000,0.500000,0.500000,-3.629032,3.629032,0.419355,0.580645,1496.370968,1503.629032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413115,2022-04-02,Kansas,Villanova,continuous_target_2,1518.641953,1569.725493,0.427010,0.572990,4.951213,-4.951213,0.537037,0.462963,1523.593166,1564.774280
413116,2022-04-04,North Carolina,Kansas,continuous_target,1592.678323,1616.868076,0.465244,0.534756,-3.257422,3.257422,0.392857,0.607143,1589.420902,1620.125497
413117,2022-04-04,Kansas,North Carolina,continuous_target,1592.350957,1584.537609,0.511242,0.488758,-3.720194,3.720194,0.428571,0.571429,1588.630762,1588.257804
413118,2022-04-04,Kansas,North Carolina,continuous_target_2,1523.593166,1561.714426,0.445358,0.554642,-0.268392,0.268392,0.439394,0.560606,1523.324775,1561.982818


In [37]:

last_rating =rtgs.drop_duplicates(subset=['protag','stat'], keep='last')[['date','protag','stat','protag_postgame_rating']].copy()
last_rating.sort_values(by=['protag_postgame_rating'], ascending=[False], inplace=True)
last_rating

Unnamed: 0,date,protag,stat,protag_postgame_rating
413068,2022-03-24,Gonzaga,continuous_target,1633.518992
412874,2022-03-17,Iowa,continuous_target,1621.500437
413070,2022-03-24,Arizona,continuous_target,1617.281038
413113,2022-04-02,Duke,continuous_target,1593.876542
412538,2022-03-10,St John's,continuous_target,1590.708836
...,...,...,...,...
412608,2022-03-11,Alabama A&M,continuous_target,1348.174842
412508,2022-03-10,MD E Shore,continuous_target,1338.807776
157750,2010-03-04,W Salem St,continuous_target,1335.879123
410554,2022-02-26,E Illinois,continuous_target,1325.454103


In [42]:
test2 = last_rating.pivot(index=['date','protag'], columns=['stat'], values=['protag_postgame_rating']).reset_index()
test2.columns=['date','protag','ppg','fg%']

In [43]:
test2.corr()

  test2.corr()


Unnamed: 0,ppg,fg%
ppg,1.0,0.818036
fg%,0.818036,1.0


In [34]:
m_data.melt(id_vars=['date', 'team_name', 'opp_name', 'is_home'], value_vars=['continuous_target', 'continuous_target_2'], var_name='stat', value_name='stat_value')

Unnamed: 0,date,team_name,opp_name,is_home,stat,stat_value
0,2002-11-14,Alabama,Oklahoma,0,continuous_target,0.380952
1,2002-11-14,Memphis,Syracuse,0,continuous_target,0.404762
2,2002-11-14,Oklahoma,Alabama,0,continuous_target,0.309524
3,2002-11-14,Syracuse,Memphis,0,continuous_target,0.321429
4,2002-11-15,E Washington,Wisconsin,-1,continuous_target,0.226190
...,...,...,...,...,...,...
413115,2022-04-02,Kansas,Villanova,0,continuous_target_2,0.537037
413116,2022-04-02,North Carolina,Duke,0,continuous_target_2,0.421875
413117,2022-04-02,Villanova,Kansas,0,continuous_target_2,0.385965
413118,2022-04-04,Kansas,North Carolina,0,continuous_target_2,0.439394


In [33]:
m_data

Unnamed: 0,season,team_score,opp_score,is_home,numot,team_fgm,team_fga,team_fgm3,team_fga3,team_ftm,...,opp_ast,opp_to,opp_stl,opp_blk,opp_pf,team_name,opp_name,date,continuous_target,continuous_target_2
0,2003,68,62,0,0,27,58,3,14,11,...,8,18,9,2,20,Alabama,Oklahoma,2002-11-14,0.380952,0.465517
1,2003,70,63,0,0,26,62,8,20,10,...,7,12,8,6,16,Memphis,Syracuse,2002-11-14,0.404762,0.419355
2,2003,62,68,0,0,22,53,2,10,16,...,13,23,7,1,22,Oklahoma,Alabama,2002-11-14,0.309524,0.415094
3,2003,63,70,0,0,24,67,6,24,9,...,16,13,4,4,18,Syracuse,Memphis,2002-11-14,0.321429,0.358209
4,2003,55,81,-1,0,20,46,3,11,12,...,12,9,9,3,18,E Washington,Wisconsin,2002-11-15,0.226190,0.434783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206555,2022,81,65,0,0,29,54,13,24,10,...,12,9,3,0,11,Kansas,Villanova,2022-04-02,0.535714,0.537037
206556,2022,81,77,0,0,27,64,10,26,17,...,12,4,7,4,18,North Carolina,Duke,2022-04-02,0.535714,0.421875
206557,2022,65,81,0,0,22,57,13,31,8,...,18,7,4,4,8,Villanova,Kansas,2022-04-02,0.345238,0.385965
206558,2022,72,69,0,0,29,66,6,17,8,...,9,13,2,6,13,Kansas,North Carolina,2022-04-04,0.428571,0.439394


149