In [9]:


import os
import torch
import numpy as np
import pandas as pd

from tqdm import tqdm
from scipy import sparse
from datetime import datetime
from scipy.linalg import solve
from scipy.optimize import minimize
from scipy.sparse.linalg import spsolve
from bayes_opt import BayesianOptimization
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict



In [10]:

DATA_PATH = '../data/testing/ncaam_sample_data.csv'
def load_data(data_path):
    return pd.read_csv(data_path)

m_data = load_data(DATA_PATH)


In [40]:

class Optimizer:
    def __init__(self):
        pass

    def optimize(self):
        raise NotImplementedError("Subclasses must implement the optimize method.")


class MasseyOptimizer(Optimizer):
    def __init__(self, decay_type, protag_col='team', antag_col='opponent', stat_col='team_sq_score', meta_cols=['location'], min_protag_games=5):
        super().__init__()
        assert(decay_type in ['time', 'games','both'])
        self.decay_type = decay_type
        self.protag_col = protag_col
        self.antag_col = antag_col
        self.stat_col = stat_col
        self.meta_cols = meta_cols
        self.min_protag_games = min_protag_games

    def load_data(self, data, path=None):
        if path is not None:
            self.data = pd.read_csv(path)
        else:
            self.data = data
        self.preprocess_data()

    def preprocess_data(self):

        assert(self.protag_col in self.data.columns), "Protagonist column not found in data"
        assert(self.antag_col in self.data.columns), "Antagonist column not found in data"
        assert(self.stat_col in self.data.columns), "Stat column not found in data"
        
        # Convert date column to datetime if needed
        if isinstance(self.data['date'].iloc[0], str):
            self.data['date'] = pd.to_datetime(self.data['date'])

        # Sort data by date
        self.data = self.data.sort_values('date').reset_index(drop=True)

        # Create a team/player list
        self.protags = list(self.data[self.protag_col].unique())
        self.antags = list(self.data[self.antag_col].unique())
        self.protags = sorted(self.protags)
        self.antags = sorted(self.antags)

        self.num_protags = len(self.protags)
        self.num_antags = len(self.antags)

        self.data['protag_idx'] = self.data[self.protag_col].apply(lambda x: self.protags.index(x))
        self.data['antag_idx'] = self.data[self.antag_col].apply(lambda x: self.antags.index(x))

        assert(len(self.data)>200), "Not enough data to optimize"

    def initialize_X_train(self, train):

        protags = sorted(train[self.protag_col].unique())
        antags = sorted(train[self.antag_col].unique())
        num_train_protags = len(protags)
        num_train_antags = len(antags)

        train['protag_idx'] = train[self.protag_col].apply(lambda x: protags.index(x))
        train['antag_idx'] = train[self.antag_col].apply(lambda x: antags.index(x))

        X_train = np.zeros((len(train), num_train_protags+num_train_antags+len(self.meta_cols)))
        X_train[np.arange(len(train)), train['protag_idx']] = 1
        X_train[np.arange(len(train)), train['antag_idx']+num_train_protags] = 1
        for i, col in enumerate(self.meta_cols):
            X_train[np.arange(len(train)), -1*i] = train[col]
        
        X_train = sparse.csr_matrix(X_train)
        return X_train, protags, antags

    def initialize_X_test(self, test, protags, antags):

        num_protags = len(protags)
        num_antags = len(antags)
        test['protag_idx'] = test[self.protag_col].apply(lambda x: protags.index(x) if x in protags else num_protags)
        test['antag_idx'] = test[self.antag_col].apply(lambda x: antags.index(x) if x in antags else num_antags)
        test = test.loc[test['protag_idx']<num_protags]
        test = test.loc[test['antag_idx']<num_antags]
        test_idx = test.index.values
        test = test.reset_index(drop=True)
        X_test = np.zeros((len(test), len(protags)+len(antags)+len(self.meta_cols)))
        X_test[np.arange(len(test)), test['protag_idx']] = 1
        X_test[np.arange(len(test)), test['antag_idx']+num_protags+1] = 1

        for i, col in enumerate(self.meta_cols):
            X_test[np.arange(len(test)), -1*i] = test[col]

        X_test = sparse.csr_matrix(X_test)

        return X_test, test_idx

    def run_time_opt(self, init_points=10, n_iter=30, num_test_dates=20, num_future_days=60, max_lookback=365*3, halflife_bounds=(10, 800), l2_bounds=(1e-9, 10)):#offense_halflife_bounds=(50, 800), defense_halflife_bounds=(50,800), meta_halflife_bounds=(100,800), l2_bounds=(1e-8, 1)):

        # Select random test dates
        unique_dates = self.data['date'].unique()
        ## don't take from the first 10 or so dates
        unique_dates = sorted(unique_dates)[10:]
        
        test_dates = np.random.choice(unique_dates, size=num_test_dates, replace=False)
        num_dates = len(unique_dates)

        for test_date in test_dates:
            train_data = self.data[(self.data['date'] >= test_date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < test_date)].copy()
            test_data = self.data[(self.data['date'] >= test_date) & (self.data['date'] <= test_date + pd.Timedelta(days=num_future_days))].copy()
            if len(train_data) < 50 or len(test_data) < 50:
                print("Not enough data for test date", test_date)
                test_dates = np.delete(test_dates, np.where(test_dates==test_date))

        def time_bayes_objective(halflife, l2):

            decay = np.exp(-np.log(2)/halflife)

            for i, test_date in enumerate(test_dates):

                # Filter data before the given date
                X_train = self.data[(self.data['date'] >= test_date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < test_date)].copy()
                idx = X_train.index.values
                if len(X_train) < 50:
                    print("Minimum data threshold not met")
                    continue
                X_train, protags, antags = self.initialize_X_train(X_train)
                num_protags = len(protags)
                num_antags = len(antags)
                X_test = self.data[(self.data['date'] >= test_date) & (self.data['date'] <= test_date + pd.Timedelta(days=num_future_days))].dropna(subset=[self.stat_col]).copy()

                if X_train.shape[0] < 50 or len(X_test) < 50:
                    continue

                dw = decay ** ((test_date - pd.to_datetime(self.data.iloc[idx]['date'])).astype('timedelta64[D]').astype(int))
                dw = dw.values.reshape(-1)
                dw = dw/np.sum(dw)

                # Calculate the time differences in days between each game and the most recent game
                W = sparse.diags(dw)
                
                q = (X_train.T @ W @ X_train).toarray()
                q += l2 * np.eye(q.shape[0]) * np.trace(q) / q.shape[0]

                y = self.data.loc[idx, self.stat_col].values
                f = X_train.T @ W @ y

                # Calculate the exponential decay weights based on the time differences and half-lives
                # solution
                b = solve(q, f, assume_a='pos')

                # Split the ratings into offense and defense ratings
                offense_ratings = b[:num_protags]
                defense_ratings = b[num_protags:num_protags+num_antags]

                offense_df = pd.DataFrame({
                    self.protag_col: protags,
                    'offense_rating': offense_ratings
                })
                defense_df = pd.DataFrame({
                    self.antag_col: antags,
                    'defense_rating': defense_ratings
                })
                

                # X_test, test_idx = self.initialize_X_test(X_test, protags, antags)
                test_idx = X_test.index.values
                
                X_test = X_test.merge(offense_df, on=self.protag_col, how='left')
                X_test = X_test.merge(defense_df, on=self.antag_col, how='left')
                # print(test_data.head())
                X_test = X_test.dropna(subset=['offense_rating','defense_rating'])
                y_test = X_test[self.stat_col]

                if len(X_test) <10:
                    print("Not enough data for test date", test_date)
                    continue
                
                # Calculate the predicted scores
                linear_model = LinearRegression()
                predictions = cross_val_predict(linear_model, X_test[['offense_rating','defense_rating']], y_test, cv=5)
                X_test['pred_score'] = predictions
                X_test['mse'] = (X_test[self.stat_col] - X_test['pred_score']) ** 2
                mse = X_test['mse'].mean()

            return -mse  
        
        best_decay_factor = None
        best_l2 = None
        best_mse = None

        pbounds = {'halflife': halflife_bounds, 'l2': l2_bounds}
        # pbounds = {'offense_halflife': offense_halflife_bounds, 'defense_halflife': defense_halflife_bounds, 'meta_halflife':meta_halflife_bounds, 'l2': l2_bounds}
       
        # Initialize the Bayesian Optimization object
        optimizer = BayesianOptimization(f=time_bayes_objective, pbounds=pbounds, random_state=17)

        # Perform the optimization
        optimizer.maximize(init_points=init_points, n_iter=n_iter)
        
        # Get the best parameters and correlation
        best_params = optimizer.max['params']
        best_halflife = best_params['halflife']
        best_l2 = best_params['l2']
        best_mse = -optimizer.max['target']

        return best_halflife, best_l2, best_mse

    def run_full_time_opt(self, num_samples=25, num_test_dates=20, num_future_days=60, max_lookback=365*2, halflife_bounds=(50, 800), l2_bounds=(1e-8, 1)):

        optimal_halflifes = []
        optimal_l2s = []
        best_mses = []
        for i in tqdm(range(num_samples), total=num_samples):
            best_halflife, best_l2, best_mse = self.run_time_opt_scipy(13, 32, num_test_dates, num_future_days, max_lookback, halflife_bounds, l2_bounds)

            optimal_halflifes.append(best_halflife)
            optimal_l2s.append(best_l2)
            best_mses.append(best_mse)
        return optimal_halflifes, optimal_l2s, best_mses


    def get_ratings_for_dates(self, dates, halflife, l2, max_lookback=365*2.1):
        
        num_dates = len(dates)

        decay = np.exp(-np.log(2)/halflife)
        offense_stats = []
        defense_stats = []
        meta_stats = []
        # dates = [pd.to_datetime('04-09-2024')]
        for i, date in tqdm(enumerate(dates), total=num_dates):

                # Filter data before the given date
            X_train = self.data[(self.data['date'] >= date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < date)].copy()
            idx = X_train.index.values
            if len(X_train) < 50:
                print("Minimum data threshold not met")
                continue
            X_train, protags, antags = self.initialize_X_train(X_train)
            num_protags = len(protags)
            num_antags = len(antags)

            # Filter data before the given date
            time_date = pd.to_datetime(date) 
            temp = time_date - pd.to_datetime(self.data.iloc[idx]['date'])
            dw = decay ** (temp.astype('timedelta64[D]').astype(int))
            dw = dw.values.reshape(-1)
            dw = dw/np.sum(dw)

            # Calculate the time differences in days between each game and the most recent game
            W = sparse.diags(dw)
            
            q = (X_train.T @ W @ X_train).toarray()
            q += l2 * np.eye(q.shape[0]) * np.trace(q) / q.shape[0]

            y = self.data.iloc[idx][self.stat_col].values
            f = X_train.T @ W @ y
            # Calculate the exponential decay weights based on the time differences and half-lives
            # solution
            b = solve(q, f, assume_a='pos')

            # Split the ratings into offense and defense ratings
            offense_ratings = b[:num_protags]
            defense_ratings = b[num_protags:num_protags+num_antags]
            meta_ratings = b[num_protags+num_antags:]


            # Create DataFrames for offense ratings and defense ratings
            offense_stat = pd.DataFrame({
                'protag': protags,
                self.stat_col: offense_ratings
            })
            offense_stat['date'] = date
            defense_stat = pd.DataFrame({
                'antag': antags,
                self.stat_col: defense_ratings
            })
            defense_stat['date'] = date
            meta_ratings = pd.DataFrame({
                'meta': self.meta_cols,
                self.stat_col: meta_ratings
            })

            offense_stats.append(offense_stat)
            defense_stats.append(defense_stat)
            meta_stats.append(meta_ratings)

        offense_stats = pd.concat(offense_stats).reset_index(drop=True)
        defense_stats = pd.concat(defense_stats).reset_index(drop=True)

        return offense_stats, defense_stats


In [41]:
m_data.head()

Unnamed: 0,season,team_score,opp_score,is_home,numot,team_fgm,team_fga,team_fgm3,team_fga3,team_ftm,...,opp_or,opp_dr,opp_ast,opp_to,opp_stl,opp_blk,opp_pf,team_name,opp_name,date
0,2003,68,62,0,0,27,58,3,14,11,...,10,22,8,18,9,2,20,Alabama,Oklahoma,2002-11-14
1,2003,70,63,0,0,26,62,8,20,10,...,20,25,7,12,8,6,16,Memphis,Syracuse,2002-11-14
2,2003,62,68,0,0,22,53,2,10,16,...,14,24,13,23,7,1,22,Oklahoma,Alabama,2002-11-14
3,2003,63,70,0,0,24,67,6,24,9,...,15,28,16,13,4,4,18,Syracuse,Memphis,2002-11-14
4,2003,55,81,-1,0,20,46,3,11,12,...,12,24,12,9,9,3,18,E Washington,Wisconsin,2002-11-15


In [42]:

MO = MasseyOptimizer('time', protag_col='team_name', antag_col='opp_name', stat_col='team_score', meta_cols=['is_home'])
MO.load_data(m_data)
# halflife, l2, mse = MO.run_time_opt()


In [39]:
rating_dates = [date for date in m_data['date'].unique() if date > pd.to_datetime('2021-10-01')]
offense_ratings, defense_ratings = MO.get_ratings_for_dates(rating_dates, halflife, l2)

  0%|          | 0/135 [00:00<?, ?it/s]


ValueError: matrix - rhs dimension mismatch ((715, 715) - 1)

In [21]:
offense_ratings.head()

Unnamed: 0,protag,team_score,date
0,Abilene Chr,1.256678,2021-11-09
1,Air Force,-9.278845,2021-11-09
2,Akron,3.565941,2021-11-09
3,Alabama,12.833932,2021-11-09
4,Alabama A&M,-14.621407,2021-11-09
