In [2]:


import os
import torch
import numpy as np
import pandas as pd

from tqdm import tqdm
from scipy import sparse
from datetime import datetime
from scipy.linalg import solve
from scipy.optimize import minimize
from scipy.sparse.linalg import spsolve
from bayes_opt import BayesianOptimization
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict



In [3]:

DATA_PATH = '../../data/testing/ncaam_sample_data.csv'
def load_data(data_path):
    return pd.read_csv(data_path)

m_data = load_data(DATA_PATH)


In [5]:
class Optimizer:
    def __init__(self):
        pass

    def optimize(self):
        raise NotImplementedError("Subclasses must implement the optimize method.")
        
class MasseyOptimizer:
    def __init__(self, decay_type, protag_col='team', antag_col='opponent', stat_col='team_sq_score', meta_cols=['location'], min_protag_games=5):
        self.decay_type = decay_type
        self.protag_col = protag_col
        self.antag_col = antag_col
        self.stat_col = stat_col
        self.meta_cols = meta_cols
        self.min_protag_games = min_protag_games

        if decay_type not in ['time', 'games', 'both']:
            raise ValueError("decay_type must be 'time', 'games', or 'both'")

    def load_data(self, data=None, path=None):
        if path:
            self.data = pd.read_csv(path)
        elif data is not None:
            self.data = data.copy()
        else:
            raise ValueError("Either data or path must be provided")
        self._preprocess_data()

    def _preprocess_data(self):
        required_columns = [self.protag_col, self.antag_col, self.stat_col, 'date']
        if not all(col in self.data.columns for col in required_columns):
            raise ValueError(f"Data must contain columns: {required_columns}")

        self.data['date'] = pd.to_datetime(self.data['date'])
        self.data = self.data.sort_values('date').reset_index(drop=True)

        self.protags = sorted(self.data[self.protag_col].unique())
        self.antags = sorted(self.data[self.antag_col].unique())

        protag_map = {p: i for i, p in enumerate(self.protags)}
        antag_map = {a: i for i, a in enumerate(self.antags)}

        self.data['protag_idx'] = self.data[self.protag_col].map(protag_map)
        self.data['antag_idx'] = self.data[self.antag_col].map(antag_map)

        if len(self.data) <= 200:
            raise ValueError("Not enough data to optimize (minimum 200 rows)")

    def _initialize_X(self, df, protags, antags):
        df = df.copy()  # Create a copy to avoid SettingWithCopyWarning
        num_protags = len(protags)
        num_antags = len(antags)
        
        protag_map = {p: i for i, p in enumerate(protags)}
        antag_map = {a: i for i, a in enumerate(antags)}
        
        df.loc[:, 'protag_idx'] = df[self.protag_col].map(protag_map).fillna(-1).astype(int)
        df.loc[:, 'antag_idx'] = df[self.antag_col].map(antag_map).fillna(-1).astype(int)
        
        X = sparse.lil_matrix((len(df), num_protags + num_antags + len(self.meta_cols)))
        valid_rows = (df['protag_idx'] != -1) & (df['antag_idx'] != -1)
        X[valid_rows, df.loc[valid_rows, 'protag_idx']] = 1
        X[valid_rows, df.loc[valid_rows, 'antag_idx'] + num_protags] = 1
        
        for i, col in enumerate(self.meta_cols):
            X[:, -(i+1)] = df[col].values.reshape(-1, 1)
        
        return sparse.csr_matrix(X), df[valid_rows]

    def _calculate_weights(self, train_data, test_date, halflife):
        decay = np.exp(-np.log(2) / halflife)
        time_diff = (test_date - train_data['date']).dt.total_seconds() / (24 * 3600)
        weights = decay ** time_diff
        return weights.values

    def _fit_model(self, X_train, y_train, weights, l2):
        W = sparse.diags(weights)
        q = (X_train.T @ W @ X_train).toarray()
        q += l2 * np.eye(q.shape[0]) * np.trace(q) / q.shape[0]
        f = X_train.T @ W @ y_train
        return solve(q, f, assume_a='pos')

    def _predict_and_evaluate(self, X_test, y_test, coeffs, num_protags, num_antags):
        offense_ratings = coeffs[:num_protags]
        defense_ratings = coeffs[num_protags:num_protags+num_antags]
        
        X_test_ratings = np.column_stack([
            offense_ratings[X_test[:, :num_protags].nonzero()[1]],
            defense_ratings[X_test[:, num_protags:num_protags+num_antags].nonzero()[1] - num_protags]
        ])
        
        linear_model = LinearRegression()
        predictions = cross_val_predict(linear_model, X_test_ratings, y_test, cv=5)
        mse = np.mean((y_test - predictions) ** 2)
        return mse

    def optimize(self, init_points=10, n_iter=30, num_test_dates=20, num_future_days=60, max_lookback=365*3, halflife_bounds=(10, 800), l2_bounds=(1e-9, 10)):
        unique_dates = sorted(self.data['date'].unique())[10:]
        test_dates = np.random.choice(unique_dates, size=num_test_dates, replace=False)

        def objective(halflife, l2):
            total_mse = 0
            for test_date in test_dates:
                train_data = self.data[(self.data['date'] >= test_date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < test_date)].copy()
                test_data = self.data[(self.data['date'] >= test_date) & (self.data['date'] <= test_date + pd.Timedelta(days=num_future_days))].copy()

                if len(train_data) < 50 or len(test_data) < 50:
                    continue

                X_train, train_data = self._initialize_X(train_data, self.protags, self.antags)
                X_test, test_data = self._initialize_X(test_data, self.protags, self.antags)

                weights = self._calculate_weights(train_data, test_date, halflife)
                coeffs = self._fit_model(X_train, train_data[self.stat_col].values, weights, l2)

                mse = self._predict_and_evaluate(X_test, test_data[self.stat_col].values, coeffs, len(self.protags), len(self.antags))
                total_mse += mse

            return -total_mse / len(test_dates)

        optimizer = BayesianOptimization(f=objective, pbounds={'halflife': halflife_bounds, 'l2': l2_bounds}, random_state=17)
        optimizer.maximize(init_points=init_points, n_iter=n_iter)

        best_params = optimizer.max['params']
        best_mse = -optimizer.max['target']
        return best_params['halflife'], best_params['l2'], best_mse

    def get_ratings_for_dates(self, dates, halflife, l2, max_lookback=365*2.1):
        offense_stats = []
        defense_stats = []

        for date in tqdm(dates):
            date = pd.to_datetime(date)
            train_data = self.data[(self.data['date'] >= date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < date)].copy()
            
            if len(train_data) < 50:
                print(f"Minimum data threshold not met for date {date}")
                continue

            X_train, train_data = self._initialize_X(train_data, self.protags, self.antags)
            weights = self._calculate_weights(train_data, date, halflife)
            coeffs = self._fit_model(X_train, train_data[self.stat_col].values, weights, l2)

            num_protags = len(self.protags)
            num_antags = len(self.antags)
            
            offense_ratings = coeffs[:num_protags]
            defense_ratings = coeffs[num_protags:num_protags+num_antags]
            meta_ratings = coeffs[num_protags+num_antags:]

            offense_stats.append(pd.DataFrame({
                'protag': self.protags,
                self.stat_col: offense_ratings,
                'date': date
            }))

            defense_stats.append(pd.DataFrame({
                'antag': self.antags,
                self.stat_col: defense_ratings,
                'date': date
            }))

        return pd.concat(offense_stats), pd.concat(defense_stats)

class MasseyMultiClassOptimizer(Optimizer):
    def __init__(self, decay_type, protag_col='team', antag_col='opponent', stat_cols='team_sq_score', meta_cols=[]):
        super().__init__()
        assert(decay_type in ['time', 'games','both'])
        self.decay_type = decay_type
        self.protag_col = protag_col
        self.antag_col = antag_col
        self.stat_cols = stat_cols
        self.meta_cols = meta_cols

    def load_data(self, data, path=None):
        if path is not None:
            self.data = pd.read_csv(path)
        else:
            self.data = data
        self.preprocess_data()

    def preprocess_data(self):
        # Convert date column to datetime if needed
        if isinstance(self.data['date'].iloc[0], str):
            self.data['date'] = pd.to_datetime(self.data['date'])

        # Sort data by date
        self.data = self.data.sort_values('date').reset_index(drop=True)

        # Create a team/player list
        self.protags = list(self.data[self.protag_col].unique())
        self.antags = list(self.data[self.antag_col].unique())
        self.protags = sorted(self.protags)
        self.antags = sorted(self.antags)

        self.num_protags = len(self.protags)
        self.num_antags = len(self.antags)
        self.num_stats = len(self.stat_cols)

        self.data['protag_idx'] = self.data[self.protag_col].apply(lambda x: self.protags.index(x))
        self.data['antag_idx'] = self.data[self.antag_col].apply(lambda x: self.antags.index(x))

        if len(self.data[self.stat_cols[0]].unique()) > 10:
            print('Warning: there are a lot of unique values in the stat col, can overload memory')

        assert(len(self.data)>200), "Not enough data to optimize"

    def initialize_X_train(self, train):

        protags = sorted(train[self.protag_col].unique())
        antags = sorted(train[self.antag_col].unique())
        num_train_protags = len(protags)
        num_train_antags = len(antags)

        train['protag_idx'] = train[self.protag_col].apply(lambda x: protags.index(x))
        train['antag_idx'] = train[self.antag_col].apply(lambda x: antags.index(x))

        X_train = np.zeros((len(train), num_train_protags+num_train_antags+len(self.meta_cols)))
        X_train[np.arange(len(train)), train['protag_idx']] = 1
        X_train[np.arange(len(train)), train['antag_idx']+num_train_protags] = 1
        for i, col in enumerate(self.meta_cols):
            X_train[np.arange(len(train)), -1*i] = train[col]
        
        X_train = sparse.csr_matrix(X_train)
        return X_train, protags, antags
    
    def initialize_X_test(self, test, protags, antags):

        num_protags = len(protags)
        num_antags = len(antags)
        test['protag_idx'] = test[self.protag_col].apply(lambda x: protags.index(x) if x in protags else num_protags)
        test['antag_idx'] = test[self.antag_col].apply(lambda x: antags.index(x) if x in antags else num_antags)
        test = test.loc[test['protag_idx']<num_protags]
        test = test.loc[test['antag_idx']<num_antags]
        test_idx = test.index.values
        test = test.reset_index(drop=True)
        X_test = np.zeros((len(test), len(protags)+len(antags)+len(self.meta_cols)))
        X_test[np.arange(len(test)), test['protag_idx']] = 1
        X_test[np.arange(len(test)), test['antag_idx']+num_protags+1] = 1

        for i, col in enumerate(self.meta_cols):
            X_test[np.arange(len(test)), -1*i] = test[col]

        X_test = sparse.csr_matrix(X_test)

        return X_test, test_idx

    def run_time_opt_scipy(self, init_points=10, n_iter=30, num_test_dates=20, num_future_days=60, max_lookback=365*2.5, halflife_bounds=(50, 800), l2_bounds=(1e-9, 1)):

        # Select random test dates
        unique_dates = self.data['date'].unique()
        assert(len(unique_dates)>15), "Not enough unique dates to test on"
        ## don't take from the first 10 or so dates
        unique_dates = sorted(unique_dates)[10:]

        test_dates = np.random.choice(unique_dates, size=num_test_dates, replace=False)
        num_dates = len(unique_dates)

        for test_date in test_dates:
            train_data = self.data[(self.data['date'] >= test_date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < test_date)].copy()
            test_data = self.data[(self.data['date'] >= test_date) & (self.data['date'] <= test_date + pd.Timedelta(days=num_future_days))].copy()
            if len(train_data) < 50 or len(test_data) < 50:
                print("Not enough data for test date", test_date)
                test_dates = np.delete(test_dates, np.where(test_dates==test_date))

        def time_bayes_objective(halflife, l2):
            # offense_halflife, defense_halflife, meta_halflife_bounds, l2 = params
            decay = np.exp(-np.log(2)/halflife)

            for i, test_date in enumerate(test_dates):

                # Filter data before the given date
                X_train = self.data[(self.data['date'] >= test_date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < test_date)].copy()
                idx = X_train.index.values
                if len(X_train) < 50:
                    print("Minimum data threshold not met")
                    continue
                X_train, protags, antags = self.initialize_X_train(X_train)
                num_protags = len(protags)
                num_antags = len(antags)
                X_test = self.data[(self.data['date'] >= test_date) & (self.data['date'] <= test_date + pd.Timedelta(days=num_future_days))].copy()

                if len(train_data) < 50 or len(test_data) < 50:
                    continue

                # Calculate the time differences in days between each game and the most recent game
                dw = decay ** (np.datetime64(test_date)  - self.data.loc[idx, 'date'].values).reshape(-1).astype('timedelta64[D]').astype(int)
                dw = dw/np.sum(dw)
                Y_train = self.data.loc[idx, self.stat_cols].values

                multilogit = LogisticRegression(C=1000, max_iter=5000,multi_class='ovr')
                multilogit.fit(X_train, Y_train, sample_weight=dw)

                # Split the ratings into intercept, offense, and defense ratings
                offense_ratings = multilogit.coef_[:, :num_protags]
                defense_ratings = multilogit.coef_[:, num_protags:num_protags+num_antags]
            
                # # Calculate ratings for the test date (without decay)
                X_test, test_idx = self.initialize_X_test(test_data, protags, antags)
                y_test = self.data.loc[test_idx, self.stat_cols].values
                y_pred = multilogit.predict_proba(X_test)

                # Evaluate the predictions using cross-entropy loss
                cross_entropy = log_loss(y_test, y_pred)

                # Print the evaluation result
                print(f"Test Cross-Entropy: {cross_entropy:.4f}")


            return -cross_entropy  
        
        best_decay_factor = None
        best_l2 = None
        best_mse = None

        pbounds = {'halflife': halflife_bounds, 'l2': l2_bounds}
        # pbounds = {'offense_halflife': offense_halflife_bounds, 'defense_halflife': defense_halflife_bounds, 'meta_halflife':meta_halflife_bounds, 'l2': l2_bounds}
       
        # Initialize the Bayesian Optimization object
        optimizer = BayesianOptimization(f=time_bayes_objective, pbounds=pbounds, random_state=17)

        # Perform the optimization
        optimizer.maximize(init_points=init_points, n_iter=n_iter)

        # Get the best parameters and correlation
        best_params = optimizer.max['params']
        best_halflife = best_params['offense_halflife']
        best_l2 = best_params['l2']
        best_correlation = -optimizer.max['target']

        return best_halflife, best_l2, best_correlation

    def run_full_time_opt(self, num_samples=25, num_test_dates=20, num_future_days=60, max_lookback=365*2, halflife_bounds=(50, 800), l2_bounds=(1e-8, 1)):

        optimal_halflifes = []
        optimal_l2s = []
        best_errors = []
        for i in tqdm(range(num_samples), total=num_samples):
            best_halflife, best_l2, best_error = self.run_time_opt_scipy(13, 32, num_test_dates, num_future_days, max_lookback, halflife_bounds, l2_bounds)

            optimal_halflifes.append(best_halflife)
            optimal_l2s.append(best_l2)
            best_errors.append(best_error)

        return optimal_halflifes, optimal_l2s, best_errors

    def get_ratings_for_dates(self, dates, halflife, l2, max_lookback=365*2.05):

        num_dates = len(dates)
        decay = np.exp(-np.log(2)/halflife)

        offense_stats = []
        defense_stats = []
        for i, date in tqdm(enumerate(dates),total=num_dates):
            # Filter data before the given date
            X_train = self.data[(self.data['date'] >= date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < date)].copy()
            idx = X_train.index.values
            if len(X_train) < 50:
                print("Minimum data threshold not met")
                continue
            X_train, protags, antags = self.initialize_X_train(X_train)
            num_protags = len(protags)
            num_antags = len(antags)

            # Calculate the time differences in days between each game and the most recent game
            dw = decay ** (np.datetime64(date)  - self.data.loc[idx, 'date'].values).reshape(-1).astype('timedelta64[D]').astype(int)
            dw = dw/np.sum(dw)
            Y_train = self.data.loc[idx, self.stat_cols].values

            multilogit = LogisticRegression(C=1000, max_iter=5000,multi_class='ovr')
            multilogit.fit(X_train, Y_train, sample_weight=dw)

            # Split the ratings into intercept, offense, and defense ratings
            offense_ratings = multilogit.coef_[:, :num_protags]
            defense_ratings = multilogit.coef_[:, num_protags:num_protags+num_antags]

            # Create DataFrames for offense ratings and defense ratings
            offense_stat = pd.DataFrame(offense_ratings.T, columns=[f'stat_{i+1}' for i in range(offense_ratings.shape[0])])
            offense_stat.insert(0, 'protag', protags)
            offense_stat['date'] = date

            defense_stat = pd.DataFrame(defense_ratings.T, columns=[f'defense_stat_{i+1}' for i in range(defense_ratings.shape[0])])
            defense_stat.insert(0, 'antag', antags)
            defense_stat['date'] = date

            offense_stats.append(offense_stat)
            defense_stats.append(defense_stat)

        offense_stats = pd.concat(offense_stats).reset_index(drop=True)
        defense_stats = pd.concat(defense_stats).reset_index(drop=True)
        return offense_stats, defense_stats

In [6]:
m_data.head()

Unnamed: 0,season,team_score,opp_score,is_home,numot,team_fgm,team_fga,team_fgm3,team_fga3,team_ftm,...,opp_or,opp_dr,opp_ast,opp_to,opp_stl,opp_blk,opp_pf,team_name,opp_name,date
0,2003,68,62,0,0,27,58,3,14,11,...,10,22,8,18,9,2,20,Alabama,Oklahoma,2002-11-14
1,2003,70,63,0,0,26,62,8,20,10,...,20,25,7,12,8,6,16,Memphis,Syracuse,2002-11-14
2,2003,62,68,0,0,22,53,2,10,16,...,14,24,13,23,7,1,22,Oklahoma,Alabama,2002-11-14
3,2003,63,70,0,0,24,67,6,24,9,...,15,28,16,13,4,4,18,Syracuse,Memphis,2002-11-14
4,2003,55,81,-1,0,20,46,3,11,12,...,12,24,12,9,9,3,18,E Washington,Wisconsin,2002-11-15


In [7]:

MO = MasseyOptimizer('time', protag_col='team_name', antag_col='opp_name', stat_col='team_score', meta_cols=['is_home'])
MO.load_data(m_data.copy())
# halflife, l2, mse = MO.run_time_opt()


In [8]:
rating_dates = sorted(m_data['date'].unique())[-15:]
halflife = 150
l2 = 1e-8
offense_ratings, defense_ratings = MO.get_ratings_for_dates(rating_dates, halflife, l2)

100%|██████████| 15/15 [00:01<00:00, 10.37it/s]


In [9]:
offense_ratings.drop_duplicates(subset=['protag'],keep='last').sort_values(by=['team_score'], ascending=False).head(10)

Unnamed: 0,protag,team_score,date
106,Gonzaga,54.718075,2022-04-04
129,Iowa,51.461523,2022-04-04
9,Arizona,51.315833,2022-04-04
3,Alabama,48.939767,2022-04-04
135,Kansas,48.813532,2022-04-04
281,St John's,47.97467,2022-04-04
77,Duke,47.78745,2022-04-04
20,Baylor,47.437041,2022-04-04
139,Kentucky,47.145549,2022-04-04
209,North Carolina,47.125641,2022-04-04


In [10]:
# MO.optimize()

In [11]:

import pandas as pd
import numpy as np
from scipy import sparse
from scipy.linalg import solve
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict
from bayes_opt import BayesianOptimization
from tqdm import tqdm

class MultiStatMasseyOptimizer:
    def __init__(self, decay_type, protag_col='team', antag_col='opponent', stat_cols=['team_score', 'team_rebounds', 'team_assists'], meta_cols=['location'], min_protag_games=5):
        self.decay_type = decay_type
        self.protag_col = protag_col
        self.antag_col = antag_col
        self.stat_cols = stat_cols
        self.meta_cols = meta_cols
        self.min_protag_games = min_protag_games

        if decay_type not in ['time', 'games', 'both']:
            raise ValueError("decay_type must be 'time', 'games', or 'both'")

    def load_data(self, data=None, path=None):
        if path:
            self.data = pd.read_csv(path)
        elif data is not None:
            self.data = data.copy()
        else:
            raise ValueError("Either data or path must be provided")
        self._preprocess_data()

    def _preprocess_data(self):
        required_columns = [self.protag_col, self.antag_col] + self.stat_cols + ['date']
        if not all(col in self.data.columns for col in required_columns):
            raise ValueError(f"Data must contain columns: {required_columns}")

        self.data['date'] = pd.to_datetime(self.data['date'])
        self.data = self.data.sort_values('date').reset_index(drop=True)

        self.protags = sorted(self.data[self.protag_col].unique())
        self.antags = sorted(self.data[self.antag_col].unique())

        protag_map = {p: i for i, p in enumerate(self.protags)}
        antag_map = {a: i for i, a in enumerate(self.antags)}

        self.data['protag_idx'] = self.data[self.protag_col].map(protag_map)
        self.data['antag_idx'] = self.data[self.antag_col].map(antag_map)

        if len(self.data) <= 200:
            raise ValueError("Not enough data to optimize (minimum 200 rows)")

    def _initialize_X(self, df, protags, antags):
        df = df.copy()
        num_protags = len(protags)
        num_antags = len(antags)
        
        protag_map = {p: i for i, p in enumerate(protags)}
        antag_map = {a: i for i, a in enumerate(antags)}
        
        df.loc[:, 'protag_idx'] = df[self.protag_col].map(protag_map).fillna(-1).astype(int)
        df.loc[:, 'antag_idx'] = df[self.antag_col].map(antag_map).fillna(-1).astype(int)
        
        X = sparse.lil_matrix((len(df), num_protags + num_antags + len(self.meta_cols)))
        valid_rows = (df['protag_idx'] != -1) & (df['antag_idx'] != -1)
        X[valid_rows, df.loc[valid_rows, 'protag_idx']] = 1
        X[valid_rows, df.loc[valid_rows, 'antag_idx'] + num_protags] = 1
        
        for i, col in enumerate(self.meta_cols):
            X[:, -(i+1)] = df[col].values.reshape(-1, 1)
        
        return sparse.csr_matrix(X), df[valid_rows]

    def _calculate_weights(self, train_data, test_date, halflife):
        decay = np.exp(-np.log(2) / halflife)
        time_diff = (test_date - train_data['date']).dt.total_seconds() / (24 * 3600)
        weights = decay ** time_diff
        return weights.values

    def _fit_model(self, X_train, y_train, weights, l2):
        W = sparse.diags(weights)
        q = (X_train.T @ W @ X_train).toarray()
        q += l2 * np.eye(q.shape[0]) * np.trace(q) / q.shape[0]
        f = X_train.T @ W @ y_train
        return solve(q, f, assume_a='pos')

    def _predict_and_evaluate(self, X_test, y_test, coeffs, num_protags, num_antags):
        offense_ratings = coeffs[:num_protags]
        defense_ratings = coeffs[num_protags:num_protags+num_antags]
        
        X_test_ratings = np.column_stack([
            offense_ratings[X_test[:, :num_protags].nonzero()[1]],
            defense_ratings[X_test[:, num_protags:num_protags+num_antags].nonzero()[1] - num_protags]
        ])
        
        linear_model = LinearRegression()
        predictions = cross_val_predict(linear_model, X_test_ratings, y_test, cv=5)
        mse = np.mean((y_test - predictions) ** 2)
        return mse

    def optimize(self, init_points=10, n_iter=30, num_test_dates=20, num_future_days=60, max_lookback=365*3, halflife_bounds=(10, 800), l2_bounds=(1e-9, 10)):
        unique_dates = sorted(self.data['date'].unique())[10:]
        test_dates = np.random.choice(unique_dates, size=num_test_dates, replace=False)

        def objective(halflife, l2):
            total_mse = 0
            for test_date in test_dates:
                train_data = self.data[(self.data['date'] >= test_date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < test_date)].copy()
                test_data = self.data[(self.data['date'] >= test_date) & (self.data['date'] <= test_date + pd.Timedelta(days=num_future_days))].copy()

                if len(train_data) < 50 or len(test_data) < 50:
                    continue

                X_train, train_data = self._initialize_X(train_data, self.protags, self.antags)
                X_test, test_data = self._initialize_X(test_data, self.protags, self.antags)

                weights = self._calculate_weights(train_data, test_date, halflife)
                
                for stat_col in self.stat_cols:
                    coeffs = self._fit_model(X_train, train_data[stat_col].values, weights, l2)
                    mse = self._predict_and_evaluate(X_test, test_data[stat_col].values, coeffs, len(self.protags), len(self.antags))
                    total_mse += mse

            return -total_mse / (len(test_dates) * len(self.stat_cols))

        optimizer = BayesianOptimization(f=objective, pbounds={'halflife': halflife_bounds, 'l2': l2_bounds}, random_state=17)
        optimizer.maximize(init_points=init_points, n_iter=n_iter)

        best_params = optimizer.max['params']
        best_mse = -optimizer.max['target']
        return best_params['halflife'], best_params['l2'], best_mse

    def get_ratings_for_dates(self, dates, halflife, l2, max_lookback=365*2.1):
        all_ratings = {stat: [] for stat in self.stat_cols}

        for date in tqdm(dates):
            date = pd.Timestamp(date)
            train_data = self.data[(self.data['date'] >= date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < date)].copy()
            
            if len(train_data) < 50:
                print(f"Minimum data threshold not met for date {date}")
                continue

            X_train, train_data = self._initialize_X(train_data, self.protags, self.antags)
            weights = self._calculate_weights(train_data, date, halflife)

            num_protags = len(self.protags)
            num_antags = len(self.antags)

            for stat_col in self.stat_cols:
                coeffs = self._fit_model(X_train, train_data[stat_col].values, weights, l2)
                
                offense_ratings = coeffs[:num_protags]
                defense_ratings = coeffs[num_protags:num_protags+num_antags]
                meta_ratings = coeffs[num_protags+num_antags:]

                all_ratings[stat_col].append(pd.DataFrame({
                    'team': self.protags,
                    'offense_rating': offense_ratings,
                    'defense_rating': defense_ratings,
                    'date': date
                }))

        return {stat: pd.concat(ratings) for stat, ratings in all_ratings.items()}

    def predict_game(self, protag, antag, date, meta_values=None):
        if meta_values is None:
            meta_values = [0] * len(self.meta_cols)
        
        train_data = self.data[self.data['date'] < date].copy()
        X_train, train_data = self._initialize_X(train_data, self.protags, self.antags)
        weights = self._calculate_weights(train_data, date, self.halflife)

        predictions = {}
        for stat_col in self.stat_cols:
            coeffs = self._fit_model(X_train, train_data[stat_col].values, weights, self.l2)
            
            protag_idx = self.protags.index(protag)
            antag_idx = self.antags.index(antag)
            
            offense_rating = coeffs[protag_idx]
            defense_rating = coeffs[len(self.protags) + antag_idx]
            meta_ratings = coeffs[len(self.protags) + len(self.antags):]
            
            prediction = offense_rating - defense_rating + np.dot(meta_ratings, meta_values)
            predictions[stat_col] = prediction

        return predictions


In [12]:
msmo = MultiStatMasseyOptimizer(
    decay_type='time',
    protag_col='team_name',
    antag_col='opp_name',
    stat_cols=['team_score', 'team_dr', 'team_ast'],
    meta_cols=['is_home']
)
msmo.load_data(m_data)

# Optimize the model
# best_halflife, best_l2, best_mse = msmo.optimize()

# Get ratings for specific dates
dates = sorted(m_data['date'].unique())[-15:]
best_halflife = 120
best_l2 = 1e-8
ratings = msmo.get_ratings_for_dates(dates, best_halflife, best_l2)

# Access ratings for a specific stat
score_ratings = ratings['team_score']
rebound_ratings = ratings['team_dr']
assist_ratings = ratings['team_ast']

# Predict a specific game
# prediction = msmo.predict_game('team_name', 'opp_name', date, meta_values=[1])
# print(f"Predicted score: {prediction['team_score']}")
# print(f"Predicted rebounds: {prediction['team_dr']}")
# print(f"Predicted assists: {prediction['team_ast']}")

100%|██████████| 15/15 [00:02<00:00,  6.39it/s]


In [17]:
score_ratings.drop_duplicates(subset=['team'],keep='last').sort_values(by=['offense_rating'], ascending=False).head(10)

Unnamed: 0,team,offense_rating,defense_rating,date
106,Gonzaga,54.461986,29.88023,2022-04-04
9,Arizona,51.622203,31.215235,2022-04-04
129,Iowa,51.344653,32.738985,2022-04-04
135,Kansas,49.109418,27.763761,2022-04-04
3,Alabama,48.898568,35.35173,2022-04-04
281,St John's,48.040235,36.644418,2022-04-04
77,Duke,47.981441,29.429602,2022-04-04
139,Kentucky,47.46419,28.364729,2022-04-04
209,North Carolina,47.326726,31.498682,2022-04-04
20,Baylor,47.286338,26.100549,2022-04-04


In [14]:
rebound_ratings

Unnamed: 0,team,offense_rating,defense_rating,date
0,Abilene Chr,8.488406,13.495930,2022-03-11
1,Air Force,7.502979,11.563780,2022-03-11
2,Akron,11.196717,9.447881,2022-03-11
3,Alabama,14.763071,9.974514,2022-03-11
4,Alabama A&M,11.663488,15.143592,2022-03-11
...,...,...,...,...
358,Wright St,11.632482,10.991622,2022-04-04
359,Wyoming,14.754123,10.751527,2022-04-04
360,Xavier,14.864636,10.995517,2022-04-04
361,Yale,12.925246,12.342821,2022-04-04


In [15]:
assist_ratings

Unnamed: 0,team,offense_rating,defense_rating,date
0,Abilene Chr,7.911145,5.084868,2022-03-11
1,Air Force,6.826064,4.758409,2022-03-11
2,Akron,4.594847,4.313961,2022-03-11
3,Alabama,8.669799,4.828603,2022-03-11
4,Alabama A&M,2.031747,8.265675,2022-03-11
...,...,...,...,...
358,Wright St,6.449498,4.816999,2022-04-04
359,Wyoming,5.337335,3.352986,2022-04-04
360,Xavier,9.281279,5.520721,2022-04-04
361,Yale,6.253933,5.539027,2022-04-04


In [16]:
m_data.head()

Unnamed: 0,season,team_score,opp_score,is_home,numot,team_fgm,team_fga,team_fgm3,team_fga3,team_ftm,...,opp_or,opp_dr,opp_ast,opp_to,opp_stl,opp_blk,opp_pf,team_name,opp_name,date
0,2003,68,62,0,0,27,58,3,14,11,...,10,22,8,18,9,2,20,Alabama,Oklahoma,2002-11-14
1,2003,70,63,0,0,26,62,8,20,10,...,20,25,7,12,8,6,16,Memphis,Syracuse,2002-11-14
2,2003,62,68,0,0,22,53,2,10,16,...,14,24,13,23,7,1,22,Oklahoma,Alabama,2002-11-14
3,2003,63,70,0,0,24,67,6,24,9,...,15,28,16,13,4,4,18,Syracuse,Memphis,2002-11-14
4,2003,55,81,-1,0,20,46,3,11,12,...,12,24,12,9,9,3,18,E Washington,Wisconsin,2002-11-15


In [63]:
import warnings

from scipy.sparse import csr_matrix
from sklearn.metrics import log_loss
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression


class MasseyMultiClassOptimizer:
    def __init__(self, decay_type, protag_col='team', antag_col='opponent', stat_cols=['team_sq_score'], meta_cols=[], min_protag_games=5):
        self.decay_type = decay_type
        self.protag_col = protag_col
        self.antag_col = antag_col
        self.stat_cols = stat_cols if isinstance(stat_cols, list) else [stat_cols]
        self.meta_cols = meta_cols
        self.min_protag_games = min_protag_games

        if decay_type not in ['time', 'games', 'both']:
            raise ValueError("decay_type must be 'time', 'games', or 'both'")

        self._suppress_logistic_regression_warning()

    @staticmethod
    def _suppress_logistic_regression_warning():
        warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.linear_model._logistic")

    def load_data(self, data=None, path=None):
        if path:
            self.data = pd.read_csv(path)
        elif data is not None:
            self.data = data.copy()
        else:
            raise ValueError("Either data or path must be provided")
        self._preprocess_data()

    def _preprocess_data(self):
        required_columns = [self.protag_col, self.antag_col] + self.stat_cols + ['date']
        if not all(col in self.data.columns for col in required_columns):
            raise ValueError(f"Data must contain columns: {required_columns}")

        self.data['date'] = pd.to_datetime(self.data['date'])
        self.data = self.data.sort_values('date').reset_index(drop=True)

        self.protags = sorted(self.data[self.protag_col].unique())
        self.antags = sorted(self.data[self.antag_col].unique())

        protag_map = {p: i for i, p in enumerate(self.protags)}
        antag_map = {a: i for i, a in enumerate(self.antags)}

        self.data['protag_idx'] = self.data[self.protag_col].map(protag_map)
        self.data['antag_idx'] = self.data[self.antag_col].map(antag_map)

        # Encode stat_cols if they're categorical
        self.label_encoders = {}
        for col in self.stat_cols:
            if self.data[col].dtype == 'object':
                le = LabelEncoder()
                self.data[col] = le.fit_transform(self.data[col])
                self.label_encoders[col] = le

        if len(self.data) <= 200:
            raise ValueError("Not enough data to optimize (minimum 200 rows)")

    def _initialize_X(self, df, protags, antags):
        df = df.copy()
        num_protags = len(protags)
        num_antags = len(antags)
        
        protag_map = {p: i for i, p in enumerate(protags)}
        antag_map = {a: i for i, a in enumerate(antags)}
        
        df['protag_idx'] = df[self.protag_col].map(protag_map).fillna(-1).astype(int)
        df['antag_idx'] = df[self.antag_col].map(antag_map).fillna(-1).astype(int)
        
        X = np.zeros((len(df), num_protags + num_antags + len(self.meta_cols)))
        valid_rows = (df['protag_idx'] != -1) & (df['antag_idx'] != -1)
        X[valid_rows, df.loc[valid_rows, 'protag_idx']] = 1
        X[valid_rows, df.loc[valid_rows, 'antag_idx'] + num_protags] = 1
        
        for i, col in enumerate(self.meta_cols):
            X[:, -(i+1)] = df[col].values
        
        return csr_matrix(X), df[valid_rows]

    def _calculate_weights(self, train_data, test_date, halflife):
        decay = np.exp(-np.log(2) / halflife)
        time_diff = (test_date - train_data['date']).dt.total_seconds() / (24 * 3600)
        weights = decay ** time_diff
        return weights.values / np.sum(weights.values)

    def _fit_model(self, X_train, y_train, weights, l2):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=FutureWarning)
            base_model = LogisticRegression(C=1/l2, max_iter=5000, multi_class='ovr')
            base_model.fit(X_train, y_train, sample_weight=weights)
        return base_model

    def _predict_and_evaluate(self, X_test, y_test, model):
        y_pred = model.predict_proba(X_test)
        return log_loss(y_test, y_pred)

    def optimize(self, init_points=10, n_iter=30, num_test_dates=20, num_future_days=60, max_lookback=365*3, halflife_bounds=(10, 800), l2_bounds=(1e-9, 10)):
        unique_dates = sorted(self.data['date'].unique())[10:]
        test_dates = np.random.choice(unique_dates, size=num_test_dates, replace=False)

        def objective(halflife, l2):
            total_loss = 0
            for test_date in test_dates:
                train_data = self.data[(self.data['date'] >= test_date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < test_date)].copy()
                test_data = self.data[(self.data['date'] >= test_date) & (self.data['date'] <= test_date + pd.Timedelta(days=num_future_days))].copy()

                if len(train_data) < 50 or len(test_data) < 50:
                    continue

                X_train, train_data = self._initialize_X(train_data, self.protags, self.antags)
                X_test, test_data = self._initialize_X(test_data, self.protags, self.antags)

                weights = self._calculate_weights(train_data, test_date, halflife)
                model = self._fit_model(X_train, train_data[self.stat_cols].values.ravel(), weights, l2)

                y_test = test_data[self.stat_cols].values.ravel()
                y_pred = model.predict_proba(X_test)
                
                # Explicitly calculate log loss for multiclass
                loss = log_loss(y_test, y_pred, labels=np.unique(y_test))
                total_loss += loss

            return -total_loss / len(test_dates)

        optimizer = BayesianOptimization(f=objective, pbounds={'halflife': halflife_bounds, 'l2': l2_bounds}, random_state=17)
        optimizer.maximize(init_points=init_points, n_iter=n_iter)

        best_params = optimizer.max['params']
        best_loss = -optimizer.max['target']
        return best_params['halflife'], best_params['l2'], best_loss

    def get_ratings_for_dates(self, dates, halflife, l2, max_lookback=365*2.1):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=FutureWarning)

        dates = pd.to_datetime(dates)
        all_ratings = []

        for date in tqdm(dates):
            train_data = self.data[(self.data['date'] >= date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < date)]
            
            if len(train_data) < 50:
                print(f"Minimum data threshold not met for date {date}")
                continue

            X_train, _ = self._initialize_X(train_data, self.protags, self.antags)
            weights = self._calculate_weights(train_data, date, halflife)
            model = self._fit_model(X_train, train_data[self.stat_cols].values.ravel(), weights, l2)

            num_protags = len(self.protags)
            num_antags = len(self.antags)
            num_classes = model.coef_.shape[0]

            daily_games = self.data[self.data['date'] == date]
            
            # Prepare rating matrices
            offense_ratings = model.coef_[:, :num_protags]
            defense_ratings = model.coef_[:, num_protags:num_protags+num_antags]

            # Create mapping arrays
            protag_idx = daily_games['protag_idx'].values
            antag_idx = daily_games['antag_idx'].values

            # Use advanced indexing to get ratings for each game
            offense_game_ratings = offense_ratings[:, protag_idx].T
            defense_game_ratings = defense_ratings[:, antag_idx].T

            # Create a DataFrame with all ratings
            ratings_df = pd.DataFrame(
                np.column_stack([offense_game_ratings, defense_game_ratings]),
                columns=(
                    [f'offense_rating_{i}' for i in range(num_classes)] +
                    [f'defense_rating_{i}' for i in range(num_classes)]
                )
            )

            # Add metadata columns
            ratings_df['date'] = date
            ratings_df[self.protag_col] = daily_games[self.protag_col].values
            ratings_df[self.antag_col] = daily_games[self.antag_col].values

            all_ratings.append(ratings_df)

        # Combine all dates
        all_ratings_df = pd.concat(all_ratings, ignore_index=True)

        # Split into offense and defense DataFrames
        offense_cols = [self.protag_col, 'date'] + [f'offense_rating_{i}' for i in range(num_classes)]
        defense_cols = [self.antag_col, 'date'] + [f'defense_rating_{i}' for i in range(num_classes)]

        offense_stats = all_ratings_df[offense_cols].rename(columns={self.protag_col: 'team'})
        defense_stats = all_ratings_df[defense_cols].rename(columns={self.antag_col: 'team'})

        return offense_stats, defense_stats

In [64]:
m_data['score_binned'] = pd.qcut(m_data['team_score'], 5, labels=False)
m_data

Unnamed: 0,season,team_score,opp_score,is_home,numot,team_fgm,team_fga,team_fgm3,team_fga3,team_ftm,...,opp_dr,opp_ast,opp_to,opp_stl,opp_blk,opp_pf,team_name,opp_name,date,score_binned
0,2003,68,62,0,0,27,58,3,14,11,...,22,8,18,9,2,20,Alabama,Oklahoma,2002-11-14,2
1,2003,70,63,0,0,26,62,8,20,10,...,25,7,12,8,6,16,Memphis,Syracuse,2002-11-14,2
2,2003,62,68,0,0,22,53,2,10,16,...,24,13,23,7,1,22,Oklahoma,Alabama,2002-11-14,1
3,2003,63,70,0,0,24,67,6,24,9,...,28,16,13,4,4,18,Syracuse,Memphis,2002-11-14,1
4,2003,55,81,-1,0,20,46,3,11,12,...,24,12,9,9,3,18,E Washington,Wisconsin,2002-11-15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206555,2022,81,65,0,0,29,54,13,24,10,...,17,12,9,3,0,11,Kansas,Villanova,2022-04-02,4
206556,2022,81,77,0,0,27,64,10,26,17,...,25,12,4,7,4,18,North Carolina,Duke,2022-04-02,4
206557,2022,65,81,0,0,22,57,13,31,8,...,25,18,7,4,4,8,Villanova,Kansas,2022-04-02,1
206558,2022,72,69,0,0,29,66,6,17,8,...,29,9,13,2,6,13,Kansas,North Carolina,2022-04-04,2


In [65]:

MMCO = MasseyMultiClassOptimizer('time', protag_col='team_name', antag_col='opp_name', stat_cols=['score_binned'], meta_cols=['is_home'])
MMCO.load_data(m_data.copy())
# halflife, l2, mse = MMCO.optimize()

rating_dates = sorted(m_data['date'].unique())[-15:]
halflife = 120
l2 = 1e-8
ratings = MMCO.get_ratings_for_dates(rating_dates, halflife, l2)


100%|██████████| 15/15 [00:03<00:00,  4.37it/s]


In [66]:
ratings

(                 team       date  offense_rating_0  offense_rating_1  \
 0                 SMU 2022-03-11         -1.463397         -0.616961   
 1      St Bonaventure 2022-03-11         -0.381402         -0.514551   
 2             Seattle 2022-03-11          0.067428         -0.445615   
 3        SE Louisiana 2022-03-11         -0.951221          0.411065   
 4    UC Santa Barbara 2022-03-11          0.467863         -0.754431   
 ..                ...        ...               ...               ...   
 277            Kansas 2022-04-02         -4.119647         -0.652892   
 278    North Carolina 2022-04-02         -1.562677         -1.384586   
 279         Villanova 2022-04-02         -1.045134         -0.716690   
 280            Kansas 2022-04-04         -3.921869         -0.740778   
 281    North Carolina 2022-04-04         -1.431219         -1.398705   
 
      offense_rating_2  offense_rating_3  offense_rating_4  
 0           -0.170408          1.396364          0.605474  


In [67]:
ratings[0].drop_duplicates(subset=['team'],keep='last').sort_values(by=['offense_rating_0'], ascending=True).head(10)

Unnamed: 0,team,date,offense_rating_0,offense_rating_1,offense_rating_2,offense_rating_3,offense_rating_4
280,Kansas,2022-04-04,-3.921869,-0.740778,0.371342,0.940165,2.376644
185,Kentucky,2022-03-17,-3.567765,-0.178213,-0.543485,0.514193,2.482238
143,Princeton,2022-03-13,-3.138864,-1.178203,-0.900765,0.528821,1.416427
219,Davidson,2022-03-18,-3.060735,-0.249868,-0.112205,1.317391,0.572686
276,Duke,2022-04-02,-2.874736,-1.635619,-0.155918,0.855029,2.466332
170,San Francisco,2022-03-17,-2.810754,-0.019907,0.376329,0.600163,1.12364
252,Arizona,2022-03-24,-2.620607,-2.167322,-0.655918,-0.617368,3.915034
161,S Dakota St,2022-03-17,-2.614745,-2.081051,-1.538762,-0.25447,2.485481
254,Gonzaga,2022-03-24,-2.534668,-2.507969,-2.136097,-1.014751,4.951599
206,Alabama,2022-03-18,-2.479474,-2.118683,-0.579715,1.040591,2.842992


In [68]:
MMCO.optimize()

|   iter    |  target   | halflife  |    l2     |
-------------------------------------------------
| [39m1        [39m | [39m-1.6     [39m | [39m242.8    [39m | [39m5.306    [39m |
| [35m2        [39m | [35m-1.595   [39m | [35m161.3    [39m | [35m0.679    [39m |
| [39m3        [39m | [39m-1.6     [39m | [39m631.7    [39m | [39m6.563    [39m |
| [39m4        [39m | [39m-1.6     [39m | [39m513.6    [39m | [39m5.756    [39m |
| [39m5        [39m | [39m-1.6     [39m | [39m40.86    [39m | [39m3.578    [39m |
| [39m6        [39m | [39m-1.596   [39m | [39m757.1    [39m | [39m0.6004   [39m |
| [39m7        [39m | [39m-1.601   [39m | [39m692.6    [39m | [39m8.773    [39m |
| [39m8        [39m | [39m-1.6     [39m | [39m50.44    [39m | [39m6.524    [39m |
| [39m9        [39m | [39m-1.6     [39m | [39m445.9    [39m | [39m5.975    [39m |
| [39m10       [39m | [39m-1.599   [39m | [39m392.0    [39m | [39m2.83     [39m |


(np.float64(162.30269172938603),
 np.float64(0.10289347090477433),
 np.float64(1.5878800657274277))

In [94]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import TransformedTargetRegressor


class MasseyProbOptimizer:
    def __init__(self, decay_type, protag_col='team', antag_col='opponent', stat_col='team_sq_score', meta_cols=None, min_protag_games=5):
        self.decay_type = decay_type
        self.protag_col = protag_col
        self.antag_col = antag_col
        self.stat_col = stat_col
        self.meta_cols = meta_cols if meta_cols is not None else []
        self.min_protag_games = min_protag_games

        if decay_type not in ['time', 'games', 'both']:
            raise ValueError("decay_type must be 'time', 'games', or 'both'")

    def load_data(self, data=None, path=None):
        if path:
            self.data = pd.read_csv(path)
        elif data is not None:
            self.data = data.copy()
        else:
            raise ValueError("Either data or path must be provided")
        self._preprocess_data()

    def _preprocess_data(self):
        self.data['date'] = pd.to_datetime(self.data['date'])
        self.data = self.data.sort_values('date').reset_index(drop=True)

        self.protags = sorted(self.data[self.protag_col].unique())
        self.antags = sorted(self.data[self.antag_col].unique())

        protag_map = {p: i for i, p in enumerate(self.protags)}
        antag_map = {a: i for i, a in enumerate(self.antags)}

        self.data['protag_idx'] = self.data[self.protag_col].map(protag_map)
        self.data['antag_idx'] = self.data[self.antag_col].map(antag_map)

        if len(self.data) <= 200:
            raise ValueError("Not enough data to optimize (minimum 200 rows)")

    def _initialize_X(self, df, protags, antags):
        num_protags = len(protags)
        num_antags = len(antags)
        num_features = num_protags + num_antags + len(self.meta_cols)
        
        protag_map = {p: i for i, p in enumerate(protags)}
        antag_map = {a: i for i, a in enumerate(antags)}
        
        rows = np.arange(len(df))
        protag_cols = df[self.protag_col].map(protag_map).fillna(-1).astype(int)
        antag_cols = df[self.antag_col].map(antag_map).fillna(-1).astype(int) + num_protags
        
        valid_mask = (protag_cols != -1) & (antag_cols != num_protags - 1)
        rows = rows[valid_mask]
        protag_cols = protag_cols[valid_mask]
        antag_cols = antag_cols[valid_mask]
        
        data = np.ones(len(rows) * 2)
        cols = np.concatenate([protag_cols, antag_cols])
        rows = np.concatenate([rows, rows])
        
        X = sparse.csr_matrix((data, (rows, cols)), shape=(len(df), num_features))
        
        if self.meta_cols:
            meta_data = df[self.meta_cols].values
            meta_sparse = sparse.csr_matrix(meta_data)
            X = sparse.hstack([X, meta_sparse], format='csr')
        
        return X, df[valid_mask]

    def _calculate_weights(self, train_data, test_date, halflife):
        if self.decay_type == 'time':
            time_diff = (test_date - train_data['date']).dt.total_seconds() / (24 * 3600)
        elif self.decay_type == 'games':
            time_diff = np.arange(len(train_data))[::-1]
        else:  # 'both'
            time_diff = (test_date - train_data['date']).dt.total_seconds() / (24 * 3600)
            game_diff = np.arange(len(train_data))[::-1]
            time_diff = time_diff * game_diff

        decay = np.exp(-np.log(2) / halflife)
        weights = decay ** time_diff
        return weights / np.sum(weights)

    def _fit_model(self, X_train, y_train, weights, l2):
        model = Ridge(alpha=l2)
        logit = lambda x: np.log(x/(1-x))
        inverse_logit = lambda x: 1/(1+np.exp(-x))
        sigmoid_transformer = TransformedTargetRegressor(regressor=model, func=inverse_logit, inverse_func=logit)
        sigmoid_transformer.fit(X_train, y_train, sample_weight=weights)
        return sigmoid_transformer

    def _predict_and_evaluate(self, X_test, y_test, model):
        predictions = model.predict(X_test)
        nll = -y_test * np.log(predictions) - (1 - y_test) * np.log(1 - predictions)
        return nll.mean()

    def optimize(self, init_points=10, n_iter=30, num_test_dates=20, num_future_days=60, max_lookback=365*3, halflife_bounds=(10, 800), l2_bounds=(1e-9, 10)):
        unique_dates = sorted(self.data['date'].unique())[10:]
        test_dates = np.random.choice(unique_dates, size=num_test_dates, replace=False)

        def objective(halflife, l2):
            total_nll = 0
            for test_date in test_dates:
                train_data = self.data[(self.data['date'] >= test_date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < test_date)].copy()
                test_data = self.data[(self.data['date'] >= test_date) & (self.data['date'] <= test_date + pd.Timedelta(days=num_future_days))].copy()

                if len(train_data) < 50 or len(test_data) < 50:
                    continue

                X_train, train_data = self._initialize_X(train_data, self.protags, self.antags)
                X_test, test_data = self._initialize_X(test_data, self.protags, self.antags)

                weights = self._calculate_weights(train_data, test_date, halflife)
                model = self._fit_model(X_train, train_data[self.stat_col].values, weights, l2)

                nll = self._predict_and_evaluate(X_test, test_data[self.stat_col].values, model)
                total_nll += nll

            return -total_nll / len(test_dates)

        optimizer = BayesianOptimization(f=objective, pbounds={'halflife': halflife_bounds, 'l2': l2_bounds}, random_state=17)
        optimizer.maximize(init_points=init_points, n_iter=n_iter)

        best_params = optimizer.max['params']
        best_nll = -optimizer.max['target']
        return best_params['halflife'], best_params['l2'], best_nll

    def get_ratings_for_dates(self, dates, halflife, l2, max_lookback=365*2.1):
        offense_stats = []
        defense_stats = []
        meta_stats = []

        for date in tqdm(dates):
            date = pd.to_datetime(date)
            train_data = self.data[(self.data['date'] >= date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < date)].copy()
            
            if len(train_data) < 50:
                print(f"Minimum data threshold not met for date {date}")
                continue

            X_train, train_data = self._initialize_X(train_data, self.protags, self.antags)
            weights = self._calculate_weights(train_data, date, halflife)
            model = self._fit_model(X_train, train_data[self.stat_col].values, weights, l2)

            num_protags = len(self.protags)
            num_antags = len(self.antags)
            
            coefficients = model.regressor_.coef_
            offense_ratings = coefficients[:num_protags]
            defense_ratings = coefficients[num_protags:num_protags+num_antags]
            meta_ratings = coefficients[num_protags+num_antags:]

            offense_stats.append(pd.DataFrame({
                'protag': self.protags,
                self.stat_col: offense_ratings,
                'date': date
            }))

            defense_stats.append(pd.DataFrame({
                'antag': self.antags,
                self.stat_col: defense_ratings,
                'date': date
            }))

            if self.meta_cols:
                meta_dict = {'date': date}
                meta_dict.update({col: [rating] for col, rating in zip(self.meta_cols, meta_ratings)})
                meta_stats.append(pd.DataFrame(meta_dict))

        offense_df = pd.concat(offense_stats, ignore_index=True)
        defense_df = pd.concat(defense_stats, ignore_index=True)
        
        if self.meta_cols:
            meta_df = pd.concat(meta_stats, ignore_index=True)
        else:
            meta_df = pd.DataFrame()

        return offense_df, defense_df, meta_df

In [95]:
m_data['won_game'] = (m_data['team_score'] > m_data['opp_score']).astype(int)


In [96]:

MProbO = MasseyProbOptimizer('time', protag_col='team_name', antag_col='opp_name', stat_col='won_game', meta_cols=['is_home'])
MProbO.load_data(m_data.copy())
# halflife, l2, mse = MProbO.optimize()

rating_dates = sorted(m_data['date'].unique())[-15:]
halflife = 120
l2 = 1e-8
offense_ratings, defense_ratings, meta_ratings = MProbO.get_ratings_for_dates(rating_dates, halflife, l2)


  0%|          | 0/15 [00:00<?, ?it/s]


ValueError: All arrays must be of the same length