In [7]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from scipy import sparse
from scipy.linalg import solve
from sklearn.pipeline import Pipeline
from bayes_opt import BayesianOptimization
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_predict


In [85]:

class Optimizer:
    def __init__(self):
        pass

    def optimize(self):
        raise NotImplementedError("Subclasses must implement the optimize method.")

class PLSOptimizer:
    def __init__(self, data, protag_col='team', antag_col='opponent', stat_col='team_sq_score', meta_cols=['location'], min_protag_games=5):
        self.data = data
        self.data['date'] = pd.to_datetime(self.data['date'])
        self.protag_col = protag_col
        self.antag_col = antag_col
        self.stat_col = stat_col
        self.meta_cols = meta_cols
        self.min_protag_games = min_protag_games
        self.preprocess_data()

    def preprocess_data(self):
        self.data = self.data.sort_values('date').reset_index(drop=True)
        self.protags = sorted(self.data[self.protag_col].unique())
        self.antags = sorted(self.data[self.antag_col].unique())

        # Pre-encode categorical variables
        self.protag_encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
        self.antag_encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
        self.protag_encoder.fit(self.data[[self.protag_col]])
        self.antag_encoder.fit(self.data[[self.antag_col]])

        # Pre-scale meta columns
        if self.meta_cols:
            self.meta_scaler = StandardScaler()
            self.meta_scaler.fit(self.data[self.meta_cols])

    def initialize_X(self, df):
        X_protag = self.protag_encoder.transform(df[[self.protag_col]])
        X_antag = self.antag_encoder.transform(df[[self.antag_col]])
        
        if self.meta_cols:
            X_meta = sparse.csr_matrix(self.meta_scaler.transform(df[self.meta_cols]))
        else:
            X_meta = sparse.csr_matrix((len(df), 0))
        
        return sparse.hstack([X_protag, X_antag, X_meta], format='csr')

    def create_pls_model(self, n_components=10):
        return PLSRegression(n_components=n_components, max_iter=1000)

    def get_ratings_for_dates(self, dates, halflife, l2, max_lookback=365*2.1):
        decay = np.exp(-np.log(2)/halflife)
        ratings = []

        # Pre-compute X for all data
        X_all = self.initialize_X(self.data)

        for date in tqdm(dates):
            mask = (self.data['date'] >= date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < date)
            train_data = self.data[mask]
            
            if len(train_data) < 50:
                print(f"Minimum data threshold not met for date {date}")
                continue

            X_train = X_all[mask]
            y_train = train_data[self.stat_col].values

            dw = decay ** ((date - train_data['date']).dt.total_seconds() / (24 * 3600)).values
            dw = dw / np.sum(dw)
            W = sparse.diags(dw)

            X_train_weighted = W @ X_train
            y_train_weighted = W @ y_train

            pls_model = self.create_pls_model()
            pls_model.fit(X_train_weighted.toarray(), y_train_weighted)

            coeffs = pls_model.coef_.flatten()
            num_protags = len(self.protag_encoder.categories_[0])
            num_antags = len(self.antag_encoder.categories_[0])
            num_meta = len(self.meta_cols)

            offense_ratings = coeffs[:num_protags]
            defense_ratings = coeffs[num_protags:num_protags+num_antags]
            meta_ratings = coeffs[-num_meta:] if num_meta > 0 else []

            offense_rating_dict = dict(zip(self.protag_encoder.categories_[0], offense_ratings))
            defense_rating_dict = dict(zip(self.antag_encoder.categories_[0], defense_ratings))

            daily_ratings = self.data.loc[self.data['date'] == date, [self.protag_col, self.antag_col] + self.meta_cols].copy()
            daily_ratings['offense'] = daily_ratings[self.protag_col].map(offense_rating_dict)
            daily_ratings['defense'] = daily_ratings[self.antag_col].map(defense_rating_dict)
            if num_meta > 0:
                daily_ratings['meta'] = daily_ratings[self.meta_cols].values @ meta_ratings
            else:
                daily_ratings['meta'] = 0
            
            ratings.append(daily_ratings)

        return pd.concat(ratings)





In [86]:
DATA_PATH = '../data/testing/ncaam_sample_data.csv'
def load_data(data_path):
    return pd.read_csv(data_path)

m_data = load_data(DATA_PATH)
MO = PLSOptimizer(m_data, protag_col='team_name', antag_col='opp_name', stat_col='team_score', meta_cols=['is_home'])



In [88]:

rtgs = MO.get_ratings_for_dates(sorted(m_data['date'].unique())[-15:], 150, 1e-8, max_lookback=365*2.1)

rtgs.drop_duplicates(subset=['team_name'],keep='last').sort_values(by=['offense'], ascending=False).head(10)


100%|██████████| 15/15 [00:12<00:00,  1.18it/s]


Unnamed: 0,team_name,opp_name,is_home,offense,defense,meta
206532,Gonzaga,Arkansas,0,54.411147,29.045115,0.0
206530,Arizona,Houston,0,52.332895,21.447124,0.0
206461,Iowa,Richmond,0,51.865361,31.200012,0.0
206484,Alabama,Notre Dame,0,49.198686,31.079479,0.0
206558,Kansas,North Carolina,0,49.038044,31.258055,0.0
206554,Duke,North Carolina,0,48.146159,31.36208,0.0
206559,North Carolina,Kansas,0,47.38711,27.681476,0.0
206463,Kentucky,St Peter's,0,47.352821,27.374677,0.0
206541,Purdue,St Peter's,0,46.486534,26.978009,0.0
206525,Auburn,Miami FL,0,46.379733,32.639573,0.0
