In [15]:


import os
import torch
import numpy as np
import pandas as pd

from tqdm import tqdm
from scipy import sparse
from datetime import datetime
from scipy.linalg import solve
from scipy.optimize import minimize
from scipy.sparse.linalg import spsolve
from sklearn.impute import SimpleImputer
from bayes_opt import BayesianOptimization
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict



In [16]:
DATA_PATH = '../data/testing/ncaam_sample_data.csv'
def load_data(data_path):
    return pd.read_csv(data_path)

m_data = load_data(DATA_PATH)

In [21]:

class NuclearNormOptimizer:
    def __init__(self, protag_col='team', antag_col='opponent', stat_col='team_sq_score', meta_cols=['location']):
        self.protag_col = protag_col
        self.antag_col = antag_col
        self.stat_col = stat_col
        self.meta_cols = meta_cols

    def load_data(self, data=None, path=None):
        if path:
            self.data = pd.read_csv(path)
        elif data is not None:
            self.data = data.copy()
        else:
            raise ValueError("Either data or path must be provided")
        self._preprocess_data()

    def _preprocess_data(self):
        required_columns = [self.protag_col, self.antag_col, self.stat_col, 'date']
        if not all(col in self.data.columns for col in required_columns):
            raise ValueError(f"Data must contain columns: {required_columns}")

        self.data['date'] = pd.to_datetime(self.data['date'])
        self.data = self.data.sort_values('date').reset_index(drop=True)

        self.teams = sorted(set(self.data[self.protag_col].unique()) | set(self.data[self.antag_col].unique()))
        self.num_teams = len(self.teams)

        # One-hot encode teams
        self.encoder = OneHotEncoder(sparse_output=False)
        self.encoder.fit(np.array(self.teams).reshape(-1, 1))

    def _create_matrix(self, data):
        X = np.zeros((len(data), 2 * self.num_teams + len(self.meta_cols)))
        
        protag_encoded = self.encoder.transform(data[self.protag_col].values.reshape(-1, 1))
        antag_encoded = self.encoder.transform(data[self.antag_col].values.reshape(-1, 1))
        
        X[:, :self.num_teams] = protag_encoded
        X[:, self.num_teams:2*self.num_teams] = antag_encoded
        
        for i, col in enumerate(self.meta_cols):
            X[:, -(i+1)] = data[col].values
        
        y = data[self.stat_col].values
        
        return X, y

    def _soft_threshold(self, x, lambda_):
        return np.sign(x) * np.maximum(np.abs(x) - lambda_, 0)

    def _svd_threshold(self, X, lambda_):
        U, s, Vt = np.linalg.svd(X, full_matrices=False)
        s = self._soft_threshold(s, lambda_)
        return U @ np.diag(s) @ Vt

    def _nuclear_norm_minimization(self, X, y, lambda_, max_iter=100, tol=1e-4):
        m, n = X.shape
        mask = ~np.isnan(y)
        y_observed = y[mask]
        X_observed = X[mask]
        
        Z = np.zeros((m, n+1))
        Z[:, :n] = X
        Z[mask, -1] = y_observed
        
        for _ in range(max_iter):
            Z_old = Z.copy()
            
            # Soft-impute step
            Z = self._svd_threshold(Z, lambda_)
            
            # Project onto the observed entries
            Z[mask, :n] = X_observed
            Z[mask, -1] = y_observed
            
            # Check convergence
            if np.linalg.norm(Z - Z_old) < tol:
                break
        
        return Z[:, -1]

    def optimize(self, lambda_=0.1, max_iter=100, tol=1e-4):
        X, y = self._create_matrix(self.data)
        self.predictions = self._nuclear_norm_minimization(X, y, lambda_, max_iter, tol)
        self.mse = np.mean((y - self.predictions)**2)
        return self.mse

    def get_ratings_for_dates(self, dates, lambda_=0.1, max_iter=100, tol=1e-4, max_lookback=365*2.1):
        ratings = []
        
        for date in tqdm(dates):
            train_data = self.data[(self.data['date'] >= date - pd.Timedelta(days=max_lookback)) & (self.data['date'] < date)].copy()
            
            if len(train_data) < 50:
                print(f"Minimum data threshold not met for date {date}")
                continue

            X, y = self._create_matrix(train_data)
            predictions = self._nuclear_norm_minimization(X, y, lambda_, max_iter, tol)
            
            # Calculate ratings as average predicted performance
            team_ratings = {}
            for i, team in enumerate(self.teams):
                team_games = (X[:, i] == 1) | (X[:, i + self.num_teams] == 1)
                if np.sum(team_games) > 0:
                    team_ratings[team] = np.mean(predictions[team_games])
                else:
                    team_ratings[team] = np.nan
            
            ratings.append(pd.DataFrame({
                'team': list(team_ratings.keys()),
                'rating': list(team_ratings.values()),
                'date': date
            }))

        return pd.concat(ratings)

    def predict_game(self, protag, antag, date, meta_values=None):
        if meta_values is None:
            meta_values = [0] * len(self.meta_cols)
        
        X = np.zeros((1, 2 * self.num_teams + len(self.meta_cols)))
        X[0, :self.num_teams] = self.encoder.transform([[protag]])
        X[0, self.num_teams:2*self.num_teams] = self.encoder.transform([[antag]])
        X[0, -len(self.meta_cols):] = meta_values
        
        y = np.array([np.nan])
        
        prediction = self._nuclear_norm_minimization(X, y, self.lambda_)
        return prediction[0]

In [22]:
DATA_PATH = '../data/testing/ncaam_sample_data.csv'
def load_data(data_path):
    return pd.read_csv(data_path)

m_data = load_data(DATA_PATH)

In [23]:
m_data.head()

Unnamed: 0,season,team_score,opp_score,is_home,numot,team_fgm,team_fga,team_fgm3,team_fga3,team_ftm,...,opp_or,opp_dr,opp_ast,opp_to,opp_stl,opp_blk,opp_pf,team_name,opp_name,date
0,2003,68,62,0,0,27,58,3,14,11,...,10,22,8,18,9,2,20,Alabama,Oklahoma,2002-11-14
1,2003,70,63,0,0,26,62,8,20,10,...,20,25,7,12,8,6,16,Memphis,Syracuse,2002-11-14
2,2003,62,68,0,0,22,53,2,10,16,...,14,24,13,23,7,1,22,Oklahoma,Alabama,2002-11-14
3,2003,63,70,0,0,24,67,6,24,9,...,15,28,16,13,4,4,18,Syracuse,Memphis,2002-11-14
4,2003,55,81,-1,0,20,46,3,11,12,...,12,24,12,9,9,3,18,E Washington,Wisconsin,2002-11-15


In [25]:
m_data['score_diff'] = m_data['team_score']-m_data['opp_score']
nnmo = NuclearNormOptimizer(protag_col='team_name', antag_col='opp_name', stat_col='score_diff', meta_cols=['is_home'])
nnmo.load_data(m_data)

# Optimize the model
mse = nnmo.optimize(lambda_=0.1)

dates = sorted(m_data['date'].unique())[-15:]

# Get ratings for specific dates
ratings = nnmo.get_ratings_for_dates(dates, lambda_=0.1)

# Predict a specific game
prediction = nnmo.predict_game('Team A', 'Team B', date, meta_values=[1, 0])

NameError: name 'dates' is not defined