In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
from datetime import datetime
from nba import NbaTracker
import importlib
import nba
importlib.reload(nba)

# data starts from 2000-01
seasons = [
    "2000-01",
    "2001-02",
    "2002-03",
    "2003-04",
    "2004-05",
    "2005-06",
    "2006-07",
    "2007-08",
    "2008-09",
    "2009-10",
    "2010-11",
    "2011-12",
    "2012-13",
    "2013-14",
    "2014-15",
    "2015-16",
    "2016-17",
    "2017-18",
    "2018-19",
    "2019-20",
    "2020-21"
]  # 19-20 was bubble season

nbaTracker = NbaTracker()
for season_name in seasons[:5]:
    nbaTracker.load_season(season_name)
    print("Loaded ", season_name, datetime.now().time())


Loaded  2000-01 16:49:43.762764
Loaded  2001-02 16:49:50.000511
Loaded  2002-03 16:49:55.720156
Loaded  2003-04 16:50:00.678208
Loaded  2004-05 16:50:05.762097


In [1]:
nbaTracker

NameError: name 'nbaTracker' is not defined

In [None]:
# build shots
# y = is_made
# X = [shot_chance, ...off_team_players, ...def_team_players]
# see if off_team_players or def_team_players add any value
    # they don't, no need to add
    
add_off = True
add_def = True
num_features = 2

num_players = 0
player_indicies = {}
for season_name in seasons[:1]:
    season = nbaTracker.seasons[season_name]
    for player_id in season.player_seasons:
        if player_id not in player_indicies:
            player_indicies[player_id] = num_players
            num_players += 1
if add_off:
    num_features += num_players
if add_def:
    num_features += num_players

shots = []
for season_name in seasons[:1]:
    season = nbaTracker.seasons[season_name]
    for game in season.games:
        for pos in game.possessions:
            for shot in pos.shots:
                shot_data = np.zeros(num_features+1)
                shot_data[0] = shot.is_made
                shot_data[1] = shot.shooter
                shot_data[2] = shot.shot_type
                if add_off:
                    for pid in shot.lineup.off_players:
                        index = 3 + player_indicies[pid]
                        shot_data[index] = 1
                if add_def:
                    for pid in shot.lineup.def_players:
                        index = 3 + player_indicies[pid]
                        if add_off:
                            index += num_players
                        shot_data[index] = 1
                shots.append(shot_data)
shots = np.array(shots, dtype=int)
train, test = train_test_split(shots, random_state=342119)


In [None]:
# build shot chances by shooter, shot_Type from train set
from sklearn.base import BaseEstimator, TransformerMixin


class ShotChanceFromShot(BaseEstimator, TransformerMixin):
    def __init__(self, num_players, keep_shooter=False, keep_shot_type=False, keep_off=False, keep_def=False) -> None:
        super().__init__()
        self.keep_shooter = keep_shooter
        self.keep_shot_type = keep_shot_type
        self.keep_off = keep_off
        self.keep_def = keep_def
        self.num_players = num_players
        self.shots = {}
        # { shooter: (attemps, makes)}

    def fit(self, X, y):
        for i, shot in enumerate(X):
            is_made = y[i]
            shooter = shot[0]
            shot_type = shot[1]
            if shooter not in self.shots:
                self.shots[shooter] = ([0]*5, [0]*5)
            self.shots[shooter][0][shot_type] += 1
            if is_made:
                self.shots[shooter][1][shot_type] += 1
        return self

    def transform(self, X, y=None):
        shot_chances = np.zeros((len(X), 1))
        for i, shot in enumerate(X):
            shooter = shot[0]
            shot_type = shot[1]
            shot_chances[i] = self.shot_chance(shooter, shot_type)
        transformed = shot_chances
        if self.keep_shooter:
            if self.keep_shot_type:
                transformed = np.c_[transformed, X[:, 0:2]]
            else:
                transformed = np.c_[transformed, X[:, 0]]
        else:
            if self.keep_shot_type:
                transformed = np.c_[transformed, X[:, 1]]
            else:
                pass
        if self.keep_off:
            if self.keep_def:
                transformed = np.c_[transformed,
                                    X[:, 2:(self.num_players*2+2)]]
            else:
                transformed = np.c_[transformed, X[:, 2:self.num_players+2]]
        else:
            if self.keep_def:
                transformed = np.c_[transformed,
                                    X[:, (2+self.num_players):(self.num_players*2+2)]]
            else:
                pass
        return transformed

    def shot_chance(self, shooter, shot_type):
        if shooter not in self.shots:
            return 0.5
        attemps = self.shots[shooter][0][shot_type]
        makes = self.shots[shooter][1][shot_type]
        if attemps:
            return makes / attemps
        return 0.5


In [None]:
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline
trf = ShotChanceFromShot(num_players, keep_def=False, keep_off=False)
reg = Ridge()
pipeline = Pipeline([('trf', trf), ('reg', reg)])

# reg.fit(X_train_trf, y_train)


In [None]:
X_train, y_train = train[:,1:], train[:,:1]

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline, X_train, y_train,
                         cv=10, scoring="neg_mean_squared_error")
print(np.sqrt(-scores).mean())


In [161]:
# find what momentum is best predictor of shot accuracy for each shot type
from collections import defaultdict

bias = 0
chance = 0.0
class RollingAveShotChance:
    def __init__(self, momentum) -> None:
        self.shots = 0
        self.chance = chance
        self.momentum = momentum
    
    def add(self, is_made):
        self.chance = self.momentum*self.chance + (1-self.momentum)*is_made
        self.shots += 1
    
    def get(self)-> float:
        if self.shots == 0:
            return 0.0
        return self.chance / (1 - self.momentum ** (self.shots + bias))

class SeasonAve:
    def __init__(self) -> None:
        self.shots = 0
        self.makes = 0
    def add(self, is_made):
        self.shots += 1
        self.makes += is_made

    def get(self) -> float:
        if self.shots == 0:
            return 0.0
        return self.makes / self.shots


# momentums = [0.990, 0.991, 0.992, 0.993, 0.994, 0.995, 0.996, 0.997, 0.998, 0.999]
momentums = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
def init() -> list[RollingAveShotChance]:
    return [RollingAveShotChance(momentum) for momentum in momentums]

momentum_data = [[] for _ in range(len(momentums))]
other_data = []
y = []


target_shot_type = 1 # short mid-range
# 0: best is 0.995
# 1: best is 

num_players = 0

player_shots: defaultdict[int, list[RollingAveShotChance]] = defaultdict(init)

for season_name in seasons[:5]:
    season = nbaTracker.seasons[season_name]
    
    for game in season.games:
        for pos in game.possessions:
            for shot in pos.shots:
                if shot.shot_type == target_shot_type:
                    roll_aves = player_shots[shot.shooter]
                    if season_name != "2000-01":
                        for i in range(len(momentums)):
                            momentum_data[i].append(roll_aves[i].get())
                        y.append(shot.is_made)
                        other_data.append([shot.shooter])
                        
                    for i in range(len(momentums)):
                        roll_aves[i].add(shot.is_made)


    
    


In [162]:
y = np.array(y)
X = np.c_[other_data, np.array(momentum_data).T]

In [155]:
y.shape

(238259,)

In [163]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=342119)


In [118]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

index = 0
sX = X_train[:, index:index+1]

reg = Ridge()
reg.fit(sX, y_train)
reg_mse = mean_squared_error(y_train, reg.predict(sX))
reg_rmse = np.sqrt(reg_mse)
print(reg_rmse)

0.4856861552965619


In [157]:
from sklearn.base import BaseEstimator, TransformerMixin

class SelectMomentum(BaseEstimator, TransformerMixin):
    def __init__(self, momentum_index=0) -> None:
        self.momentum_index = momentum_index 
        self.season_aves = defaultdict(SeasonAve)

    def fit(self, X, y):
        for i, shot in enumerate(X):
            is_made = y[i]
            shooter = shot[0]
            self.season_aves[shooter].add(is_made)
        return self
        
    def transform(self, X, y=None):
        shot_chances = np.zeros((len(X), 1))
        for i, shot in enumerate(X):
            shooter = shot[0]
            shot_chances[i] = self.season_aves[shooter].get()
        return np.c_[shot_chances, X[:, self.momentum_index+1]]

selmo = SelectMomentum()

In [164]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
reg = Ridge(alpha=0.0)

pipeline = Pipeline([("selmo", selmo), ("reg", reg)])
param_grid = [{"selmo__momentum_index": range(10)}]
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="neg_mean_squared_error", return_train_score=True)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_, grid_search.best_estimator_)

# scores = cross_val_score(reg, sX, y_train, scoring="neg_mean_squared_error", cv=10)
# rmse_scores = np.sqrt(-scores)
# print(rmse_scores.mean())

{'selmo__momentum_index': 0} Pipeline(steps=[('selmo', SelectMomentum()), ('reg', Ridge(alpha=0.0))])


In [165]:
grid_search.best_estimator_["reg"].coef_

array([ 1.00441121, -0.0048965 ])

In [166]:
grid_search.best_estimator_["reg"].intercept_


0.00018457830643814344

In [None]:
cvres = grid_search.cv_results_

In [114]:
reg.coef_

array([0.42396976])