# Setup

Import the necessary libraries

In [1]:
# for data
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from datetime import datetime, timedelta
from pyNBA.Data.data import QueryData
from pyNBA.Models.helpers import CleanData
import math

# for features
from pyNBA.Models.features import FeatureCreation
from pyNBA.Models.cluster import Cluster, Evaluate
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns
from research import Helpers
from statsmodels.graphics.api import abline_plot

# for statistical tests
from scipy.stats import shapiro
import pingouin as pg

# for machine learning
from sklearn import model_selection, preprocessing, ensemble, neighbors, linear_model, svm, neural_network, metrics
import xgboost as xgb
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# for explainer
from lime import lime_tabular

# misc
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

  **kwargs


# Data Analysis
Summarize the characteristics of the dataset.

In [2]:
feature_creation = FeatureCreation()
query_data = QueryData()
clean_data = CleanData()
helpers = Helpers()

# pull boxscore data
boxscores = query_data.query_boxscore_data()
boxscores = clean_data.select_regular_season_games(boxscores)
boxscores = clean_data.drop_rows_player_injured(boxscores)
boxscores = clean_data.drop_rows_player_rest(boxscores)

/Users/brandonshimiaie/Projects/pyNBA/sqlite/db/nba.db
2.6.0


In [3]:
team_boxscores = boxscores.groupby(['SEASON', 'DATE', 'TEAM', 'OPP_TEAM']).apply(
    lambda x: pd.Series({
        'TEAM_POSSESSIONS': x['POSS'].sum()/5,
        'TEAM_OREB': x['OREB'].sum(),
        'TEAM_DREB': x['DREB'].sum()
    })
).reset_index()

opp_team_boxscores = team_boxscores.drop(columns='OPP_TEAM')
opp_team_boxscores = opp_team_boxscores.rename(columns={
    'TEAM': 'OPP_TEAM', 'TEAM_POSSESSIONS': 'OPP_TEAM_POSSESSIONS', 'TEAM_OREB': 'OPP_TEAM_OREB', 'TEAM_DREB': 'OPP_TEAM_DREB'
    })
team_boxscores = team_boxscores.merge(opp_team_boxscores, on=['SEASON', 'DATE', 'OPP_TEAM'], how='left')

team_boxscores['TEAM_OREB_CHANCES'] = team_boxscores['TEAM_OREB'] + team_boxscores['OPP_TEAM_DREB']
team_boxscores['TEAM_OREB_CHANCES/POSSESSION'] = team_boxscores['TEAM_OREB_CHANCES']/team_boxscores['TEAM_POSSESSIONS']

team_boxscores['OPP_TEAM_OREB_CHANCES'] = team_boxscores['TEAM_DREB'] + team_boxscores['OPP_TEAM_OREB']
team_boxscores['OPP_TEAM_OREB_CHANCES/POSSESSION'] = \
    team_boxscores['OPP_TEAM_OREB_CHANCES']/team_boxscores['OPP_TEAM_POSSESSIONS']

In [4]:
# average team oreb chances/possession
team_boxscores = feature_creation.expanding_weighted_mean(
    df=team_boxscores, group_col_names=['SEASON', 'TEAM'], col_name='TEAM_OREB_CHANCES/POSSESSION',
    new_col_name='AVG_TEAM_OREB_CHANCES/POSSESSION', weight_col_name='TEAM_POSSESSIONS'
)

In [5]:
# average oreb chances/possession that opp team allowed
team_boxscores = feature_creation.expanding_weighted_mean(
    df=team_boxscores, group_col_names=['SEASON', 'OPP_TEAM'], col_name='TEAM_OREB_CHANCES/POSSESSION',
    new_col_name='AVG_OREB_CHANCES/POSSESSION_OPP_TEAM_ALLOWED', weight_col_name='TEAM_POSSESSIONS'
)

In [6]:
# average oreb chances/possession allowed that team played against
season_stats = team_boxscores.groupby(['SEASON', 'TEAM']).apply(
    lambda x: pd.Series({
        'TEAM_OREB_CHANCES(SEASON)': x['TEAM_OREB_CHANCES'].mean(),
        'TEAM_POSSESSIONS(SEASON)': x['TEAM_POSSESSIONS'].mean(),
        'TEAM_OREB_CHANCES_ALLOWED(SEASON)': x['OPP_TEAM_OREB_CHANCES'].mean(),
        'TEAM_POSSESSIONS_ALLOWED(SEASON)': x['OPP_TEAM_POSSESSIONS'].mean(),
    })
).reset_index()

season_stats['TEAM_OREB_CHANCES/POSSESSION(SEASON)'] = \
    season_stats['TEAM_OREB_CHANCES(SEASON)']/season_stats['TEAM_POSSESSIONS(SEASON)']
season_stats['TEAM_OREB_CHANCES/POSSESSION_ALLOWED(SEASON)'] = \
    season_stats['TEAM_OREB_CHANCES_ALLOWED(SEASON)']/season_stats['TEAM_POSSESSIONS_ALLOWED(SEASON)']

opp_season_stats = season_stats.rename(columns={
    'TEAM': 'OPP_TEAM', 'TEAM_OREB_CHANCES(SEASON)': 'OPP_TEAM_OREB_CHANCES(SEASON)',
    'TEAM_POSSESSIONS(SEASON)': 'OPP_TEAM_POSSESSIONS(SEASON)',
    'TEAM_OREB_CHANCES_ALLOWED(SEASON)': 'OPP_TEAM_OREB_CHANCES_ALLOWED(SEASON)',
    'TEAM_POSSESSIONS_ALLOWED(SEASON)': 'OPP_TEAM_POSSESSIONS_ALLOWED(SEASON)',
    'TEAM_OREB_CHANCES/POSSESSION(SEASON)': 'OPP_TEAM_OREB_CHANCES/POSSESSION(SEASON)',
    'TEAM_OREB_CHANCES/POSSESSION_ALLOWED(SEASON)': 'OPP_TEAM_OREB_CHANCES/POSSESSION_ALLOWED(SEASON)'
    })


team_boxscores = team_boxscores.merge(season_stats, on=['SEASON', 'TEAM'], how='left')
team_boxscores = team_boxscores.merge(opp_season_stats, on=['SEASON', 'OPP_TEAM'], how='left')

team_boxscores = feature_creation.expanding_weighted_mean(
    df=team_boxscores, group_col_names=['SEASON', 'TEAM'], col_name='OPP_TEAM_OREB_CHANCES/POSSESSION_ALLOWED(SEASON)',
    new_col_name='AVG_OREB_CHANCES/POSSESSION_ALLOWED(SEASON)_TEAM_P.A.', weight_col_name='TEAM_POSSESSIONS'
)

In [7]:
# average oreb chances/possession that opp team played against
team_boxscores = feature_creation.expanding_weighted_mean(
    df=team_boxscores, group_col_names=['SEASON', 'OPP_TEAM'], col_name='TEAM_OREB_CHANCES/POSSESSION(SEASON)',
    new_col_name='AVG_OREB_CHANCES/POSSESSION(SEASON)_OPP_TEAM_P.A.', weight_col_name='OPP_TEAM_POSSESSIONS'
)

In [8]:
# team oreb chances/possession
team_boxscores['TEAM_OREB_CHANCES/POSSESSION_HAT'] = \
    2*team_boxscores['AVG_TEAM_OREB_CHANCES/POSSESSION'] - \
        team_boxscores['AVG_OREB_CHANCES/POSSESSION_ALLOWED(SEASON)_TEAM_P.A.']

# opp team oreb chances/possession allowed
team_boxscores['OPP_TEAM_OREB_CHANCES/POSSESSION_ALLOWED_HAT'] = \
    2*team_boxscores['AVG_OREB_CHANCES/POSSESSION_OPP_TEAM_ALLOWED'] - \
        team_boxscores['AVG_OREB_CHANCES/POSSESSION(SEASON)_OPP_TEAM_P.A.']

In [9]:
# average player oreb/chance
boxscores = boxscores.merge(team_boxscores, on=['SEASON', 'DATE', 'TEAM', 'OPP_TEAM'], how='left')

boxscores['OREB_CHANCES'] = np.nan
boxscores.loc[boxscores['OREB'] > 0, 'OREB_CHANCES'] = (
    boxscores.loc[boxscores['OREB'] > 0, 'OREB'] / boxscores.loc[boxscores['OREB'] > 0, 'OREB_PCT']
    ).apply(lambda x: round(x))
boxscores.loc[boxscores['OREB'] == 0, 'OREB_CHANCES'] = \
    boxscores.loc[boxscores['OREB'] == 0, 'TEAM_OREB_CHANCES/POSSESSION']*boxscores.loc[boxscores['OREB'] == 0, 'POSS']
boxscores['OREB/OREB_CHANCE'] = boxscores['OREB']/boxscores['OREB_CHANCES']

boxscores = feature_creation.expanding_weighted_mean(
    df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='OREB/OREB_CHANCE',
    new_col_name='AVG_OREB/OREB_CHANCE', weight_col_name='OREB_CHANCES'
)

In [10]:
# average player dreb/chance
boxscores['TEAM_DREB_CHANCES/POSSESSION'] = boxscores['OPP_TEAM_OREB_CHANCES/POSSESSION']

boxscores['DREB_CHANCES'] = np.nan
boxscores.loc[boxscores['DREB'] > 0, 'DREB_CHANCES'] = (
    boxscores.loc[boxscores['DREB'] > 0, 'DREB'] / boxscores.loc[boxscores['DREB'] > 0, 'DREB_PCT']
    ).apply(lambda x: round(x))
boxscores.loc[boxscores['DREB'] == 0, 'DREB_CHANCES'] = \
    boxscores.loc[boxscores['OREB'] == 0, 'TEAM_DREB_CHANCES/POSSESSION']*boxscores.loc[boxscores['DREB'] == 0, 'POSS']
boxscores['DREB/DREB_CHANCE'] = boxscores['DREB']/boxscores['DREB_CHANCES']

boxscores = feature_creation.expanding_weighted_mean(
    df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DREB/DREB_CHANCE',
    new_col_name='AVG_DREB/DREB_CHANCE', weight_col_name='DREB_CHANCES'
)

In [11]:
# average oreb/oreb chance that opp team allowed
boxscores['NORMALIZED_POSITION'] = boxscores['POSITION'].apply(lambda x: x if '-' not in x else x.split('-')[0])

team_position_game_boxscores = boxscores.groupby(['SEASON', 'DATE', 'NORMALIZED_POSITION', 'TEAM', 'OPP_TEAM']).apply(
    lambda x: pd.Series({
        'TEAM_POSITION_OREB': x['OREB'].sum(),
        'TEAM_POSITION_OREB_CHANCES': x['OREB_CHANCES'].sum(),
        'TEAM_POSITION_DREB': x['DREB'].sum(),
        'TEAM_POSITION_DREB_CHANCES': x['DREB_CHANCES'].sum()
    })
).reset_index()

opp_team_position_game_boxscores = team_position_game_boxscores.drop(columns='OPP_TEAM')
opp_team_position_game_boxscores = opp_team_position_game_boxscores.rename(columns={
    'TEAM': 'OPP_TEAM', 'TEAM_POSITION_OREB': 'OPP_TEAM_POSITION_OREB',
    'TEAM_POSITION_OREB_CHANCES': 'OPP_TEAM_POSITION_OREB_CHANCES', 'TEAM_POSITION_DREB': 'OPP_TEAM_POSITION_DREB',
    'TEAM_POSITION_DREB_CHANCES': 'OPP_TEAM_POSITION_DREB_CHANCES'
    })
team_position_game_boxscores = team_position_game_boxscores.merge(
    opp_team_position_game_boxscores, on=['SEASON', 'DATE', 'NORMALIZED_POSITION', 'OPP_TEAM'], how='left'
    )

team_position_game_boxscores['TEAM_POSITION_OREB/OREB_CHANCE'] = \
    team_position_game_boxscores['TEAM_POSITION_OREB']/team_position_game_boxscores['TEAM_POSITION_OREB_CHANCES']

team_position_game_boxscores = feature_creation.expanding_weighted_mean(
    df=team_position_game_boxscores, group_col_names=['SEASON', 'NORMALIZED_POSITION', 'OPP_TEAM'],
    col_name='TEAM_POSITION_OREB/OREB_CHANCE', new_col_name='AVG_TEAM_POSITION_OREB/OREB_CHANCE_OPP_ALLOWED',
    weight_col_name='TEAM_POSITION_OREB_CHANCES'
)

In [12]:
# average dreb/dreb chance that opp team allowed
team_position_game_boxscores['TEAM_POSITION_DREB/DREB_CHANCE'] = \
    team_position_game_boxscores['TEAM_POSITION_DREB']/team_position_game_boxscores['TEAM_POSITION_DREB_CHANCES']

team_position_game_boxscores = feature_creation.expanding_weighted_mean(
    df=team_position_game_boxscores, group_col_names=['SEASON', 'NORMALIZED_POSITION', 'OPP_TEAM'],
    col_name='TEAM_POSITION_DREB/DREB_CHANCE', new_col_name='AVG_TEAM_POSITION_DREB/DREB_CHANCE_OPP_ALLOWED',
    weight_col_name='TEAM_POSITION_DREB_CHANCES'
)

In [13]:
boxscores = boxscores.merge(
    team_position_game_boxscores, on=['SEASON', 'DATE', 'NORMALIZED_POSITION', 'TEAM', 'OPP_TEAM'], how='left'
)

In [14]:
# average oreb/oreb chance allowed that player played against
team_position_season_boxscores = team_position_game_boxscores.groupby(
    ['SEASON', 'NORMALIZED_POSITION', 'TEAM']
    ).apply(
    lambda x: pd.Series({
        'TEAM_POSITION_OREB(SEASON)': x['TEAM_POSITION_OREB'].mean(),
        'TEAM_POSITION_OREB_CHANCES(SEASON)': x['TEAM_POSITION_OREB_CHANCES'].mean(),
        'TEAM_POSITION_DREB(SEASON)': x['TEAM_POSITION_DREB'].mean(),
        'TEAM_POSITION_DREB_CHANCES(SEASON)': x['TEAM_POSITION_DREB_CHANCES'].mean(),
        'TEAM_POSITION_OREB_ALLOWED(SEASON)': x['OPP_TEAM_POSITION_OREB'].mean(),
        'TEAM_POSITION_OREB_CHANCES_ALLOWED(SEASON)': x['OPP_TEAM_POSITION_OREB_CHANCES'].mean(),
        'TEAM_POSITION_DREB_ALLOWED(SEASON)': x['OPP_TEAM_POSITION_DREB'].mean(),
        'TEAM_POSITION_DREB_CHANCES_ALLOWED(SEASON)': x['OPP_TEAM_POSITION_DREB_CHANCES'].mean(),
    })
).reset_index()

opp_team_position_season_boxscores = team_position_season_boxscores.rename(columns={
    'TEAM': 'OPP_TEAM',
    'TEAM_POSITION_OREB(SEASON)': 'OPP_TEAM_POSITION_OREB(SEASON)',
    'TEAM_POSITION_OREB_CHANCES(SEASON)': 'OPP_TEAM_POSITION_OREB_CHANCES(SEASON)',
    'TEAM_POSITION_DREB(SEASON)': 'OPP_TEAM_POSITION_DREB(SEASON)',
    'TEAM_POSITION_DREB_CHANCES(SEASON)': 'OPP_TEAM_POSITION_DREB_CHANCES(SEASON)',
    'TEAM_POSITION_OREB_ALLOWED(SEASON)': 'OPP_TEAM_POSITION_OREB_ALLOWED(SEASON)',
    'TEAM_POSITION_OREB_CHANCES_ALLOWED(SEASON)': 'OPP_TEAM_POSITION_OREB_CHANCES_ALLOWED(SEASON)',
    'TEAM_POSITION_DREB_ALLOWED(SEASON)': 'OPP_TEAM_POSITION_DREB_ALLOWED(SEASON)',
    'TEAM_POSITION_DREB_CHANCES_ALLOWED(SEASON)': 'OPP_TEAM_POSITION_DREB_CHANCES_ALLOWED(SEASON)',
    })

boxscores = boxscores.merge(
    team_position_season_boxscores, on=['SEASON', 'NORMALIZED_POSITION', 'TEAM'], how='left'
)
boxscores = boxscores.merge(
    opp_team_position_season_boxscores, on=['SEASON', 'NORMALIZED_POSITION', 'OPP_TEAM'], how='left'
)

boxscores['OPP_TEAM_POSITION_OREB/OREB_CHANCE_ALLOWED(SEASON)'] = \
    boxscores['OPP_TEAM_POSITION_OREB_ALLOWED(SEASON)'] / boxscores['OPP_TEAM_POSITION_OREB_CHANCES_ALLOWED(SEASON)']

boxscores = feature_creation.expanding_weighted_mean(
    df=boxscores, group_col_names=['SEASON', 'NORMALIZED_POSITION', 'TEAM', 'PLAYERID'],
    col_name='OPP_TEAM_POSITION_OREB/OREB_CHANCE_ALLOWED(SEASON)',
    new_col_name='AVG_TEAM_POSITION_OREB/OREB_CHANCE(SEASON)_ALLOWED_PLAYER_P.A',
    weight_col_name='OREB_CHANCES'
)

In [15]:
# average dreb/dreb chance allowed that player played against
boxscores['OPP_TEAM_POSITION_DREB/DREB_CHANCE_ALLOWED(SEASON)'] = \
    boxscores['OPP_TEAM_POSITION_DREB_ALLOWED(SEASON)'] / boxscores['OPP_TEAM_POSITION_DREB_CHANCES_ALLOWED(SEASON)']

boxscores = feature_creation.expanding_weighted_mean(
    df=boxscores, group_col_names=['SEASON', 'NORMALIZED_POSITION', 'TEAM', 'PLAYERID'],
    col_name='OPP_TEAM_POSITION_DREB/DREB_CHANCE_ALLOWED(SEASON)',
    new_col_name='AVG_TEAM_POSITION_DREB/DREB_CHANCE(SEASON)_ALLOWED_PLAYER_P.A',
    weight_col_name='DREB_CHANCES'
)

In [16]:
# oreb/oreb chance defense
boxscores['OREB/OREB(CH)_DEF'] = \
    boxscores['AVG_TEAM_POSITION_OREB/OREB_CHANCE_OPP_ALLOWED'] / \
        boxscores['AVG_TEAM_POSITION_OREB/OREB_CHANCE(SEASON)_ALLOWED_PLAYER_P.A']

# dreb/dreb chance defense
boxscores['DREB/DREB(CH)_DEF'] = \
    boxscores['AVG_TEAM_POSITION_DREB/DREB_CHANCE_OPP_ALLOWED'] / \
        boxscores['AVG_TEAM_POSITION_DREB/DREB_CHANCE(SEASON)_ALLOWED_PLAYER_P.A']

In [27]:
boxscores = feature_creation.expanding_mean(
    df=boxscores, group_col_names=['SEASON', 'TEAM', 'PLAYERID'],
    col_name='POSS',
    new_col_name='AVG_POSS'
)

display(boxscores.loc[(boxscores['NAME'] == 'Cody Martin') & (boxscores['START'] == 1),
                      ['POSITION', 'SECONDSPLAYED', 'START', 'TEAM', 'DATE', 'NAME',
                       'AVG_DREB/DREB_CHANCE', 'AVG_OREB/OREB_CHANCE', 'AVG_POSS', 'AVG_TEAM_POSITION_DREB/DREB_CHANCE_OPP_ALLOWED', 'AVG_TEAM_POSITION_DREB/DREB_CHANCE(SEASON)_ALLOWED_PLAYER_P.A']
                     ].tail(20))

Unnamed: 0,POSITION,SECONDSPLAYED,START,TEAM,DATE,NAME,AVG_DREB/DREB_CHANCE,AVG_OREB/OREB_CHANCE,AVG_POSS,AVG_TEAM_POSITION_DREB/DREB_CHANCE_OPP_ALLOWED,AVG_TEAM_POSITION_DREB/DREB_CHANCE(SEASON)_ALLOWED_PLAYER_P.A
140801,Forward,2174,1,CHA,2020-02-04,Cody Martin,0.150616,0.046019,34.272727,0.165859,0.159033
144068,Forward,2054,1,CHA,2020-03-03,Cody Martin,0.154452,0.042238,37.825,0.17485,0.160011
145199,Forward,1944,1,CHA,2020-03-11,Cody Martin,0.1518,0.040369,40.068182,0.158795,0.159855
