# Imports

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import datetime as dt

from sklearn.preprocessing import MinMaxScaler

In [4]:
pd.set_option('display.max_columns', 500)

# Data Load

In [40]:
df_raw_players = pd.read_pickle('backend/data/pkl/boxscores_advanced_player_part1.pkl').copy().sort_values('GAME_ID')

In [31]:
df_2 = pd.read_pickle('backend/data/pkl/boxscores_advanced_player_part2.pkl').copy().sort_values('GAME_ID')

In [7]:
df = pd.concat([df_1, df_2], axis=0)
df = df.drop(['START_POSITION', 'COMMENT'], axis=1)
df = df.dropna()
df['MIN'] = (pd.to_datetime(df['MIN'], format="%M:%S", errors='coerce', exact=True).dt.second/60)+pd.to_datetime(df['MIN'], format="%M:%S", errors='coerce', exact=True).dt.minute

In [19]:
df_raw = pd.read_pickle('raw_data/raw_games_5yrs.pkl').copy()[['TEAM_ID', 'HOME_TEAM', 'GAME_ID', 'GAME_DATE', 'PLUS_MINUS', 'PTS']]

In [84]:
df_eda = pd.read_pickle('backend/data/pkl/boxscore_advanced_rolling_players.pkl')

In [16]:
scaled_df_eda = pd.read_pickle('backend/data/pkl/scaled_boxscore_advanced_rolling_players.pkl')

In [41]:
df_raw_players.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63919 entries, 28 to 30
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GAME_ID            63919 non-null  object 
 1   TEAM_ID            63919 non-null  int64  
 2   TEAM_ABBREVIATION  63919 non-null  object 
 3   TEAM_CITY          63919 non-null  object 
 4   PLAYER_ID          63919 non-null  int64  
 5   PLAYER_NAME        63919 non-null  object 
 6   NICKNAME           63919 non-null  object 
 7   START_POSITION     63919 non-null  object 
 8   COMMENT            63919 non-null  object 
 9   MIN                52324 non-null  object 
 10  E_OFF_RATING       52324 non-null  float64
 11  OFF_RATING         63919 non-null  float64
 12  E_DEF_RATING       52324 non-null  float64
 13  DEF_RATING         63919 non-null  float64
 14  E_NET_RATING       52324 non-null  float64
 15  NET_RATING         63919 non-null  float64
 16  AST_PCT            52324

In [42]:
df_raw_players

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,E_OFF_RATING,OFF_RATING,E_DEF_RATING,DEF_RATING,E_NET_RATING,NET_RATING,AST_PCT,AST_TOV,AST_RATIO,OREB_PCT,DREB_PCT,REB_PCT,TM_TOV_PCT,EFG_PCT,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE
28,0012200001,1610612764,WAS,Washington,1630264,Anthony Gill,Anthony,,,14:30,89.0,87.9,89.3,87.9,-0.3,0.0,0.143,1.00,14.3,0.111,0.118,0.114,14.3,0.250,0.307,0.171,0.173,107.72,109.24,91.03,33,0.000
18,0012200001,1610612744,GSW,Golden State,1630311,Pat Spencer,Pat,,DNP - Coach's Decision,,,0.0,,0.0,,0.0,,,,0.000,0.000,0.000,,,,0.000,0.000,,,,0,
19,0012200001,1610612744,GSW,Golden State,202691,Klay Thompson,Klay,,DNP - Coach's Decision,,,0.0,,0.0,,0.0,,,,0.000,0.000,0.000,,,,0.000,0.000,,,,0,
20,0012200001,1610612764,WAS,Washington,1628398,Kyle Kuzma,Kyle,F,,20:04,75.4,73.9,101.2,97.8,-25.7,-23.9,0.091,0.33,7.7,0.000,0.167,0.083,23.1,0.389,0.389,0.255,0.260,107.11,110.03,91.69,46,0.009
21,0012200001,1610612764,WAS,Washington,1629060,Rui Hachimura,Rui,F,,25:10,80.6,78.9,85.0,84.2,-4.4,-5.3,0.000,0.00,0.0,0.000,0.281,0.141,7.7,0.556,0.538,0.217,0.221,107.11,108.72,90.60,57,0.190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,1622200006,1610612763,MEM,Memphis,1630533,Ziaire Williams,Ziaire,G,,27:12,93.6,100.0,107.4,107.0,-13.8,-7.0,0.125,1.00,9.5,0.000,0.108,0.061,9.5,0.500,0.552,0.271,0.272,104.79,101.47,84.56,58,0.160
2,1622200006,1610612763,MEM,Memphis,1630214,Xavier Tillman,Xavier,C,,28:56,117.4,119.7,91.0,92.4,26.4,27.3,0.154,4.00,21.1,0.171,0.194,0.183,5.3,0.357,0.357,0.183,0.187,111.42,109.49,91.24,66,0.126
1,1622200006,1610612763,MEM,Memphis,1630583,Santi Aldama,Santi,F,,26:13,89.0,94.6,107.4,107.3,-18.4,-12.6,0.067,0.17,5.9,0.069,0.167,0.123,35.3,0.650,0.670,0.235,0.237,104.76,101.61,84.68,56,0.066
14,1622200006,1610612763,MEM,Memphis,1631246,Vince Williams Jr.,Vince,,DNP - Coach's Decision,,,0.0,,0.0,,0.0,,,,0.000,0.000,0.000,,,,0.000,0.000,,,,0,


# Data Preprocess

In [17]:
df['GAME_ID'] = df['GAME_ID'].astype('int32')

In [20]:
df = pd.merge(df, df_raw, on=['TEAM_ID','GAME_ID'], how='inner')

In [21]:
df_sorted = df.sort_values('GAME_DATE')

In [18]:
target_x = ['PLAYER_ID', 'GAME_DATE','MIN', 'OFF_RATING', 'DEF_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'TM_TOV_PCT', 'EFG_PCT', 'TS_PCT', 'USG_PCT', 'PACE', 'PIE', 'POSS', 'PTS', 'PLUS_MINUS']

In [19]:
df_rolling = df_sorted[target_x].groupby('PLAYER_ID', group_keys=False).rolling(10, on='GAME_DATE').mean()

In [24]:
df_rolling.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 132226 entries, (1713, 57076) to (1631455, 1032)
Data columns (total 20 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   GAME_DATE   132226 non-null  datetime64[ns]
 1   AST_PCT     121617 non-null  float64       
 2   AST_RATIO   121617 non-null  float64       
 3   AST_TOV     121617 non-null  float64       
 4   DEF_RATING  121617 non-null  float64       
 5   DREB_PCT    121617 non-null  float64       
 6   EFG_PCT     121617 non-null  float64       
 7   MIN         100084 non-null  float64       
 8   NET_RATING  121617 non-null  float64       
 9   OFF_RATING  121617 non-null  float64       
 10  OREB_PCT    121617 non-null  float64       
 11  PACE        121617 non-null  float64       
 12  PIE         121617 non-null  float64       
 13  PLUS_MINUS  121617 non-null  float64       
 14  POSS        121617 non-null  float64       
 15  PTS         121617 non-null  f

In [26]:
scaler = MinMaxScaler()
df_rolling_preproc = scaler.fit_transform(df_rolling.loc[:, df_rolling.columns != 'GAME_DATE'])

In [29]:
df_rolling.memory_usage()

Index         6090862
GAME_DATE     1057808
AST_PCT       1057808
AST_RATIO     1057808
AST_TOV       1057808
DEF_RATING    1057808
DREB_PCT      1057808
EFG_PCT       1057808
MIN           1057808
NET_RATING    1057808
OFF_RATING    1057808
OREB_PCT      1057808
PACE          1057808
PIE           1057808
PLUS_MINUS    1057808
POSS          1057808
PTS           1057808
REB_PCT       1057808
TM_TOV_PCT    1057808
TS_PCT        1057808
USG_PCT       1057808
dtype: int64

In [32]:
df_rolling.to_pickle('/Users/lj/code/5pacepenguin/nba_betting_analysis/nba_betting_analysis/backend/data/pkl/boxscore_advanced_rolling_players.pkl')

In [57]:
df_eda = df_rolling.dropna().reset_index()

In [53]:
df_eda.dtypes

PLAYER_ID               int64
level_1                 int64
GAME_DATE      datetime64[ns]
AST_PCT               float64
AST_RATIO             float64
AST_TOV               float64
DEF_RATING            float64
DREB_PCT              float64
EFG_PCT               float64
MIN                   float64
NET_RATING            float64
OFF_RATING            float64
OREB_PCT              float64
PACE                  float64
PIE                   float64
PLUS_MINUS            float64
POSS                  float64
PTS                   float64
REB_PCT               float64
TM_TOV_PCT            float64
TS_PCT                float64
USG_PCT               float64
PLAYER_NAME            object
dtype: object

In [62]:
# cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True)
# s_cat = s.astype(cat_type)

cat_type = pd.api.types.CategoricalDtype(categories=df_eda.PLAYER_ID.unique(), ordered=False)
df_eda.PLAYER_ID = df_eda.PLAYER_ID.astype(cat_type)
df_eda.dtypes, df_eda.PLAYER_ID

(PLAYER_ID              category
 GAME_DATE        datetime64[ns]
 AST_PCT                 float64
 AST_RATIO               float64
 AST_TOV                 float64
 DEF_RATING              float64
 DREB_PCT                float64
 EFG_PCT                 float64
 MIN                     float64
 NET_RATING              float64
 OFF_RATING              float64
 OREB_PCT                float64
 PACE                    float64
 PIE                     float64
 PLUS_MINUS              float64
 POSS                    float64
 PTS                     float64
 REB_PCT                 float64
 TM_TOV_PCT              float64
 TS_PCT                  float64
 USG_PCT                 float64
 PLAYER_NAME_x            object
 PLAYER_NAME_y            object
 PLAYER_NAME            category
 dtype: object,
 0            1713
 1            1713
 2            1713
 3            1713
 4            1713
            ...   
 100079    1631323
 100080    1631323
 100081    1631323
 100082    1631323
 1

In [61]:
cat_type = pd.api.types.CategoricalDtype(categories=df_eda.PLAYER_NAME.unique(), ordered=False)
df_eda.PLAYER_NAME = df_eda.PLAYER_NAME.astype(cat_type)
df_eda.dtypes, df_eda.PLAYER_NAME

(PLAYER_ID                 int64
 GAME_DATE        datetime64[ns]
 AST_PCT                 float64
 AST_RATIO               float64
 AST_TOV                 float64
 DEF_RATING              float64
 DREB_PCT                float64
 EFG_PCT                 float64
 MIN                     float64
 NET_RATING              float64
 OFF_RATING              float64
 OREB_PCT                float64
 PACE                    float64
 PIE                     float64
 PLUS_MINUS              float64
 POSS                    float64
 PTS                     float64
 REB_PCT                 float64
 TM_TOV_PCT              float64
 TS_PCT                  float64
 USG_PCT                 float64
 PLAYER_NAME_x            object
 PLAYER_NAME_y            object
 PLAYER_NAME            category
 dtype: object,
 0              Vince Carter
 1              Vince Carter
 2              Vince Carter
 3              Vince Carter
 4              Vince Carter
                 ...        
 100079    Simone 

In [55]:
df_name = df[['PLAYER_ID', 'PLAYER_NAME']].drop_duplicates('PLAYER_ID')

In [9]:
cat_type = pd.api.types.CategoricalDtype(categories=df.PLAYER_ID.unique(), ordered=False)
df.PLAYER_ID = df.PLAYER_ID.astype(cat_type)
df.dtypes, df.PLAYER_ID

(GAME_ID                object
 TEAM_ID                 int64
 TEAM_ABBREVIATION      object
 TEAM_CITY              object
 PLAYER_ID            category
 PLAYER_NAME            object
 NICKNAME               object
 MIN                   float64
 E_OFF_RATING          float64
 OFF_RATING            float64
 E_DEF_RATING          float64
 DEF_RATING            float64
 E_NET_RATING          float64
 NET_RATING            float64
 AST_PCT               float64
 AST_TOV               float64
 AST_RATIO             float64
 OREB_PCT              float64
 DREB_PCT              float64
 REB_PCT               float64
 TM_TOV_PCT            float64
 EFG_PCT               float64
 TS_PCT                float64
 USG_PCT               float64
 E_USG_PCT             float64
 E_PACE                float64
 PACE                  float64
 PACE_PER40            float64
 POSS                    int64
 PIE                   float64
 dtype: object,
 28    1630264
 20    1628398
 21    1629060
 22     2

In [10]:
scaled_df_eda

Unnamed: 0,PLAYER_ID,GAME_DATE,PLAYER_NAME,PLUS_MINUS,AST_PCT,AST_RATIO,AST_TOV,DEF_RATING,DREB_PCT,EFG_PCT,MIN,NET_RATING,OFF_RATING,OREB_PCT,PACE,PIE,POSS,PTS,REB_PCT,TM_TOV_PCT,TS_PCT,USG_PCT
0,1713,2018-12-03,Vince Carter,-14.9,0.123266,0.169653,0.042814,0.578202,0.250375,0.402074,0.329102,0.501794,0.587002,0.050330,0.014105,0.542849,0.347285,0.421875,0.228959,0.072430,0.431366,0.376921
1,1713,2018-12-05,Vince Carter,-14.8,0.132462,0.147789,0.042814,0.546110,0.255056,0.432649,0.366260,0.513995,0.573205,0.064214,0.013059,0.548627,0.383484,0.437500,0.231373,0.072430,0.465736,0.384718
2,1713,2018-12-08,Vince Carter,-13.9,0.147144,0.161434,0.057086,0.548544,0.268539,0.466538,0.354895,0.515012,0.577199,0.083652,0.013363,0.554863,0.373303,0.437500,0.249472,0.090059,0.499260,0.418802
3,1713,2018-12-12,Vince Carter,-13.9,0.125686,0.163571,0.014271,0.554043,0.256742,0.414582,0.347681,0.500778,0.561224,0.065255,0.014036,0.548169,0.368778,0.445312,0.231071,0.070731,0.447864,0.404767
4,1713,2018-12-14,Vince Carter,-11.5,0.125686,0.163571,0.014271,0.595691,0.256742,0.396729,0.346915,0.476435,0.566125,0.047900,0.013775,0.547654,0.367647,0.474609,0.221418,0.070731,0.422377,0.418133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100079,1631323,2023-02-23,Simone Fontecchio,0.6,0.092611,0.167352,0.071357,0.566844,0.141199,0.415758,0.251965,0.588337,0.706635,0.133287,0.012827,0.522883,0.263575,0.759766,0.171342,0.267842,0.421003,0.387391
100080,1631323,2023-02-25,Simone Fontecchio,2.3,0.107293,0.190860,0.071357,0.579735,0.141199,0.429121,0.264539,0.622010,0.770627,0.152725,0.012945,0.538387,0.277149,0.761719,0.182202,0.214741,0.455690,0.367788
100081,1631323,2023-02-28,Simone Fontecchio,-0.8,0.128751,0.218313,0.080919,0.630758,0.094382,0.375668,0.305364,0.582356,0.761732,0.152725,0.012803,0.522654,0.319005,0.699219,0.139065,0.267842,0.402813,0.317665
100082,1631323,2023-03-03,Simone Fontecchio,-5.3,0.139561,0.228670,0.095191,0.612729,0.045880,0.277635,0.358884,0.601974,0.773260,0.096147,0.011583,0.523112,0.369910,0.666016,0.080543,0.139550,0.305626,0.312542


In [25]:
scaled_df_eda = pd.merge(df.loc[:, ['PLAYER_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'GAME_DATE']], scaled_df_eda, on=['PLAYER_ID', 'GAME_DATE'], how='right')
scaled_df_eda

Unnamed: 0,PLAYER_ID,TEAM_ID,TEAM_ABBREVIATION,GAME_DATE,PLAYER_NAME,PLUS_MINUS,AST_PCT,AST_RATIO,AST_TOV,DEF_RATING,DREB_PCT,EFG_PCT,MIN,NET_RATING,OFF_RATING,OREB_PCT,PACE,PIE,POSS,PTS,REB_PCT,TM_TOV_PCT,TS_PCT,USG_PCT
0,1713,1610612737,ATL,2018-12-03,Vince Carter,-14.9,0.123266,0.169653,0.042814,0.578202,0.250375,0.402074,0.329102,0.501794,0.587002,0.050330,0.014105,0.542849,0.347285,0.421875,0.228959,0.072430,0.431366,0.376921
1,1713,1610612737,ATL,2018-12-05,Vince Carter,-14.8,0.132462,0.147789,0.042814,0.546110,0.255056,0.432649,0.366260,0.513995,0.573205,0.064214,0.013059,0.548627,0.383484,0.437500,0.231373,0.072430,0.465736,0.384718
2,1713,1610612737,ATL,2018-12-08,Vince Carter,-13.9,0.147144,0.161434,0.057086,0.548544,0.268539,0.466538,0.354895,0.515012,0.577199,0.083652,0.013363,0.554863,0.373303,0.437500,0.249472,0.090059,0.499260,0.418802
3,1713,1610612737,ATL,2018-12-12,Vince Carter,-13.9,0.125686,0.163571,0.014271,0.554043,0.256742,0.414582,0.347681,0.500778,0.561224,0.065255,0.014036,0.548169,0.368778,0.445312,0.231071,0.070731,0.447864,0.404767
4,1713,1610612737,ATL,2018-12-14,Vince Carter,-11.5,0.125686,0.163571,0.014271,0.595691,0.256742,0.396729,0.346915,0.476435,0.566125,0.047900,0.013775,0.547654,0.367647,0.474609,0.221418,0.070731,0.422377,0.418133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100079,1631323,1610612762,UTA,2023-02-23,Simone Fontecchio,0.6,0.092611,0.167352,0.071357,0.566844,0.141199,0.415758,0.251965,0.588337,0.706635,0.133287,0.012827,0.522883,0.263575,0.759766,0.171342,0.267842,0.421003,0.387391
100080,1631323,1610612762,UTA,2023-02-25,Simone Fontecchio,2.3,0.107293,0.190860,0.071357,0.579735,0.141199,0.429121,0.264539,0.622010,0.770627,0.152725,0.012945,0.538387,0.277149,0.761719,0.182202,0.214741,0.455690,0.367788
100081,1631323,1610612762,UTA,2023-02-28,Simone Fontecchio,-0.8,0.128751,0.218313,0.080919,0.630758,0.094382,0.375668,0.305364,0.582356,0.761732,0.152725,0.012803,0.522654,0.319005,0.699219,0.139065,0.267842,0.402813,0.317665
100082,1631323,1610612762,UTA,2023-03-03,Simone Fontecchio,-5.3,0.139561,0.228670,0.095191,0.612729,0.045880,0.277635,0.358884,0.601974,0.773260,0.096147,0.011583,0.523112,0.369910,0.666016,0.080543,0.139550,0.305626,0.312542


In [58]:
df_eda = pd.merge(df_eda, df_name, on='PLAYER_ID')

In [64]:
df_eda.loc[:, ['PLAYER_ID', 'PLAYER_NAME']]

Unnamed: 0,PLAYER_ID,PLAYER_NAME
0,1713,Vince Carter
1,1713,Vince Carter
2,1713,Vince Carter
3,1713,Vince Carter
4,1713,Vince Carter
...,...,...
100079,1631323,Simone Fontecchio
100080,1631323,Simone Fontecchio
100081,1631323,Simone Fontecchio
100082,1631323,Simone Fontecchio


In [65]:
df_eda.drop(columns='level_1', inplace=True)

KeyError: "['level_1'] not found in axis"

In [72]:
columns = df_eda.columns
scaled_columns = columns.drop(['PLAYER_ID', 'GAME_DATE', 'PLAYER_NAME', 'PLUS_MINUS'])
unscaled_data = df_eda.loc[: , ['PLAYER_ID', 'GAME_DATE', 'PLAYER_NAME', 'PLUS_MINUS']].copy()

In [73]:
unscaled_data

Unnamed: 0,PLAYER_ID,GAME_DATE,PLAYER_NAME,PLUS_MINUS
0,1713,2018-12-03,Vince Carter,-14.9
1,1713,2018-12-05,Vince Carter,-14.8
2,1713,2018-12-08,Vince Carter,-13.9
3,1713,2018-12-12,Vince Carter,-13.9
4,1713,2018-12-14,Vince Carter,-11.5
...,...,...,...,...
100079,1631323,2023-02-23,Simone Fontecchio,0.6
100080,1631323,2023-02-25,Simone Fontecchio,2.3
100081,1631323,2023-02-28,Simone Fontecchio,-0.8
100082,1631323,2023-03-03,Simone Fontecchio,-5.3


In [27]:
scaled_df_eda

Unnamed: 0,PLAYER_ID,TEAM_ID,TEAM_ABBREVIATION,GAME_DATE,PLAYER_NAME,PLUS_MINUS,AST_PCT,AST_RATIO,AST_TOV,DEF_RATING,DREB_PCT,EFG_PCT,MIN,NET_RATING,OFF_RATING,OREB_PCT,PACE,PIE,POSS,PTS,REB_PCT,TM_TOV_PCT,TS_PCT,USG_PCT
0,1713,1610612737,ATL,2018-12-03,Vince Carter,-14.9,0.123266,0.169653,0.042814,0.578202,0.250375,0.402074,0.329102,0.501794,0.587002,0.050330,0.014105,0.542849,0.347285,0.421875,0.228959,0.072430,0.431366,0.376921
1,1713,1610612737,ATL,2018-12-05,Vince Carter,-14.8,0.132462,0.147789,0.042814,0.546110,0.255056,0.432649,0.366260,0.513995,0.573205,0.064214,0.013059,0.548627,0.383484,0.437500,0.231373,0.072430,0.465736,0.384718
2,1713,1610612737,ATL,2018-12-08,Vince Carter,-13.9,0.147144,0.161434,0.057086,0.548544,0.268539,0.466538,0.354895,0.515012,0.577199,0.083652,0.013363,0.554863,0.373303,0.437500,0.249472,0.090059,0.499260,0.418802
3,1713,1610612737,ATL,2018-12-12,Vince Carter,-13.9,0.125686,0.163571,0.014271,0.554043,0.256742,0.414582,0.347681,0.500778,0.561224,0.065255,0.014036,0.548169,0.368778,0.445312,0.231071,0.070731,0.447864,0.404767
4,1713,1610612737,ATL,2018-12-14,Vince Carter,-11.5,0.125686,0.163571,0.014271,0.595691,0.256742,0.396729,0.346915,0.476435,0.566125,0.047900,0.013775,0.547654,0.367647,0.474609,0.221418,0.070731,0.422377,0.418133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100079,1631323,1610612762,UTA,2023-02-23,Simone Fontecchio,0.6,0.092611,0.167352,0.071357,0.566844,0.141199,0.415758,0.251965,0.588337,0.706635,0.133287,0.012827,0.522883,0.263575,0.759766,0.171342,0.267842,0.421003,0.387391
100080,1631323,1610612762,UTA,2023-02-25,Simone Fontecchio,2.3,0.107293,0.190860,0.071357,0.579735,0.141199,0.429121,0.264539,0.622010,0.770627,0.152725,0.012945,0.538387,0.277149,0.761719,0.182202,0.214741,0.455690,0.367788
100081,1631323,1610612762,UTA,2023-02-28,Simone Fontecchio,-0.8,0.128751,0.218313,0.080919,0.630758,0.094382,0.375668,0.305364,0.582356,0.761732,0.152725,0.012803,0.522654,0.319005,0.699219,0.139065,0.267842,0.402813,0.317665
100082,1631323,1610612762,UTA,2023-03-03,Simone Fontecchio,-5.3,0.139561,0.228670,0.095191,0.612729,0.045880,0.277635,0.358884,0.601974,0.773260,0.096147,0.011583,0.523112,0.369910,0.666016,0.080543,0.139550,0.305626,0.312542


In [74]:
scaler = MinMaxScaler()
df_eda = scaler.fit_transform(df_eda.loc[:, scaled_columns])
df_eda = pd.DataFrame(columns=scaled_columns, data=df_eda)
df_eda = pd.concat((unscaled_data, df_eda), axis=1)
df_eda

Unnamed: 0,PLAYER_ID,GAME_DATE,PLAYER_NAME,PLUS_MINUS,AST_PCT,AST_RATIO,AST_TOV,DEF_RATING,DREB_PCT,EFG_PCT,MIN,NET_RATING,OFF_RATING,OREB_PCT,PACE,PIE,POSS,PTS,REB_PCT,TM_TOV_PCT,TS_PCT,USG_PCT
0,1713,2018-12-03,Vince Carter,-14.9,0.123266,0.169653,0.042814,0.578202,0.250375,0.402074,0.329102,0.501794,0.587002,0.050330,0.014105,0.542849,0.347285,0.421875,0.228959,0.072430,0.431366,0.376921
1,1713,2018-12-05,Vince Carter,-14.8,0.132462,0.147789,0.042814,0.546110,0.255056,0.432649,0.366260,0.513995,0.573205,0.064214,0.013059,0.548627,0.383484,0.437500,0.231373,0.072430,0.465736,0.384718
2,1713,2018-12-08,Vince Carter,-13.9,0.147144,0.161434,0.057086,0.548544,0.268539,0.466538,0.354895,0.515012,0.577199,0.083652,0.013363,0.554863,0.373303,0.437500,0.249472,0.090059,0.499260,0.418802
3,1713,2018-12-12,Vince Carter,-13.9,0.125686,0.163571,0.014271,0.554043,0.256742,0.414582,0.347681,0.500778,0.561224,0.065255,0.014036,0.548169,0.368778,0.445312,0.231071,0.070731,0.447864,0.404767
4,1713,2018-12-14,Vince Carter,-11.5,0.125686,0.163571,0.014271,0.595691,0.256742,0.396729,0.346915,0.476435,0.566125,0.047900,0.013775,0.547654,0.367647,0.474609,0.221418,0.070731,0.422377,0.418133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100079,1631323,2023-02-23,Simone Fontecchio,0.6,0.092611,0.167352,0.071357,0.566844,0.141199,0.415758,0.251965,0.588337,0.706635,0.133287,0.012827,0.522883,0.263575,0.759766,0.171342,0.267842,0.421003,0.387391
100080,1631323,2023-02-25,Simone Fontecchio,2.3,0.107293,0.190860,0.071357,0.579735,0.141199,0.429121,0.264539,0.622010,0.770627,0.152725,0.012945,0.538387,0.277149,0.761719,0.182202,0.214741,0.455690,0.367788
100081,1631323,2023-02-28,Simone Fontecchio,-0.8,0.128751,0.218313,0.080919,0.630758,0.094382,0.375668,0.305364,0.582356,0.761732,0.152725,0.012803,0.522654,0.319005,0.699219,0.139065,0.267842,0.402813,0.317665
100082,1631323,2023-03-03,Simone Fontecchio,-5.3,0.139561,0.228670,0.095191,0.612729,0.045880,0.277635,0.358884,0.601974,0.773260,0.096147,0.011583,0.523112,0.369910,0.666016,0.080543,0.139550,0.305626,0.312542


In [26]:
scaled_df_eda.to_pickle('/Users/lj/code/5pacepenguin/nba_betting_analysis/nba_betting_analysis/backend/data/pkl/scaled_boxscore_advanced_rolling_players.pkl')

In [77]:
df_vif = df_sorted[['OFF_RATING', 'DEF_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO', 'REB_PCT', 'TM_TOV_PCT', 'EFG_PCT', 'TS_PCT', 'USG_PCT', 'PACE', 'POSS']].dropna()

In [78]:
df_vif

Unnamed: 0,OFF_RATING,DEF_RATING,AST_PCT,AST_TOV,AST_RATIO,REB_PCT,TM_TOV_PCT,EFG_PCT,TS_PCT,USG_PCT,PACE,POSS
57022,96.5,139.7,0.353,3.0,27.3,0.017,9.1,0.357,0.357,0.242,105.75,57
57005,109.5,124.2,0.000,0.0,0.0,0.075,12.5,0.654,0.698,0.239,99.72,63
57004,103.8,128.0,0.111,0.0,16.7,0.143,0.0,0.400,0.400,0.185,99.92,26
57003,101.4,121.6,0.091,2.0,11.1,0.038,5.6,0.667,0.649,0.203,99.80,73
57001,98.1,111.8,0.143,2.0,14.3,0.105,7.1,0.889,0.848,0.207,98.75,52
...,...,...,...,...,...,...,...,...,...,...,...,...
47974,78.9,93.4,0.150,1.0,17.6,0.092,17.6,0.364,0.364,0.177,101.90,76
47975,133.3,77.3,0.000,0.0,0.0,0.000,0.0,0.750,0.750,0.087,107.87,21
47976,96.3,79.5,0.400,8.0,27.6,0.080,3.4,0.553,0.578,0.228,101.11,82
47965,46.2,138.5,0.000,0.0,0.0,0.071,33.3,0.000,0.000,0.200,102.02,13


In [79]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler

In [80]:
scaler = MinMaxScaler()
df_vif_vals = scaler.fit_transform(df_vif)

In [81]:
df_vif_vals

array([[0.27571429, 0.34925   , 0.353     , ..., 0.242     , 0.00734081,
        0.47107438],
       [0.31285714, 0.3105    , 0.        , ..., 0.239     , 0.00692223,
        0.52066116],
       [0.29657143, 0.32      , 0.111     , ..., 0.185     , 0.00693611,
        0.21487603],
       ...,
       [0.27514286, 0.19875   , 0.4       , ..., 0.228     , 0.00701872,
        0.67768595],
       [0.132     , 0.34625   , 0.        , ..., 0.2       , 0.00708189,
        0.10743802],
       [0.41771429, 0.41975   , 0.083     , ..., 0.034     , 0.00698332,
        0.21487603]])

In [82]:
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = df_vif.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(df_vif_vals, i)
                          for i in range(len(df_vif.columns))]
  
vif_data

Unnamed: 0,feature,VIF
0,OFF_RATING,19.508758
1,DEF_RATING,12.543131
2,AST_PCT,6.776613
3,AST_TOV,2.169016
4,AST_RATIO,6.954879
5,REB_PCT,2.648547
6,TM_TOV_PCT,1.660056
7,EFG_PCT,37.861055
8,TS_PCT,45.307898
9,USG_PCT,8.752835
