In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests

In [2]:
df = pd.read_pickle('../data/pkl/raw_games_5yrs.pkl')

In [3]:
df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

In [4]:
# df = df[df['GAME_DATE'] > "2022-09-01"].sort_values(by='GAME_DATE', ascending=False)

In [5]:
# games.groupby('TEAM_ID').apply(lambda x: x)

In [6]:
df.reset_index(drop=True, inplace=True)

In [7]:
map_id_name = df[['TEAM_ID', 'TEAM_NAME']].drop_duplicates().reset_index(drop=True)

In [8]:
map_id_name

Unnamed: 0,TEAM_ID,TEAM_NAME
0,1610612737,Atlanta Hawks
1,1610612738,Boston Celtics
2,1610612739,Cleveland Cavaliers
3,1610612740,New Orleans Pelicans
4,1610612741,Chicago Bulls
5,1610612742,Dallas Mavericks
6,1610612743,Denver Nuggets
7,1610612744,Golden State Warriors
8,1610612745,Houston Rockets
9,1610612746,LA Clippers


In [9]:
df.dtypes.value_counts()

int64             13
float64            9
object             5
int32              1
datetime64[ns]     1
dtype: int64

In [10]:
feat_categorical_nunique = df.select_dtypes(include='object').nunique()

In [11]:
feat_categorical_nunique

SEASON_ID              19
TEAM_ABBREVIATION      30
TEAM_NAME              30
MATCHUP              1777
WL                      2
dtype: int64

In [12]:
df.sort_values(['GAME_DATE', 'GAME_ID', 'HOME_TEAM'], ascending=[False, False, False], inplace=True)

## REMOVING SINGLE GAME ROWS

In [13]:
value_counts = df['GAME_ID'].value_counts()

In [14]:
value_counts

22200973      2
21900424      2
21900427      2
21900428      2
21900429      2
             ..
21801044      1
21900107      1
1622200006    1
1522200001    1
12100027      1
Name: GAME_ID, Length: 6999, dtype: int64

In [15]:
unique_values = value_counts[value_counts == 1].index.tolist()

In [16]:
unique_values

[21900024,
 1521900020,
 21900020,
 1321900006,
 1622100004,
 22100054,
 21801185,
 11900070,
 1621900004,
 1522100022,
 21900021,
 11900067,
 12100046,
 1522100017,
 21900556,
 1521900004,
 21900009,
 12100038,
 21801166,
 21900006,
 21900010,
 21900011,
 21900013,
 12200002,
 12200008,
 1522100006,
 1521900006,
 1522100004,
 1521900001,
 21801155,
 21801172,
 21900001,
 21800507,
 1521900021,
 22100060,
 1521900009,
 11900064,
 11900047,
 21801192,
 11900012,
 1521900058,
 11900021,
 11900020,
 12100012,
 12100010,
 11900018,
 1521900067,
 1521900068,
 41800132,
 1522100063,
 12100006,
 41800143,
 41800222,
 22100009,
 11900011,
 1521900054,
 21900694,
 11900009,
 12100001,
 11900008,
 11900007,
 1521900076,
 1521900077,
 41800156,
 1522100068,
 11900004,
 11900001,
 1521900079,
 1521900080,
 21900601,
 11900025,
 22100031,
 1521900030,
 21801207,
 12100052,
 21801194,
 1521900032,
 21801195,
 21801196,
 21801197,
 1621900001,
 21900568,
 12200025,
 11900057,
 1521900034,
 1522100034

In [17]:
df = df[~df['GAME_ID'].isin(unique_values)]

In [18]:
df = df.reset_index(drop=True)

## PREPROCESS

In [19]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [20]:
scaler = MinMaxScaler()

In [21]:
selected_columns = ['FG_PCT', 'FG3_PCT', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF']

In [22]:
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [23]:
df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,HOME_TEAM
0,22022,1610612764,WAS,Washington Wizards,22200973,2023-03-05,WAS vs. MIL,L,239,111,...,0.689655,0.466667,0.666667,0.325581,0.363636,0.10,0.25000,0.333333,-6.0,1
1,22022,1610612749,MIL,Milwaukee Bucks,22200973,2023-03-05,MIL @ WAS,W,240,117,...,0.275862,0.511111,0.470588,0.534884,0.227273,0.30,0.34375,0.230769,6.0,0
2,22022,1610612746,LAC,LA Clippers,22200970,2023-03-05,LAC vs. MEM,W,239,135,...,0.448276,0.555556,0.607843,0.418605,0.272727,0.10,0.50000,0.282051,6.0,1
3,22022,1610612763,MEM,Memphis Grizzlies,22200970,2023-03-05,MEM @ LAC,L,240,129,...,0.068966,0.200000,0.078431,0.581395,0.590909,0.15,0.21875,0.435897,-6.0,0
4,22022,1610612738,BOS,Boston Celtics,22200969,2023-03-05,BOS vs. NYK,L,289,129,...,0.586207,0.488889,0.627451,0.465116,0.363636,0.25,0.46875,0.564103,-2.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13397,22017,1610612756,PHX,Phoenix Suns,21700952,2018-03-05,PHX @ MIA,L,239,103,...,0.413793,0.288889,0.352941,0.325581,0.136364,0.10,0.50000,0.358974,-22.0,0
13398,22017,1610612754,IND,Indiana Pacers,21700951,2018-03-05,IND vs. MIL,W,240,92,...,0.275862,0.311111,0.294118,0.255814,0.590909,0.25,0.40625,0.230769,3.0,1
13399,22017,1610612749,MIL,Milwaukee Bucks,21700951,2018-03-05,MIL @ IND,L,240,89,...,0.275862,0.355556,0.333333,0.186047,0.363636,0.35,0.46875,0.256410,-3.0,0
13400,22017,1610612739,CLE,Cleveland Cavaliers,21700950,2018-03-05,CLE vs. DET,W,241,112,...,0.379310,0.600000,0.607843,0.511628,0.363636,0.10,0.40625,0.307692,22.0,1


In [24]:
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(df[['TEAM_ABBREVIATION']])
df[ohe.get_feature_names_out()] = ohe.transform(df[['TEAM_ABBREVIATION']])



In [25]:
df.drop(columns = ["TEAM_ABBREVIATION"], inplace = True)

In [26]:
df.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP',
       'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT',
       'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PLUS_MINUS', 'HOME_TEAM', 'TEAM_ABBREVIATION_ATL',
       'TEAM_ABBREVIATION_BKN', 'TEAM_ABBREVIATION_BOS',
       'TEAM_ABBREVIATION_CHA', 'TEAM_ABBREVIATION_CHI',
       'TEAM_ABBREVIATION_CLE', 'TEAM_ABBREVIATION_DAL',
       'TEAM_ABBREVIATION_DEN', 'TEAM_ABBREVIATION_DET',
       'TEAM_ABBREVIATION_GSW', 'TEAM_ABBREVIATION_HOU',
       'TEAM_ABBREVIATION_IND', 'TEAM_ABBREVIATION_LAC',
       'TEAM_ABBREVIATION_LAL', 'TEAM_ABBREVIATION_MEM',
       'TEAM_ABBREVIATION_MIA', 'TEAM_ABBREVIATION_MIL',
       'TEAM_ABBREVIATION_MIN', 'TEAM_ABBREVIATION_NOP',
       'TEAM_ABBREVIATION_NYK', 'TEAM_ABBREVIATION_OKC',
       'TEAM_ABBREVIATION_ORL', 'TEAM_ABBREVIATION_PHI',
       'TEAM_ABBREVIATION_PHX', 'TEAM_ABBREVIATION_POR',
       'TEAM

In [27]:
ohe.get_feature_names_out()

array(['TEAM_ABBREVIATION_ATL', 'TEAM_ABBREVIATION_BKN',
       'TEAM_ABBREVIATION_BOS', 'TEAM_ABBREVIATION_CHA',
       'TEAM_ABBREVIATION_CHI', 'TEAM_ABBREVIATION_CLE',
       'TEAM_ABBREVIATION_DAL', 'TEAM_ABBREVIATION_DEN',
       'TEAM_ABBREVIATION_DET', 'TEAM_ABBREVIATION_GSW',
       'TEAM_ABBREVIATION_HOU', 'TEAM_ABBREVIATION_IND',
       'TEAM_ABBREVIATION_LAC', 'TEAM_ABBREVIATION_LAL',
       'TEAM_ABBREVIATION_MEM', 'TEAM_ABBREVIATION_MIA',
       'TEAM_ABBREVIATION_MIL', 'TEAM_ABBREVIATION_MIN',
       'TEAM_ABBREVIATION_NOP', 'TEAM_ABBREVIATION_NYK',
       'TEAM_ABBREVIATION_OKC', 'TEAM_ABBREVIATION_ORL',
       'TEAM_ABBREVIATION_PHI', 'TEAM_ABBREVIATION_PHX',
       'TEAM_ABBREVIATION_POR', 'TEAM_ABBREVIATION_SAC',
       'TEAM_ABBREVIATION_SAS', 'TEAM_ABBREVIATION_TOR',
       'TEAM_ABBREVIATION_UTA', 'TEAM_ABBREVIATION_WAS'], dtype=object)

## ROLLING

In [28]:
df_rolling = df[selected_columns + ["SEASON_ID", "PLUS_MINUS", "TEAM_NAME", "GAME_DATE"]]

In [29]:
df_rolling

Unnamed: 0,FG_PCT,FG3_PCT,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,SEASON_ID,PLUS_MINUS,TEAM_NAME,GAME_DATE
0,0.414798,0.236601,0.887381,0.689655,0.466667,0.666667,0.325581,0.363636,0.10,0.25000,0.333333,22022,-6.0,Washington Wizards,2023-03-05
1,0.500000,0.486275,0.691995,0.275862,0.511111,0.470588,0.534884,0.227273,0.30,0.34375,0.230769,22022,6.0,Milwaukee Bucks,2023-03-05
2,0.645740,0.440523,0.922659,0.448276,0.555556,0.607843,0.418605,0.272727,0.10,0.50000,0.282051,22022,6.0,LA Clippers,2023-03-05
3,0.677130,0.590850,0.637720,0.068966,0.200000,0.078431,0.581395,0.590909,0.15,0.21875,0.435897,22022,-6.0,Memphis Grizzlies,2023-03-05
4,0.343049,0.364706,0.837178,0.586207,0.488889,0.627451,0.465116,0.363636,0.25,0.46875,0.564103,22022,-2.0,Boston Celtics,2023-03-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13397,0.542601,0.444444,0.321574,0.413793,0.288889,0.352941,0.325581,0.136364,0.10,0.50000,0.358974,22017,-22.0,Phoenix Suns,2018-03-05
13398,0.421525,0.667974,0.677069,0.275862,0.311111,0.294118,0.255814,0.590909,0.25,0.40625,0.230769,22017,3.0,Indiana Pacers,2018-03-05
13399,0.461883,0.296732,0.830393,0.275862,0.355556,0.333333,0.186047,0.363636,0.35,0.46875,0.256410,22017,-3.0,Milwaukee Bucks,2018-03-05
13400,0.556054,0.375163,0.761194,0.379310,0.600000,0.607843,0.511628,0.363636,0.10,0.40625,0.307692,22017,22.0,Cleveland Cavaliers,2018-03-05


In [30]:
df_rolling.sort_values(['GAME_DATE'], ascending=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rolling.sort_values(['GAME_DATE'], ascending=True, inplace=True)


In [31]:
def find_team_averages(team):
    return team.rolling(10).mean()

df_rolling = df_rolling.groupby(["TEAM_NAME"], group_keys=False).apply(find_team_averages)

  return team.rolling(10).mean()


In [32]:
df_rolling

Unnamed: 0,FG_PCT,FG3_PCT,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,SEASON_ID,PLUS_MINUS
13401,,,,,,,,,,,,,
13388,,,,,,,,,,,,,
13389,,,,,,,,,,,,,
13390,,,,,,,,,,,,,
13391,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,0.545740,0.390719,0.704478,0.313793,0.293333,0.300000,0.474419,0.340909,0.290,0.325000,0.361538,22022.0,-2.76
18,0.588565,0.443268,0.682904,0.255172,0.333333,0.301961,0.418605,0.277273,0.140,0.325000,0.376923,22022.0,2.10
19,0.543274,0.335817,0.700271,0.413793,0.400000,0.450980,0.472093,0.395455,0.250,0.337500,0.415385,22022.0,5.50
10,0.499776,0.330588,0.694844,0.386207,0.386667,0.423529,0.376744,0.368182,0.225,0.421875,0.361538,22022.0,-1.10


In [33]:
df_rolling = df_rolling.sort_index()

In [34]:
df_rolling = df_rolling[selected_columns]

In [35]:
new_column_names = {}
for col in df_rolling.columns:
    new_column_names[col] = col + '_rolling'

# rename the columns using the dictionary
df_rolling = df_rolling.rename(columns=new_column_names)

In [36]:
df_rolling

Unnamed: 0,FG_PCT_rolling,FG3_PCT_rolling,FT_PCT_rolling,OREB_rolling,DREB_rolling,REB_rolling,AST_rolling,STL_rolling,BLK_rolling,TOV_rolling,PF_rolling
0,0.607175,0.428235,0.776662,0.358621,0.395556,0.415686,0.427907,0.327273,0.205,0.437500,0.261538
1,0.538117,0.421699,0.635278,0.355172,0.540000,0.541176,0.460465,0.250000,0.210,0.337500,0.338462
2,0.558072,0.389804,0.783718,0.282759,0.395556,0.372549,0.446512,0.345455,0.190,0.415625,0.371795
3,0.508969,0.353987,0.671099,0.317241,0.393333,0.390196,0.423256,0.418182,0.305,0.271875,0.379487
4,0.529596,0.416993,0.714111,0.362069,0.455556,0.470588,0.469767,0.313636,0.255,0.387500,0.376923
...,...,...,...,...,...,...,...,...,...,...,...
13397,,,,,,,,,,,
13398,,,,,,,,,,,
13399,,,,,,,,,,,
13400,,,,,,,,,,,


In [37]:
combined_df = pd.concat([df, df_rolling], axis=1)
combined_df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,...,FG3_PCT_rolling,FT_PCT_rolling,OREB_rolling,DREB_rolling,REB_rolling,AST_rolling,STL_rolling,BLK_rolling,TOV_rolling,PF_rolling
0,22022,1610612764,Washington Wizards,22200973,2023-03-05,WAS vs. MIL,L,239,111,46,...,0.428235,0.776662,0.358621,0.395556,0.415686,0.427907,0.327273,0.205,0.437500,0.261538
1,22022,1610612749,Milwaukee Bucks,22200973,2023-03-05,MIL @ WAS,W,240,117,39,...,0.421699,0.635278,0.355172,0.540000,0.541176,0.460465,0.250000,0.210,0.337500,0.338462
2,22022,1610612746,LA Clippers,22200970,2023-03-05,LAC vs. MEM,W,239,135,45,...,0.389804,0.783718,0.282759,0.395556,0.372549,0.446512,0.345455,0.190,0.415625,0.371795
3,22022,1610612763,Memphis Grizzlies,22200970,2023-03-05,MEM @ LAC,L,240,129,50,...,0.353987,0.671099,0.317241,0.393333,0.390196,0.423256,0.418182,0.305,0.271875,0.379487
4,22022,1610612738,Boston Celtics,22200969,2023-03-05,BOS vs. NYK,L,289,129,43,...,0.416993,0.714111,0.362069,0.455556,0.470588,0.469767,0.313636,0.255,0.387500,0.376923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13397,22017,1610612756,Phoenix Suns,21700952,2018-03-05,PHX @ MIA,L,239,103,42,...,,,,,,,,,,
13398,22017,1610612754,Indiana Pacers,21700951,2018-03-05,IND vs. MIL,W,240,92,33,...,,,,,,,,,,
13399,22017,1610612749,Milwaukee Bucks,21700951,2018-03-05,MIL @ IND,L,240,89,34,...,,,,,,,,,,
13400,22017,1610612739,Cleveland Cavaliers,21700950,2018-03-05,CLE vs. DET,W,241,112,43,...,,,,,,,,,,


In [38]:
combined_df = combined_df.drop(columns=selected_columns)

In [39]:
combined_df = combined_df.dropna()
combined_df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,...,FG3_PCT_rolling,FT_PCT_rolling,OREB_rolling,DREB_rolling,REB_rolling,AST_rolling,STL_rolling,BLK_rolling,TOV_rolling,PF_rolling
0,22022,1610612764,Washington Wizards,22200973,2023-03-05,WAS vs. MIL,L,239,111,46,...,0.428235,0.776662,0.358621,0.395556,0.415686,0.427907,0.327273,0.205,0.437500,0.261538
1,22022,1610612749,Milwaukee Bucks,22200973,2023-03-05,MIL @ WAS,W,240,117,39,...,0.421699,0.635278,0.355172,0.540000,0.541176,0.460465,0.250000,0.210,0.337500,0.338462
2,22022,1610612746,LA Clippers,22200970,2023-03-05,LAC vs. MEM,W,239,135,45,...,0.389804,0.783718,0.282759,0.395556,0.372549,0.446512,0.345455,0.190,0.415625,0.371795
3,22022,1610612763,Memphis Grizzlies,22200970,2023-03-05,MEM @ LAC,L,240,129,50,...,0.353987,0.671099,0.317241,0.393333,0.390196,0.423256,0.418182,0.305,0.271875,0.379487
4,22022,1610612738,Boston Celtics,22200969,2023-03-05,BOS vs. NYK,L,289,129,43,...,0.416993,0.714111,0.362069,0.455556,0.470588,0.469767,0.313636,0.255,0.387500,0.376923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13148,22017,1610612740,New Orleans Pelicans,21701075,2018-03-22,NOP vs. LAL,W,241,128,52,...,0.351242,0.774084,0.279310,0.466667,0.433333,0.430233,0.309091,0.375,0.368750,0.307692
13150,22017,1610612745,Houston Rockets,21701074,2018-03-22,HOU vs. DET,W,265,100,35,...,0.389542,0.647218,0.334483,0.415556,0.419608,0.327907,0.322727,0.275,0.359375,0.358974
13155,22017,1610612763,Memphis Grizzlies,21701072,2018-03-22,MEM @ CHA,L,239,79,30,...,0.353072,0.628494,0.358621,0.340000,0.366667,0.346512,0.304545,0.235,0.421875,0.451282
13169,22017,1610612761,Toronto Raptors,21701065,2018-03-21,TOR @ CLE,L,239,129,47,...,0.399085,0.743148,0.417241,0.397778,0.450980,0.458140,0.286364,0.280,0.387500,0.353846


In [40]:
combined_df.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP',
       'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA',
       'PLUS_MINUS', 'HOME_TEAM', 'TEAM_ABBREVIATION_ATL',
       'TEAM_ABBREVIATION_BKN', 'TEAM_ABBREVIATION_BOS',
       'TEAM_ABBREVIATION_CHA', 'TEAM_ABBREVIATION_CHI',
       'TEAM_ABBREVIATION_CLE', 'TEAM_ABBREVIATION_DAL',
       'TEAM_ABBREVIATION_DEN', 'TEAM_ABBREVIATION_DET',
       'TEAM_ABBREVIATION_GSW', 'TEAM_ABBREVIATION_HOU',
       'TEAM_ABBREVIATION_IND', 'TEAM_ABBREVIATION_LAC',
       'TEAM_ABBREVIATION_LAL', 'TEAM_ABBREVIATION_MEM',
       'TEAM_ABBREVIATION_MIA', 'TEAM_ABBREVIATION_MIL',
       'TEAM_ABBREVIATION_MIN', 'TEAM_ABBREVIATION_NOP',
       'TEAM_ABBREVIATION_NYK', 'TEAM_ABBREVIATION_OKC',
       'TEAM_ABBREVIATION_ORL', 'TEAM_ABBREVIATION_PHI',
       'TEAM_ABBREVIATION_PHX', 'TEAM_ABBREVIATION_POR',
       'TEAM_ABBREVIATION_SAC', 'TEAM_ABBREVIATION_SAS',
       'TEAM_ABBREVIATION_TOR', 'TEAM_ABBREVIATION

## COMBINING ROWS

In [41]:
df_in_process = combined_df.drop(columns=['SEASON_ID', 'WL', 'MIN', 'MATCHUP', 'PTS', 'TEAM_ID', 'TEAM_NAME', 'FTM', 'FTA', 'FGM', 'FGA', 'FG3M', 'FG3A', 'HOME_TEAM'])

In [42]:
home_df = df_in_process.iloc[::2].reset_index(drop=True)

In [43]:
home_df

Unnamed: 0,GAME_ID,GAME_DATE,PLUS_MINUS,TEAM_ABBREVIATION_ATL,TEAM_ABBREVIATION_BKN,TEAM_ABBREVIATION_BOS,TEAM_ABBREVIATION_CHA,TEAM_ABBREVIATION_CHI,TEAM_ABBREVIATION_CLE,TEAM_ABBREVIATION_DAL,...,FG3_PCT_rolling,FT_PCT_rolling,OREB_rolling,DREB_rolling,REB_rolling,AST_rolling,STL_rolling,BLK_rolling,TOV_rolling,PF_rolling
0,22200973,2023-03-05,-6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.428235,0.776662,0.358621,0.395556,0.415686,0.427907,0.327273,0.205,0.437500,0.261538
1,22200970,2023-03-05,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.389804,0.783718,0.282759,0.395556,0.372549,0.446512,0.345455,0.190,0.415625,0.371795
2,22200969,2023-03-05,-2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.416993,0.714111,0.362069,0.455556,0.470588,0.469767,0.313636,0.255,0.387500,0.376923
3,22200968,2023-03-05,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.345229,0.716282,0.424138,0.337778,0.401961,0.432558,0.390909,0.145,0.309375,0.407692
4,22200967,2023-03-05,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.330065,0.677205,0.455172,0.373333,0.450980,0.388372,0.350000,0.220,0.406250,0.389744
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6561,21701082,2018-03-23,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.428105,0.721710,0.375862,0.420000,0.447059,0.460465,0.254545,0.275,0.381250,0.361538
6562,21701079,2018-03-23,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.440131,0.724559,0.341379,0.455556,0.458824,0.451163,0.350000,0.290,0.390625,0.320513
6563,21701078,2018-03-23,-5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.366144,0.617503,0.358621,0.420000,0.437255,0.355814,0.295455,0.170,0.393750,0.312821
6564,21701074,2018-03-22,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.389542,0.647218,0.334483,0.415556,0.419608,0.327907,0.322727,0.275,0.359375,0.358974


In [44]:
away_df = df_in_process.iloc[1::2].reset_index(drop=True)

In [45]:
away_df

Unnamed: 0,GAME_ID,GAME_DATE,PLUS_MINUS,TEAM_ABBREVIATION_ATL,TEAM_ABBREVIATION_BKN,TEAM_ABBREVIATION_BOS,TEAM_ABBREVIATION_CHA,TEAM_ABBREVIATION_CHI,TEAM_ABBREVIATION_CLE,TEAM_ABBREVIATION_DAL,...,FG3_PCT_rolling,FT_PCT_rolling,OREB_rolling,DREB_rolling,REB_rolling,AST_rolling,STL_rolling,BLK_rolling,TOV_rolling,PF_rolling
0,22200973,2023-03-05,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.421699,0.635278,0.355172,0.540000,0.541176,0.460465,0.250000,0.210,0.337500,0.338462
1,22200970,2023-03-05,-6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.353987,0.671099,0.317241,0.393333,0.390196,0.423256,0.418182,0.305,0.271875,0.379487
2,22200969,2023-03-05,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.421307,0.632429,0.324138,0.468889,0.460784,0.323256,0.222727,0.195,0.353125,0.376923
3,22200968,2023-03-05,-10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.317778,0.720217,0.465517,0.464444,0.537255,0.425581,0.222727,0.405,0.506250,0.348718
4,22200967,2023-03-05,-32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.339608,0.682497,0.431034,0.391111,0.452941,0.416279,0.318182,0.200,0.431250,0.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6561,21701080,2018-03-23,25.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.421961,0.660109,0.282759,0.408889,0.384314,0.395349,0.313636,0.265,0.365625,0.338462
6562,21701078,2018-03-23,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.385882,0.766757,0.379310,0.357778,0.394118,0.306977,0.459091,0.200,0.346875,0.325641
6563,21701075,2018-03-22,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.351242,0.774084,0.279310,0.466667,0.433333,0.430233,0.309091,0.375,0.368750,0.307692
6564,21701072,2018-03-22,-61.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.353072,0.628494,0.358621,0.340000,0.366667,0.346512,0.304545,0.235,0.421875,0.451282


In [46]:
new_column_names = {}
for col in home_df.columns:
    new_column_names[col] = col + '_h'

# rename the columns using the dictionary
home_df = home_df.rename(columns=new_column_names)

In [47]:
new_column_names = {}
for col in away_df.columns:
    new_column_names[col] = col + '_a'

# rename the columns using the dictionary
away_df = away_df.rename(columns=new_column_names)

In [48]:
home_df.columns

Index(['GAME_ID_h', 'GAME_DATE_h', 'PLUS_MINUS_h', 'TEAM_ABBREVIATION_ATL_h',
       'TEAM_ABBREVIATION_BKN_h', 'TEAM_ABBREVIATION_BOS_h',
       'TEAM_ABBREVIATION_CHA_h', 'TEAM_ABBREVIATION_CHI_h',
       'TEAM_ABBREVIATION_CLE_h', 'TEAM_ABBREVIATION_DAL_h',
       'TEAM_ABBREVIATION_DEN_h', 'TEAM_ABBREVIATION_DET_h',
       'TEAM_ABBREVIATION_GSW_h', 'TEAM_ABBREVIATION_HOU_h',
       'TEAM_ABBREVIATION_IND_h', 'TEAM_ABBREVIATION_LAC_h',
       'TEAM_ABBREVIATION_LAL_h', 'TEAM_ABBREVIATION_MEM_h',
       'TEAM_ABBREVIATION_MIA_h', 'TEAM_ABBREVIATION_MIL_h',
       'TEAM_ABBREVIATION_MIN_h', 'TEAM_ABBREVIATION_NOP_h',
       'TEAM_ABBREVIATION_NYK_h', 'TEAM_ABBREVIATION_OKC_h',
       'TEAM_ABBREVIATION_ORL_h', 'TEAM_ABBREVIATION_PHI_h',
       'TEAM_ABBREVIATION_PHX_h', 'TEAM_ABBREVIATION_POR_h',
       'TEAM_ABBREVIATION_SAC_h', 'TEAM_ABBREVIATION_SAS_h',
       'TEAM_ABBREVIATION_TOR_h', 'TEAM_ABBREVIATION_UTA_h',
       'TEAM_ABBREVIATION_WAS_h', 'FG_PCT_rolling_h', 'FG3_PCT_rolli

In [49]:
away_df.columns

Index(['GAME_ID_a', 'GAME_DATE_a', 'PLUS_MINUS_a', 'TEAM_ABBREVIATION_ATL_a',
       'TEAM_ABBREVIATION_BKN_a', 'TEAM_ABBREVIATION_BOS_a',
       'TEAM_ABBREVIATION_CHA_a', 'TEAM_ABBREVIATION_CHI_a',
       'TEAM_ABBREVIATION_CLE_a', 'TEAM_ABBREVIATION_DAL_a',
       'TEAM_ABBREVIATION_DEN_a', 'TEAM_ABBREVIATION_DET_a',
       'TEAM_ABBREVIATION_GSW_a', 'TEAM_ABBREVIATION_HOU_a',
       'TEAM_ABBREVIATION_IND_a', 'TEAM_ABBREVIATION_LAC_a',
       'TEAM_ABBREVIATION_LAL_a', 'TEAM_ABBREVIATION_MEM_a',
       'TEAM_ABBREVIATION_MIA_a', 'TEAM_ABBREVIATION_MIL_a',
       'TEAM_ABBREVIATION_MIN_a', 'TEAM_ABBREVIATION_NOP_a',
       'TEAM_ABBREVIATION_NYK_a', 'TEAM_ABBREVIATION_OKC_a',
       'TEAM_ABBREVIATION_ORL_a', 'TEAM_ABBREVIATION_PHI_a',
       'TEAM_ABBREVIATION_PHX_a', 'TEAM_ABBREVIATION_POR_a',
       'TEAM_ABBREVIATION_SAC_a', 'TEAM_ABBREVIATION_SAS_a',
       'TEAM_ABBREVIATION_TOR_a', 'TEAM_ABBREVIATION_UTA_a',
       'TEAM_ABBREVIATION_WAS_a', 'FG_PCT_rolling_a', 'FG3_PCT_rolli

In [50]:
home_df = home_df.rename(columns={'GAME_ID_h': 'GAME_ID'})

In [51]:
away_df = away_df.rename(columns={'GAME_ID_a': 'GAME_ID'})

In [52]:
home_df = home_df.drop(columns=['GAME_DATE_h'])

In [53]:
away_df = away_df.drop(columns=['PLUS_MINUS_a', 'GAME_DATE_a'])

In [54]:
combined_df = pd.merge(home_df, away_df, on='GAME_ID')
combined_df

Unnamed: 0,GAME_ID,PLUS_MINUS_h,TEAM_ABBREVIATION_ATL_h,TEAM_ABBREVIATION_BKN_h,TEAM_ABBREVIATION_BOS_h,TEAM_ABBREVIATION_CHA_h,TEAM_ABBREVIATION_CHI_h,TEAM_ABBREVIATION_CLE_h,TEAM_ABBREVIATION_DAL_h,TEAM_ABBREVIATION_DEN_h,...,FG3_PCT_rolling_a,FT_PCT_rolling_a,OREB_rolling_a,DREB_rolling_a,REB_rolling_a,AST_rolling_a,STL_rolling_a,BLK_rolling_a,TOV_rolling_a,PF_rolling_a
0,22200973,-6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.421699,0.635278,0.355172,0.540000,0.541176,0.460465,0.250000,0.210,0.337500,0.338462
1,22200970,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.353987,0.671099,0.317241,0.393333,0.390196,0.423256,0.418182,0.305,0.271875,0.379487
2,22200969,-2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.421307,0.632429,0.324138,0.468889,0.460784,0.323256,0.222727,0.195,0.353125,0.376923
3,22200968,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.317778,0.720217,0.465517,0.464444,0.537255,0.425581,0.222727,0.405,0.506250,0.348718
4,22200967,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.339608,0.682497,0.431034,0.391111,0.452941,0.416279,0.318182,0.200,0.431250,0.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6553,21701092,-7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.369542,0.680326,0.337931,0.444444,0.447059,0.362791,0.318182,0.215,0.409375,0.356410
6554,21701091,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,0.740163,0.303448,0.464444,0.445098,0.413953,0.327273,0.370,0.368750,0.287179
6555,21701089,-22.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.388758,0.649118,0.341379,0.482222,0.482353,0.351163,0.363636,0.315,0.368750,0.287179
6556,21701085,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.344706,0.682361,0.331034,0.497778,0.490196,0.427907,0.409091,0.305,0.421875,0.348718


In [55]:
combined_df.columns

Index(['GAME_ID', 'PLUS_MINUS_h', 'TEAM_ABBREVIATION_ATL_h',
       'TEAM_ABBREVIATION_BKN_h', 'TEAM_ABBREVIATION_BOS_h',
       'TEAM_ABBREVIATION_CHA_h', 'TEAM_ABBREVIATION_CHI_h',
       'TEAM_ABBREVIATION_CLE_h', 'TEAM_ABBREVIATION_DAL_h',
       'TEAM_ABBREVIATION_DEN_h', 'TEAM_ABBREVIATION_DET_h',
       'TEAM_ABBREVIATION_GSW_h', 'TEAM_ABBREVIATION_HOU_h',
       'TEAM_ABBREVIATION_IND_h', 'TEAM_ABBREVIATION_LAC_h',
       'TEAM_ABBREVIATION_LAL_h', 'TEAM_ABBREVIATION_MEM_h',
       'TEAM_ABBREVIATION_MIA_h', 'TEAM_ABBREVIATION_MIL_h',
       'TEAM_ABBREVIATION_MIN_h', 'TEAM_ABBREVIATION_NOP_h',
       'TEAM_ABBREVIATION_NYK_h', 'TEAM_ABBREVIATION_OKC_h',
       'TEAM_ABBREVIATION_ORL_h', 'TEAM_ABBREVIATION_PHI_h',
       'TEAM_ABBREVIATION_PHX_h', 'TEAM_ABBREVIATION_POR_h',
       'TEAM_ABBREVIATION_SAC_h', 'TEAM_ABBREVIATION_SAS_h',
       'TEAM_ABBREVIATION_TOR_h', 'TEAM_ABBREVIATION_UTA_h',
       'TEAM_ABBREVIATION_WAS_h', 'FG_PCT_rolling_h', 'FG3_PCT_rolling_h',
       'FT

## X, y

In [56]:
X = combined_df.drop(columns=['PLUS_MINUS_h'])
y = combined_df.PLUS_MINUS_h

In [57]:
# X.drop(columns=['SEASON_ID', 'WL', 'MIN', 'MATCHUP', 'PTS', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'FTM', 'FTA', 'FGM', 'FGA', 'FG3M', 'FG3A'], inplace=True)

In [58]:
X.drop(columns=['GAME_ID'], inplace=True)

In [59]:
X.columns

Index(['TEAM_ABBREVIATION_ATL_h', 'TEAM_ABBREVIATION_BKN_h',
       'TEAM_ABBREVIATION_BOS_h', 'TEAM_ABBREVIATION_CHA_h',
       'TEAM_ABBREVIATION_CHI_h', 'TEAM_ABBREVIATION_CLE_h',
       'TEAM_ABBREVIATION_DAL_h', 'TEAM_ABBREVIATION_DEN_h',
       'TEAM_ABBREVIATION_DET_h', 'TEAM_ABBREVIATION_GSW_h',
       'TEAM_ABBREVIATION_HOU_h', 'TEAM_ABBREVIATION_IND_h',
       'TEAM_ABBREVIATION_LAC_h', 'TEAM_ABBREVIATION_LAL_h',
       'TEAM_ABBREVIATION_MEM_h', 'TEAM_ABBREVIATION_MIA_h',
       'TEAM_ABBREVIATION_MIL_h', 'TEAM_ABBREVIATION_MIN_h',
       'TEAM_ABBREVIATION_NOP_h', 'TEAM_ABBREVIATION_NYK_h',
       'TEAM_ABBREVIATION_OKC_h', 'TEAM_ABBREVIATION_ORL_h',
       'TEAM_ABBREVIATION_PHI_h', 'TEAM_ABBREVIATION_PHX_h',
       'TEAM_ABBREVIATION_POR_h', 'TEAM_ABBREVIATION_SAC_h',
       'TEAM_ABBREVIATION_SAS_h', 'TEAM_ABBREVIATION_TOR_h',
       'TEAM_ABBREVIATION_UTA_h', 'TEAM_ABBREVIATION_WAS_h',
       'FG_PCT_rolling_h', 'FG3_PCT_rolling_h', 'FT_PCT_rolling_h',
       'OREB_roll

In [60]:
# X.to_pickle('X_basic_df_2022-09-01.pkl')

In [61]:
# y.to_pickle('y_basic_df_2022-09-01.pkl')

In [62]:
# X.to_pickle('X_basic_df_5yrs_preprocessed.pkl')

In [63]:
# y.to_pickle('y_basic_df_5yrs_preprocessed.pkl')

In [64]:
X.to_pickle('X_basic_rolling_df_5yrs_preprocessed.pkl')

In [65]:
y.to_pickle('y_basic_rolling_df_5yrs_preprocessed.pkl')