# Feature Engineering

In [2]:
import pandas as pd
import numpy as np

In [3]:
pd.set_option("display.max_columns", None)

# Games data

This data is only used for EDA

In [10]:
all_games = pd.read_csv("data/games_clean.csv", index_col="id")

In [99]:
all_games.shape

(49834, 17)

In [11]:
all_games.head()

Unnamed: 0_level_0,date,home_team_score,period,postseason,season,status,visitor_team_score,home_team.id,home_team.abbreviation,home_team.conference,home_team.division,home_team.full_name,visitor_team.id,visitor_team.abbreviation,visitor_team.conference,visitor_team.division,visitor_team.full_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
47179,2019-01-30T00:00:00.000Z,126,4,False,2018,Final,94,2,BOS,East,Atlantic,Boston Celtics,4,CHA,East,Southeast,Charlotte Hornets
48751,2019-02-09T00:00:00.000Z,112,4,False,2018,Final,123,2,BOS,East,Atlantic,Boston Celtics,13,LAC,West,Pacific,LA Clippers
48739,2019-02-08T00:00:00.000Z,117,4,False,2018,Final,110,23,PHI,East,Atlantic,Philadelphia 76ers,8,DEN,West,Northwest,Denver Nuggets
48740,2019-02-08T00:00:00.000Z,119,4,False,2018,Final,106,30,WAS,East,Southeast,Washington Wizards,6,CLE,East,Central,Cleveland Cavaliers
48746,2019-02-08T00:00:00.000Z,102,4,False,2018,Final,96,26,SAC,West,Pacific,Sacramento Kings,16,MIA,East,Southeast,Miami Heat


In [12]:
# Feature engineering

g = all_games.copy()

g["date"] = pd.to_datetime(all_games["date"]).dt.tz_localize(None)

# creating target variable
g["winner"] = np.where(g["home_team_score"] > g["visitor_team_score"], 1, 0)

# creating avg points by team across all years (not sure how useful this will be)
home_avg_pts_map = g[["home_team.id", "home_team_score"]].groupby("home_team.id").mean().squeeze()
g["home_team_avg_score_historical"] = g["home_team.id"].map(home_avg_pts_map).round(1)

visitor_avg_pts_map = g[["visitor_team.id", "visitor_team_score"]].groupby("visitor_team.id").mean().squeeze()
g["visitor_team_avg_score_historical"] = g["visitor_team.id"].map(visitor_avg_pts_map).round(1)

In [13]:
# creating an identifier for the team in a specific season
g["home_team_id_year"] = (g["home_team.id"].astype(str) + " " + g["season"].astype(str)).values
g["visitor_team_id_year"] = (g["visitor_team.id"].astype(str) + " " + g["season"].astype(str)).values

In [14]:
# creating average pts feature
# average pts scored per game in that season
# based on if they are home or away because there is a statistically significant difference
home_avg_score_map = g[["home_team_id_year", "home_team_score"]].groupby(["home_team_id_year"]).mean().squeeze()
visitor_avg_score_map = g[["visitor_team_id_year", "visitor_team_score"]].groupby(["visitor_team_id_year"]).mean().squeeze()

g["home_team_avg_score"] = g["home_team_id_year"].map(home_avg_score_map).round(1)
g["visitor_team_avg_score"] = g["visitor_team_id_year"].map(visitor_avg_score_map).round(1) 

In [15]:
# create an average difference in pts feature
# will calculate on average how much a team wins or loses by
# this will hopefully benefit lower scoring teams that also play defensively
avg_score_diff = g[["home_team_id_year", "home_team_avg_score", "visitor_team_avg_score"]].groupby("home_team_id_year").mean()
avg_score_diff = avg_score_diff["home_team_avg_score"] - avg_score_diff["visitor_team_avg_score"]
g["home_avg_score_diff"] = g["home_team_id_year"].map(avg_score_diff)

avg_score_diff = g[["visitor_team_id_year", "visitor_team_avg_score", "home_team_avg_score"]].groupby("visitor_team_id_year").mean()
avg_score_diff = avg_score_diff["visitor_team_avg_score"] - avg_score_diff["home_team_avg_score"]
g["visitor_avg_score_diff"] = g["visitor_team_id_year"].map(avg_score_diff)

In [16]:
g

Unnamed: 0_level_0,date,home_team_score,period,postseason,season,status,visitor_team_score,home_team.id,home_team.abbreviation,home_team.conference,home_team.division,home_team.full_name,visitor_team.id,visitor_team.abbreviation,visitor_team.conference,visitor_team.division,visitor_team.full_name,winner,home_team_avg_score_historical,visitor_team_avg_score_historical,home_team_id_year,visitor_team_id_year,home_team_avg_score,visitor_team_avg_score,home_avg_score_diff,visitor_avg_score_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
47179,2019-01-30,126,4,False,2018,Final,94,2,BOS,East,Atlantic,Boston Celtics,4,CHA,East,Southeast,Charlotte Hornets,1,105.8,98.2,2 2018,4 2018,112.8,108.3,3.620000,-3.831707
48751,2019-02-09,112,4,False,2018,Final,123,2,BOS,East,Atlantic,Boston Celtics,13,LAC,West,Pacific,LA Clippers,0,105.8,100.7,2 2018,13 2018,112.8,113.1,3.620000,0.581818
48739,2019-02-08,117,4,False,2018,Final,110,23,PHI,East,Atlantic,Philadelphia 76ers,8,DEN,West,Northwest,Denver Nuggets,1,103.3,104.2,23 2018,8 2018,117.9,108.2,8.725532,-4.670213
48740,2019-02-08,119,4,False,2018,Final,106,30,WAS,East,Southeast,Washington Wizards,6,CLE,East,Central,Cleveland Cavaliers,1,103.4,98.3,30 2018,6 2018,116.4,103.8,7.429268,-8.419512
48746,2019-02-08,102,4,False,2018,Final,96,26,SAC,West,Pacific,Sacramento Kings,16,MIA,East,Southeast,Miami Heat,1,105.5,96.0,26 2018,16 2018,114.9,105.4,5.129268,-6.670732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128052,2021-03-03,111,4,False,2020,Final,114,6,CLE,East,Central,Cleveland Cavaliers,12,IND,East,Central,Indiana Pacers,0,102.3,100.1,6 2020,12 2020,107.5,117.7,-4.341667,5.432432
128036,2021-03-01,124,4,False,2020,Final,130,22,ORL,East,Southeast,Orlando Magic,7,DAL,West,Southwest,Dallas Mavericks,0,101.9,101.5,22 2020,7 2020,105.4,113.8,-5.902778,1.257500
128001,2021-02-25,111,4,False,2020,Final,97,23,PHI,East,Atlantic,Philadelphia 76ers,7,DAL,West,Southwest,Dallas Mavericks,1,103.3,101.5,23 2020,7 2020,117.3,113.8,6.072093,1.257500
127942,2021-02-17,99,4,False,2020,Final,120,4,CHA,East,Southeast,Charlotte Hornets,5,CHI,East,Central,Chicago Bulls,0,100.9,99.7,4 2020,5 2020,108.4,111.9,-2.677778,-0.113889


In [17]:
g.to_csv("data/games_with_features.csv")

### Stats data

This data is used to build the model

In [54]:
stats = pd.read_csv("data/stats_clean.csv", index_col="id")

Convert the date column to datetime format so the data can be sorted by date.  

Convert the minutes column to timedelta so that they can be added together. (The rows of data are stats put up by a specific player in a specific game. These stats from these rows will be aggregated... grouping by team and game. This will result in the stats that the TEAM put up for that game, which will be useful because we are predicting which TEAM will win the game.)

In [43]:
# Convert game date to datetime
stats["game.date"] = pd.to_datetime(stats["game.date"]).dt.tz_localize(None)

# Convert string to timedelta
stats["min"] = [pd.Timedelta(minutes=int(time[0]), seconds=int(time[1])) for time in stats["min"].str.split(":").values]

# Create target variable & a "labels" dataframe
stats["winner"] = np.where(stats["game.home_team_score"].values > stats["game.visitor_team_score"].values, 1,0)
labels = stats[["game.id", "game.date", "game.season", "winner"]].groupby("game.id").first()

This code block aggregates the individual player statistics into team statistics

In [44]:
# split into 2 dataframes so that grouping data by game id doesn't
# group players of opposing teams
home_stats = stats[stats["team.id"].eq(stats["game.home_team_id"])]
away_stats = stats[stats["team.id"].eq(stats["game.visitor_team_id"])]

# define how to aggregate statistics when grouping player stats
agg_map = {"ast": "sum", 
           "blk": "sum", 
           "dreb": "sum", 
           "fg3_pct": "mean", 
           "fg3a": "sum", 
           "fg3m": "sum", 
           "fg_pct": "mean",
          "fga": "sum",
          "fgm": "sum",
          "ft_pct": "mean",
          "fta": "sum",
          "ftm": "sum",
          "min": "sum",
          "oreb": "sum",
          "pf": "sum",
          "pts": "sum",
          "reb": "sum",
          "stl": "sum",
          "turnover": "sum",
          "game.id": "first",
          "game.date": "first",
          "game.season": "first",
          "game.home_team_id": "first",
          "game.home_team_score": "first",
          "game.visitor_team_id": "first",
          "game.visitor_team_score": "first",
          "player.id": "first",
          "team.id": "first",}

# aggregate player stats to team stats
home_games = home_stats.groupby("game.id").agg(agg_map)
away_games = away_stats.groupby("game.id").agg(agg_map)

# player stats were aggregated to team stats, so player id doesn't make sense anymore
# team id isn't needed either
home_games.drop(["player.id", "team.id"], axis=1, inplace=True)
away_games.drop(["player.id", "team.id"], axis=1, inplace=True)

To predict which team will win a game, the model will look at the average stats that a team put up in their 20 most recent games.*  (called the rolling average from here on out)   

*Their 20 most recent home games if the game being predicted is a home game   
*Their 20 most recent away games if the game being predicted is an away game

In [45]:
# sort rows by team and date to prepare for the rolling average
home_games = home_games.sort_values(["game.home_team_id", "game.date"])
away_games = away_games.sort_values(["game.visitor_team_id", "game.date"])

In [46]:
# Only use the columns that refer to stats when calculating the rolling average
stats_cols = ["ast","blk","dreb","fg3_pct","fg3a","fg3m","fg_pct","fga","fgm","ft_pct","fta","ftm","oreb",
              "pf","pts","reb","stl","turnover"]

rolling_stats_home = pd.DataFrame()
for team in home_games["game.home_team_id"].unique():                       # for each team
    games = home_games[home_games["game.home_team_id"].eq(team)]            # get only home games for that team
    games_stats = games[stats_cols]                                         # Only use the columns that refer to stats when calculating the rolling average
    rolling_stats_home = rolling_stats_home.append(games_stats.rolling(window=20).mean())  # calculate the rolling average (20 game average)
    rolling_stats_home = rolling_stats_home.shift()                         # make the rolling average not include the current game
rolling_stats_home.dropna(inplace=True)


rolling_stats_away = pd.DataFrame()
for team in away_games["game.visitor_team_id"].unique():
    games = away_games[away_games["game.visitor_team_id"].eq(team)]
    games_stats = games[stats_cols]
    rolling_stats_away = rolling_stats_away.append(games_stats.rolling(window=20).mean())
    rolling_stats_away = rolling_stats_away.shift()  # make the rolling average not include the current game
rolling_stats_away.dropna(inplace=True)

In [47]:
# Rename columns in the home and away dataframes
rolling_stats_home.columns = ["home_" + col_name for col_name in rolling_stats_home.columns]
rolling_stats_away.columns = ["away_" + col_name for col_name in rolling_stats_away.columns]
# Now that columns are renamed they can be merged into the same dataframe
rolling_stats = pd.merge(rolling_stats_home, rolling_stats_away, on="game.id")

In [48]:
# NOTE: adding columns to the start of the dataframe will mess up this code
# adding columns to the end will not
rolling_stats_diff = pd.DataFrame(index=rolling_stats.index)
home_cols = rolling_stats.columns[0:18]
away_cols = rolling_stats.columns[18:36]

In [49]:
# Create a dataframe where each column is equal to the home value minus the away value
##### This should be altered so that percentages are divided by each other instead of subtracted
for col in zip(stats_cols, home_cols, away_cols):
    rolling_stats_diff[col[0]] = rolling_stats[col[1]] - rolling_stats[col[2]]

In [50]:
# Rename the columns
rolling_stats_diff.columns = ["diff_" + col_name for col_name in rolling_stats_away.columns]

# putting it all together in 1 dataframe
rolling_stats = pd.merge(rolling_stats, rolling_stats_diff, on="game.id") # adding diff to the rolling stats
rolling_stats = pd.merge(labels, rolling_stats, on="game.id")             # adding labels

In [52]:
rolling_stats.head()

Unnamed: 0_level_0,game.date,game.season,winner,home_ast,home_blk,home_dreb,home_fg3_pct,home_fg3a,home_fg3m,home_fg_pct,home_fga,home_fgm,home_ft_pct,home_fta,home_ftm,home_oreb,home_pf,home_pts,home_reb,home_stl,home_turnover,away_ast,away_blk,away_dreb,away_fg3_pct,away_fg3a,away_fg3m,away_fg_pct,away_fga,away_fgm,away_ft_pct,away_fta,away_ftm,away_oreb,away_pf,away_pts,away_reb,away_stl,away_turnover,diff_away_ast,diff_away_blk,diff_away_dreb,diff_away_fg3_pct,diff_away_fg3a,diff_away_fg3m,diff_away_fg_pct,diff_away_fga,diff_away_fgm,diff_away_ft_pct,diff_away_fta,diff_away_ftm,diff_away_oreb,diff_away_pf,diff_away_pts,diff_away_reb,diff_away_stl,diff_away_turnover
game.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
1,2018-10-16,2018,1,23.4,5.35,36.05,0.283508,32.1,11.6,0.407498,85.3,38.1,0.392525,21.45,16.35,8.8,20.2,104.15,44.85,7.05,12.9,25.6,4.75,34.2,0.242414,29.45,10.85,0.451425,83.5,39.2,0.477155,24.25,18.2,11.1,21.15,107.45,45.3,8.0,15.55,-2.2,0.6,1.85,0.041094,2.65,0.75,-0.043927,1.8,-1.1,-0.084629,-2.8,-1.85,-2.3,-0.95,-3.3,-0.45,-0.95,-2.65
2,2018-10-16,2018,1,31.55,7.6,34.45,0.186014,27.6,11.35,0.51133,83.35,43.5,0.385137,20.15,16.35,8.6,18.85,114.7,43.05,7.65,15.25,22.55,5.6,32.8,0.235524,31.2,11.2,0.406164,90.1,41.5,0.353259,22.6,16.2,12.5,20.05,110.4,45.3,7.8,12.1,9.0,2.0,1.65,-0.04951,-3.6,0.15,0.105166,-6.75,2.0,0.031878,-2.45,0.15,-3.9,-1.2,4.3,-2.25,-0.15,3.15
3,2018-10-17,2018,0,21.35,5.15,35.85,0.211261,26.7,9.45,0.415333,87.05,38.7,0.515664,27.4,21.25,9.75,17.05,108.1,45.6,6.45,11.35,21.3,4.65,32.0,0.226351,25.3,8.6,0.398101,82.05,37.8,0.433145,23.85,18.7,9.5,21.55,102.9,41.5,8.8,14.3,0.05,0.5,3.85,-0.015089,1.4,0.85,0.017233,5.0,0.9,0.082519,3.55,2.55,0.25,-4.5,5.2,4.1,-2.35,-2.95
4,2018-10-17,2018,1,23.15,3.4,33.55,0.239457,28.45,11.5,0.431416,84.55,38.95,0.400628,20.05,14.95,9.3,17.85,104.35,42.85,7.9,13.3,21.6,4.1,35.45,0.249849,31.6,10.25,0.425655,85.15,37.45,0.506771,25.15,19.7,9.15,20.65,104.85,44.6,7.3,16.6,1.55,-0.7,-1.9,-0.010392,-3.15,1.25,0.005761,-0.6,1.5,-0.106143,-5.1,-4.75,0.15,-2.8,-0.5,-1.75,0.6,-3.3
5,2018-10-17,2018,1,22.55,4.25,33.2,0.291932,25.8,10.0,0.45345,84.7,41.25,0.432828,20.2,15.3,9.4,18.8,107.8,42.6,7.9,13.0,20.55,4.25,30.0,0.27658,25.85,9.4,0.433132,81.05,35.85,0.429151,20.3,16.55,9.6,23.85,97.65,39.6,6.75,14.5,2.0,0.0,3.2,0.015352,-0.05,0.6,0.020318,3.65,5.4,0.003677,-0.1,-1.25,-0.2,-5.05,10.15,3.0,1.15,-1.5


In [39]:
rolling_stats.to_csv("data/stats_feats.csv")