# Feature Engineering

In [2]:
import pandas as pd
import numpy as np

In [3]:
pd.set_option("display.max_columns", None)

In [10]:
all_games = pd.read_csv("data/games_clean.csv", index_col="id")

In [99]:
all_games.shape

(49834, 17)

In [11]:
all_games.head()

Unnamed: 0_level_0,date,home_team_score,period,postseason,season,status,visitor_team_score,home_team.id,home_team.abbreviation,home_team.conference,home_team.division,home_team.full_name,visitor_team.id,visitor_team.abbreviation,visitor_team.conference,visitor_team.division,visitor_team.full_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
47179,2019-01-30T00:00:00.000Z,126,4,False,2018,Final,94,2,BOS,East,Atlantic,Boston Celtics,4,CHA,East,Southeast,Charlotte Hornets
48751,2019-02-09T00:00:00.000Z,112,4,False,2018,Final,123,2,BOS,East,Atlantic,Boston Celtics,13,LAC,West,Pacific,LA Clippers
48739,2019-02-08T00:00:00.000Z,117,4,False,2018,Final,110,23,PHI,East,Atlantic,Philadelphia 76ers,8,DEN,West,Northwest,Denver Nuggets
48740,2019-02-08T00:00:00.000Z,119,4,False,2018,Final,106,30,WAS,East,Southeast,Washington Wizards,6,CLE,East,Central,Cleveland Cavaliers
48746,2019-02-08T00:00:00.000Z,102,4,False,2018,Final,96,26,SAC,West,Pacific,Sacramento Kings,16,MIA,East,Southeast,Miami Heat


In [12]:
# Feature engineering

g = all_games.copy()

g["date"] = pd.to_datetime(all_games["date"]).dt.tz_localize(None)

# creating target variable
g["winner"] = np.where(g["home_team_score"] > g["visitor_team_score"], 1, 0)

# creating avg points by team across all years (not sure how useful this will be)
home_avg_pts_map = g[["home_team.id", "home_team_score"]].groupby("home_team.id").mean().squeeze()
g["home_team_avg_score_historical"] = g["home_team.id"].map(home_avg_pts_map).round(1)

visitor_avg_pts_map = g[["visitor_team.id", "visitor_team_score"]].groupby("visitor_team.id").mean().squeeze()
g["visitor_team_avg_score_historical"] = g["visitor_team.id"].map(visitor_avg_pts_map).round(1)

In [13]:
# creating an identifier for the team in a specific season
g["home_team_id_year"] = (g["home_team.id"].astype(str) + " " + g["season"].astype(str)).values
g["visitor_team_id_year"] = (g["visitor_team.id"].astype(str) + " " + g["season"].astype(str)).values

In [14]:
# creating average pts feature
# average pts scored per game in that season
# based on if they are home or away because there is a statistically significant difference
home_avg_score_map = g[["home_team_id_year", "home_team_score"]].groupby(["home_team_id_year"]).mean().squeeze()
visitor_avg_score_map = g[["visitor_team_id_year", "visitor_team_score"]].groupby(["visitor_team_id_year"]).mean().squeeze()

g["home_team_avg_score"] = g["home_team_id_year"].map(home_avg_score_map).round(1)
g["visitor_team_avg_score"] = g["visitor_team_id_year"].map(visitor_avg_score_map).round(1) 

In [15]:
# create an average difference in pts feature
# will calculate on average how much a team wins or loses by
# this will hopefully benefit lower scoring teams that also play defensively
avg_score_diff = g[["home_team_id_year", "home_team_avg_score", "visitor_team_avg_score"]].groupby("home_team_id_year").mean()
avg_score_diff = avg_score_diff["home_team_avg_score"] - avg_score_diff["visitor_team_avg_score"]
g["home_avg_score_diff"] = g["home_team_id_year"].map(avg_score_diff)

avg_score_diff = g[["visitor_team_id_year", "visitor_team_avg_score", "home_team_avg_score"]].groupby("visitor_team_id_year").mean()
avg_score_diff = avg_score_diff["visitor_team_avg_score"] - avg_score_diff["home_team_avg_score"]
g["visitor_avg_score_diff"] = g["visitor_team_id_year"].map(avg_score_diff)

In [16]:
g

Unnamed: 0_level_0,date,home_team_score,period,postseason,season,status,visitor_team_score,home_team.id,home_team.abbreviation,home_team.conference,home_team.division,home_team.full_name,visitor_team.id,visitor_team.abbreviation,visitor_team.conference,visitor_team.division,visitor_team.full_name,winner,home_team_avg_score_historical,visitor_team_avg_score_historical,home_team_id_year,visitor_team_id_year,home_team_avg_score,visitor_team_avg_score,home_avg_score_diff,visitor_avg_score_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
47179,2019-01-30,126,4,False,2018,Final,94,2,BOS,East,Atlantic,Boston Celtics,4,CHA,East,Southeast,Charlotte Hornets,1,105.8,98.2,2 2018,4 2018,112.8,108.3,3.620000,-3.831707
48751,2019-02-09,112,4,False,2018,Final,123,2,BOS,East,Atlantic,Boston Celtics,13,LAC,West,Pacific,LA Clippers,0,105.8,100.7,2 2018,13 2018,112.8,113.1,3.620000,0.581818
48739,2019-02-08,117,4,False,2018,Final,110,23,PHI,East,Atlantic,Philadelphia 76ers,8,DEN,West,Northwest,Denver Nuggets,1,103.3,104.2,23 2018,8 2018,117.9,108.2,8.725532,-4.670213
48740,2019-02-08,119,4,False,2018,Final,106,30,WAS,East,Southeast,Washington Wizards,6,CLE,East,Central,Cleveland Cavaliers,1,103.4,98.3,30 2018,6 2018,116.4,103.8,7.429268,-8.419512
48746,2019-02-08,102,4,False,2018,Final,96,26,SAC,West,Pacific,Sacramento Kings,16,MIA,East,Southeast,Miami Heat,1,105.5,96.0,26 2018,16 2018,114.9,105.4,5.129268,-6.670732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128052,2021-03-03,111,4,False,2020,Final,114,6,CLE,East,Central,Cleveland Cavaliers,12,IND,East,Central,Indiana Pacers,0,102.3,100.1,6 2020,12 2020,107.5,117.7,-4.341667,5.432432
128036,2021-03-01,124,4,False,2020,Final,130,22,ORL,East,Southeast,Orlando Magic,7,DAL,West,Southwest,Dallas Mavericks,0,101.9,101.5,22 2020,7 2020,105.4,113.8,-5.902778,1.257500
128001,2021-02-25,111,4,False,2020,Final,97,23,PHI,East,Atlantic,Philadelphia 76ers,7,DAL,West,Southwest,Dallas Mavericks,1,103.3,101.5,23 2020,7 2020,117.3,113.8,6.072093,1.257500
127942,2021-02-17,99,4,False,2020,Final,120,4,CHA,East,Southeast,Charlotte Hornets,5,CHI,East,Central,Chicago Bulls,0,100.9,99.7,4 2020,5 2020,108.4,111.9,-2.677778,-0.113889


In [17]:
g.to_csv("data/games_with_features.csv")

In [18]:
feat_names = ["season", "winner", "home_team_avg_score", "visitor_team_avg_score"]
features = g.loc[:, feat_names]

features.to_csv("data/features.csv")

### Using stats

In [7]:
stats = pd.read_csv("data/all_stats_clean.csv", index_col="id")

In [8]:
stats

Unnamed: 0_level_0,ast,blk,dreb,fg3_pct,fg3a,fg3m,fg_pct,fga,fgm,ft_pct,fta,ftm,min,oreb,pf,pts,reb,stl,turnover,game.id,game.date,game.home_team_id,game.home_team_score,game.season,game.visitor_team_id,game.visitor_team_score,player.id,team.id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
1069008,0.0,1.0,2.0,0.200,5.0,1.0,0.333,9.0,3.0,0.000,0.0,0.0,20:08,0.0,0.0,7.0,2.0,0.0,2.0,45237,2019-01-17T00:00:00.000Z,4,114,2018,26,95,415.0,26
1069009,4.0,0.0,5.0,0.000,2.0,0.0,0.200,5.0,1.0,0.000,0.0,0.0,19:22,1.0,1.0,2.0,6.0,0.0,0.0,45237,2019-01-17T00:00:00.000Z,4,114,2018,26,95,49.0,26
1069010,4.0,1.0,5.0,0.000,0.0,0.0,0.667,6.0,4.0,0.000,0.0,0.0,27:24,6.0,2.0,8.0,11.0,3.0,2.0,45237,2019-01-17T00:00:00.000Z,4,114,2018,26,95,91.0,26
1069011,1.0,0.0,1.0,0.545,11.0,6.0,0.500,18.0,9.0,0.000,0.0,0.0,32:06,0.0,2.0,24.0,1.0,2.0,0.0,45237,2019-01-17T00:00:00.000Z,4,114,2018,26,95,210.0,26
1069012,8.0,1.0,5.0,0.000,2.0,0.0,0.400,10.0,4.0,0.667,3.0,2.0,30:30,0.0,4.0,10.0,5.0,1.0,4.0,45237,2019-01-17T00:00:00.000Z,4,114,2018,26,95,161.0,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7286561,1.0,0.0,2.0,0.000,2.0,0.0,0.000,2.0,0.0,0.000,0.0,0.0,13:17,0.0,2.0,0.0,2.0,0.0,0.0,470197,2021-08-17T00:00:00.000Z,2,67,2021,26,100,17554006.0,26
7286573,0.0,1.0,2.0,20.000,5.0,1.0,16.700,6.0,1.0,100.000,2.0,2.0,11:04,0.0,0.0,5.0,2.0,0.0,0.0,470197,2021-08-17T00:00:00.000Z,2,67,2021,26,100,17895875.0,2
7286575,1.0,1.0,1.0,0.000,1.0,0.0,0.000,1.0,0.0,0.000,0.0,0.0,6:52,0.0,2.0,0.0,1.0,0.0,1.0,470197,2021-08-17T00:00:00.000Z,2,67,2021,26,100,666716.0,2
7286576,0.0,0.0,1.0,0.000,1.0,0.0,33.300,3.0,1.0,50.000,2.0,1.0,4:08,1.0,1.0,3.0,2.0,0.0,1.0,470197,2021-08-17T00:00:00.000Z,2,67,2021,26,100,17895732.0,2


In [9]:
# Convert game date to datetime
stats["game.date"] = pd.to_datetime(stats["game.date"]).dt.tz_localize(None)

# Convert string to timedelta
stats["min"] = [pd.Timedelta(minutes=int(time[0]), seconds=int(time[1])) for time in stats["min"].str.split(":").values]

# Create label
stats["winner"] = np.where(stats["game.home_team_score"].values > stats["game.visitor_team_score"].values, 1,0)
labels = stats[["game.id", "game.date", "game.season", "winner"]].groupby("game.id").first()

In [10]:
# split into 2 dataframes so that grouping data by game id doesn't
# group players of opposing teams
home_stats = stats[stats["team.id"].eq(stats["game.home_team_id"])]
away_stats = stats[stats["team.id"].eq(stats["game.visitor_team_id"])]

# define how to aggregate statistics when grouping player stats
agg_map = {"ast": "sum", 
           "blk": "sum", 
           "dreb": "sum", 
           "fg3_pct": "mean", 
           "fg3a": "sum", 
           "fg3m": "sum", 
           "fg_pct": "mean",
          "fga": "sum",
          "fgm": "sum",
          "ft_pct": "mean",
          "fta": "sum",
          "ftm": "sum",
          "min": "sum",
          "oreb": "sum",
          "pf": "sum",
          "pts": "sum",
          "reb": "sum",
          "stl": "sum",
          "turnover": "sum",
          "game.id": "first",
          "game.date": "first",
          "game.season": "first",
          "game.home_team_id": "first",
          "game.home_team_score": "first",
          "game.visitor_team_id": "first",
          "game.visitor_team_score": "first",
          "player.id": "first",
          "team.id": "first",}

# aggregate player stats to team stats
home_games = home_stats.groupby("game.id").agg(agg_map)
away_games = away_stats.groupby("game.id").agg(agg_map)

# player stats were aggregated to team stats, so player id doesn't make sense anymore
# team id isn't needed either
home_games.drop(["player.id", "team.id"], axis=1, inplace=True)
away_games.drop(["player.id", "team.id"], axis=1, inplace=True)

In [11]:
# sort rows by team and date to prepare for rolling average
home_games = home_games.sort_values(["game.home_team_id", "game.date"])
away_games = away_games.sort_values(["game.visitor_team_id", "game.date"])

In [12]:
# Only use the columns that refer to stats when calculating the rolling average
stats_cols = ["ast","blk","dreb","fg3_pct","fg3a","fg3m","fg_pct","fga","fgm","ft_pct","fta","ftm","oreb",
              "pf","pts","reb","stl","turnover"]

rolling_stats_home = pd.DataFrame()
for team in home_games["game.home_team_id"].unique():                       # for each team
    games = home_games[home_games["game.home_team_id"].eq(team)]            # get only home games for that team
    games_stats = games[stats_cols]                                         # Only use the columns that refer to stats when calculating the rolling average
    rolling_stats_home = rolling_stats_home.append(games_stats.rolling(window=20).mean())  # calculate the rolling average
    rolling_stats_home = rolling_stats_home.shift()                         # make the rolling average not include the current game
rolling_stats_home.dropna(inplace=True)


rolling_stats_away = pd.DataFrame()
for team in away_games["game.visitor_team_id"].unique():
    games = away_games[away_games["game.visitor_team_id"].eq(team)]
    games_stats = games[stats_cols]
    rolling_stats_away = rolling_stats_away.append(games_stats.rolling(window=20).mean())
    rolling_stats_away = rolling_stats_away.shift()  # make the rolling average not include the current game
rolling_stats_away.dropna(inplace=True)

In [20]:
# Rename columns in the home and away dataframes
rolling_stats_home.columns = ["home_" + col_name for col_name in rolling_stats_home.columns]
rolling_stats_away.columns = ["away_" + col_name for col_name in rolling_stats_away.columns]
# Now that columns are renamed they can be merged into the same dataframe
rolling_stats = pd.merge(rolling_stats_home, rolling_stats_away, on="game.id")

In [25]:
# NOTE: adding columns to the start of the dataframe will mess up this code
# adding columns to the end will not
rolling_stats_diff = pd.DataFrame(index=rolling_stats.index)
home_cols = rolling_stats.columns[0:18]
away_cols = rolling_stats.columns[18:36]

In [33]:
# Create a dataframe where each column is equal to the home value minus the away value
##### This should be altered so that percentages are divided by each other instead of subtracted
for col in zip(stats_cols, home_cols, away_cols):
    rolling_stats_diff[col[0]] = rolling_stats[col[1]] - rolling_stats[col[2]]

In [37]:
# Rename the columns
rolling_stats_diff.columns = ["diff_" + col_name for col_name in rolling_stats_away.columns]

# putting it all together in 1 dataframe
rolling_stats = pd.merge(rolling_stats, rolling_stats_diff, on="game.id") # adding diff to the rolling stats
rolling_stats = pd.merge(labels, rolling_stats, on="game.id")             # adding labels

In [39]:
rolling_stats.to_csv("data/all_stats_feats.csv")