# Feature Engineering

In [121]:
import pandas as pd
import numpy as np

In [122]:
pd.set_option("display.max_columns", None)

In [123]:
all_games = pd.read_csv("data/games_clean.csv", index_col="id")

In [124]:
all_games.head()

Unnamed: 0_level_0,date,home_team_score,period,postseason,season,status,visitor_team_score,home_team.id,home_team.abbreviation,home_team.conference,home_team.division,home_team.full_name,visitor_team.id,visitor_team.abbreviation,visitor_team.conference,visitor_team.division,visitor_team.full_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
47179,2019-01-30T00:00:00.000Z,126,4,False,2018,Final,94,2,BOS,East,Atlantic,Boston Celtics,4,CHA,East,Southeast,Charlotte Hornets
48751,2019-02-09T00:00:00.000Z,112,4,False,2018,Final,123,2,BOS,East,Atlantic,Boston Celtics,13,LAC,West,Pacific,LA Clippers
48739,2019-02-08T00:00:00.000Z,117,4,False,2018,Final,110,23,PHI,East,Atlantic,Philadelphia 76ers,8,DEN,West,Northwest,Denver Nuggets
48740,2019-02-08T00:00:00.000Z,119,4,False,2018,Final,106,30,WAS,East,Southeast,Washington Wizards,6,CLE,East,Central,Cleveland Cavaliers
48746,2019-02-08T00:00:00.000Z,102,4,False,2018,Final,96,26,SAC,West,Pacific,Sacramento Kings,16,MIA,East,Southeast,Miami Heat


In [128]:
# Feature engineering

g = all_games.copy()

g["date"] = pd.to_datetime(games["date"]).dt.tz_localize(None)

# creating target variable
g["winner"] = np.where(g["home_team_score"] > g["visitor_team_score"], 1, 0)

# creating avg points by team across all years (not sure how useful this will be)
home_avg_pts_map = g[["home_team.id", "home_team_score"]].groupby("home_team.id").mean().squeeze()
g["home_team_avg_score_historical"] = g["home_team.id"].map(home_avg_pts_map).round(1)

visitor_avg_pts_map = g[["visitor_team.id", "visitor_team_score"]].groupby("visitor_team.id").mean().squeeze()
g["visitor_team_avg_score_historical"] = g["visitor_team.id"].map(visitor_avg_pts_map).round(1)

In [205]:
# creating an identifier for the team in a specific season
g["home_team_id_year"] = (g["home_team.id"].astype(str) + " " + g["season"].astype(str)).values
g["visitor_team_id_year"] = (g["visitor_team.id"].astype(str) + " " + g["season"].astype(str)).values

In [206]:
# creating average pts feature
# average pts scored per game in that season
# based on if they are home or away because there is a statistically significant difference
home_avg_score_map = g[["home_team_id_year", "home_team_score"]].groupby(["home_team_id_year"]).mean().squeeze()
visitor_avg_score_map = g[["visitor_team_id_year", "visitor_team_score"]].groupby(["visitor_team_id_year"]).mean().squeeze()

g["home_team_avg_score"] = g["home_team_id_year"].map(home_avg_score_map).round(1)
g["visitor_team_avg_score"] = g["visitor_team_id_year"].map(visitor_avg_score_map).round(1) 

In [220]:
# create an average difference in pts feature
# will calculate on average how much a team wins or loses by
# this will hopefully benefit lower scoring teams that also play defensively
avg_score_diff = g[["home_team_id_year", "home_team_avg_score", "visitor_team_avg_score"]].groupby("home_team_id_year").mean()
avg_score_diff = avg_score_diff["home_team_avg_score"] - avg_score_diff["visitor_team_avg_score"]
g["home_avg_score_diff"] = g["home_team_id_year"].map(avg_score_diff)

avg_score_diff = g[["visitor_team_id_year", "visitor_team_avg_score", "home_team_avg_score"]].groupby("visitor_team_id_year").mean()
avg_score_diff = avg_score_diff["visitor_team_avg_score"] - avg_score_diff["home_team_avg_score"]
g["visitor_avg_score_diff"] = g["visitor_team_id_year"].map(avg_score_diff)

In [221]:
g

Unnamed: 0_level_0,date,home_team_score,period,postseason,season,status,visitor_team_score,home_team.id,home_team.abbreviation,home_team.conference,home_team.division,home_team.full_name,visitor_team.id,visitor_team.abbreviation,visitor_team.conference,visitor_team.division,visitor_team.full_name,winner,home_team_avg_score_historical,visitor_team_avg_score_historical,home_team_id_year,visitor_team_id_year,home_team_avg_score,visitor_team_avg_score,home_avg_score_diff,visitor_avg_score_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
47179,2019-01-30,126,4,False,2018,Final,94,2,BOS,East,Atlantic,Boston Celtics,4,CHA,East,Southeast,Charlotte Hornets,1,105.8,98.2,2 2018,4 2018,112.8,108.3,3.620000,-3.831707
48751,2019-02-09,112,4,False,2018,Final,123,2,BOS,East,Atlantic,Boston Celtics,13,LAC,West,Pacific,LA Clippers,0,105.8,100.7,2 2018,13 2018,112.8,113.1,3.620000,0.581818
48739,2019-02-08,117,4,False,2018,Final,110,23,PHI,East,Atlantic,Philadelphia 76ers,8,DEN,West,Northwest,Denver Nuggets,1,103.3,104.2,23 2018,8 2018,117.9,108.2,8.725532,-4.670213
48740,2019-02-08,119,4,False,2018,Final,106,30,WAS,East,Southeast,Washington Wizards,6,CLE,East,Central,Cleveland Cavaliers,1,103.4,98.3,30 2018,6 2018,116.4,103.8,7.429268,-8.419512
48746,2019-02-08,102,4,False,2018,Final,96,26,SAC,West,Pacific,Sacramento Kings,16,MIA,East,Southeast,Miami Heat,1,105.5,96.0,26 2018,16 2018,114.9,105.4,5.129268,-6.670732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128052,2021-03-03,111,4,False,2020,Final,114,6,CLE,East,Central,Cleveland Cavaliers,12,IND,East,Central,Indiana Pacers,0,102.3,100.1,6 2020,12 2020,107.5,117.7,-4.341667,5.432432
128036,2021-03-01,124,4,False,2020,Final,130,22,ORL,East,Southeast,Orlando Magic,7,DAL,West,Southwest,Dallas Mavericks,0,101.9,101.5,22 2020,7 2020,105.4,113.8,-5.902778,1.257500
128001,2021-02-25,111,4,False,2020,Final,97,23,PHI,East,Atlantic,Philadelphia 76ers,7,DAL,West,Southwest,Dallas Mavericks,1,103.3,101.5,23 2020,7 2020,117.3,113.8,6.072093,1.257500
127942,2021-02-17,99,4,False,2020,Final,120,4,CHA,East,Southeast,Charlotte Hornets,5,CHI,East,Central,Chicago Bulls,0,100.9,99.7,4 2020,5 2020,108.4,111.9,-2.677778,-0.113889


In [222]:
g.to_csv("data/games_with_features.csv")

In [86]:
feat_names = ["season", "winner", "home_team_avg_score", "visitor_team_avg_score"]
features = g.loc[:, feat_names]

features.to_csv("data/features.csv")

### Using stats

In [469]:
stats = pd.read_csv("data/stats2020_clean.csv", index_col="id")

In [470]:
stats["game_date"] = pd.to_datetime(stats["game_date"]).dt.tz_localize(None)

stats["min"] = [pd.Timedelta(minutes=int(time[0]), seconds=int(time[1])) for time in stats["min"].str.split(":").values]

stats["winner"] = np.where(stats["home_team_score"].values > stats["away_team_score"].values, 1,0)
label = stats[["game_id", "winner"]].groupby("game_id").first()

In [471]:
# split into 2 dataframes so that grouping data by game id doesn't
# group players of opposing teams
home_stats = stats[stats["player_team_id"].eq(stats["home_team_id"])]
away_stats = stats[stats["player_team_id"].eq(stats["away_team_id"])]

# define how to aggregate statistics when grouping player stats
agg_map = {"ast": "sum", 
           "blk": "sum", 
           "dreb": "sum", 
           "fg3_pct": "mean", 
           "fg3a": "sum", 
           "fg3m": "sum", 
           "fg_pct": "mean",
          "fga": "sum",
          "fgm": "sum",
          "ft_pct": "mean",
          "fta": "sum",
          "ftm": "sum",
          "min": "sum",
          "oreb": "sum",
          "pf": "sum",
          "pts": "sum",
          "reb": "sum",
          "stl": "sum",
          "turnover": "sum",
          "player_id": "first",
          "player_team_id": "first",
          "game_id": "first",
          "game_date": "first",
          "season": "first",
          "home_team_id": "first",
          "home_team_score": "first",
          "away_team_id": "first",
          "away_team_score": "first"}

# aggregate player stats to team stats
home_games = home_stats.groupby("game_id").agg(agg_map)
away_games = away_stats.groupby("game_id").agg(agg_map)

# player stats were aggregated to team stats, so player id doesn't make sense anymore
home_games.drop(["player_id", "player_team_id"], axis=1, inplace=True)
away_games.drop(["player_id", "player_team_id"], axis=1, inplace=True)

In [472]:
# sort rows by team and date to prepare for rolling average
home_games = home_games.sort_values(["home_team_id", "game_date"])
away_games = away_games.sort_values(["away_team_id", "game_date"])

stats_cols = ["ast","blk","dreb","fg3_pct","fg3a","fg3m","fg_pct","fga","fgm","ft_pct","fta","ftm","oreb",
              "pf","pts","reb","stl","turnover"]
rolling_stats_home = pd.DataFrame()
for team in home_games["home_team_id"].unique():
    games = home_games[home_games["home_team_id"].eq(team)]
    stats = games[stats_cols]
    rolling_stats_home = rolling_stats_home.append(stats.rolling(window=2).mean())
rolling_stats_home.dropna(inplace=True)
    
rolling_stats_away = pd.DataFrame()
for team in away_games["away_team_id"].unique():
    games = away_games[away_games["away_team_id"].eq(team)]
    stats = games[stats_cols]
    rolling_stats_away = rolling_stats_away.append(stats.rolling(window=2).mean())
rolling_stats_away.dropna(inplace=True)

rolling_stats_home.columns = ["home_" + col_name for col_name in rolling_stats_home.columns]
rolling_stats_away.columns = ["away_" + col_name for col_name in rolling_stats_away.columns]

rolling_stats = pd.merge(rolling_stats_home, rolling_stats_away, on="game_id")

rolling_stats = pd.merge(rolling_stats, label, on="game_id")

In [473]:
stats_cols = ["ast","blk","dreb","fg3_pct","fg3a","fg3m","fg_pct","fga","fgm","ft_pct","fta","ftm","oreb",
              "pf","pts","reb","stl","turnover"]
home_cols = rolling_stats.columns[:18]
away_cols = rolling_stats.columns[18:-1]

rolling_stats_diff = pd.DataFrame(index=rolling_stats.index)

##### This should be altered to that percentages are divided by each other instead of subtracted
for col in zip(stats_cols, home_cols, away_cols):
    rolling_stats_diff[col[0]] = rolling_stats[col[1]] - rolling_stats[col[2]]

rolling_stats_diff = pd.merge(rolling_stats_diff, label, on="game_id")

In [476]:
rolling_stats.to_csv("data/stats_feats2020.csv")
rolling_stats_diff.to_csv("data/stats_diff2020.csv")

In [474]:
rolling_stats_diff

Unnamed: 0_level_0,ast,blk,dreb,fg3_pct,fg3a,fg3m,fg_pct,fga,fgm,ft_pct,fta,ftm,oreb,pf,pts,reb,stl,turnover,winner
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
127581,5.5,2.0,3.5,8.837500,11.0,6.0,11.103333,-3.0,0.0,-4.365833,5.0,6.0,-2.0,-0.5,12.0,1.5,-0.5,-0.5,0
127596,-1.5,-1.0,-6.0,-6.924583,7.5,1.0,-17.396528,2.0,-5.0,-9.448889,2.0,-1.0,2.0,2.0,-10.0,-4.0,0.0,1.0,0
127611,-16.5,-7.5,-21.0,-9.833838,-21.5,-10.0,-1.211919,-37.0,-14.0,-21.804747,-18.0,-13.0,-1.0,-8.0,-51.0,-22.0,-4.5,-8.0,0
127655,1.0,-7.0,2.5,-2.522348,11.0,3.5,-5.400429,8.0,-2.5,-3.364217,6.0,3.0,6.5,-2.0,1.5,9.0,2.5,-4.0,1
127706,3.0,0.5,11.0,7.287500,-3.0,0.0,-1.423889,-3.0,-0.5,-2.741667,1.5,4.0,-1.5,-5.0,3.0,9.5,-2.5,-0.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264689,2.5,-3.0,-3.0,-23.378322,-10.0,-10.0,-3.579487,-0.5,0.0,-8.148873,-1.0,-1.5,1.0,0.0,-11.5,-2.0,0.5,-2.5,1
264801,9.5,1.5,5.0,5.186869,-20.5,-4.0,9.188889,-1.0,15.5,-14.614646,-1.5,-2.0,-3.0,3.0,25.0,2.0,0.5,5.5,1
423334,-4.0,2.5,10.0,-7.229004,-9.5,-3.0,6.276714,-5.5,4.5,-5.629365,4.0,2.5,-1.0,1.0,8.5,9.0,-3.0,1.0,1
430019,0.5,3.0,3.5,-1.927976,2.5,-1.0,-3.298810,12.0,2.5,12.477976,5.5,5.5,2.5,-2.5,9.5,6.0,-1.5,-2.0,0


In [475]:
rolling_stats

Unnamed: 0_level_0,home_ast,home_blk,home_dreb,home_fg3_pct,home_fg3a,home_fg3m,home_fg_pct,home_fga,home_fgm,home_ft_pct,home_fta,home_ftm,home_oreb,home_pf,home_pts,home_reb,home_stl,home_turnover,away_ast,away_blk,away_dreb,away_fg3_pct,away_fg3a,away_fg3m,away_fg_pct,away_fga,away_fgm,away_ft_pct,away_fta,away_ftm,away_oreb,away_pf,away_pts,away_reb,away_stl,away_turnover,winner
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
127581,26.0,6.0,35.0,31.380000,37.0,15.5,44.785000,83.5,37.5,35.415000,25.5,19.0,11.5,21.0,109.5,46.5,7.0,16.5,20.5,4.0,31.5,22.542500,26.0,9.5,33.681667,86.5,37.5,39.780833,20.5,13.0,13.5,21.5,97.5,45.0,7.5,17.0,0
127596,23.0,3.5,31.5,21.276111,31.5,10.5,36.180556,86.0,37.5,29.676111,21.0,14.0,12.5,20.5,99.5,44.0,6.5,16.0,24.5,4.5,37.5,28.200694,24.0,9.5,53.577083,84.0,42.5,39.125000,19.0,15.0,10.5,18.5,109.5,48.0,6.5,15.0,0
127611,24.5,1.5,37.5,14.911111,35.5,8.5,34.885556,91.5,38.0,40.331111,20.5,16.5,11.0,23.0,101.0,48.5,5.0,12.5,41.0,9.0,58.5,24.744949,57.0,18.5,36.097475,128.5,52.0,62.135859,38.5,29.5,12.0,31.0,152.0,70.5,9.5,20.5,0
127655,22.5,2.0,44.5,17.095833,42.0,12.0,32.342500,97.5,36.5,43.390833,22.5,18.0,13.0,19.0,103.0,57.5,6.5,10.5,21.5,9.0,42.0,19.618182,31.0,8.5,37.742929,89.5,39.0,46.755051,16.5,15.0,6.5,21.0,101.5,48.5,4.0,14.5,1
127706,26.0,5.5,42.5,28.712500,36.0,14.0,36.940278,89.0,37.0,43.320833,26.0,22.0,10.5,15.0,110.0,53.0,11.0,16.0,23.0,5.0,31.5,21.425000,39.0,14.0,38.364167,92.0,37.5,46.062500,24.5,18.0,12.0,20.0,107.0,43.5,13.5,16.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264689,37.0,5.5,37.5,27.480769,20.0,7.5,50.985664,96.5,55.0,44.748601,22.5,17.5,9.5,19.5,135.0,47.0,8.0,14.5,34.5,8.5,40.5,50.859091,30.0,17.5,54.565152,97.0,55.0,52.897475,23.5,19.0,8.5,19.5,146.5,49.0,7.5,17.0,1
264801,36.0,5.0,39.0,23.559091,26.5,9.5,47.600000,97.0,53.5,40.940909,22.5,18.0,9.5,19.5,134.5,48.5,7.0,15.0,26.5,3.5,34.0,18.372222,47.0,13.5,38.411111,98.0,38.0,55.555556,24.0,20.0,12.5,16.5,109.5,46.5,6.5,9.5,1
423334,28.0,6.0,44.0,23.087662,29.5,12.0,47.025325,92.0,49.0,34.767857,23.5,18.5,8.5,20.0,128.5,52.5,5.5,14.5,32.0,3.5,34.0,30.316667,39.0,15.0,40.748611,97.5,44.5,40.397222,19.5,16.0,9.5,19.0,120.0,43.5,8.5,13.5,1
430019,26.0,6.0,36.5,23.332738,31.5,11.0,46.640476,94.5,46.0,42.363690,25.5,19.5,10.0,19.0,122.5,46.5,6.0,11.5,25.5,3.0,33.0,25.260714,29.0,12.0,49.939286,82.5,43.5,29.885714,20.0,14.0,7.5,21.5,113.0,40.5,7.5,13.5,0
