## Feature Engineering for Model Training

Prior to training the model, we want to conduct some feature engineering so that the model inputs are more reliable.

In [12]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

### Read data and drop/convert columns

In [13]:
df = pd.read_csv("../data/clean/stats_salaries.csv")
df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,START_POSITION,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,TEAM_ABBREVIATION,TEAM_NAME,GAME_DATE_EST,SEASON_START,WON_GAME,SALARY,INFLATION_ADJ_SALARY,TEAM_PAYROLL,INFLATION_ADJ_TEAM_PAYROLL,LEAGUE_PAYROLL,INFLATION_ADJ_LEAGUE_PAYROLL,TEAM_IMPORTANCE,LEAGUE_IMPORTANCE,TEAM_MARKET_SIZE
0,1630162,Anthony Edwards,F,36:22,4.0,10.0,0.400,3.0,8.0,0.375,4.0,4.0,1.000,0.0,8.0,8.0,5.0,3.0,1.0,1.0,1.0,15.0,5.0,MIN,Minnesota Timberwolves,2022-03-12,2021,1,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235
1,1630162,Anthony Edwards,F,34:27,9.0,19.0,0.474,4.0,11.0,0.364,3.0,3.0,1.000,0.0,3.0,3.0,5.0,1.0,0.0,0.0,3.0,25.0,-3.0,MIN,Minnesota Timberwolves,2022-03-11,2021,0,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235
2,1630162,Anthony Edwards,F,25:29,7.0,15.0,0.467,2.0,8.0,0.250,0.0,0.0,0.000,0.0,1.0,1.0,3.0,2.0,1.0,4.0,2.0,16.0,7.0,MIN,Minnesota Timberwolves,2022-03-09,2021,1,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235
3,1630162,Anthony Edwards,F,32:22,7.0,13.0,0.538,1.0,5.0,0.200,2.0,2.0,1.000,2.0,2.0,4.0,4.0,4.0,0.0,4.0,5.0,17.0,7.0,MIN,Minnesota Timberwolves,2022-02-28,2021,1,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235
4,1630162,Anthony Edwards,F,37:46,5.0,13.0,0.385,1.0,6.0,0.167,4.0,6.0,0.667,1.0,2.0,3.0,5.0,1.0,1.0,2.0,3.0,15.0,-21.0,MIN,Minnesota Timberwolves,2022-02-25,2021,0,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492307,203129,Tornike Shengelia,,4:59,0.0,1.0,0.000,0.0,1.0,0.000,2.0,2.0,1.000,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,-1.0,BKN,Brooklyn Nets,2013-01-08,2012,1,473604.0,560734,87644649,103768982,2015938307,2386818417,0.005404,0.000235,0.043476
492308,203129,Tornike Shengelia,,1:36,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,BKN,Brooklyn Nets,2013-01-02,2012,1,473604.0,560734,87644649,103768982,2015938307,2386818417,0.005404,0.000235,0.043476
492309,203129,Tornike Shengelia,,3:20,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,-4.0,BKN,Brooklyn Nets,2012-12-23,2012,1,473604.0,560734,87644649,103768982,2015938307,2386818417,0.005404,0.000235,0.043476
492310,202345,Damion James,,0:17,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BKN,Brooklyn Nets,2013-01-15,2012,1,50258.0,59504,87644649,103768982,2015938307,2386818417,0.000573,0.000025,0.043476


It doesn't look like we need the `PLAYER_NAME` or `TEAM_ABBREVIATION` for training (as we already have `PLAYER_ID` and `TEAM_NAME`), so we can drop that.

In [14]:
df = df.drop(columns=['PLAYER_NAME', 'TEAM_ABBREVIATION'])
df

Unnamed: 0,PLAYER_ID,START_POSITION,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,TEAM_NAME,GAME_DATE_EST,SEASON_START,WON_GAME,SALARY,INFLATION_ADJ_SALARY,TEAM_PAYROLL,INFLATION_ADJ_TEAM_PAYROLL,LEAGUE_PAYROLL,INFLATION_ADJ_LEAGUE_PAYROLL,TEAM_IMPORTANCE,LEAGUE_IMPORTANCE,TEAM_MARKET_SIZE
0,1630162,F,36:22,4.0,10.0,0.400,3.0,8.0,0.375,4.0,4.0,1.000,0.0,8.0,8.0,5.0,3.0,1.0,1.0,1.0,15.0,5.0,Minnesota Timberwolves,2022-03-12,2021,1,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235
1,1630162,F,34:27,9.0,19.0,0.474,4.0,11.0,0.364,3.0,3.0,1.000,0.0,3.0,3.0,5.0,1.0,0.0,0.0,3.0,25.0,-3.0,Minnesota Timberwolves,2022-03-11,2021,0,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235
2,1630162,F,25:29,7.0,15.0,0.467,2.0,8.0,0.250,0.0,0.0,0.000,0.0,1.0,1.0,3.0,2.0,1.0,4.0,2.0,16.0,7.0,Minnesota Timberwolves,2022-03-09,2021,1,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235
3,1630162,F,32:22,7.0,13.0,0.538,1.0,5.0,0.200,2.0,2.0,1.000,2.0,2.0,4.0,4.0,4.0,0.0,4.0,5.0,17.0,7.0,Minnesota Timberwolves,2022-02-28,2021,1,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235
4,1630162,F,37:46,5.0,13.0,0.385,1.0,6.0,0.167,4.0,6.0,0.667,1.0,2.0,3.0,5.0,1.0,1.0,2.0,3.0,15.0,-21.0,Minnesota Timberwolves,2022-02-25,2021,0,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492307,203129,,4:59,0.0,1.0,0.000,0.0,1.0,0.000,2.0,2.0,1.000,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,-1.0,Brooklyn Nets,2013-01-08,2012,1,473604.0,560734,87644649,103768982,2015938307,2386818417,0.005404,0.000235,0.043476
492308,203129,,1:36,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Brooklyn Nets,2013-01-02,2012,1,473604.0,560734,87644649,103768982,2015938307,2386818417,0.005404,0.000235,0.043476
492309,203129,,3:20,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,-4.0,Brooklyn Nets,2012-12-23,2012,1,473604.0,560734,87644649,103768982,2015938307,2386818417,0.005404,0.000235,0.043476
492310,202345,,0:17,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Brooklyn Nets,2013-01-15,2012,1,50258.0,59504,87644649,103768982,2015938307,2386818417,0.000573,0.000025,0.043476


We also convert minutes played to just seconds, so that it can be fed into the model in a numerical format:

In [15]:
df['SECS_PLAYED'] = df['MIN'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]))
df = df.drop(columns=['MIN'])
df

Unnamed: 0,PLAYER_ID,START_POSITION,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,TEAM_NAME,GAME_DATE_EST,SEASON_START,WON_GAME,SALARY,INFLATION_ADJ_SALARY,TEAM_PAYROLL,INFLATION_ADJ_TEAM_PAYROLL,LEAGUE_PAYROLL,INFLATION_ADJ_LEAGUE_PAYROLL,TEAM_IMPORTANCE,LEAGUE_IMPORTANCE,TEAM_MARKET_SIZE,SECS_PLAYED
0,1630162,F,4.0,10.0,0.400,3.0,8.0,0.375,4.0,4.0,1.000,0.0,8.0,8.0,5.0,3.0,1.0,1.0,1.0,15.0,5.0,Minnesota Timberwolves,2022-03-12,2021,1,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235,2182
1,1630162,F,9.0,19.0,0.474,4.0,11.0,0.364,3.0,3.0,1.000,0.0,3.0,3.0,5.0,1.0,0.0,0.0,3.0,25.0,-3.0,Minnesota Timberwolves,2022-03-11,2021,0,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235,2067
2,1630162,F,7.0,15.0,0.467,2.0,8.0,0.250,0.0,0.0,0.000,0.0,1.0,1.0,3.0,2.0,1.0,4.0,2.0,16.0,7.0,Minnesota Timberwolves,2022-03-09,2021,1,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235,1529
3,1630162,F,7.0,13.0,0.538,1.0,5.0,0.200,2.0,2.0,1.000,2.0,2.0,4.0,4.0,4.0,0.0,4.0,5.0,17.0,7.0,Minnesota Timberwolves,2022-02-28,2021,1,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235,1942
4,1630162,F,5.0,13.0,0.385,1.0,6.0,0.167,4.0,6.0,0.667,1.0,2.0,3.0,5.0,1.0,1.0,2.0,3.0,15.0,-21.0,Minnesota Timberwolves,2022-02-25,2021,0,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235,2266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492307,203129,,0.0,1.0,0.000,0.0,1.0,0.000,2.0,2.0,1.000,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,-1.0,Brooklyn Nets,2013-01-08,2012,1,473604.0,560734,87644649,103768982,2015938307,2386818417,0.005404,0.000235,0.043476,299
492308,203129,,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Brooklyn Nets,2013-01-02,2012,1,473604.0,560734,87644649,103768982,2015938307,2386818417,0.005404,0.000235,0.043476,96
492309,203129,,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,-4.0,Brooklyn Nets,2012-12-23,2012,1,473604.0,560734,87644649,103768982,2015938307,2386818417,0.005404,0.000235,0.043476,200
492310,202345,,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Brooklyn Nets,2013-01-15,2012,1,50258.0,59504,87644649,103768982,2015938307,2386818417,0.000573,0.000025,0.043476,17


### Convert `START_POSITION` into dummy columns

We convert the `START_POSITION` column into dummy columns first, to indicate what position a player started in. This will also help get rid of all `NaN` values (players who didn't start will have a 0 in all dummy columns).

In [16]:
positions = pd.get_dummies(df['START_POSITION'])
df = pd.concat([df, positions], axis=1).drop(columns=['START_POSITION'])
df

Unnamed: 0,PLAYER_ID,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,TEAM_NAME,GAME_DATE_EST,SEASON_START,WON_GAME,SALARY,INFLATION_ADJ_SALARY,TEAM_PAYROLL,INFLATION_ADJ_TEAM_PAYROLL,LEAGUE_PAYROLL,INFLATION_ADJ_LEAGUE_PAYROLL,TEAM_IMPORTANCE,LEAGUE_IMPORTANCE,TEAM_MARKET_SIZE,SECS_PLAYED,C,F,G
0,1630162,4.0,10.0,0.400,3.0,8.0,0.375,4.0,4.0,1.000,0.0,8.0,8.0,5.0,3.0,1.0,1.0,1.0,15.0,5.0,Minnesota Timberwolves,2022-03-12,2021,1,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235,2182,0,1,0
1,1630162,9.0,19.0,0.474,4.0,11.0,0.364,3.0,3.0,1.000,0.0,3.0,3.0,5.0,1.0,0.0,0.0,3.0,25.0,-3.0,Minnesota Timberwolves,2022-03-11,2021,0,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235,2067,0,1,0
2,1630162,7.0,15.0,0.467,2.0,8.0,0.250,0.0,0.0,0.000,0.0,1.0,1.0,3.0,2.0,1.0,4.0,2.0,16.0,7.0,Minnesota Timberwolves,2022-03-09,2021,1,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235,1529,0,1,0
3,1630162,7.0,13.0,0.538,1.0,5.0,0.200,2.0,2.0,1.000,2.0,2.0,4.0,4.0,4.0,0.0,4.0,5.0,17.0,7.0,Minnesota Timberwolves,2022-02-28,2021,1,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235,1942,0,1,0
4,1630162,5.0,13.0,0.385,1.0,6.0,0.167,4.0,6.0,0.667,1.0,2.0,3.0,5.0,1.0,1.0,2.0,3.0,15.0,-21.0,Minnesota Timberwolves,2022-02-25,2021,0,10245480.0,10245480,137098327,137098327,4125163242,4125163242,0.074731,0.002484,0.033235,2266,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492307,203129,0.0,1.0,0.000,0.0,1.0,0.000,2.0,2.0,1.000,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,-1.0,Brooklyn Nets,2013-01-08,2012,1,473604.0,560734,87644649,103768982,2015938307,2386818417,0.005404,0.000235,0.043476,299,0,0,0
492308,203129,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Brooklyn Nets,2013-01-02,2012,1,473604.0,560734,87644649,103768982,2015938307,2386818417,0.005404,0.000235,0.043476,96,0,0,0
492309,203129,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,-4.0,Brooklyn Nets,2012-12-23,2012,1,473604.0,560734,87644649,103768982,2015938307,2386818417,0.005404,0.000235,0.043476,200,0,0,0
492310,202345,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Brooklyn Nets,2013-01-15,2012,1,50258.0,59504,87644649,103768982,2015938307,2386818417,0.000573,0.000025,0.043476,17,0,0,0


### Create season average dataframe

For each player in each season, we create a dataframe of their stat averages over their previous 100 games (excluding games that particular year). This is because player salaries are decided at the beggining of the season based on their performance from previous season(s). The reasoning for averaging over 100 games is that the NBA season is 82 games long + some players will play in the playoffs (accounting for the extra 18 games). We choose the the average of the previous 100 games (instead of simply averaging over the previous season) because some players may not have played in certain games the previous season due to injury/other factors.

If the player does not has not played in _any_ games before a particular season (i.e., he is a rookie in that season), those rows will be excluded from consideration. This is because the player is most likely a rookie, and rookie contracts are handed out based on draft order (not performance in the NBA). If a player has games before a particular season but less than 100 games, those rows will simply be calculated by taking the average of however many games they played prior to this season.

Finally, it should be noted that these averages are only calculated for __in-game stats__ — the player's team, team's payroll, and salary data for that year should remain the same as it currently is in the DataFrame (represented by the variable `static_cols`).

#### Static Columns (should remain the same for each year)

In [17]:
static_cols = ['TEAM_NAME', 'SALARY', 'INFLATION_ADJ_SALARY', 'TEAM_PAYROLL', 'INFLATION_ADJ_TEAM_PAYROLL',
               'LEAGUE_PAYROLL', 'INFLATION_ADJ_LEAGUE_PAYROLL', 'TEAM_IMPORTANCE', 'LEAGUE_IMPORTANCE', 'TEAM_MARKET_SIZE']

static_col_df = df[['PLAYER_ID', 'SEASON_START'] + static_cols].groupby(['PLAYER_ID', 'SEASON_START']).agg(pd.Series.mode)

def select_one_mode(value):
    return value[0] if isinstance(value, np.ndarray) else value

for static_col in static_cols:
    static_col_df[static_col] = static_col_df[static_col].apply(select_one_mode)

static_col_df

Unnamed: 0_level_0,Unnamed: 1_level_0,TEAM_NAME,SALARY,INFLATION_ADJ_SALARY,TEAM_PAYROLL,INFLATION_ADJ_TEAM_PAYROLL,LEAGUE_PAYROLL,INFLATION_ADJ_LEAGUE_PAYROLL,TEAM_IMPORTANCE,LEAGUE_IMPORTANCE,TEAM_MARKET_SIZE
PLAYER_ID,SEASON_START,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
15,2003,Houston Rockets,2500000.0,3697550,55401430,81939823,1672617482,2473834730,0.045125,0.001495,0.033123
15,2004,Chicago Bulls,2750000.0,3938661,57276129,82033174,1773666037,2540315861,0.048013,0.001550,0.032293
15,2005,Chicago Bulls,3000000.0,4190683,57166530,79855609,1897469981,2650565322,0.052478,0.001581,0.030128
15,2006,Phoenix Suns,1178348.0,1577882,65841120,88165438,1939373647,2596944366,0.017897,0.000608,0.033950
15,2007,Phoenix Suns,1219590.0,1590374,71323049,93006958,2063887234,2691358145,0.017100,0.000591,0.034558
...,...,...,...,...,...,...,...,...,...,...,...
1630256,2021,Houston Rockets,1517981.0,1517981,132267085,132267085,4125163242,4125163242,0.011477,0.000368,0.032063
1630264,2020,Washington Wizards,898310.0,940530,131294012,138372654,3905190611,4115736761,0.006842,0.000230,0.033620
1630264,2021,Washington Wizards,1517981.0,1517981,128019790,128019790,4125163242,4125163242,0.011857,0.000368,0.031034
1630267,2020,Denver Nuggets,3200000.0,3350400,129793210,136790939,3905190611,4115736761,0.024655,0.000819,0.033236


#### Average Columns (over prev. 100 games from past years)

Sort the rows by `GAME_DATE_EST` so that we can use a rolling window over the dataframe.

In [18]:
avg_col_df = df.sort_values(by='GAME_DATE_EST').drop(columns=static_cols) # sort by game date
avg_col_df

Unnamed: 0,PLAYER_ID,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_DATE_EST,SEASON_START,WON_GAME,SECS_PLAYED,C,F,G
214062,243,3.0,7.0,0.429,2.0,2.0,1.000,2.0,2.0,1.0,0.0,4.0,4.0,1.0,1.0,0.0,1.0,3.0,10.0,15.0,2003-10-28,2003,1,1490,0,0,0
194299,1904,7.0,9.0,0.778,2.0,3.0,0.667,0.0,1.0,0.0,0.0,7.0,7.0,3.0,1.0,2.0,0.0,2.0,16.0,13.0,2003-10-28,2003,1,2075,0,1,0
195302,2457,2.0,3.0,0.667,1.0,1.0,1.000,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,2.0,0.0,5.0,-8.0,2003-10-28,2003,1,535,0,0,0
213156,714,2.0,8.0,0.250,0.0,4.0,0.000,8.0,8.0,1.0,0.0,4.0,4.0,1.0,0.0,0.0,1.0,1.0,12.0,-14.0,2003-10-28,2003,0,1739,0,0,1
200326,339,0.0,3.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,-14.0,2003-10-28,2003,0,1019,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4206,202691,15.0,24.0,0.625,8.0,14.0,0.571,0.0,0.0,0.0,0.0,6.0,6.0,5.0,1.0,1.0,2.0,2.0,38.0,27.0,2022-03-12,2021,1,2031,0,0,1
4154,1627814,0.0,1.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-7.0,2022-03-12,2021,1,177,0,0,0
6363,1629018,2.0,13.0,0.154,1.0,6.0,0.167,3.0,5.0,0.6,0.0,3.0,3.0,1.0,2.0,0.0,1.0,3.0,8.0,-6.0,2022-03-12,2021,1,1861,0,1,0
9277,1628972,0.0,1.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,2022-03-12,2021,1,569,0,0,0


Calculate averages over the past 100 games prior to the current `SEASON_START`

In [19]:
cols = avg_col_df.columns.tolist()
cols.remove('PLAYER_ID')
cols.remove('SEASON_START')
cols.remove('GAME_DATE_EST')

aggregations = {col: 'mean' for col in cols if not 'PCT' in col}
avg_df_grouped = avg_col_df.groupby('PLAYER_ID')

# Aggregation function to calculate percentages
def fgm_to_fg_pct_rolling(df, fgm_col):
    fga_col = df.loc[fgm_col.index, 'FGA']
    return fgm_col.sum() / fga_col.sum() if fga_col.sum() > 0 else 0


def fg3m_to_fg3_pct_rolling(df, fg3m_col):
    fg3a_col = df.loc[fg3m_col.index, 'FG3A']
    return fg3m_col.sum() / fg3a_col.sum() if fg3a_col.sum() > 0 else 0


def ftm_to_ft_pct_rolling(df, ftm_col):
    fta_col = df.loc[ftm_col.index, 'FTA']
    return ftm_col.sum() / fta_col.sum() if fta_col.sum() > 0 else 0

player_avg_dfs = []

for id, player_df in avg_df_grouped:
    # Calculate rolling average of past 100 games
    player_avg_df = player_df.rolling(100, min_periods=1).agg(aggregations)
    player_avg_df['SEASON_START'] = player_df['SEASON_START']
    player_avg_df['GAME_DATE_EST'] = player_df['GAME_DATE_EST']
    # Calculate percentage columns
    player_avg_df['FG_PCT'] = player_df['FGM'].rolling(100, min_periods=1).apply(lambda x: fgm_to_fg_pct_rolling(player_df, x))
    player_avg_df['FG3_PCT'] = player_df['FG3M'].rolling(100, min_periods=1).apply(lambda x: fg3m_to_fg3_pct_rolling(player_df, x))
    player_avg_df['FT_PCT'] = player_df['FTM'].rolling(100, min_periods=1).apply(lambda x: ftm_to_ft_pct_rolling(player_df, x))
    # Get last row of each season (holds avg for prev. 100 games)
    player_avg_df = player_avg_df.groupby('SEASON_START').last()
    # Offset SEASON_START by 1 year (since we are using stats to predict salary for the FOLLOWING YEAR)
    player_avg_df.index += 1
    player_avg_df = player_avg_df[player_avg_df.index <= 2021] # 2022-23 data is non-existent
    player_avg_df = pd.concat({id: player_avg_df}, names=['PLAYER_ID'])
    player_avg_dfs.append(player_avg_df)

avg_col_df = pd.concat(player_avg_dfs, axis=0)
avg_col_df

Unnamed: 0_level_0,Unnamed: 1_level_0,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,WON_GAME,SECS_PLAYED,C,F,G,GAME_DATE_EST,FG_PCT,FG3_PCT,FT_PCT
PLAYER_ID,SEASON_START,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
15,2004,1.440000,3.840000,0.860000,2.440000,0.280000,0.320000,0.200000,1.280000,1.480000,0.520000,0.320000,0.100000,0.580000,0.860000,4.020000,-0.300000,0.440000,846.440000,0.0,0.000000,0.000000,2004-04-25,0.375000,0.352459,0.875000
15,2005,1.450000,3.460000,0.780000,1.940000,0.520000,0.650000,0.210000,1.030000,1.240000,0.670000,0.390000,0.030000,0.500000,0.850000,4.200000,0.300000,0.490000,703.610000,0.0,0.000000,0.110000,2005-05-04,0.419075,0.402062,0.800000
15,2006,1.270000,3.070000,0.640000,1.610000,0.450000,0.580000,0.190000,0.870000,1.060000,0.590000,0.330000,0.040000,0.450000,0.820000,3.630000,0.710000,0.550000,616.200000,0.0,0.000000,0.060000,2006-05-04,0.413681,0.397516,0.775862
15,2007,1.250000,3.020000,0.620000,1.620000,0.440000,0.570000,0.200000,0.850000,1.050000,0.550000,0.310000,0.050000,0.440000,0.800000,3.560000,0.570000,0.600000,585.960000,0.0,0.000000,0.060000,2007-04-24,0.413907,0.382716,0.771930
15,2008,1.060000,2.640000,0.570000,1.540000,0.430000,0.550000,0.170000,0.810000,0.980000,0.540000,0.260000,0.060000,0.380000,0.760000,3.120000,0.240000,0.640000,537.560000,0.0,0.000000,0.060000,2008-04-27,0.401515,0.370130,0.781818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1630218,2021,0.625000,1.625000,0.062500,0.500000,0.312500,0.625000,0.437500,1.125000,1.562500,0.125000,0.062500,0.250000,0.062500,0.375000,1.625000,0.125000,0.312500,271.562500,0.0,0.000000,0.000000,2021-05-16,0.384615,0.125000,0.500000
1630241,2021,0.926829,2.146341,0.634146,1.439024,0.121951,0.121951,0.170732,0.707317,0.878049,0.536585,0.292683,0.048780,0.268293,0.536585,2.609756,-1.000000,0.585366,422.390244,0.0,0.024390,0.024390,2021-07-11,0.431818,0.440678,1.000000
1630256,2021,4.333333,8.613333,0.813333,2.706667,1.373333,2.013333,1.866667,3.466667,5.333333,2.453333,1.186667,0.493333,1.373333,3.133333,10.853333,-2.466667,0.266667,1703.480000,0.0,0.786667,0.013333,2021-05-16,0.503096,0.300493,0.682119
1630264,2021,1.028571,2.057143,0.257143,0.857143,0.428571,0.514286,0.514286,1.142857,1.657143,0.371429,0.342857,0.114286,0.342857,1.085714,2.742857,-1.000000,0.400000,484.200000,0.0,0.142857,0.000000,2021-06-02,0.500000,0.300000,0.833333


#### Join the two dataframes (static columns and averaged columns)

In [20]:
df = static_col_df.join(avg_col_df, how='inner')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,TEAM_NAME,SALARY,INFLATION_ADJ_SALARY,TEAM_PAYROLL,INFLATION_ADJ_TEAM_PAYROLL,LEAGUE_PAYROLL,INFLATION_ADJ_LEAGUE_PAYROLL,TEAM_IMPORTANCE,LEAGUE_IMPORTANCE,TEAM_MARKET_SIZE,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,WON_GAME,SECS_PLAYED,C,F,G,GAME_DATE_EST,FG_PCT,FG3_PCT,FT_PCT
PLAYER_ID,SEASON_START,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
15,2004,Chicago Bulls,2750000.0,3938661,57276129,82033174,1773666037,2540315861,0.048013,0.001550,0.032293,1.440000,3.840000,0.860000,2.440000,0.280000,0.320000,0.200000,1.280000,1.480000,0.520000,0.320000,0.100000,0.580000,0.860000,4.020000,-0.300000,0.440000,846.440000,0.0,0.000000,0.000000,2004-04-25,0.375000,0.352459,0.875000
15,2005,Chicago Bulls,3000000.0,4190683,57166530,79855609,1897469981,2650565322,0.052478,0.001581,0.030128,1.450000,3.460000,0.780000,1.940000,0.520000,0.650000,0.210000,1.030000,1.240000,0.670000,0.390000,0.030000,0.500000,0.850000,4.200000,0.300000,0.490000,703.610000,0.0,0.000000,0.110000,2005-05-04,0.419075,0.402062,0.800000
15,2006,Phoenix Suns,1178348.0,1577882,65841120,88165438,1939373647,2596944366,0.017897,0.000608,0.033950,1.270000,3.070000,0.640000,1.610000,0.450000,0.580000,0.190000,0.870000,1.060000,0.590000,0.330000,0.040000,0.450000,0.820000,3.630000,0.710000,0.550000,616.200000,0.0,0.000000,0.060000,2006-05-04,0.413681,0.397516,0.775862
15,2007,Phoenix Suns,1219590.0,1590374,71323049,93006958,2063887234,2691358145,0.017100,0.000591,0.034558,1.250000,3.020000,0.620000,1.620000,0.440000,0.570000,0.200000,0.850000,1.050000,0.550000,0.310000,0.050000,0.440000,0.800000,3.560000,0.570000,0.600000,585.960000,0.0,0.000000,0.060000,2007-04-24,0.413907,0.382716,0.771930
56,2004,Boston Celtics,5408700.0,7746558,64577356,92490288,1773666037,2540315861,0.083755,0.003049,0.036409,5.230000,11.450000,0.650000,2.070000,1.900000,2.640000,0.870000,3.070000,3.940000,5.310000,1.130000,0.200000,1.730000,2.230000,13.010000,2.590000,0.650000,2078.580000,0.0,0.000000,1.000000,2004-06-15,0.456769,0.314010,0.719697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1630218,2021,Sacramento Kings,1517981.0,1517981,130457848,130457848,4125163242,4125163242,0.011636,0.000368,0.031625,0.625000,1.625000,0.062500,0.500000,0.312500,0.625000,0.437500,1.125000,1.562500,0.125000,0.062500,0.250000,0.062500,0.375000,1.625000,0.125000,0.312500,271.562500,0.0,0.000000,0.000000,2021-05-16,0.384615,0.125000,0.500000
1630241,2021,Memphis Grizzlies,1517981.0,1517981,117284457,117284457,4125163242,4125163242,0.012943,0.000368,0.028431,0.926829,2.146341,0.634146,1.439024,0.121951,0.121951,0.170732,0.707317,0.878049,0.536585,0.292683,0.048780,0.268293,0.536585,2.609756,-1.000000,0.585366,422.390244,0.0,0.024390,0.024390,2021-07-11,0.431818,0.440678,1.000000
1630256,2021,Houston Rockets,1517981.0,1517981,132267085,132267085,4125163242,4125163242,0.011477,0.000368,0.032063,4.333333,8.613333,0.813333,2.706667,1.373333,2.013333,1.866667,3.466667,5.333333,2.453333,1.186667,0.493333,1.373333,3.133333,10.853333,-2.466667,0.266667,1703.480000,0.0,0.786667,0.013333,2021-05-16,0.503096,0.300493,0.682119
1630264,2021,Washington Wizards,1517981.0,1517981,128019790,128019790,4125163242,4125163242,0.011857,0.000368,0.031034,1.028571,2.057143,0.257143,0.857143,0.428571,0.514286,0.514286,1.142857,1.657143,0.371429,0.342857,0.114286,0.342857,1.085714,2.742857,-1.000000,0.400000,484.200000,0.0,0.142857,0.000000,2021-06-02,0.500000,0.300000,0.833333


### `TEAM_NAME` to binary columns

We also convert the team name to binary columns. We expect this could have an effect since different teams have different salary caps. Since there are a lot of teams (30), instead of using one-hot encoding, we use binary encoding to reduce the number of additional columns that are added to the dataframe.

In [21]:
from category_encoders import BinaryEncoder

df = BinaryEncoder(cols=['TEAM_NAME']).fit_transform(df)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,TEAM_NAME_0,TEAM_NAME_1,TEAM_NAME_2,TEAM_NAME_3,TEAM_NAME_4,SALARY,INFLATION_ADJ_SALARY,TEAM_PAYROLL,INFLATION_ADJ_TEAM_PAYROLL,LEAGUE_PAYROLL,INFLATION_ADJ_LEAGUE_PAYROLL,TEAM_IMPORTANCE,LEAGUE_IMPORTANCE,TEAM_MARKET_SIZE,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,WON_GAME,SECS_PLAYED,C,F,G,GAME_DATE_EST,FG_PCT,FG3_PCT,FT_PCT
PLAYER_ID,SEASON_START,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
15,2004,0,0,0,0,1,2750000.0,3938661,57276129,82033174,1773666037,2540315861,0.048013,0.001550,0.032293,1.440000,3.840000,0.860000,2.440000,0.280000,0.320000,0.200000,1.280000,1.480000,0.520000,0.320000,0.100000,0.580000,0.860000,4.020000,-0.300000,0.440000,846.440000,0.0,0.000000,0.000000,2004-04-25,0.375000,0.352459,0.875000
15,2005,0,0,0,0,1,3000000.0,4190683,57166530,79855609,1897469981,2650565322,0.052478,0.001581,0.030128,1.450000,3.460000,0.780000,1.940000,0.520000,0.650000,0.210000,1.030000,1.240000,0.670000,0.390000,0.030000,0.500000,0.850000,4.200000,0.300000,0.490000,703.610000,0.0,0.000000,0.110000,2005-05-04,0.419075,0.402062,0.800000
15,2006,0,0,0,1,0,1178348.0,1577882,65841120,88165438,1939373647,2596944366,0.017897,0.000608,0.033950,1.270000,3.070000,0.640000,1.610000,0.450000,0.580000,0.190000,0.870000,1.060000,0.590000,0.330000,0.040000,0.450000,0.820000,3.630000,0.710000,0.550000,616.200000,0.0,0.000000,0.060000,2006-05-04,0.413681,0.397516,0.775862
15,2007,0,0,0,1,0,1219590.0,1590374,71323049,93006958,2063887234,2691358145,0.017100,0.000591,0.034558,1.250000,3.020000,0.620000,1.620000,0.440000,0.570000,0.200000,0.850000,1.050000,0.550000,0.310000,0.050000,0.440000,0.800000,3.560000,0.570000,0.600000,585.960000,0.0,0.000000,0.060000,2007-04-24,0.413907,0.382716,0.771930
56,2004,0,0,0,1,1,5408700.0,7746558,64577356,92490288,1773666037,2540315861,0.083755,0.003049,0.036409,5.230000,11.450000,0.650000,2.070000,1.900000,2.640000,0.870000,3.070000,3.940000,5.310000,1.130000,0.200000,1.730000,2.230000,13.010000,2.590000,0.650000,2078.580000,0.0,0.000000,1.000000,2004-06-15,0.456769,0.314010,0.719697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1630218,2021,0,0,1,0,1,1517981.0,1517981,130457848,130457848,4125163242,4125163242,0.011636,0.000368,0.031625,0.625000,1.625000,0.062500,0.500000,0.312500,0.625000,0.437500,1.125000,1.562500,0.125000,0.062500,0.250000,0.062500,0.375000,1.625000,0.125000,0.312500,271.562500,0.0,0.000000,0.000000,2021-05-16,0.384615,0.125000,0.500000
1630241,2021,1,0,1,0,1,1517981.0,1517981,117284457,117284457,4125163242,4125163242,0.012943,0.000368,0.028431,0.926829,2.146341,0.634146,1.439024,0.121951,0.121951,0.170732,0.707317,0.878049,0.536585,0.292683,0.048780,0.268293,0.536585,2.609756,-1.000000,0.585366,422.390244,0.0,0.024390,0.024390,2021-07-11,0.431818,0.440678,1.000000
1630256,2021,0,1,0,1,0,1517981.0,1517981,132267085,132267085,4125163242,4125163242,0.011477,0.000368,0.032063,4.333333,8.613333,0.813333,2.706667,1.373333,2.013333,1.866667,3.466667,5.333333,2.453333,1.186667,0.493333,1.373333,3.133333,10.853333,-2.466667,0.266667,1703.480000,0.0,0.786667,0.013333,2021-05-16,0.503096,0.300493,0.682119
1630264,2021,1,0,1,1,1,1517981.0,1517981,128019790,128019790,4125163242,4125163242,0.011857,0.000368,0.031034,1.028571,2.057143,0.257143,0.857143,0.428571,0.514286,0.514286,1.142857,1.657143,0.371429,0.342857,0.114286,0.342857,1.085714,2.742857,-1.000000,0.400000,484.200000,0.0,0.142857,0.000000,2021-06-02,0.500000,0.300000,0.833333


### Remove `GAME_DATE_EST`

We no longer need the game date column, since the rows have been averaged.

In [22]:
df = df.drop(columns=['GAME_DATE_EST'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,TEAM_NAME_0,TEAM_NAME_1,TEAM_NAME_2,TEAM_NAME_3,TEAM_NAME_4,SALARY,INFLATION_ADJ_SALARY,TEAM_PAYROLL,INFLATION_ADJ_TEAM_PAYROLL,LEAGUE_PAYROLL,INFLATION_ADJ_LEAGUE_PAYROLL,TEAM_IMPORTANCE,LEAGUE_IMPORTANCE,TEAM_MARKET_SIZE,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,WON_GAME,SECS_PLAYED,C,F,G,FG_PCT,FG3_PCT,FT_PCT
PLAYER_ID,SEASON_START,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
15,2004,0,0,0,0,1,2750000.0,3938661,57276129,82033174,1773666037,2540315861,0.048013,0.001550,0.032293,1.440000,3.840000,0.860000,2.440000,0.280000,0.320000,0.200000,1.280000,1.480000,0.520000,0.320000,0.100000,0.580000,0.860000,4.020000,-0.300000,0.440000,846.440000,0.0,0.000000,0.000000,0.375000,0.352459,0.875000
15,2005,0,0,0,0,1,3000000.0,4190683,57166530,79855609,1897469981,2650565322,0.052478,0.001581,0.030128,1.450000,3.460000,0.780000,1.940000,0.520000,0.650000,0.210000,1.030000,1.240000,0.670000,0.390000,0.030000,0.500000,0.850000,4.200000,0.300000,0.490000,703.610000,0.0,0.000000,0.110000,0.419075,0.402062,0.800000
15,2006,0,0,0,1,0,1178348.0,1577882,65841120,88165438,1939373647,2596944366,0.017897,0.000608,0.033950,1.270000,3.070000,0.640000,1.610000,0.450000,0.580000,0.190000,0.870000,1.060000,0.590000,0.330000,0.040000,0.450000,0.820000,3.630000,0.710000,0.550000,616.200000,0.0,0.000000,0.060000,0.413681,0.397516,0.775862
15,2007,0,0,0,1,0,1219590.0,1590374,71323049,93006958,2063887234,2691358145,0.017100,0.000591,0.034558,1.250000,3.020000,0.620000,1.620000,0.440000,0.570000,0.200000,0.850000,1.050000,0.550000,0.310000,0.050000,0.440000,0.800000,3.560000,0.570000,0.600000,585.960000,0.0,0.000000,0.060000,0.413907,0.382716,0.771930
56,2004,0,0,0,1,1,5408700.0,7746558,64577356,92490288,1773666037,2540315861,0.083755,0.003049,0.036409,5.230000,11.450000,0.650000,2.070000,1.900000,2.640000,0.870000,3.070000,3.940000,5.310000,1.130000,0.200000,1.730000,2.230000,13.010000,2.590000,0.650000,2078.580000,0.0,0.000000,1.000000,0.456769,0.314010,0.719697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1630218,2021,0,0,1,0,1,1517981.0,1517981,130457848,130457848,4125163242,4125163242,0.011636,0.000368,0.031625,0.625000,1.625000,0.062500,0.500000,0.312500,0.625000,0.437500,1.125000,1.562500,0.125000,0.062500,0.250000,0.062500,0.375000,1.625000,0.125000,0.312500,271.562500,0.0,0.000000,0.000000,0.384615,0.125000,0.500000
1630241,2021,1,0,1,0,1,1517981.0,1517981,117284457,117284457,4125163242,4125163242,0.012943,0.000368,0.028431,0.926829,2.146341,0.634146,1.439024,0.121951,0.121951,0.170732,0.707317,0.878049,0.536585,0.292683,0.048780,0.268293,0.536585,2.609756,-1.000000,0.585366,422.390244,0.0,0.024390,0.024390,0.431818,0.440678,1.000000
1630256,2021,0,1,0,1,0,1517981.0,1517981,132267085,132267085,4125163242,4125163242,0.011477,0.000368,0.032063,4.333333,8.613333,0.813333,2.706667,1.373333,2.013333,1.866667,3.466667,5.333333,2.453333,1.186667,0.493333,1.373333,3.133333,10.853333,-2.466667,0.266667,1703.480000,0.0,0.786667,0.013333,0.503096,0.300493,0.682119
1630264,2021,1,0,1,1,1,1517981.0,1517981,128019790,128019790,4125163242,4125163242,0.011857,0.000368,0.031034,1.028571,2.057143,0.257143,0.857143,0.428571,0.514286,0.514286,1.142857,1.657143,0.371429,0.342857,0.114286,0.342857,1.085714,2.742857,-1.000000,0.400000,484.200000,0.0,0.142857,0.000000,0.500000,0.300000,0.833333


### Save dataframe

In [23]:
df.to_csv('../data/preprocessed/stats_salaries_final.csv')