In [246]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [247]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)
%matplotlib inline

In [248]:
df = pd.read_pickle('/Users/andrewpeters/GitHub/fpl/data/interim/df.pkl')

To start, the main way I'll be adding features here is by making lag and rolling features. I also want to correct for that fact that not all data in each row is knowable _before_ the game. I know the fixture and the opponent before a game, but I don't know the minutes a player played until after, for example

In [249]:
prior_knowledge = ['player','team', 'position', 'gw', 'opponent_team', 'was_home', 'kickoff_time', 'season']
posterior_knowledge = ['team_h_score', 'team_a_score', 'minutes', 'goals_scored', 'assists', 'clean_sheets', 
                       'goals_conceded', 'bonus', 'bps', 'influence', 'creativity', 'threat', 'ict_index',
                      'transfers_balance', 'selected', 'transfers_in', 'transfers_out']

In [250]:
#ordering the columns so I can keep better track of this
df = df[['total_points'] + prior_knowledge + posterior_knowledge]

In [251]:
df = df.sort_values(['player', 'kickoff_time'])
shifted_df = df.groupby('player')[['total_points'] + posterior_knowledge].shift(1)
df = df.join(shifted_df, rsuffix='_prev').sort_values(['player', 'kickoff_time'])

In [252]:
posterior_knowledge_shift = [f'{col}_prev' for col in posterior_knowledge]

In [253]:
#generate rolling features, and join back with the original df. Using a minimum period of 4 to balance precision and not ending up with too many NaNs.

for window_size in tqdm(np.arange(2,11)):
    rolling_df = df.groupby('player')[['total_points_prev'] + posterior_knowledge_shift].rolling(window=window_size, min_periods=1).mean().reset_index().set_index('level_1')
    df = df.join(rolling_df[['total_points_prev'] + posterior_knowledge_shift], rsuffix = f'_{window_size}')

100%|██████████| 9/9 [00:13<00:00,  1.55s/it]


In [254]:
# at this point, I should drop the posterior_knowledge features -- the features we don't actually know until after the end of the gw
df = df.drop(columns=posterior_knowledge)

In [255]:
# engineer features to show team-level performance
team_level = df.groupby(['team', 'season','gw']).sum(min_count=1) #min_count preserves NAs

#not all of the player-level features are helpful at the team-level; pick what I want to keep
team_level_keep_features = ['total_points', 'goals_scored', 'goals_conceded', 'clean_sheets', 'influence', 'creativity', 'threat']

#I want to keep all rolling window versions of the columns listed above:
team_level_cols = []
for col in list(team_level):
    for keep_col in team_level_keep_features:
        if keep_col in col:
            team_level_cols.append(col)
            
team_level = team_level[team_level_cols]
team_level = team_level.reset_index()

In [256]:
#merge the team-level features back in to the original df
df = df.merge(team_level, on = ['team', 'season', 'gw'], suffixes = ('', '_team'))

#do the same for opponents, so this can provide features on the difficulty of the opponent
df = df.merge(team_level, left_on = ['opponent_team', 'season', 'gw'], right_on = ['team', 'season', 'gw'], suffixes = ('', '_opponent'))

In [257]:
df.to_pickle('/Users/andrewpeters/GitHub/fpl/data/interim/df_with_rolling_features.pkl')

In [258]:
#save the team_level table so I can use later for forward-looking games
team_level.to_pickle('/Users/andrewpeters/GitHub/fpl/data/interim/team_level_rolling.pkl')