In [127]:
import pandas as pd
import seaborn as sns
import numpy as np
import pickle
sns.set()
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

In [128]:
df = pd.read_pickle('/Users/andrewpeters/GitHub/fpl/data/interim/df_with_rolling_features.pkl')

In [129]:
len(df)

105230

In [130]:
df = df[df['total_points'] >= 0] #possible, but rare, to get negative points. Will remove negative points for ease of use
df['past_three_avg_mins'] = (df['minutes_prev'] + df['minutes_prev_2'])/2
df = df[df['past_three_avg_mins'] > 45]
df = df[df['minutes'] > 40] #minutes played is very much after-the-fact knowledge, but usually we know who won't even be playing 40 minutes


In [131]:
len(df)

30509

In [132]:
def prep_final_features(df):
    df['was_home'] = df['was_home'].astype('int') #convert bool to numeric
    positions_df = pd.get_dummies(df['position'], prefix='position')
    df = df.join(positions_df)
    return df

In [133]:
df = prep_final_features(df)

In [135]:
df.isna().sum()

total_points                      0
player                            0
team                              0
position                          0
gw                                0
opponent_team                     0
was_home                          0
date                              0
season                            0
minutes                           0
total_points_prev                 0
minutes_prev                      0
goals_scored_prev                 0
assists_prev                      0
clean_sheets_prev                 0
goals_conceded_prev               0
bonus_prev                        0
bps_prev                          0
influence_prev                    0
creativity_prev                   0
threat_prev                       0
ict_index_prev                    0
transfers_balance_prev            0
selected_prev                     0
transfers_in_prev                 0
transfers_out_prev                0
key_passes_prev                 662
xA_prev                     

In [115]:
df = df.dropna()

In [116]:
#remove top .05% of performance 
df = df.sort_values('total_points', ascending=False).iloc[round(len(df)*.005):]

In [117]:
metadata_cols = ['player', 'minutes', 'team', 'position', 'opponent_team', 'gw', 'date', 'season', 'team_opponent']

In [118]:
metadata = df[metadata_cols]

In [119]:
X = df.drop(columns = metadata_cols + ['past_three_avg_mins', 'minutes'])
X = X.drop(columns = ['total_points'])

In [120]:
y = df['total_points']

In [121]:
#save the final features names -- I'll use this when predicting future data to make sure I have the correct features
with open('/Users/andrewpeters/GitHub/fpl/data/interim/x_features.pkl', 'wb') as f:
    pickle.dump(list(X), f)

In [122]:
with open('/Users/andrewpeters/GitHub/fpl/data/processed/X.npy', 'wb') as file:
    np.save(file, X)

In [123]:
with open('/Users/andrewpeters/GitHub/fpl/data/processed/y.npy', 'wb') as file:
    np.save(file, y)

In [124]:
with open('/Users/andrewpeters/GitHub/fpl/data/processed/metadata.npy', 'wb') as file:
    np.save(file, metadata)

In [125]:
X.shape

(14477, 261)

In [126]:
metadata.shape

(14477, 9)