In [167]:
import pandas as pd
import seaborn as sns
import numpy as np
import pickle
sns.set()
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

In [168]:
df = pd.read_pickle('/Users/andrewpeters/GitHub/fpl/data/interim/df_with_rolling_features.pkl')

In [169]:
df = df[df['total_points'] >= 0] #possible, but rare, to get negative points. Will remove negative points for ease of use
df['past_three_avg_mins'] = (df['minutes_prev'] + df['minutes_prev_2'] + df['minutes_prev_3'])/3
df = df[df['past_three_avg_mins'] > 60]
df = df[df['minutes'] > 40] #minutes played is very much after-the-fact knowledge, but usually we know who won't even be playing 40 minutes


In [170]:
def prep_final_features(df):
    df['was_home'] = df['was_home'].astype('int') #convert bool to numeric
    positions_df = pd.get_dummies(df['position'], prefix='position')
    df = df.join(positions_df)
    return df

In [171]:
df = prep_final_features(df)

In [172]:
df = df.dropna()

In [173]:
#remove top .05% of performance 
df = df.sort_values('total_points', ascending=False).iloc[round(len(df)*.005):]

In [174]:
metadata_cols = ['player', 'minutes', 'team', 'position','opponent_team', 'gw', 'kickoff_time', 'season', 'team_opponent','total_points_opponent']

In [175]:
metadata = df[metadata_cols]

In [176]:
X = df.drop(columns = metadata_cols + ['past_three_avg_mins', 'minutes'])
X = X.drop(columns = ['total_points'])

In [177]:
y = df['total_points']

In [178]:
#save the final features names -- I'll use this when predicting future data to make sure I have the correct features
with open('/Users/andrewpeters/GitHub/fpl/data/interim/x_features.pkl', 'wb') as f:
    pickle.dump(list(X), f)

In [179]:
with open('/Users/andrewpeters/GitHub/fpl/data/processed/X.npy', 'wb') as file:
    np.save(file, X)

In [180]:
with open('/Users/andrewpeters/GitHub/fpl/data/processed/y.npy', 'wb') as file:
    np.save(file, y)

In [181]:
with open('/Users/andrewpeters/GitHub/fpl/data/processed/metadata.npy', 'wb') as file:
    np.save(file, metadata)

In [182]:
X.shape

(27531, 320)

In [183]:
metadata.shape

(27531, 10)