In [160]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from glob import glob

In [161]:
pd.set_option('display.max_columns', 80)

# import & prep df from 538

In [162]:
spi_url = 'https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv'
spi = pd.read_csv(spi_url)

In [163]:
spi.head()

Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,probtie,proj_score1,proj_score2,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
0,2016,2016-07-09,7921,FA Women's Super League,Liverpool Women,Reading,51.56,50.42,0.4389,0.2767,0.2844,1.39,1.05,,,2.0,0.0,,,,,,
1,2016,2016-07-10,7921,FA Women's Super League,Arsenal Women,Notts County Ladies,46.61,54.03,0.3572,0.3608,0.2819,1.27,1.28,,,2.0,0.0,,,,,,
2,2016,2016-07-10,7921,FA Women's Super League,Chelsea FC Women,Birmingham City,59.85,54.64,0.4799,0.2487,0.2714,1.53,1.03,,,1.0,1.0,,,,,,
3,2016,2016-07-16,7921,FA Women's Super League,Liverpool Women,Notts County Ladies,53.0,52.35,0.4289,0.2699,0.3013,1.27,0.94,,,0.0,0.0,,,,,,
4,2016,2016-07-17,7921,FA Women's Super League,Chelsea FC Women,Arsenal Women,59.43,60.99,0.4124,0.3157,0.2719,1.45,1.24,,,1.0,2.0,,,,,,


In [164]:
spi = spi[spi['league'] == 'Barclays Premier League']

In [165]:
spi2 = spi

In [166]:
spi = spi.rename(columns={'team1':'team', 'team2': 'team_opp', 'spi1': 'spi', 'spi2': 'spi_opp', 'proj_score1': 'proj_score', 'proj_score2': 'proj_score_opp', 'prob1': 'prob_w', 'prob2': 'prob_l', 'importance1': 'importance', 'importance2': 'importance_opp'})

In [167]:
spi2 = spi2.rename(columns={'team2':'team', 'team1': 'team_opp', 'spi2': 'spi', 'spi1': 'spi_opp', 'proj_score2': 'proj_score', 'proj_score1': 'proj_score_opp', 'prob2': 'prob_w', 'prob1': 'prob_l', 'importance2': 'importance', 'importance1': 'importance_opp'})

In [168]:
spi = spi.append(spi2)

In [169]:
spi = spi[['date', 'team', 'team_opp', 'spi', 'spi_opp', 'proj_score', 'proj_score_opp', 'prob_w', 'prob_l', 'probtie', 'importance', 'importance_opp']]

In [170]:
spi.team.unique()

array(['Hull City', 'Everton', 'Crystal Palace', 'Middlesbrough',
       'Southampton', 'Burnley', 'Manchester City', 'AFC Bournemouth',
       'Arsenal', 'Chelsea', 'Manchester United', 'Stoke City',
       'Tottenham Hotspur', 'West Bromwich Albion', 'Watford',
       'Swansea City', 'Leicester City', 'Sunderland', 'West Ham United',
       'Liverpool', 'Brighton and Hove Albion', 'Newcastle',
       'Huddersfield Town', 'Fulham', 'Wolverhampton', 'Cardiff City',
       'Aston Villa', 'Norwich City', 'Sheffield United', 'Leeds United'],
      dtype=object)

In [171]:
spi = spi.replace({'West Ham United': 'West Ham', 
    'Manchester City': 'Man City',
    'Brighton and Hove Albion': 'Brighton',
    'Cardiff City': 'Cardiff',
    'Leicester City': 'Leicester',
    'Tottenham Hotspur': 'Tottenham',
    'AFC Bournemouth': 'Bournemouth',
    'Huddersfield Town': 'Huddersfield',
    'Wolverhampton': 'Wolves',
    'Manchester United': 'Man United'})

# import and append fantasy performance data

In [172]:
cols = ['name', 'element','kickoff_time', 'total_points',  'fixture', 'opponent_team', 'team_a_score', 'team_h_score', 'was_home', 'season']

In [173]:
# I had to change the column names in 2018-19 teams.csv to make a column called "Team", to correspond with other seasons

In [174]:
years = [('2018-19', 'ISO-8859-1'), ('2019-20', 'utf-8')]
# later realized that the 2018-19 CSV is not encoded in UTF-8, so made the above into tuples including the explicit encoding
players = pd.DataFrame()
for year, encoding in years:
    #import player-level data
    players_section = '/Users/andrewjpeters/Documents/GitHub/fpl/data/'+year+'/gws/merged_gw.csv'
    players_section = pd.read_csv(players_section, engine='python', encoding=encoding)
    players_section['season'] = year[:4]
    players_section = players_section[cols]
    #merge fixture data to identify teams
    fixtures = '/Users/andrewjpeters/Documents/GitHub/fpl/data/'+year+'/fixtures.csv'
    fixtures = pd.read_csv(fixtures, engine='python')
    fixtures = fixtures[['id', 'team_a', 'team_h']]
    players_section = players_section.merge(fixtures, how='left', left_on='fixture', right_on='id')
    #import and merge team names to align with 538 SPI data
    teams = pd.read_csv('/Users/andrewjpeters/Documents/GitHub/fpl/data/'+year+'/teams.csv')
    teams = teams[['id', 'Team']]
    players_section = players_section.merge(teams, left_on='team_a', right_on='id')
    players_section = players_section.rename(columns={'Team': 'team_a_name'})
    players_section = players_section.merge(teams, left_on='team_h', right_on='id')
    players_section = players_section.rename(columns={'Team': 'team_h_name'})
    #import and merge player positions
    positions = pd.read_csv('/Users/andrewjpeters/Documents/GitHub/fpl/data/'+year+'/players_raw.csv', engine='python')
    players_section = players_section.merge(positions[['id', 'element_type']], how='left', left_on='element', right_on='id')
    #append to full players dataframe
    players = players.append(players_section, ignore_index=True)

In [175]:
players['date'] = pd.to_datetime(players['kickoff_time'])

In [176]:
players['date'] = players['date'].dt.strftime('%Y-%m-%d')

In [177]:
def team_namer(was_home, home_team, away_team):
    if was_home:
        return home_team
    else:
        return away_team

In [178]:
players['team'] = players.apply(lambda x: team_namer(x['was_home'], x['team_h_name'], x['team_a_name']), axis=1)

In [179]:
players = players[['name', 'element','element_type','total_points', 'date', 'team', 'was_home', 'season']]

In [180]:
players = players.replace({'Man Utd': 'Man United',
                'Sheffield Utd': 'Sheffield United',
                'Norwich': 'Norwich City',
                'Spurs': 'Tottenham',
})

In [181]:
players = players.merge(spi, how='left', left_on=['date', 'team'], right_on=['date', 'team'])

In [182]:
players.isnull().sum()

name               0
element            0
element_type       0
total_points       0
date               0
team               0
was_home           0
season             0
team_opp          59
spi               59
spi_opp           59
proj_score        59
proj_score_opp    59
prob_w            59
prob_l            59
probtie           59
importance        59
importance_opp    59
dtype: int64

In [183]:
players[players.team_opp.isnull()].head()

Unnamed: 0,name,element,element_type,total_points,date,team,was_home,season,team_opp,spi,spi_opp,proj_score,proj_score_opp,prob_w,prob_l,probtie,importance,importance_opp
43660,John_Stones_207,207,2,0,2020-03-11,Man City,True,2019,,,,,,,,,,
43661,David_Silva_219,219,3,0,2020-03-11,Man City,True,2019,,,,,,,,,,
43662,Bukayo_Saka_541,541,3,0,2020-03-11,Arsenal,False,2019,,,,,,,,,,
43663,Sead_Kolasinac_3,3,2,0,2020-03-11,Arsenal,False,2019,,,,,,,,,,
43664,Bernd_Leno_14,14,1,0,2020-03-11,Arsenal,False,2019,,,,,,,,,,


In [184]:
# the 2020-03-11 Man City v. Arsenal game was postponed to 2020-06-17 because of Covid

In [185]:
#check that there are no other null games
players[players.team_opp.isnull()]['date'].unique()
players[players.team_opp.isnull()]['team'].unique()

array(['Man City', 'Arsenal'], dtype=object)

In [186]:
#mke a new df for just the null game
ars_man = players[players.team_opp.isnull()]

In [187]:
#drop the null game from the players df
players = players.dropna()
players.isnull().sum()

name              0
element           0
element_type      0
total_points      0
date              0
team              0
was_home          0
season            0
team_opp          0
spi               0
spi_opp           0
proj_score        0
proj_score_opp    0
prob_w            0
prob_l            0
probtie           0
importance        0
importance_opp    0
dtype: int64

In [188]:
ars_man['date'] = '2020-06-17'

In [189]:
ars_man = ars_man[['name', 'element','element_type','total_points', 'date', 'team', 'was_home', 'season']]

In [190]:
ars_man = ars_man.merge(spi, how='left', on=['date', 'team'])

In [191]:
ars_man.head()

Unnamed: 0,name,element,element_type,total_points,date,team,was_home,season,team_opp,spi,spi_opp,proj_score,proj_score_opp,prob_w,prob_l,probtie,importance,importance_opp
0,John_Stones_207,207,2,0,2020-06-17,Man City,True,2019,Arsenal,94.8,76.77,2.68,0.77,0.7727,0.0828,0.1445,0.0,40.2
1,David_Silva_219,219,3,0,2020-06-17,Man City,True,2019,Arsenal,94.8,76.77,2.68,0.77,0.7727,0.0828,0.1445,0.0,40.2
2,Bukayo_Saka_541,541,3,0,2020-06-17,Arsenal,False,2019,Man City,76.77,94.8,0.77,2.68,0.0828,0.7727,0.1445,40.2,0.0
3,Sead_Kolasinac_3,3,2,0,2020-06-17,Arsenal,False,2019,Man City,76.77,94.8,0.77,2.68,0.0828,0.7727,0.1445,40.2,0.0
4,Bernd_Leno_14,14,1,0,2020-06-17,Arsenal,False,2019,Man City,76.77,94.8,0.77,2.68,0.0828,0.7727,0.1445,40.2,0.0


In [192]:
ars_man.isnull().sum()

name              0
element           0
element_type      0
total_points      0
date              0
team              0
was_home          0
season            0
team_opp          0
spi               0
spi_opp           0
proj_score        0
proj_score_opp    0
prob_w            0
prob_l            0
probtie           0
importance        0
importance_opp    0
dtype: int64

In [193]:
players = players.append(ars_man)

In [194]:
players.isnull().sum()

name              0
element           0
element_type      0
total_points      0
date              0
team              0
was_home          0
season            0
team_opp          0
spi               0
spi_opp           0
proj_score        0
proj_score_opp    0
prob_w            0
prob_l            0
probtie           0
importance        0
importance_opp    0
dtype: int64

In [195]:
players['spi_d'] = players['spi'] - players['spi_opp']
players['proj_score_d'] = players['proj_score'] - players['proj_score_opp']

In [196]:
'''
Hypothesis: high number of players earning 0 points throws off data
'''

'\nHypothesis: high number of players earning 0 points throws off data\n'

In [197]:
season_points = players.groupby(['name', 'element', 'season'])['total_points'].sum().reset_index()

In [198]:
top_players = season_points[season_points['total_points'] > 40]

In [199]:
'''
decided to look at all players scoring over 40 points, rather than percentile -- this includes more players
top_players['rank'] = top_players['total_points'].rank(pct=True)
top_players = top_players[top_players['rank'] >= .5].sort_values('total_points')
'''

"\ndecided to look at all players scoring over 40 points, rather than percentile -- this includes more players\ntop_players['rank'] = top_players['total_points'].rank(pct=True)\ntop_players = top_players[top_players['rank'] >= .5].sort_values('total_points')\n"

In [200]:
players = players[players['name'].isin(top_players.name)]

In [201]:
players['name'] = players['name'].str.replace('\d+', '')

In [202]:
players['name'] = players['name'].str.rstrip('_')

In [203]:
players['name'] = players['name'].str.replace('_', ' ')

In [204]:
list(players)

['name',
 'element',
 'element_type',
 'total_points',
 'date',
 'team',
 'was_home',
 'season',
 'team_opp',
 'spi',
 'spi_opp',
 'proj_score',
 'proj_score_opp',
 'prob_w',
 'prob_l',
 'probtie',
 'importance',
 'importance_opp',
 'spi_d',
 'proj_score_d']

In [205]:
players.head()

Unnamed: 0,name,element,element_type,total_points,date,team,was_home,season,team_opp,spi,spi_opp,proj_score,proj_score_opp,prob_w,prob_l,probtie,importance,importance_opp,spi_d,proj_score_d
5,Alisson Ramses Becker,468,1,7,2018-08-12,Liverpool,True,2018,West Ham,88.86,64.54,2.75,0.55,0.8203,0.0505,0.1292,54.8,20.7,24.32,2.2
6,Andrew Robertson,247,2,11,2018-08-12,Liverpool,True,2018,West Ham,88.86,64.54,2.75,0.55,0.8203,0.0505,0.1292,54.8,20.7,24.32,2.2
9,Angelo Ogbonna,403,2,0,2018-08-12,West Ham,False,2018,Liverpool,64.54,88.86,0.55,2.75,0.0505,0.8203,0.1292,20.7,54.8,-24.32,-2.2
10,Arthur Masuaku,410,3,2,2018-08-12,West Ham,False,2018,Liverpool,64.54,88.86,0.55,2.75,0.0505,0.8203,0.1292,20.7,54.8,-24.32,-2.2
13,Declan Rice,406,2,0,2018-08-12,West Ham,False,2018,Liverpool,64.54,88.86,0.55,2.75,0.0505,0.8203,0.1292,20.7,54.8,-24.32,-2.2


In [206]:
points_to_date = players.groupby(['name', 'element', 'season','date'])['total_points'].sum().reset_index()

In [207]:
def avg_points_to_date(player, date):
    section = points_to_date[(points_to_date['date'] < date) & (points_to_date['name'] == player)]['total_points'].tail(5)
    if len(section) == 0:
        return 0
    else:
        return float(section.mean())

In [208]:
points_to_date['avg_points'] = points_to_date.apply(lambda x: avg_points_to_date(x['name'] ,x['date']), axis=1)

In [209]:
players = players.merge(points_to_date[['name', 'date', 'avg_points']], on=['name', 'date'])

In [210]:
injury_list = pd.DataFrame()
for season in ['2018-19', '2019-20']:
    dirs = list(glob('/Users/andrewjpeters/Documents/GitHub/fpl/data/'+season+'/players/*/'))
    for player in dirs:
        path = player + 'gw.csv'
        gw_history = pd.read_csv(path)
        gw_history['played_more_than_20'] = pd.cut(gw_history.minutes, bins=[0, 20, 100], labels=[0,1], include_lowest=True)
        gw_history = gw_history[['element', 'kickoff_time', 'minutes', 'played_more_than_20', 'value']]
        injury_list = injury_list.append(gw_history)
injury_list['date'] = pd.to_datetime(injury_list.kickoff_time)
injury_list['date'] = injury_list['date'].dt.strftime('%Y-%m-%d')

In [211]:
players = players.merge(injury_list[['element', 'date', 'value', 'played_more_than_20']], how='left', on=['element', 'date'])

In [212]:
players.isnull().sum()

name                   0
element                0
element_type           0
total_points           0
date                   0
team                   0
was_home               0
season                 0
team_opp               0
spi                    0
spi_opp                0
proj_score             0
proj_score_opp         0
prob_w                 0
prob_l                 0
probtie                0
importance             0
importance_opp         0
spi_d                  0
proj_score_d           0
avg_points             0
value                  0
played_more_than_20    0
dtype: int64

# Begin ML Work

In [275]:
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

In [271]:
lb_make = LabelEncoder()

In [272]:
players['name_code'] = lb_make.fit_transform(players['name'])

## compare classifiers

In [158]:
feature_cols = ['name_code', 'spi', 'spi_opp', 'proj_score', 'proj_score_opp', 'spi_d', 'proj_score_d', 'avg_points', 'prob_w', 'prob_l', 'probtie', 'importance', 'importance_opp', 'element_type', 'was_home', 'played_more_than_20']

In [159]:
features = players[feature_cols]
labels = players['total_points']

KeyError: "['avg_points', 'played_more_than_20'] not in index"

In [None]:
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
# models.append(('SGD', SGDRegressor()))
models.append(('DTR', DecisionTreeRegressor()))
models.append(('DTC', DecisionTreeClassifier()))
models.append(('Lasso', Lasso()))
models.append(('Elastic Net', ElasticNet()))
models.append(('Ridge', Ridge()))
models.append(('Random Forest', RandomForestClassifier()))
# evaluate each model in turn

In [None]:
# evaluate each model in turn
results = []
names = []
scoring = 'r2'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    scaler = MinMaxScaler(feature_range=(0, 1))
    features = scaler.fit_transform(features)
    cv_results = model_selection.cross_val_score(model, features, labels, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    predicted = cross_val_predict(model, features, labels, cv=kfold)
    fig, ax = plt.subplots()
    ax.scatter(labels, predicted, edgecolors=(0, 0, 0))
    ax.plot([labels.min(), labels.max()], [labels.min(), labels.max()], 'k--', lw=4)
    fig.suptitle(name)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted') 
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
#How well does avg_points along over the previous 5 games predict points earned? not as well as our forecast.

fig, ax = plt.subplots()
ax.scatter(players.total_points, players.avg_points, edgecolors=(0, 0, 0))
ax.plot([players.total_points.min(), players.total_points.max()], [players.total_points.min(), players.total_points.max()], 'k--', lw=4)
fig.suptitle('avg_points')
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

## settle on Decision Tree Regressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=.2, random_state=4)

In [None]:
clf = DecisionTreeRegressor()

In [None]:
clf.fit(X_train, y_train)

In [None]:
predictions = pd.DataFrame()
predictions['pred'] = list(clf.predict(X_test))
predictions['actual'] = list(y_test)

In [None]:
sns.scatterplot(x='pred', y='actual', data=predictions)

In [None]:
#mse of test data

mean_squared_error(predictions.actual,predictions.pred)

In [None]:
## Look at feature importance in CLF

data = pd.DataFrame(zip(list(features), clf.feature_importances_))
data = data.sort_values(1)
plt.figure(figsize=(15,4))
ax = sns.barplot(x=0, y=1, data=data)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()

In [None]:
players['clf_pred'] = list(clf.predict(features))
players['total_points'] = players.total_points.astype('float')
players['clf_pred'] = players.clf_pred.astype('float')

In [None]:
sns.set_style('darkgrid')
sns.set('talk')
sns.jointplot(data=players, x='clf_pred', y='total_points', kind='kde', xlim=(0,15), ylim=(0,15))


In [None]:
mean_squared_error(players.clf_pred,players.total_points)

In [None]:
data = players.groupby('name')[['total_points', 'clf_pred']].sum()

In [None]:
sns.set_style('darkgrid')
sns.set('talk')
sns.jointplot(data=data, x='clf_pred', y='total_points', kind='reg')


In [None]:
mean_absolute_error(data.clf_pred, data.total_points)

In [None]:
#what would a season long estimate look like if we only used avg_points as our indicator? not as good

data = players.groupby('name')[['total_points', 'avg_points']].sum()
sns.set_style('darkgrid')
sns.set('talk')
sns.jointplot(data=data, x='avg_points', y='total_points', kind='reg')
mean_absolute_error(data.avg_points, data.total_points)

# Make Predictions for 2020-2021 Season

In [238]:
season = pd.read_csv('/Users/andrewjpeters/Documents/GitHub/fpl/data/2020-21/gws/gw3.csv')

In [239]:
season.head()

Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,fixture,goals_conceded,goals_scored,ict_index,influence,kickoff_time,minutes,opponent_team,own_goals,penalties_missed,penalties_saved,red_cards,round,saves,selected,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards
0,Aaron Connolly,FWD,Brighton,2.7,0,0,2,0,12.1,78,19,2,0,1.9,0.0,2020-09-26T11:30:00Z,73,13,0,0,0,0,3,0,55356,3,2,8.0,2,13526,26823,13297,55,True,0
1,Aaron Cresswell,DEF,West Ham,3.0,0,1,32,1,52.5,435,28,0,0,8.8,35.8,2020-09-27T18:00:00Z,90,20,0,0,0,0,3,0,58281,0,4,0.0,7,-7282,2292,9574,49,True,0
2,Aaron Mooy,MID,Brighton,0.0,0,0,0,0,0.0,60,19,0,0,0.0,0.0,2020-09-26T11:30:00Z,0,13,0,0,0,0,3,0,1253,3,2,0.0,0,-282,9666,9948,50,True,0
3,Aaron Ramsdale,GK,Sheffield Utd,2.8,0,0,26,0,10.0,483,25,1,0,5.2,42.0,2020-09-27T11:00:00Z,90,10,0,0,0,0,3,7,377916,1,0,0.0,4,-66442,9045,75487,50,True,0
4,Aaron Wan-Bissaka,DEF,Man Utd,1.5,0,0,14,0,1.2,313,19,2,0,1.7,16.2,2020-09-26T11:30:00Z,90,3,0,0,0,0,3,0,759590,3,2,0.0,1,-123785,34697,158482,55,False,0


In [240]:
season = season[['name', 'position', 'team', 'value', 'total_points']]

In [241]:
season.sort_values('team').team.unique()

array(['Arsenal', 'Aston Villa', 'Brighton', 'Burnley', 'Chelsea',
       'Crystal Palace', 'Everton', 'Fulham', 'Leeds', 'Leicester',
       'Liverpool', 'Man City', 'Man Utd', 'Newcastle', 'Sheffield Utd',
       'Southampton', 'Spurs', 'West Brom', 'West Ham', 'Wolves'],
      dtype=object)

In [242]:
spi.head()

Unnamed: 0,date,team,team_opp,spi,spi_opp,proj_score,proj_score_opp,prob_w,prob_l,probtie,importance,importance_opp
12,2016-08-13,Hull City,Leicester,53.57,66.81,1.16,1.24,0.3459,0.3621,0.2921,38.1,22.2
13,2016-08-13,Everton,Tottenham,68.02,73.25,1.47,1.38,0.391,0.3401,0.2689,31.9,48.0
14,2016-08-13,Crystal Palace,West Bromwich Albion,55.19,58.66,1.35,1.14,0.4214,0.2939,0.2847,43.6,34.6
15,2016-08-13,Middlesbrough,Stoke City,56.32,60.35,1.3,1.01,0.438,0.2692,0.2927,33.9,32.5
16,2016-08-13,Southampton,Watford,69.49,59.33,1.91,1.05,0.5759,0.1874,0.2367,34.1,30.7


In [243]:
spi_2020 = spi[(spi['date'] > '2020-09-01') & (spi['date'] < '2021-06-01')]

In [244]:
for team in list(spi_2020.team.unique()):
    if not team in list(season.team):
        print(team)

West Bromwich Albion
Tottenham
Sheffield United
Leeds United
Man United


In [245]:
season = season.replace({'Man Utd': 'Man United',
        'Spurs': 'Tottenham',
        'Sheffield Utd': 'Sheffield United',
        'West Brom': 'West Bromwich Albion',
        'Leeds': 'Leeds United'})

In [246]:
df = pd.DataFrame()
for name in season.name.unique():
    part = season[season['name'] == name]
    part = part.merge(spi_2020, how='right', on='team')
    df = df.append(part)

In [247]:
df.head()

Unnamed: 0,name,position,team,value,total_points,date,team_opp,spi,spi_opp,proj_score,proj_score_opp,prob_w,prob_l,probtie,importance,importance_opp
0,Aaron Connolly,FWD,Brighton,55.0,2.0,2020-09-14,Chelsea,68.03,85.37,1.02,1.93,0.1944,0.5806,0.2249,28.3,61.0
1,Aaron Connolly,FWD,Brighton,55.0,2.0,2020-09-26,Man United,70.26,85.38,1.0,1.75,0.2135,0.5444,0.2421,20.9,55.4
2,Aaron Connolly,FWD,Brighton,55.0,2.0,2020-10-26,West Bromwich Albion,70.38,58.34,1.74,0.93,0.5603,0.1979,0.2418,,
3,Aaron Connolly,FWD,Brighton,55.0,2.0,2020-11-07,Burnley,70.38,67.75,1.52,1.12,0.4603,0.2771,0.2626,,
4,Aaron Connolly,FWD,Brighton,55.0,2.0,2020-11-28,Liverpool,70.38,88.81,1.2,2.24,0.1934,0.604,0.2025,,


In [248]:
# what columns do we need to add to df to match the players dataframe
for col in list(players):
    if not col in list(df):
        print(col)

element
element_type
was_home
season
spi_d
proj_score_d
avg_points
played_more_than_20


In [249]:
df['season'] = 2020

In [250]:
#for now, let's assume that everyone in df is healthy, and will play more than 20 minutes per game. 
df['played_more_than_20'] = 1

In [251]:
#element_type is position, with GK = 1, Defender = 2, and so on

In [252]:
df.position.unique()

array(['FWD', nan, 'DEF', 'MID', 'GK'], dtype=object)

In [253]:
df = df.replace({
    'GK': 1,
    'DEF': 2,
    'MID': 3,
    'FWD': 4
})

In [254]:
# We'll assume player's current values will hold, and will be their value for the rest of the season.
# In the future, it would be worth adding an ability to update these values from the fpl website directly to keep everything UTD
df = df.merge(season[['name', 'value']], how='left', on='name')

In [255]:
# "Element" is the player id from the FPL site, but we're not currently using this for anything -- no need to add now

In [256]:
#avg_points: I'm going to calculate this using the first 3 games of this season, plus the average of the last 5 from last season. First 3 from this season will take 70% wt, and last 5 from last season will take 30% wt. This is a bit arbitrary, but my thinking is that as straight average of the last 5 games would put 60% wt on the first 3 from this season (at 20% per game, split evenly) -- this gives some recency bias towards the new season, but just a bit, since last season has a much large sample size.

In [257]:
#need to add whether or not the game was home
#because 'was home' was shown to have very low importance in the CLF, I'm going to mark all games as home for the time being
df['was_home'] = True

In [258]:
#spi_d & proj_score_d is simple math, can be added using current columns
df['spi_d'] = df['spi'] - df['spi_opp']
df['proj_score_d'] = df['proj_score'] - df['proj_score_opp']

In [301]:
df[df.importance.isnull()]

Unnamed: 0,name,position,team,value_x,total_points,date,team_opp,spi,spi_opp,proj_score,proj_score_opp,prob_w,prob_l,probtie,importance,importance_opp,season,played_more_than_20,value_y,was_home,spi_d,proj_score_d,avg_points,name_code
2,Aaron Connolly,4.0,Brighton,55.0,2.0,2020-10-26,West Bromwich Albion,70.38,58.34,1.74,0.93,0.5603,0.1979,0.2418,,,2020,1,55.0,True,12.04,0.81,2.0,0.0
3,Aaron Connolly,4.0,Brighton,55.0,2.0,2020-11-07,Burnley,70.38,67.75,1.52,1.12,0.4603,0.2771,0.2626,,,2020,1,55.0,True,2.63,0.40,2.0,0.0
4,Aaron Connolly,4.0,Brighton,55.0,2.0,2020-11-28,Liverpool,70.38,88.81,1.20,2.24,0.1934,0.6040,0.2025,,,2020,1,55.0,True,-18.43,-1.04,2.0,0.0
5,Aaron Connolly,4.0,Brighton,55.0,2.0,2020-12-05,Southampton,70.38,72.31,1.44,1.27,0.4072,0.3293,0.2635,,,2020,1,55.0,True,-1.93,0.17,2.0,0.0
6,Aaron Connolly,4.0,Brighton,55.0,2.0,2020-12-19,Sheffield United,70.38,67.39,1.43,0.98,0.4706,0.2558,0.2736,,,2020,1,55.0,True,2.99,0.45,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21047,Ørjan Nyland,1.0,Aston Villa,40.0,0.0,2021-03-20,Sheffield United,72.43,67.39,1.28,1.22,0.3764,0.3466,0.2769,,,2020,1,40.0,True,5.04,0.06,0.0,
21048,Ørjan Nyland,1.0,Aston Villa,40.0,0.0,2021-04-10,Liverpool,72.43,88.81,1.03,2.46,0.1394,0.6825,0.1781,,,2020,1,40.0,True,-16.38,-1.43,0.0,
21049,Ørjan Nyland,1.0,Aston Villa,40.0,0.0,2021-05-01,Everton,72.43,76.89,1.22,1.70,0.2722,0.4820,0.2458,,,2020,1,40.0,True,-4.46,-0.48,0.0,
21050,Ørjan Nyland,1.0,Aston Villa,40.0,0.0,2021-05-12,Crystal Palace,72.43,67.68,1.32,1.29,0.3728,0.3574,0.2698,,,2020,1,40.0,True,4.75,0.03,0.0,


In [311]:
#estimate importance using median importance by team
importance = df.groupby('team')['importance', 'importance_opp'].median().reset_index()

In [312]:
df = df.merge(importance, how='left', on='team')

In [313]:
df.head()

Unnamed: 0,name,position,team,value_x,total_points,date,team_opp,spi,spi_opp,proj_score,proj_score_opp,prob_w,prob_l,probtie,importance_x,importance_opp_x,season,played_more_than_20,value_y,was_home,spi_d,proj_score_d,avg_points,name_code,importance_y,importance_opp_y
0,Aaron Connolly,4.0,Brighton,55.0,2.0,2020-09-14,Chelsea,68.03,85.37,1.02,1.93,0.1944,0.5806,0.2249,28.3,61.0,2020,1,55.0,True,-17.34,-0.91,2.0,0.0,25.9,41.9
1,Aaron Connolly,4.0,Brighton,55.0,2.0,2020-09-26,Man United,70.26,85.38,1.0,1.75,0.2135,0.5444,0.2421,20.9,55.4,2020,1,55.0,True,-15.12,-0.75,2.0,0.0,25.9,41.9
2,Aaron Connolly,4.0,Brighton,55.0,2.0,2020-10-26,West Bromwich Albion,70.38,58.34,1.74,0.93,0.5603,0.1979,0.2418,,,2020,1,55.0,True,12.04,0.81,2.0,0.0,25.9,41.9
3,Aaron Connolly,4.0,Brighton,55.0,2.0,2020-11-07,Burnley,70.38,67.75,1.52,1.12,0.4603,0.2771,0.2626,,,2020,1,55.0,True,2.63,0.4,2.0,0.0,25.9,41.9
4,Aaron Connolly,4.0,Brighton,55.0,2.0,2020-11-28,Liverpool,70.38,88.81,1.2,2.24,0.1934,0.604,0.2025,,,2020,1,55.0,True,-18.43,-1.04,2.0,0.0,25.9,41.9


In [314]:
#where 538 estimated importance (in importance_x colums), use those values
#where those values are missing, using median estimate (importance_y_columns)

df.loc[df['importance_x'].isnull(),'importance_x'] = df['importance_y']
df.loc[df['importance_opp_x'].isnull(),'importance_opp_x'] = df['importance_opp_y']

In [316]:
df = df.drop(columns=['importance_y', 'importance_opp_y'])

In [317]:
df = df.rename(columns={'importance_x': 'importance',
                       'importance_opp_x': 'importance_opp'})

In [260]:
avg_points_2020 = season.groupby('name')['total_points'].mean().reset_index()

In [261]:
last_five = players.sort_values(['name', 'date']).groupby(['name']).tail(5)
avg_points_2019 = last_five.groupby('name')['total_points'].mean().reset_index()

In [262]:
avg_points = avg_points_2020.merge(avg_points_2019, on='name', how='left', suffixes=['2020','2019'])

In [263]:
avg_points.head()

Unnamed: 0,name,total_points2020,total_points2019
0,Aaron Connolly,2,2.0
1,Aaron Cresswell,7,2.4
2,Aaron Mooy,0,1.2
3,Aaron Ramsdale,4,4.8
4,Aaron Wan-Bissaka,1,3.6


In [264]:
avg_points['weighted'] = (avg_points['total_points2020']*.7) + (avg_points['total_points2019']*.3)

In [265]:
avg_points.tail()

Unnamed: 0,name,total_points2020,total_points2019,weighted
549,Yves Bissouma,0,1.4,0.42
550,Zack Steffen,0,,
551,Zeze Steven Sessegnon,0,,
552,Çaglar Söyüncü,0,-0.2,-0.06
553,Ørjan Nyland,0,,


In [266]:
#there are going to be so NaN for players in our 2020 list than don't have data from our 2019 list
#for these players, we can just use the avg from the first 3 games of 2020 as our average
avg_points.loc[avg_points['weighted'].isnull(),'weighted'] = avg_points['total_points2020']

In [267]:
df = df.merge(avg_points[['name', 'weighted']], on='name')

In [268]:
df = df.rename(columns={'weighted': 'avg_points'})

In [279]:
name_encodings = players[['name', 'name_code']]

In [281]:
name_encodings = name_encodings.sort_values('name').drop_duplicates()

In [283]:
df = df.merge(name_encodings, how='left', on='name')

In [292]:
df[df.name_code.isnull()]['avg_points'].mean()

0.8835341365461847

In [293]:
df[~df.name_code.isnull()]['avg_points'].mean()

2.0512786885245884

In [297]:
df[df.name_code.isnull()].sort_values('avg_points', ascending=False).name.unique()

array(['Callum Robinson', 'Illan Meslier', 'Patrick Bamford',
       'Karl Darlow', 'Kyle Bartley', 'Jack Harrison', 'Luke Ayling',
       'Robin Koch', 'Stuart Dallas', 'Liam Cooper', 'Kyle Walker-Peters',
       'Emiliano Martínez', 'Matthew Cash', 'Matheus Pereira',
       'Kai Havertz', 'Tariq Lamptey', 'Andy Carroll', 'Timothy Castagne',
       'Oliver Burke', 'Moussa Djenepo', 'Mateusz Klich',
       'Hélder Wander Sousa de Azevedo e Costa', 'Darnell Furlong',
       'Sam Johnstone', 'Alireza Jahanbakhsh', 'Ollie Watkins',
       'Jack Robinson', 'Adam Lallana', 'Kevin Long', 'Ethan Ampadu',
       'Jimmy Dunne', 'Josh Brownhill', 'André-Frank Zambo Anguissa',
       'James Rodríguez', 'Sander Berge', 'Romaine Sawyers',
       'Ivan Ricardo Neves Abreu Cavaleiro', 'Allan Marques Loureiro',
       'Eberechi Eze', 'Matej Vydra', 'Aleksandar Mitrović',
       'Jake Livermore', 'Kalvin Phillips',
       'Mohamed Naser El Sayed Elneny', 'Timo Werner', 'Grady Diangana',
       'Ben Osb

In [319]:
#I'm going to drop NaNs in the 'name code' category for the time being. This is lazy and I should revisit.
#Using the entire dataset from previous seasons would help this (more players accounted for), and maybe one-hot
#encoding would help as well

df = df.dropna()

In [320]:
df.isnull().sum()

name                   0
position               0
team                   0
value_x                0
total_points           0
date                   0
team_opp               0
spi                    0
spi_opp                0
proj_score             0
proj_score_opp         0
prob_w                 0
prob_l                 0
probtie                0
importance             0
importance_opp         0
season                 0
played_more_than_20    0
value_y                0
was_home               0
spi_d                  0
proj_score_d           0
avg_points             0
name_code              0
dtype: int64

In [259]:
# next steps
# look into using one-hot encoding to deal with player names
# estimate player points of 20-21 season
# when estimating player points --> it's preferable to pick players that have a high probablility of a strong return on points, rather than a lower probability of getting an absurd amount of points
# solve kapsack problem

In [322]:
season_19 = players[(players.date > '2019-08-08')]

In [325]:
#try solving knapsack problem using predicted numbers on season 19

ModuleNotFoundError: No module named 'pulp'