In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets, ensemble
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler
import pprint

In [2]:
data=pd.read_csv('datacleaned.csv')
data=data.drop(['Unnamed: 0'], axis=1)

In [3]:
cols=['team', 'playerid', 'name', 'clean_sheets', 'creativity',
       'goals_conceded', 'goals_scored', 'influence', 'minutes',
       'opponent_team', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'value', 'was_home', 'GW',
       'fdr']
data=data[cols]

In [4]:
playerlist=list(data['playerid'].unique())

In [5]:
pointsdict={}
fdrdict={}
for i in playerlist:
    ptslist=[]
    fdrlist=[]
    selection=data[data['playerid']==i]
    for gw in range(1,39):
        ptslist.append(selection[selection['GW']==gw].iloc[0]['total_points'])
        fdrlist.append(selection[selection['GW']==gw].iloc[0]['fdr'])
    pointsdict[i]=ptslist
    fdrdict[i]=fdrlist

In [6]:
data.insert(0, 'prevscore', 0)
data.insert(0, 'prevfdr', 0)
data.insert(0, 'prev2score', 0)
data.insert(0, 'prev2fdr', 0)
data.insert(0, 'prev3score', 0)
data.insert(0, 'prev3fdr', 0)
data.insert(0, 'team_goals_scored', 0)
data.insert(0, 'team_goals_conceded', 0)

In [7]:
def kernel(scores:list)->int:
    if(len(scores)==1):
        return scores[0]*3
    if(len(scores)==2):
        return scores[0]*2+scores[1]*3
    weights=[1,2,3]
    return sum([x*y for x,y in zip(scores,weights)])

In [8]:
def fixture_finder(team1, team2, data):
    teamdict={1:'ARS', 2:'AVL',3:'BOU', 4:'BHA', 5:'BUR', 6:'CHE', 7:'CRY', 8:'EVE', 9:'LEI', 10:'LIV', 11:'MCI', 12:'MUN',
          13:'NEW', 14:'NOR', 15:'SHU', 16:'SOU', 17:'TOT', 18:'WAT', 19:'WHU', 20:'WOL'}
    rev_team = {v: k for k, v in teamdict.items()}
    t1code=rev_team[team1]
    t2code=rev_team[team2]
    selection=data[(data['team']==t1code) & (data['opponent_team']==t2code)]
    return selection

In [9]:
prevscorelist = []
prev2scorelist = []
prev3scorelist = []
prevfdrlist = []
prev2fdrlist = []
prev3fdrlist = []
goals_scored = []
goals_conceded = []
avg_influence=[]
avg_crea=[]
avg_threat=[]

for index, row in data.iterrows():
    currgw=row['GW']
    
    if currgw==1:
        prevscorelist.append(0)
        prevfdrlist.append(0)
    else:
        prevscorelist.append(pointsdict[row['playerid']][currgw-2])
        prevfdrlist.append(fdrdict[row['playerid']][currgw-2])
    
    if currgw in [1,2]:
        prev2scorelist.append(0)
        prev2fdrlist.append(0)
    else:
        prev2scorelist.append(pointsdict[row['playerid']][currgw-3])
        prev2fdrlist.append(fdrdict[row['playerid']][currgw-3])
    
    if currgw in [1,2,3]:
        prev3scorelist.append(0)
        prev3fdrlist.append(0)
    else:
        prev3scorelist.append(pointsdict[row['playerid']][currgw-4])
        prev3fdrlist.append(fdrdict[row['playerid']][currgw-4])
    
    if row['was_home']:
        goals_scored.append(row['team_h_score'])
        goals_conceded.append(row['team_a_score'])
    else:
        goals_scored.append(row['team_a_score'])
        goals_conceded.append(row['team_h_score'])

In [10]:
data['prevscore']=prevscorelist
data['prev2score']=prev2scorelist
data['prev3score']=prev3scorelist
data['prevfdr']=prevfdrlist
data['prev2fdr']=prev2fdrlist
data['prev3fdr']=prev3fdrlist
data['team_goals_scored']=goals_scored
data['team_goals_conceded']=goals_conceded

In [11]:
goalscoring_form=[]
defensive_form=[]
influence_total=[]
creativity_total=[]
threat_total=[]

i_avg=[]
c_avg=[]
t_avg=[]

for i in playerlist:
    selection=data[data['playerid']==i]
    for gw in range(1,39):
        if gw==1:
            goalscoring_form.append(0)
            defensive_form.append(0)
            influence_total.append(0)
            creativity_total.append(0)
            threat_total.append(0)
            i_avg.append(0)
            c_avg.append(0)
            t_avg.append(0)
        else:
            st_gw=max(1,gw-3)
            sco=[selection[(selection['playerid']==i)&(selection['GW']==x)].iloc[0]['team_goals_scored'] for x in range(st_gw, gw)]
            con=[selection[(selection['playerid']==i)&(selection['GW']==x)].iloc[0]['team_goals_conceded'] for x in range(st_gw, gw)]
            goalscoring_form.append(kernel(sco))
            defensive_form.append(-1*kernel(con))
            '''
            influence_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw-1)].iloc[0]['influence']
                                  + selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['influence'])
            creativity_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw-1)].iloc[0]['creativity']
                                  + selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['creativity'])
            threat_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw-1)].iloc[0]['threat']
                                  + selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['threat'])
            i_avg.append(influence_total[-1]/gw-1)
            c_avg.append(creativity_total[-1]/gw-1)
            t_avg.append(threat_total[-1]/gw-1)
            '''

In [12]:
data['att_form']=goalscoring_form
data['def_form']=defensive_form
#data['influence']=i_avg
#data['creativity']=c_avg
#data['threat']=t_avg

In [13]:
data.columns

Index(['team_goals_conceded', 'team_goals_scored', 'prev3fdr', 'prev3score',
       'prev2fdr', 'prev2score', 'prevfdr', 'prevscore', 'team', 'playerid',
       'name', 'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored',
       'influence', 'minutes', 'opponent_team', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'value', 'was_home', 'GW', 'fdr', 'att_form',
       'def_form'],
      dtype='object')

#### Scaling and one-hot encoding

In [14]:
data['fdr']-=min(data['fdr'])
data['fdr']/=max(data['fdr'])

data['att_form']-=min(data['att_form'])
data['att_form']/=max(data['att_form'])
data['def_form']-=min(data['def_form'])
data['def_form']/=max(data['def_form'])

In [15]:
opp_team=pd.get_dummies(data['opponent_team'], prefix='opp_team')
for col in opp_team.columns:
    data[col]=opp_team[col]

In [16]:
data.columns

Index(['team_goals_conceded', 'team_goals_scored', 'prev3fdr', 'prev3score',
       'prev2fdr', 'prev2score', 'prevfdr', 'prevscore', 'team', 'playerid',
       'name', 'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored',
       'influence', 'minutes', 'opponent_team', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'value', 'was_home', 'GW', 'fdr', 'att_form',
       'def_form', 'opp_team_0', 'opp_team_1', 'opp_team_2', 'opp_team_3',
       'opp_team_4', 'opp_team_5', 'opp_team_6', 'opp_team_7', 'opp_team_8',
       'opp_team_9', 'opp_team_10', 'opp_team_11', 'opp_team_12',
       'opp_team_13', 'opp_team_14', 'opp_team_15', 'opp_team_16',
       'opp_team_17', 'opp_team_18', 'opp_team_19', 'opp_team_20'],
      dtype='object')

In [17]:
feature_cols=[col for col in data.columns if col not in ['opponent_team', 'name', 'total_points']]
targets=data['total_points']
data=data[feature_cols]

In [18]:
xtrain, xtest, ytrain, ytest=train_test_split(data, targets, test_size=0.2)

#### todo
- test on handpicked fixtures
- nailedness
- team value constraints
- ict

In [19]:
reg = ensemble.RandomForestRegressor(n_estimators=500)
reg.fit(xtrain, ytrain)

mse = mean_squared_error(ytest, reg.predict(xtest))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on test set: 1.7028


In [20]:
pprint.pprint(set(zip(reg.feature_importances_, feature_cols)))

{(0.0, 'opp_team_0'),
 (0.0002598176103875383, 'opp_team_11'),
 (0.00027330263774107816, 'opp_team_15'),
 (0.0002938892191855222, 'opp_team_17'),
 (0.00035984127609075585, 'opp_team_6'),
 (0.00037841423182902845, 'opp_team_14'),
 (0.00040236292118419573, 'opp_team_1'),
 (0.00040277124828974105, 'opp_team_20'),
 (0.0004331950960773143, 'opp_team_8'),
 (0.0004781320474972437, 'opp_team_3'),
 (0.0004913884017794169, 'opp_team_13'),
 (0.0004947991343688795, 'opp_team_10'),
 (0.0005126182169785601, 'opp_team_2'),
 (0.0005754041868439296, 'opp_team_16'),
 (0.0005983639635088915, 'opp_team_19'),
 (0.000610768889232208, 'opp_team_7'),
 (0.0006518461127012819, 'opp_team_18'),
 (0.0007044870644391567, 'opp_team_12'),
 (0.0007280733214819583, 'opp_team_5'),
 (0.0007501470022858241, 'opp_team_4'),
 (0.0008081319796217375, 'opp_team_9'),
 (0.0009002289584592506, 'was_home'),
 (0.0019677074440767853, 'fdr'),
 (0.0022655525266922385, 'prevfdr'),
 (0.0024098153709646956, 'prev2fdr'),
 (0.0024121061621

In [21]:
ada = ensemble.AdaBoostRegressor(n_estimators=500)
ada.fit(xtrain, ytrain)

mse = mean_squared_error(ytest, ada.predict(xtest))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on test set: 2.9211


In [22]:
print(ada.feature_importances_)
print(feature_cols)

[2.60515998e-02 1.20842340e-02 1.57919933e-03 4.79707839e-04
 4.22077437e-03 1.03912833e-02 0.00000000e+00 2.54556178e-02
 4.77877036e-04 4.13307757e-03 1.16449453e-01 2.64625814e-02
 1.34079372e-02 2.17427429e-01 4.54783167e-01 1.30110191e-02
 3.51920277e-03 5.73505461e-03 1.94818142e-02 6.63615492e-03
 0.00000000e+00 1.84516200e-03 1.16701850e-02 2.37316814e-03
 2.85301240e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 4.65247255e-03 8.46563882e-04 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 3.51370163e-03
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 8.17580282e-03 0.00000000e+00 0.00000000e+00 4.85045854e-03
 0.00000000e+00 0.00000000e+00]
['team_goals_conceded', 'team_goals_scored', 'prev3fdr', 'prev3score', 'prev2fdr', 'prev2score', 'prevfdr', 'prevscore', 'team', 'playerid', 'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored', 'influence', 'minutes', 'team_a_score', 'team_h_score', 'threat', 'value', 'was_home', 'GW',