In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets, ensemble
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler
import pprint

In [2]:
data=pd.read_csv('datacleaned.csv')
data=data.drop(['Unnamed: 0'], axis=1)

In [3]:
cols=['team', 'playerid', 'name', 'clean_sheets', 'creativity',
       'goals_conceded', 'goals_scored', 'influence', 'minutes',
       'opponent_team', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'value', 'was_home', 'GW',
       'fdr']
data=data[cols]

In [4]:
playerlist=list(data['playerid'].unique())

In [5]:
pointsdict={}
fdrdict={}
for i in playerlist:
    ptslist=[]
    fdrlist=[]
    selection=data[data['playerid']==i]
    for gw in range(1,39):
        ptslist.append(selection[selection['GW']==gw].iloc[0]['total_points'])
        fdrlist.append(selection[selection['GW']==gw].iloc[0]['fdr'])
    pointsdict[i]=ptslist
    fdrdict[i]=fdrlist

In [6]:
data.insert(0, 'prevscore', 0)
data.insert(0, 'prevfdr', 0)
data.insert(0, 'prev2score', 0)
data.insert(0, 'prev2fdr', 0)
data.insert(0, 'prev3score', 0)
data.insert(0, 'prev3fdr', 0)
data.insert(0, 'team_goals_scored', 0)
data.insert(0, 'team_goals_conceded', 0)

In [7]:
def kernel(scores:list)->int:
    if(len(scores)==1):
        return scores[0]*3
    if(len(scores)==2):
        return scores[0]*2+scores[1]*3
    weights=[1,2,3]
    return sum([x*y for x,y in zip(scores,weights)])

In [8]:
def fixture_finder(team1, team2, data):
    teamdict={1:'ARS', 2:'AVL',3:'BOU', 4:'BHA', 5:'BUR', 6:'CHE', 7:'CRY', 8:'EVE', 9:'LEI', 10:'LIV', 11:'MCI', 12:'MUN',
          13:'NEW', 14:'NOR', 15:'SHU', 16:'SOU', 17:'TOT', 18:'WAT', 19:'WHU', 20:'WOL'}
    rev_team = {v: k for k, v in teamdict.items()}
    t1code=rev_team[team1]
    t2code=rev_team[team2]
    selection=data[(data['team']==t1code) & (data['opponent_team']==t2code)]
    return selection

In [9]:
prevscorelist = []
prev2scorelist = []
prev3scorelist = []
prevfdrlist = []
prev2fdrlist = []
prev3fdrlist = []
goals_scored = []
goals_conceded = []
avg_influence=[]
avg_crea=[]
avg_threat=[]

for index, row in data.iterrows():
    currgw=row['GW']
    
    if currgw==1:
        prevscorelist.append(0)
        prevfdrlist.append(0)
    else:
        prevscorelist.append(pointsdict[row['playerid']][currgw-2])
        prevfdrlist.append(fdrdict[row['playerid']][currgw-2])
    
    if currgw in [1,2]:
        prev2scorelist.append(0)
        prev2fdrlist.append(0)
    else:
        prev2scorelist.append(pointsdict[row['playerid']][currgw-3])
        prev2fdrlist.append(fdrdict[row['playerid']][currgw-3])
    
    if currgw in [1,2,3]:
        prev3scorelist.append(0)
        prev3fdrlist.append(0)
    else:
        prev3scorelist.append(pointsdict[row['playerid']][currgw-4])
        prev3fdrlist.append(fdrdict[row['playerid']][currgw-4])
    
    if row['was_home']:
        goals_scored.append(row['team_h_score'])
        goals_conceded.append(row['team_a_score'])
    else:
        goals_scored.append(row['team_a_score'])
        goals_conceded.append(row['team_h_score'])

In [10]:
data['prevscore']=prevscorelist
data['prev2score']=prev2scorelist
data['prev3score']=prev3scorelist
data['prevfdr']=prevfdrlist
data['prev2fdr']=prev2fdrlist
data['prev3fdr']=prev3fdrlist
data['team_goals_scored']=goals_scored
data['team_goals_conceded']=goals_conceded

In [11]:
goalscoring_form=[]
defensive_form=[]

i_avg=[]
c_avg=[]
t_avg=[]

for i in playerlist:
    selection=data[data['playerid']==i]
    influence_total=[]
    creativity_total=[]
    threat_total=[]
    for gw in range(1,39):
        if gw==1:
            goalscoring_form.append(0)
            defensive_form.append(0)
            i_avg.append(0)
            c_avg.append(0)
            t_avg.append(0)
        else:
            st_gw=max(1,gw-3)
            sco=[selection[(selection['playerid']==i)&(selection['GW']==x)].iloc[0]['team_goals_scored'] for x in range(st_gw, gw)]
            con=[selection[(selection['playerid']==i)&(selection['GW']==x)].iloc[0]['team_goals_conceded'] for x in range(st_gw, gw)]
            goalscoring_form.append(kernel(sco))
            defensive_form.append(-1*kernel(con))
            
            if(selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['minutes']>30):
                influence_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['influence'])
                creativity_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['creativity'])
                threat_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['threat'])
            
            i_avg.append(sum(influence_total)/(len(influence_total)+1))
            c_avg.append(sum(creativity_total)/(len(creativity_total)+1))
            t_avg.append(sum(threat_total)/(len(threat_total)+1))
            
            '''
            influence_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw-1)].iloc[0]['influence']
                                  + selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['influence'])
            creativity_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw-1)].iloc[0]['creativity']
                                  + selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['creativity'])
            threat_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw-1)].iloc[0]['threat']
                                  + selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['threat'])
            i_avg.append(influence_total[-1]/gw-1)
            c_avg.append(creativity_total[-1]/gw-1)
            t_avg.append(threat_total[-1]/gw-1)
            '''

In [12]:
data['att_form']=goalscoring_form
data['def_form']=defensive_form
data['influence']=i_avg
data['creativity']=c_avg
data['threat']=t_avg

#### Scaling and one-hot encoding

In [13]:
data['fdr']-=min(data['fdr'])
data['fdr']/=max(data['fdr'])

data['att_form']-=min(data['att_form'])
data['att_form']/=max(data['att_form'])
data['def_form']-=min(data['def_form'])
data['def_form']/=max(data['def_form'])
data['influence']-=min(data['influence'])
data['influence']/=max(data['influence'])
data['creativity']-=min(data['creativity'])
data['creativity']/=max(data['creativity'])
data['threat']-=min(data['threat'])
data['threat']/=max(data['threat'])

In [14]:
opp_team=pd.get_dummies(data['opponent_team'], prefix='opp_team')
for col in opp_team.columns:
    data[col]=opp_team[col]

In [29]:
ids=pd.get_dummies(data['playerid'], prefix='id')
for col in ids.columns:
    data[col]=ids[col]

In [30]:
data.columns

Index(['team_goals_conceded', 'team_goals_scored', 'prev3fdr', 'prev3score',
       'prev2fdr', 'prev2score', 'prevfdr', 'prevscore', 'team', 'playerid',
       ...
       'id_457', 'id_460', 'id_463', 'id_468', 'id_471', 'id_488', 'id_494',
       'id_502', 'id_525', 'id_618'],
      dtype='object', length=155)

In [31]:
feature_cols=[col for col in data.columns if col not in ['opponent_team', 'name', 'total_points', 'playerid',
                                                         'goals_conceded', 'goals_scored', 'clean_sheets', 'minutes']]
targets=data['total_points']
newdata=data[feature_cols]

In [32]:
xtrain, xtest, ytrain, ytest=train_test_split(newdata, targets, test_size=0.1)

#### todo
- test on handpicked fixtures
- nailedness
- team value constraints
- ict

In [39]:
reg = ensemble.RandomForestRegressor(n_estimators=500)
reg.fit(xtrain, ytrain)

mse = mean_squared_error(ytest, reg.predict(xtest))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on test set: 9.8373


In [40]:
pprint.pprint(set(zip(reg.feature_importances_, feature_cols)))

{(0.00012782645502537086, 'id_423'),
 (0.00013838363445679944, 'id_403'),
 (0.00015419951218928894, 'id_148'),
 (0.00024885100472883713, 'id_42'),
 (0.00026885238083585366, 'id_133'),
 (0.0002889582814767433, 'id_47'),
 (0.0003047601207851181, 'id_164'),
 (0.0003344574414537141, 'id_293'),
 (0.00035636603026440925, 'id_457'),
 (0.0003600413407746676, 'id_280'),
 (0.00042071879938464343, 'id_14'),
 (0.0004245795156642854, 'id_396'),
 (0.0004380040643804883, 'id_162'),
 (0.0004417626327343325, 'id_225'),
 (0.0004548815303271964, 'id_235'),
 (0.00047224911353179, 'id_262'),
 (0.0005493506700032402, 'id_160'),
 (0.0005759215019449944, 'id_366'),
 (0.0005848974895865865, 'id_295'),
 (0.0005874477244798855, 'id_212'),
 (0.0005930569487637916, 'id_67'),
 (0.0005991134033208748, 'id_131'),
 (0.0006041435306031248, 'id_291'),
 (0.0007071859567392139, 'id_83'),
 (0.0007114535206205116, 'id_266'),
 (0.0007337015731021005, 'id_122'),
 (0.0007803100690218036, 'opp_team_0'),
 (0.0008268632574535468,

In [35]:
ada = ensemble.AdaBoostRegressor(n_estimators=500)
ada.fit(xtrain, ytrain)

mse = mean_squared_error(ytest, ada.predict(xtest))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on test set: 17.7112


In [36]:
print(ada.feature_importances_)
print(feature_cols)

[9.60352057e-02 1.95063875e-01 4.36860464e-03 1.79344238e-02
 2.40174644e-02 3.50582402e-02 2.45698498e-03 4.46540464e-02
 1.96821592e-02 6.23997839e-02 4.62810144e-02 7.91010032e-02
 3.85609716e-03 5.27994671e-02 6.09288941e-02 2.84062672e-03
 1.59205401e-02 3.90365843e-04 2.79345316e-02 3.60748949e-02
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 1.74339914e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 2.83396935e-03 0.00000000e+00 0.00000000e+00
 2.35888835e-05 0.00000000e+00 0.00000000e+00 0.00000000e+00
 1.05642390e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 1.80152792e-03 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 2.96689627e-03
 0.00000000e+00 4.96207411e-04 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.000000