In [116]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets, ensemble
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler
import pprint

In [83]:
data=pd.read_csv('datacleaned.csv')
data=data.drop(['Unnamed: 0'], axis=1)

In [84]:
cols=['team', 'playerid', 'name', 'clean_sheets', 'creativity',
       'goals_conceded', 'goals_scored', 'influence', 'minutes',
       'opponent_team', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'value', 'was_home', 'GW',
       'fdr']
data=data[cols]

In [85]:
playerlist=list(data['playerid'].unique())

In [86]:
pointsdict={}
fdrdict={}
for i in playerlist:
    ptslist=[]
    fdrlist=[]
    selection=data[data['playerid']==i]
    for gw in range(1,39):
        ptslist.append(selection[selection['GW']==gw].iloc[0]['total_points'])
        fdrlist.append(selection[selection['GW']==gw].iloc[0]['fdr'])
    pointsdict[i]=ptslist
    fdrdict[i]=fdrlist

In [87]:
data.insert(0, 'prevscore', 0)
data.insert(0, 'prevfdr', 0)
data.insert(0, 'prev2score', 0)
data.insert(0, 'prev2fdr', 0)
data.insert(0, 'prev3score', 0)
data.insert(0, 'prev3fdr', 0)
data.insert(0, 'team_goals_scored', 0)
data.insert(0, 'team_goals_conceded', 0)

In [88]:
def kernel(scores:list)->int:
    if(len(scores)==1):
        return scores[0]*3
    if(len(scores)==2):
        return scores[0]*2+scores[1]*3
    weights=[1,2,3]
    return sum([x*y for x,y in zip(scores,weights)])

In [89]:
def fixture_finder(team1, team2, data):
    teamdict={1:'ARS', 2:'AVL',3:'BOU', 4:'BHA', 5:'BUR', 6:'CHE', 7:'CRY', 8:'EVE', 9:'LEI', 10:'LIV', 11:'MCI', 12:'MUN',
          13:'NEW', 14:'NOR', 15:'SHU', 16:'SOU', 17:'TOT', 18:'WAT', 19:'WHU', 20:'WOL'}
    rev_team = {v: k for k, v in teamdict.items()}
    t1code=rev_team[team1]
    t2code=rev_team[team2]
    selection=data[(data['team']==t1code) & (data['opponent_team']==t2code)]
    return selection

In [90]:
prevscorelist = []
prev2scorelist = []
prev3scorelist = []
prevfdrlist = []
prev2fdrlist = []
prev3fdrlist = []
goals_scored = []
goals_conceded = []
avg_influence=[]
avg_crea=[]
avg_threat=[]

for index, row in data.iterrows():
    currgw=row['GW']
    
    if currgw==1:
        prevscorelist.append(0)
        prevfdrlist.append(0)
    else:
        prevscorelist.append(pointsdict[row['playerid']][currgw-2])
        prevfdrlist.append(fdrdict[row['playerid']][currgw-2])
    
    if currgw in [1,2]:
        prev2scorelist.append(0)
        prev2fdrlist.append(0)
    else:
        prev2scorelist.append(pointsdict[row['playerid']][currgw-3])
        prev2fdrlist.append(fdrdict[row['playerid']][currgw-3])
    
    if currgw in [1,2,3]:
        prev3scorelist.append(0)
        prev3fdrlist.append(0)
    else:
        prev3scorelist.append(pointsdict[row['playerid']][currgw-4])
        prev3fdrlist.append(fdrdict[row['playerid']][currgw-4])
    
    if row['was_home']:
        goals_scored.append(row['team_h_score'])
        goals_conceded.append(row['team_a_score'])
    else:
        goals_scored.append(row['team_a_score'])
        goals_conceded.append(row['team_h_score'])

In [91]:
data['prevscore']=prevscorelist
data['prev2score']=prev2scorelist
data['prev3score']=prev3scorelist
data['prevfdr']=prevfdrlist
data['prev2fdr']=prev2fdrlist
data['prev3fdr']=prev3fdrlist
data['team_goals_scored']=goals_scored
data['team_goals_conceded']=goals_conceded

In [92]:
goalscoring_form=[]
defensive_form=[]
influence_total=[]
creativity_total=[]
threat_total=[]

i_avg=[]
c_avg=[]
t_avg=[]

for i in playerlist:
    selection=data[data['playerid']==i]
    for gw in range(1,39):
        if gw==1:
            goalscoring_form.append(0)
            defensive_form.append(0)
            influence_total.append(0)
            creativity_total.append(0)
            threat_total.append(0)
            i_avg.append(0)
            c_avg.append(0)
            t_avg.append(0)
        else:
            st_gw=max(1,gw-3)
            sco=[selection[(selection['playerid']==i)&(selection['GW']==x)].iloc[0]['team_goals_scored'] for x in range(st_gw, gw)]
            con=[selection[(selection['playerid']==i)&(selection['GW']==x)].iloc[0]['team_goals_conceded'] for x in range(st_gw, gw)]
            goalscoring_form.append(kernel(sco))
            defensive_form.append(-1*kernel(con))
            '''
            influence_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw-1)].iloc[0]['influence']
                                  + selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['influence'])
            creativity_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw-1)].iloc[0]['creativity']
                                  + selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['creativity'])
            threat_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw-1)].iloc[0]['threat']
                                  + selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['threat'])
            i_avg.append(influence_total[-1]/gw-1)
            c_avg.append(creativity_total[-1]/gw-1)
            t_avg.append(threat_total[-1]/gw-1)
            '''

In [93]:
data['att_form']=goalscoring_form
data['def_form']=defensive_form
#data['influence']=i_avg
#data['creativity']=c_avg
#data['threat']=t_avg

In [94]:
data.columns

Index(['team_goals_conceded', 'team_goals_scored', 'prev3fdr', 'prev3score',
       'prev2fdr', 'prev2score', 'prevfdr', 'prevscore', 'team', 'playerid',
       'name', 'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored',
       'influence', 'minutes', 'opponent_team', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'value', 'was_home', 'GW', 'fdr', 'att_form',
       'def_form'],
      dtype='object')

#### Scaling and one-hot encoding

In [96]:
data['fdr']-=min(data['fdr'])
data['fdr']/=max(data['fdr'])

data['att_form']-=min(data['att_form'])
data['att_form']/=max(data['att_form'])
data['def_form']-=min(data['def_form'])
data['def_form']/=max(data['def_form'])

In [97]:
opp_team=pd.get_dummies(data['opponent_team'], prefix='opp_team')
for col in opp_team.columns:
    data[col]=opp_team[col]

In [98]:
data.columns

Index(['team_goals_conceded', 'team_goals_scored', 'prev3fdr', 'prev3score',
       'prev2fdr', 'prev2score', 'prevfdr', 'prevscore', 'team', 'playerid',
       'name', 'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored',
       'influence', 'minutes', 'opponent_team', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'value', 'was_home', 'GW', 'fdr', 'att_form',
       'def_form', 'opp_team_0', 'opp_team_1', 'opp_team_2', 'opp_team_3',
       'opp_team_4', 'opp_team_5', 'opp_team_6', 'opp_team_7', 'opp_team_8',
       'opp_team_9', 'opp_team_10', 'opp_team_11', 'opp_team_12',
       'opp_team_13', 'opp_team_14', 'opp_team_15', 'opp_team_16',
       'opp_team_17', 'opp_team_18', 'opp_team_19', 'opp_team_20'],
      dtype='object')

In [119]:
feature_cols=[col for col in data.columns if col not in ['opponent_team', 'name', 'total_points']]
targets=data['total_points']
data=data[feature_cols]

In [120]:
xtrain, xtest, ytrain, ytest=train_test_split(data, targets, test_size=0.2)

#### todo
- test on handpicked fixtures
- nailedness
- team value constraints
- ict

In [121]:
reg = ensemble.RandomForestRegressor(n_estimators=500)
reg.fit(xtrain, ytrain)

mse = mean_squared_error(ytest, reg.predict(xtest))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on test set: 1.8491


In [122]:
pprint.pprint(set(zip(reg.feature_importances_, feature_cols)))

{(0.0, 'opp_team_0'),
 (0.0002436455207068446, 'opp_team_15'),
 (0.00027479760078611906, 'opp_team_17'),
 (0.00031977243069369806, 'opp_team_11'),
 (0.00039956805185898216, 'opp_team_13'),
 (0.000401383099670043, 'opp_team_14'),
 (0.0004031209983454447, 'opp_team_20'),
 (0.00046213390862063933, 'opp_team_8'),
 (0.00047067534221664276, 'opp_team_3'),
 (0.00047879882587596257, 'opp_team_6'),
 (0.0005036429290671417, 'opp_team_5'),
 (0.0005322384384433684, 'opp_team_2'),
 (0.0005624613818135268, 'opp_team_1'),
 (0.0005917139447704768, 'opp_team_12'),
 (0.0006001811829834796, 'opp_team_10'),
 (0.0006848515992255646, 'opp_team_7'),
 (0.000739087711698267, 'opp_team_16'),
 (0.0008588411566995634, 'opp_team_9'),
 (0.0008609684619986002, 'opp_team_19'),
 (0.0008706148004423529, 'opp_team_18'),
 (0.0009703011586234904, 'opp_team_4'),
 (0.0010235812391515184, 'was_home'),
 (0.002077341580232461, 'fdr'),
 (0.002102866553801783, 'prevfdr'),
 (0.0022823851084708044, 'prev2fdr'),
 (0.002352248422727

In [123]:
ada = ensemble.AdaBoostRegressor(n_estimators=500)
ada.fit(xtrain, ytrain)

mse = mean_squared_error(ytest, ada.predict(xtest))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on test set: 3.0017


In [124]:
print(ada.feature_importances_)
print(feature_cols)

[0.03288038 0.00671739 0.00279261 0.00226614 0.         0.01570042
 0.         0.0011684  0.         0.00587152 0.13003564 0.01620116
 0.01223506 0.2291164  0.47033282 0.0104358  0.00109233 0.00058194
 0.01682858 0.00592183 0.00093892 0.01368261 0.00256331 0.00835865
 0.00365259 0.         0.         0.         0.         0.00826383
 0.         0.         0.         0.         0.00124101 0.
 0.         0.00112064 0.         0.         0.         0.
 0.         0.         0.         0.        ]
['team_goals_conceded', 'team_goals_scored', 'prev3fdr', 'prev3score', 'prev2fdr', 'prev2score', 'prevfdr', 'prevscore', 'team', 'playerid', 'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored', 'influence', 'minutes', 'team_a_score', 'team_h_score', 'threat', 'value', 'was_home', 'GW', 'fdr', 'att_form', 'def_form', 'opp_team_0', 'opp_team_1', 'opp_team_2', 'opp_team_3', 'opp_team_4', 'opp_team_5', 'opp_team_6', 'opp_team_7', 'opp_team_8', 'opp_team_9', 'opp_team_10', 'opp_team_11', 'o