In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets, ensemble
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
import pprint
import numpy as np

In [2]:
def import_data(path='datacleaned.csv'):
    data=pd.read_csv(path)
    data=data.drop(['Unnamed: 0'], axis=1)
    cols=['team', 'playerid', 'name', 'clean_sheets', 'creativity',
       'goals_conceded', 'goals_scored', 'influence', 'minutes',
       'opponent_team', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'value', 'was_home', 'GW',
       'fdr']
    data=data[cols]
    return data

In [22]:
def kernel(scores:list)->int:
    '''
    weight past scores/form by weights
    currently weights are 3,2,1
    handles data of length 2/1 also
    '''
    if(len(scores)==1):
        return scores[0]*3
    if(len(scores)==2):
        return scores[0]*2+scores[1]*3
    weights=[1,2,3]
    return sum([x*y for x,y in zip(scores,weights)])

In [52]:
def fixture_finder(team1, team2, data):
    '''
    helped function to find players in the fixture 'TEAM1 vs TEAM2'
    team codes are fetched from dict
    '''
    teamdict={1:'ARS', 2:'AVL',3:'BOU', 4:'BHA', 5:'BUR', 6:'CHE', 7:'CRY', 8:'EVE', 9:'LEI', 10:'LIV', 11:'MCI', 12:'MUN',
          13:'NEW', 14:'NOR', 15:'SHU', 16:'SOU', 17:'TOT', 18:'WAT', 19:'WHU', 20:'WOL'}
    rev_team = {v: k for k, v in teamdict.items()}
    t1code=rev_team[team1]
    t2code=rev_team[team2]
    selectionH=data[(data['team']==t1code) & (data['opponent_team']==t2code) & (data['was_home']==1)]
    selectionA=data[(data['team']==t2code) & (data['opponent_team']==t1code) & (data['was_home']==0)]
    return pd.concat([selectionH, selectionA])

In [20]:
def prevfeatures(data):
    '''
    synthesising features about the last 3 fixtures
        points the player has got in last 3 fixtures
        fdr of the team the player faced in the last 3 fixtures
        goals the team scored in the last 3 fixtures
        goals the team conceded in the last 3 fixtures
        (last 2 can be used to see talisman status)
    '''
    playerlist=list(data['playerid'].unique())
    pointsdict={}
    fdrdict={}
    for i in playerlist:
        ptslist=[]
        fdrlist=[]
        selection=data[data['playerid']==i]
        for gw in range(1,39):
            ptslist.append(selection[selection['GW']==gw].iloc[0]['total_points'])
            fdrlist.append(selection[selection['GW']==gw].iloc[0]['fdr'])
        pointsdict[i]=ptslist
        fdrdict[i]=fdrlist
    
    prevscorelist = []
    prev2scorelist = []
    prev3scorelist = []
    prevfdrlist = []
    prev2fdrlist = []
    prev3fdrlist = []
    goals_scored = []
    goals_conceded = []

    for index, row in data.iterrows():
        currgw=row['GW']

        if currgw==1:
            prevscorelist.append(0)
            prevfdrlist.append(0)
        else:
            prevscorelist.append(pointsdict[row['playerid']][currgw-2])
            prevfdrlist.append(fdrdict[row['playerid']][currgw-2])

        if currgw in [1,2]:
            prev2scorelist.append(0)
            prev2fdrlist.append(0)
        else:
            prev2scorelist.append(pointsdict[row['playerid']][currgw-3])
            prev2fdrlist.append(fdrdict[row['playerid']][currgw-3])

        if currgw in [1,2,3]:
            prev3scorelist.append(0)
            prev3fdrlist.append(0)
        else:
            prev3scorelist.append(pointsdict[row['playerid']][currgw-4])
            prev3fdrlist.append(fdrdict[row['playerid']][currgw-4])

        if row['was_home']:
            goals_scored.append(row['team_h_score'])
            goals_conceded.append(row['team_a_score'])
        else:
            goals_scored.append(row['team_a_score'])
            goals_conceded.append(row['team_h_score'])
    data['prevscore']=prevscorelist
    data['prev2score']=prev2scorelist
    data['prev3score']=prev3scorelist
    data['prevfdr']=prevfdrlist
    data['prev2fdr']=prev2fdrlist
    data['prev3fdr']=prev3fdrlist
    data['team_goals_scored']=goals_scored
    data['team_goals_conceded']=goals_conceded
    
    return data

In [23]:
def ICT(data):
    '''
    compute average influence, creativity and threat scores of a player, till that point in the season
    ICT scores of a gameweek counted only if the player has played more than 30mins
    '''
    playerlist=list(data['playerid'].unique())
    i_avg=[]
    c_avg=[]
    t_avg=[]
    
    for i in playerlist:
        selection=data[data['playerid']==i]
        influence_total=[]
        creativity_total=[]
        threat_total=[]
        
        for gw in range(1,39):
            if gw==1:
                i_avg.append(0)
                c_avg.append(0)
                t_avg.append(0)
            else:
                if(selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['minutes']>30):
                    influence_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['influence'])
                    creativity_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['creativity'])
                    threat_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['threat'])
                
                i_avg.append(sum(influence_total)/(len(influence_total)+1))
                c_avg.append(sum(creativity_total)/(len(creativity_total)+1))
                t_avg.append(sum(threat_total)/(len(threat_total)+1))
    
    data['influence']=i_avg
    data['creativity']=c_avg
    data['threat']=t_avg
    return data

In [24]:
def team_form(data):
    '''
    attacking and defensive form of a team, weighted by the kernel
    '''
    playerlist=list(data['playerid'].unique())
    goalscoring_form=[]
    defensive_form=[]

    for i in playerlist:
        selection=data[data['playerid']==i]

        for gw in range(1,39):
            if gw==1:
                goalscoring_form.append(0)
                defensive_form.append(0)
            else:
                st_gw=max(1,gw-3)
                sco=[selection[(selection['playerid']==i)&(selection['GW']==x)].iloc[0]['team_goals_scored'] for x in range(st_gw, gw)]
                con=[selection[(selection['playerid']==i)&(selection['GW']==x)].iloc[0]['team_goals_conceded'] for x in range(st_gw, gw)]
                goalscoring_form.append(kernel(sco))
                defensive_form.append(-1*kernel(con))
    data['att_form']=goalscoring_form
    data['def_form']=defensive_form
    return data

In [25]:
def scaling(data):
    '''
    scaling FDR, ICT, and team form to the range 0-1
    '''
    #fixture difficulty
    data['fdr']-=min(data['fdr'])
    data['fdr']/=max(data['fdr'])
    
    #attacking and defensive form
    data['att_form']-=min(data['att_form'])
    data['att_form']/=max(data['att_form'])
    data['def_form']-=min(data['def_form'])
    data['def_form']/=max(data['def_form'])
    
    #ICT
    data['influence']-=min(data['influence'])
    data['influence']/=max(data['influence'])
    data['creativity']-=min(data['creativity'])
    data['creativity']/=max(data['creativity'])
    data['threat']-=min(data['threat'])
    data['threat']/=max(data['threat'])
    return data

In [9]:
def OHE(data):
    #opponent team
    opp_team=pd.get_dummies(data['opponent_team'], prefix='opp_team')
    for col in opp_team.columns:
        data[col]=opp_team[col]
    
    #player IDs
    ids=pd.get_dummies(data['playerid'], prefix='id')
    for col in ids.columns:
        data[col]=ids[col]
        
    return data

In [10]:
def filter_split(data, feature_cols, target_col, test_ratio=0.2):
    targets=data[target_col]
    newdata=data[feature_cols]
    xtrain, xtest, ytrain, ytest=train_test_split(newdata, targets, test_size=test_ratio)
    return xtrain, xtest, ytrain, ytest

#### todo
- nailedness
- team value constraints
- simulate a set of fixtures

In [11]:
def execute(verbose=False):
    if verbose:
        print("Importing data...")
    data=import_data()
    if verbose:
        print("Done.")
    if verbose:
        print("Making features from 3 previous fixtures...")
    data=prevfeatures(data)
    if verbose:
        print("Done.")
    if verbose:
        print("Creating ICT features...")
    data=ICT(data)
    if verbose:
        print("Done.")
    if verbose:
        print("Incorporating team form...")
    data=team_form(data)
    if verbose:
        print("Done.")
    if verbose:
        print("Scaling features...")
    data=scaling(data)
    if verbose:
        print("Done.")
    if verbose:
        print("Performing one-hot encoding...")
    data=OHE(data)
    if verbose:
        print("Done.")
    if verbose:
        print("Splitting data for testing...")
    feature_cols=[col for col in data.columns if col not in ['opponent_team', 'name', 'total_points', 'playerid',
                                                         'goals_conceded', 'goals_scored', 'clean_sheets', 'minutes',
                                                         'team_goals_scored', 'team_goals_conceded']]    
    target_col='total_points'
    xtrain, xtest, ytrain, ytest=filter_split(data, feature_cols, target_col)
    if verbose:
        print("Done.")
    return xtrain, xtest, ytrain, ytest, data

In [12]:
xtrain, xtest, ytrain, ytest, data=execute(verbose=True)

Importing data...
Done.
Making features from 3 previous fixtures...
Done.
Creating ICT features...
Done.
Incorporating team form...
Done.
Scaling features...
Done.
Performing one-hot encoding...
Done.
Splitting data for testing...
Done.


In [19]:
xtrain

Unnamed: 0,team,creativity,influence,team_a_score,team_h_score,threat,value,was_home,GW,fdr,...,id_457,id_460,id_463,id_468,id_471,id_488,id_494,id_502,id_525,id_618
1419,11,0.270687,0.383250,2.0,2.0,0.762814,58,False,14,0.4,...,0,0,0,0,0,0,0,0,0,0
3373,20,0.246743,0.298745,1.0,0.0,0.638957,83,False,32,0.4,...,0,0,0,0,0,0,0,0,0,0
759,7,0.496108,0.359695,2.0,1.0,0.115627,50,False,8,0.4,...,0,0,0,0,0,0,0,0,0,0
2106,4,0.063169,0.560177,0.0,2.0,0.214378,58,True,20,0.4,...,0,0,0,0,0,0,0,0,0,0
3256,17,0.037066,0.240710,0.0,2.0,0.000000,83,True,31,0.4,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,17,0.302584,0.294176,0.0,4.0,0.095745,110,True,5,0.4,...,0,0,0,0,0,0,0,0,0,0
2905,11,0.255571,0.491605,0.0,3.0,0.311236,96,True,28,0.6,...,0,0,0,0,0,0,0,0,0,0
1442,15,0.346883,0.483033,1.0,1.0,0.876121,46,False,14,0.6,...,0,0,0,0,0,0,0,0,0,0
665,9,0.000000,0.377149,0.0,5.0,0.000000,50,True,7,0.4,...,0,0,0,0,0,0,0,0,0,0


### Random Forest

In [16]:
reg = ensemble.RandomForestRegressor(n_estimators=500)
reg.fit(xtrain, ytrain)

mse = mean_squared_error(ytest, reg.predict(xtest))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on test set: 9.8842


In [17]:
feature_cols=[col for col in data.columns if col not in ['opponent_team', 'name', 'total_points', 'playerid',
                                                         'goals_conceded', 'goals_scored', 'clean_sheets', 'minutes',
                                                         'team_goals_scored', 'team_goals_conceded']]
pprint.pprint(set(zip(reg.feature_importances_, feature_cols)))

{(0.00010637694500980661, 'opp_team_0'),
 (0.0002086370822541996, 'id_423'),
 (0.0004187406815737973, 'id_14'),
 (0.0004868575575569603, 'id_293'),
 (0.0005062085964274357, 'id_488'),
 (0.0005147253979617685, 'id_47'),
 (0.000530551397366134, 'id_403'),
 (0.0005546803509559835, 'id_148'),
 (0.0005725868868768505, 'id_133'),
 (0.0005748599437090441, 'id_448'),
 (0.0006192579627899882, 'id_395'),
 (0.0006812152270143581, 'id_225'),
 (0.0006918717959839081, 'id_12'),
 (0.0006982603560163561, 'id_171'),
 (0.0007144403362341695, 'id_67'),
 (0.0007322668157318409, 'id_457'),
 (0.0008233430697560963, 'id_162'),
 (0.0008233957311524635, 'id_212'),
 (0.000829624741768544, 'id_318'),
 (0.0008588953311542961, 'id_97'),
 (0.000881389845977284, 'id_164'),
 (0.00089851380982295, 'id_131'),
 (0.0009154881821620654, 'id_199'),
 (0.0009313879982100323, 'id_411'),
 (0.0009360248822616613, 'id_254'),
 (0.0009452563980368968, 'id_203'),
 (0.0009645684542106143, 'id_122'),
 (0.0009743275069036606, 'id_187'

In [49]:
sel=fixture_finder('SOU', 'LEI', data)
res=sel['total_points']
sel=sel[feature_cols]

In [50]:
sel

Unnamed: 0,team,creativity,influence,team_a_score,team_h_score,threat,value,was_home,GW,fdr,...,id_457,id_460,id_463,id_468,id_471,id_488,id_494,id_502,id_525,id_618
1023,16,0.04581,0.32677,9.0,0.0,0.126531,59,True,10,0.6,...,0,0,0,0,0,0,0,0,0,0
1024,16,0.055197,0.326123,9.0,0.0,0.123112,43,True,10,0.6,...,0,0,0,0,0,0,0,0,0,0
1025,16,0.054198,0.318066,9.0,0.0,0.119872,63,True,10,0.6,...,0,0,0,0,0,0,0,0,0,0
1026,16,0.0,0.0,9.0,0.0,0.0,58,True,10,0.6,...,0,0,0,0,0,0,0,0,0,0
342,9,0.0,0.0,1.0,1.0,0.0,55,False,4,0.6,...,0,0,0,0,0,0,0,0,0,0
977,9,0.290695,0.326268,9.0,0.0,0.099655,62,False,10,0.6,...,0,0,0,0,0,0,0,0,0,0
979,9,0.300835,0.353439,9.0,0.0,0.109405,54,False,10,0.6,...,0,0,0,0,0,0,0,0,0,0
980,9,0.303587,0.348353,9.0,0.0,0.109671,50,False,10,0.6,...,0,0,0,0,0,0,0,0,0,0
981,9,0.316366,0.345066,9.0,0.0,0.109917,47,False,10,0.6,...,0,0,0,0,0,0,0,0,0,0
982,9,0.306505,0.333888,9.0,0.0,0.10692,91,False,10,0.6,...,0,0,0,0,0,0,0,0,0,0


In [51]:
print(reg.predict(sel))
print(np.column_stack(res))

[ 4.548  0.664  2.974  3.02   1.732  6.71  14.092  6.694  6.796 16.058
  7.172  8.246  8.4    7.056 10.024]
[[ 1  0  2  2  1  6 19  6  6 20  7  8  9 20 11]]


### XGBoost

In [17]:
xgb=XGBRegressor(objective ='reg:linear', n_estimators = 500)
xgb.fit(xtrain, ytrain)
mse = mean_squared_error(ytest, xgb.predict(xtest))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on test set: 12.8955
