In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets, ensemble
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
import pprint
import numpy as np

In [2]:
def import_data(path='datacleaned.csv'):
    data=pd.read_csv(path)
    data=data.drop(['Unnamed: 0'], axis=1)
    cols=['team', 'playerid', 'name', 'clean_sheets', 'creativity',
       'goals_conceded', 'goals_scored', 'influence', 'minutes',
       'opponent_team', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'value', 'was_home', 'GW',
       'fdr']
    data=data[cols]
    return data

In [3]:
def kernel(scores:list)->int:
    if(len(scores)==1):
        return scores[0]*3
    if(len(scores)==2):
        return scores[0]*2+scores[1]*3
    weights=[1,2,3]
    return sum([x*y for x,y in zip(scores,weights)])

In [4]:
def fixture_finder(team1, team2, data):
    teamdict={1:'ARS', 2:'AVL',3:'BOU', 4:'BHA', 5:'BUR', 6:'CHE', 7:'CRY', 8:'EVE', 9:'LEI', 10:'LIV', 11:'MCI', 12:'MUN',
          13:'NEW', 14:'NOR', 15:'SHU', 16:'SOU', 17:'TOT', 18:'WAT', 19:'WHU', 20:'WOL'}
    rev_team = {v: k for k, v in teamdict.items()}
    t1code=rev_team[team1]
    t2code=rev_team[team2]
    selection=data[(data['team']==t1code) & (data['opponent_team']==t2code) & (data['was_home']==1)]
    return selection

In [5]:
def prevfeatures(data):
    playerlist=list(data['playerid'].unique())
    pointsdict={}
    fdrdict={}
    for i in playerlist:
        ptslist=[]
        fdrlist=[]
        selection=data[data['playerid']==i]
        for gw in range(1,39):
            ptslist.append(selection[selection['GW']==gw].iloc[0]['total_points'])
            fdrlist.append(selection[selection['GW']==gw].iloc[0]['fdr'])
        pointsdict[i]=ptslist
        fdrdict[i]=fdrlist
    
    prevscorelist = []
    prev2scorelist = []
    prev3scorelist = []
    prevfdrlist = []
    prev2fdrlist = []
    prev3fdrlist = []
    goals_scored = []
    goals_conceded = []

    for index, row in data.iterrows():
        currgw=row['GW']

        if currgw==1:
            prevscorelist.append(0)
            prevfdrlist.append(0)
        else:
            prevscorelist.append(pointsdict[row['playerid']][currgw-2])
            prevfdrlist.append(fdrdict[row['playerid']][currgw-2])

        if currgw in [1,2]:
            prev2scorelist.append(0)
            prev2fdrlist.append(0)
        else:
            prev2scorelist.append(pointsdict[row['playerid']][currgw-3])
            prev2fdrlist.append(fdrdict[row['playerid']][currgw-3])

        if currgw in [1,2,3]:
            prev3scorelist.append(0)
            prev3fdrlist.append(0)
        else:
            prev3scorelist.append(pointsdict[row['playerid']][currgw-4])
            prev3fdrlist.append(fdrdict[row['playerid']][currgw-4])

        if row['was_home']:
            goals_scored.append(row['team_h_score'])
            goals_conceded.append(row['team_a_score'])
        else:
            goals_scored.append(row['team_a_score'])
            goals_conceded.append(row['team_h_score'])
    data['prevscore']=prevscorelist
    data['prev2score']=prev2scorelist
    data['prev3score']=prev3scorelist
    data['prevfdr']=prevfdrlist
    data['prev2fdr']=prev2fdrlist
    data['prev3fdr']=prev3fdrlist
    data['team_goals_scored']=goals_scored
    data['team_goals_conceded']=goals_conceded
    
    return data

In [6]:
def ICT(data):
    playerlist=list(data['playerid'].unique())
    i_avg=[]
    c_avg=[]
    t_avg=[]
    
    for i in playerlist:
        selection=data[data['playerid']==i]
        influence_total=[]
        creativity_total=[]
        threat_total=[]
        
        for gw in range(1,39):
            if gw==1:
                i_avg.append(0)
                c_avg.append(0)
                t_avg.append(0)
            else:
                if(selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['minutes']>30):
                    influence_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['influence'])
                    creativity_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['creativity'])
                    threat_total.append(selection[(selection['playerid']==i)&(selection['GW']==gw)].iloc[0]['threat'])
                
                i_avg.append(sum(influence_total)/(len(influence_total)+1))
                c_avg.append(sum(creativity_total)/(len(creativity_total)+1))
                t_avg.append(sum(threat_total)/(len(threat_total)+1))
    
    data['influence']=i_avg
    data['creativity']=c_avg
    data['threat']=t_avg
    return data

In [7]:
def team_form(data):
    playerlist=list(data['playerid'].unique())
    goalscoring_form=[]
    defensive_form=[]

    for i in playerlist:
        selection=data[data['playerid']==i]

        for gw in range(1,39):
            if gw==1:
                goalscoring_form.append(0)
                defensive_form.append(0)
            else:
                st_gw=max(1,gw-3)
                sco=[selection[(selection['playerid']==i)&(selection['GW']==x)].iloc[0]['team_goals_scored'] for x in range(st_gw, gw)]
                con=[selection[(selection['playerid']==i)&(selection['GW']==x)].iloc[0]['team_goals_conceded'] for x in range(st_gw, gw)]
                goalscoring_form.append(kernel(sco))
                defensive_form.append(-1*kernel(con))
    data['att_form']=goalscoring_form
    data['def_form']=defensive_form
    return data

In [8]:
def scaling(data):
    #fixture difficulty
    data['fdr']-=min(data['fdr'])
    data['fdr']/=max(data['fdr'])
    
    #attacking and defensive form
    data['att_form']-=min(data['att_form'])
    data['att_form']/=max(data['att_form'])
    data['def_form']-=min(data['def_form'])
    data['def_form']/=max(data['def_form'])
    
    #ICT
    data['influence']-=min(data['influence'])
    data['influence']/=max(data['influence'])
    data['creativity']-=min(data['creativity'])
    data['creativity']/=max(data['creativity'])
    data['threat']-=min(data['threat'])
    data['threat']/=max(data['threat'])
    return data

In [9]:
def OHE(data):
    #opponent team
    opp_team=pd.get_dummies(data['opponent_team'], prefix='opp_team')
    for col in opp_team.columns:
        data[col]=opp_team[col]
    
    #player IDs
    ids=pd.get_dummies(data['playerid'], prefix='id')
    for col in ids.columns:
        data[col]=ids[col]
        
    return data

In [10]:
def filter_split(data, test_ratio=0.2):
    feature_cols=[col for col in data.columns if col not in ['opponent_team', 'name', 'total_points', 'playerid',
                                                         'goals_conceded', 'goals_scored', 'clean_sheets', 'minutes',
                                                         'team_goals_scored', 'team_goals_conceded']]
    targets=data['total_points']
    newdata=data[feature_cols]
    xtrain, xtest, ytrain, ytest=train_test_split(newdata, targets, test_size=test_ratio)
    return xtrain, xtest, ytrain, ytest

#### todo
- test on handpicked fixtures
- nailedness
- team value constraints

In [11]:
def execute(verbose=False):
    if verbose:
        print("Importing data...")
    data=import_data()
    if verbose:
        print("Done.")
    if verbose:
        print("Making features from 3 previous fixtures...")
    data=prevfeatures(data)
    if verbose:
        print("Done.")
    if verbose:
        print("Creating ICT features...")
    data=ICT(data)
    if verbose:
        print("Done.")
    if verbose:
        print("Incorporating team form...")
    data=team_form(data)
    if verbose:
        print("Done.")
    if verbose:
        print("Scaling features...")
    data=scaling(data)
    if verbose:
        print("Done.")
    if verbose:
        print("Performing one-hot encoding...")
    data=OHE(data)
    if verbose:
        print("Done.")
    if verbose:
        print("Splitting data for testing...")
    xtrain, xtest, ytrain, ytest=filter_split(data)
    if verbose:
        print("Done.")
    return xtrain, xtest, ytrain, ytest, data

In [12]:
xtrain, xtest, ytrain, ytest, data=execute(verbose=True)

Importing data...
Done.
Making features from 3 previous fixtures...
Done.
Creating ICT fixtures...
Done.
Incorporating team form...
Done.
Scaling features...
Done.
Performing one-hot encoding...
Done.
Splitting data for testing...
Done.


### Random Forest

In [15]:
reg = ensemble.RandomForestRegressor(n_estimators=500)
reg.fit(xtrain, ytrain)

mse = mean_squared_error(ytest, reg.predict(xtest))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on test set: 11.6775


In [None]:
pprint.pprint(set(zip(reg.feature_importances_, feature_cols)))

In [16]:
sel=fixture_finder('MUN', 'WHU', data)
res=sel['total_points']
sel=sel[feature_cols]

NameError: name 'data' is not defined

In [None]:
print(reg.predict(sel))
print(np.column_stack(res))

### XGBoost

In [None]:
xgb=XGBRegressor(objective ='reg:linear', n_estimators = 500)
xgb.fit(xtrain, ytrain)
mse = mean_squared_error(ytest, xgb.predict(xtest))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))