In [130]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import fifa_index
import re

In [2]:
epl1718 = 'https://www.football-data.co.uk/mmz4281/1718/E0.csv'
epl1819 = 'https://www.football-data.co.uk/mmz4281/1819/E0.csv'
epl1920 = 'https://www.football-data.co.uk/mmz4281/1920/E0.csv'

In [207]:
def download_data(url, date_col):
    dataf = pd.read_csv(url)
    dataf[date_col] = pd.to_datetime(dataf[date_col], dayfirst = True)
    return dataf

def keep_cols(dataf, cols):
    return dataf.loc[:,cols]

def categorical_to_codes(dataf, column):
    dataf[column] = dataf[column].astype('category').cat.codes
    return dataf

def set_season(dataf, season):
    dataf['season'] = season
    return dataf

def match_team_names(teams_list_1, teams_list_2): #both must be lists
    teams_dict = {}
    for x in teams_list_1:
        for y in teams_list_2:
            if x in y or y in x:
                teams_dict[x] = y
    return teams_dict

def get_team_ranks(hometeam, awayteam, season):
    home = teams_dict[hometeam]
    away = teams_dict[awayteam]
    if season not in fifa.teams[home] or season not in fifa.teams[away]: #Validation
        return []
    
    ranks = (list(fifa.teams[home][season].values()) + list(fifa.teams[away][season].values()))
    
    return [int(x) for x in ranks]

# Fixtures data

In [311]:
cols = ['Date','HomeTeam','AwayTeam','FTR','PSH','PSD','PSA']

In [356]:
dataf = pd.DataFrame()

for i,season in zip([18,19,20], [epl1718,epl1819,epl1920]):
    
    temp_df = (download_data(season, 'Date').pipe(keep_cols, cols)
                                            .pipe(categorical_to_codes, 'FTR') # 0 = AWAY WIN, 1 = DRAW, 2 = HOME WIN !
                                            .pipe(set_season, i))
    dataf = pd.concat([dataf, temp_df], axis = 0)

dataf.reset_index(inplace = True)
dataf.drop('index', axis = 1, inplace = True)

In [357]:
dataf.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,PSH,PSD,PSA,season
0,2017-08-11,Arsenal,Leicester,2,1.53,4.55,6.85,18
1,2017-08-12,Brighton,Man City,0,10.95,5.55,1.34,18
2,2017-08-12,Chelsea,Burnley,0,1.26,6.3,15.25,18
3,2017-08-12,Crystal Palace,Huddersfield,0,1.83,3.58,5.11,18
4,2017-08-12,Everton,Stoke,2,1.7,3.83,5.81,18


# Scraping fifaindex.com

In [229]:
fifa = fifa_index.fifaIndex() # Initializing

fifa.setVersions(18,19,20) # Setting versions of FIFA to scrape from. 18 = 2017-2018 and so on.
fifa.scrapeLeagues(13) # See fifa.getAvailableLeagues() to see all possible leagues to scrape from.
fifa.dataframe() #Show end results, it is just easier to look at results in a dataframe

# I will use fifa.teams as it is a dictionary so it's faster and easier to use

scraping:	 https://www.fifaindex.com/teams/fifa18/1/?league=13&
scraping:	 https://www.fifaindex.com/teams/fifa19/1/?league=13&
scraping:	 https://www.fifaindex.com/teams/fifa20/1/?league=13&


Unnamed: 0,Unnamed: 1,defense,midfield,attack,overall
Manchester City,18,83,87,85,84
Manchester City,19,82,88,86,85
Manchester City,20,83,86,87,85
Tottenham Hotspur,18,82,83,85,83
Tottenham Hotspur,19,83,82,86,83
Tottenham Hotspur,20,82,82,85,82
Manchester United,18,81,83,85,83
Manchester United,19,80,83,83,82
Manchester United,20,81,81,83,81
Chelsea,18,82,86,84,83


In [230]:
fifa.mergeTeams(['AFC Bournemouth','Bournemouth']) 
#Sometimes fifaindex write the same team under different names in different seasons.
# mergeTeams will accept as many lists (of length two) and will merge the second item in the list to the first one.
# in this case, all data from `Brounemouth` will be moved into `AFC Bournemouth`

fifa_teams_list = list(fifa.teams.keys())
data_teams_list = list(dataf.HomeTeam.unique())

teams_dict = match_team_names(data_teams_list, fifa_teams_list)
teams_dict['Wolves'] = 'Wolverhampton Wanderers'
teams_dict['Man City'] = 'Manchester City'
teams_dict['Man United'] = 'Manchester United' #Adding some manually

In [314]:
ranks_df = dataf.apply(lambda d: get_team_ranks(d['HomeTeam'], d['AwayTeam'], d['season']),
                       axis = 1).apply(pd.Series).rename(columns = {
    0:'h_def',1:'h_mid',2:'h_att',3:'h_ovl',4:'a_def',5:'a_mid',6:'a_att',7:'a_ovl'})

In [315]:
dataf = pd.concat([dataf, ranks_df], axis = 1)

In [316]:
dataf[dataf.isna().any(axis = 1)] #Check for any missing values

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,PSH,PSD,PSA,season,h_def,h_mid,h_att,h_ovl,a_def,a_mid,a_att,a_ovl


# Modeling

In [264]:
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [323]:
y = keras.utils.to_categorical(dataf['FTR'])
X_train, X_test, y_train, y_test = train_test_split(dataf.iloc[:,8:16]/100, y, test_size = 0.1)

In [262]:
def set_model(dropout, first_layer, second_layer):
    
    global model
    model = Sequential()
    n_features = X_train.shape[1]

    model.add(Dense(8, input_shape = (n_features,)))
    model.add(Dense(first_layer))
    model.add(Dropout(dropout))
    model.add(Dense(second_layer))
    model.add(Dense(3, activation = 'softmax'))

    opt = keras.optimizers.Adam(learning_rate=0.01)
    model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics = ['accuracy'])
    
def test_model(epochs, patience):
    early_stopping = keras.callbacks.EarlyStopping(monitor='accuracy', min_delta = 0, patience = patience)
    model.fit(X_train, y_train, verbose = 0, epochs = epochs, batch_size = 1, callbacks = [early_stopping])
    return model.evaluate(X_test, y_test)

In [324]:
set_model(0.5, 8,6)
test_model(1000,15)



[1.0003230571746826, 0.5175438523292542]

In [329]:
model_predictions = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['Away','Draw','Home'])

final_df = pd.merge(dataf.iloc[:,0:7],
         model_predictions.loc[:,['Home','Draw','Away']], 
         how = 'inner', 
         left_index= True, 
         right_index = True)

final_df['FTR'] = final_df.apply(lambda d: 'A' if d['FTR'] == 0 else 'D' if d['FTR'] == 1 else 'H', axis = 1)

In [334]:
for x in ['PSH','PSD','PSA']:
    final_df[x] = 1/final_df[x]

In [355]:
final_df

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,PSH,PSD,PSA,Home,Draw,Away
8,2017-08-13,Man United,West Ham,H,0.751880,0.176056,0.092166,0.718746,0.196215,0.085039
11,2017-08-19,Burnley,West Brom,A,0.378788,0.315457,0.325733,0.533322,0.244876,0.221802
41,2017-09-16,Crystal Palace,Southampton,A,0.334448,0.301205,0.384615,0.539307,0.243890,0.216803
62,2017-09-30,Huddersfield,Tottenham,A,0.104822,0.200000,0.714286,0.197168,0.205120,0.597712
76,2017-10-14,Watford,Arsenal,H,0.182149,0.228833,0.609756,0.343878,0.243894,0.412228
...,...,...,...,...,...,...,...,...,...,...
1087,2020-07-05,Liverpool,Aston Villa,H,0.740741,0.181488,0.111857,0.781820,0.167115,0.051065
1103,2020-07-11,Sheffield United,Chelsea,H,0.162866,0.253165,0.613497,0.292937,0.235297,0.471767
1109,2020-07-13,Man United,Southampton,D,0.781250,0.166945,0.085106,0.698229,0.204261,0.097510
1118,2020-07-16,Southampton,Brighton,D,0.454545,0.296736,0.278552,0.534249,0.244670,0.221081
