In [1]:
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [3]:
#Reading all the dataset

players = pd.read_csv('EU_leagues_15-21/players.csv', encoding = 'latin1')
teams = pd.read_csv('EU_leagues_15-21/teams.csv', encoding = 'latin1')
leagues = pd.read_csv('EU_leagues_15-21/leagues.csv', encoding = 'latin1')
teamstats = pd.read_csv('EU_leagues_15-21/teamstats.csv', encoding = 'latin1')
games = pd.read_csv('EU_leagues_15-21/games.csv', encoding = 'latin1')
shots = pd.read_csv('EU_leagues_15-21/shots.csv', encoding = 'latin1')
appearances = pd.read_csv('EU_leagues_15-21/appearances.csv', encoding = 'latin1')


In [3]:
#games dataframe has the number of goals per team but not the result of the game
#This cell adds the result (W,D,L): (Win, Draw, Lose) from the home team point of view
#It also adds the goal differential, also from the home team point of view

games['homeRes'] = games.apply(lambda x: 'W' if x['homeGoals']>x['awayGoals'] else ('L' if x['homeGoals']<x['awayGoals'] else 'D'), axis = 1)
games['GoalDiff'] = games.apply(lambda x : x['homeGoals']-x['awayGoals'] , axis = 1)
games.head()

Unnamed: 0,gameID,leagueID,season,date,homeTeamID,awayTeamID,homeGoals,awayGoals,homeProbability,drawProbability,awayProbability,homeGoalsHalfTime,awayGoalsHalfTime,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,PSH,PSD,PSA,WHH,WHD,WHA,VCH,VCD,VCA,PSCH,PSCD,PSCA,homeRes,GoalDiff
0,81,1,2015,2015-08-08 15:45:00,89,82,1,0,0.2843,0.3999,0.3158,1,0,1.65,4.0,6.0,1.65,4.0,5.5,1.65,3.6,5.1,1.65,4.09,5.9,1.62,3.6,6.0,1.67,4.0,5.75,1.64,4.07,6.04,W,1
1,82,1,2015,2015-08-08 18:00:00,73,71,0,1,0.3574,0.35,0.2926,0,0,2.0,3.6,4.0,2.0,3.3,3.7,2.1,3.3,3.3,1.95,3.65,4.27,1.91,3.5,4.0,2.0,3.5,4.2,1.82,3.88,4.7,L,-1
2,83,1,2015,2015-08-08 18:00:00,72,90,2,2,0.2988,0.4337,0.2675,0,1,1.7,3.9,5.5,1.7,3.5,5.0,1.7,3.6,4.7,1.7,3.95,5.62,1.73,3.5,5.0,1.73,3.9,5.4,1.75,3.76,5.44,D,0
3,84,1,2015,2015-08-08 18:00:00,75,77,4,2,0.6422,0.2057,0.1521,3,0,1.95,3.5,4.33,2.0,3.3,3.75,2.0,3.3,3.6,1.99,3.48,4.34,2.0,3.1,2.7,2.0,3.4,4.33,1.79,3.74,5.1,W,2
4,85,1,2015,2015-08-08 18:00:00,79,78,1,3,0.1461,0.2159,0.638,0,1,2.55,3.3,3.0,2.6,3.2,2.7,2.4,3.2,2.85,2.52,3.35,3.08,2.6,3.1,2.88,2.6,3.25,3.0,2.46,3.39,3.14,L,-2


# MODEL 1
### Linear Regression using only team ID's and goal differential as target

In [4]:
X1 = games.loc[:,['homeTeamID' ,'awayTeamID']]
y1 = games.GoalDiff

X1_train, X1_test, y1_train, y1_test = train_test_split(X1 , y1,test_size = 0.3, stratify = games.homeRes, random_state = 0)

#Data preprocessing => one hot encoder on team IDs
OHE = OneHotEncoder(drop = 'first')
OHE.fit(X1)

X1_train = OHE.transform(X1_train)
X1_test = OHE.transform(X1_test)

LR = LinearRegression()
LR.fit(X1_train, y1_train)


LinearRegression()

In [5]:
LR = LinearRegression()
LR.fit(X1_train, y1_train)

LinearRegression()

<b> Pour tester l'accuracy, on retransforme la différence de but prédite en catégorie (Win, Draw, Loose)</b><br>
Pour cela on définit une fonction <b>transform_goalsDiff_to_results</b>

In [6]:
def transform_goalsDiff_to_results(array):
    res = np.asarray([])
    for val in array:
        if val>0:
            res = np.append(res,'W')
        elif val == 0:
            res = np.append(res,'D')
        else : 
            res = np.append(res,'L')
    return res

In [7]:
#Making predinctions on X_train and X_test

y1_test_pred = LR.predict(X1_test)
y1_train_pred = LR.predict(X1_train)


#Transforming the prediction and the train_test_splitted data back to categories (W,D,L)

y1_test_pred_res = transform_goalsDiff_to_results(y1_test_pred)
y1_train_pred_res = transform_goalsDiff_to_results(y1_train_pred)
y1_test_res = transform_goalsDiff_to_results(y1_test)
y1_train_res = transform_goalsDiff_to_results(y1_train)

print("accuracy on training set : ", accuracy_score(y1_train_res, y1_train_pred_res))
print("accuracy on test set : ", accuracy_score(y1_test_res, y1_test_pred_res))

accuracy on training set :  0.5381928796755295
accuracy on test set :  0.5155099894847529


In [8]:
cvp = cross_val_predict(LinearRegression(), OHE.transform(X1), y1)
y1_pred_res = transform_goalsDiff_to_results(cvp)
print("accuracy : ", accuracy_score(y1_pred_res, games.homeRes))

accuracy :  0.5124605678233438


# MODEL 2
<b> using the past n games opposing the two teams of a given match, with the same team as home team AND with the opposite team as home team</br>

In [9]:
test = pd.DataFrame(games.value_counts('homeTeamID'))
most_frequent_teamID = list(test.loc[test[0] >= 100,:].index.values)
y_freq = games.loc[(games.homeTeamID.isin(most_frequent_teamID)) & (games.awayTeamID.isin(most_frequent_teamID)),['gameID','GoalDiff']]
y_freq.shape
good_gameID = y_freq.gameID.values

In [10]:
select_good_games = games.loc[(games.season >=2016) & (games.gameID.isin(good_gameID)),:].copy()
select_good_games.shape

(4240, 36)

In [22]:
players.loc[players.playerID == 160,:]

Unnamed: 0,playerID,name
1538,160,Nicklas Bendtner


In [28]:
appearances.position.unique()

array(['GK', 'DR', 'DC', 'DL', 'DMC', 'AMR', 'AMC', 'AML', 'FW', 'Sub',
       'MR', 'MC', 'ML', 'FWR', 'FWL', 'DMR', 'DML'], dtype=object)

In [120]:
# here we fetch the past 3 games
nb_hist = 3


#####WARNING this takes quite some time to compute:

#Gets the last 3 games for EACH game with the same team as host
df_home = (select_good_games.apply(lambda x : games.loc\
            [(games.date < x['date']) &\
             (games.homeTeamID == x['homeTeamID']) &\
             (games.awayTeamID == x['awayTeamID'])\
             ,'GoalDiff'].values[:nb_hist] \
            , axis = 1)
)
#Gets the last 3 games for EACH game with the OPPOSITE team as host
df_away = (select_good_games.apply(lambda x : games.loc\
            [(games.date < x['date']) &\
             (games.homeTeamID == x['awayTeamID']) &\
             (games.awayTeamID == x['homeTeamID'])\
             ,'GoalDiff'].values[:nb_hist] \
            , axis = 1)
)

In [121]:
#there is a lot of missing games,
#We impute these missing games with the value 0 (Draw)
def impute_for_missing_games(n):
    global missing_games_count
    while len(n)<nb_hist:
        n = np.append(n, 0)
        missing_games_count +=1
    return n

missing_games_count = 0
df_home = df_home.apply(impute_for_missing_games)
print(missing_games_count)
missing_games_count = 0
df_away = df_away.apply(impute_for_missing_games)
print(missing_games_count)
print(df_home.shape, df_away.shape)

1200
679
(4240,) (4240,)


In [122]:
#here we concatenate the two dataframes.
#NB: there is certainly a smarter way to do this, *feel free to change it*

test = pd.concat([df_home, df_away], axis = 1)
test = test.apply( lambda x: np.concatenate((x[0], x[1]), axis = None).tolist(), axis = 1)
X2 = pd.DataFrame(test.tolist())
X2.head()

Unnamed: 0,0,1,2,3,4,5
0,0,-1,0,0,1,0
1,0,3,0,0,0,0
2,0,2,0,1,-1,0
3,-1,-1,0,-1,-1,0
4,-1,0,0,2,0,0


In [123]:
select_good_games.reset_index(drop = True, inplace = True)

In [124]:
#let's add team ID's
X2team = select_good_games.loc[:,['homeTeamID' ,'awayTeamID']]
X2 = pd.concat([X2team, X2], axis=1)
print(X2.shape)
X2.tail()



(4240, 8)


Unnamed: 0,homeTeamID,awayTeamID,0,1,2,3,4,5
4235,164,167,-1,3,0,-1,0,0
4236,171,163,0,3,0,0,-1,2
4237,167,160,2,1,0,0,-1,-1
4238,178,170,0,0,-1,3,2,-2
4239,168,166,-2,1,1,1,-1,4


In [125]:
# We want to one-hot encode the team ID's and keep the other columns as they are, to do so we pass the argument 'remainder = "passthrough"'

preprocessor = ColumnTransformer(
    transformers = [('teams_one_hot', OneHotEncoder(drop = 'first'), [0,1])],
    remainder='passthrough'
)

# We fit preprocessor on X2, however we do not transform X2 as train_test_split do not accept sparse matrices
preprocessor.fit(X2)

ColumnTransformer(remainder='passthrough',
                  transformers=[('teams_one_hot', OneHotEncoder(drop='first'),
                                 [0, 1])])

In [126]:
y2 = select_good_games.GoalDiff
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.3, random_state = 0)

# Now we can transform X_train and X_test
X2_train = preprocessor.transform(X2_train)
X2_test = preprocessor.transform(X2_test)

#Then train the model
LR2 = LinearRegression()
LR2.fit(X2_train, y2_train)

LinearRegression()

In [127]:
y2_test_pred = LR2.predict(X2_test)
y2_train_pred = LR2.predict(X2_train)

#Same as before, we change the predicted score into the game result
y2_test_pred_res = transform_goalsDiff_to_results(y2_test_pred)
y2_train_pred_res = transform_goalsDiff_to_results(y2_train_pred)
y2_test_res = transform_goalsDiff_to_results(y2_test)
y2_train_res = transform_goalsDiff_to_results(y2_train)

print("accuracy on training set : ", accuracy_score(y2_train_res, y2_train_pred_res))
print("accuracy on test set : ", accuracy_score(y2_test_res, y2_test_pred_res))

accuracy on training set :  0.5363881401617251
accuracy on test set :  0.5369496855345912


In [129]:
cvp2 = cross_val_predict(LinearRegression(), preprocessor.transform(X2), y2)
y2_pred_res = transform_goalsDiff_to_results(cvp2)
print("accuracy : ", accuracy_score(y2_pred_res, select_good_games.homeRes))

accuracy :  0.5110849056603773


# MODEL 3
<b> Using the player xValues (expected behaviors as calculated by bookmakers, i.e. xGoals: the probability for the player to score a goal). <br> These values  are averaged over the whole team

In [17]:
# In the appearance dataframe, the players of a game are not affiliated with a team
# Luckily, the players of a game are organised always in the same manner: 
##          first the home team, beginning with the goal keeper, then the away team, beginning also with the goal keeper
# This function adds the team (home or away team) to which each player belongs to

def search_team_ID(pos):
    global a
    if pos == 'GK':
        a += 1
    if a%2 == 1:
        res = 'h'
    else:
        res = 'a'
    return res

appearances = pd.read_csv('appearances.csv', encoding = 'latin1')
a=0
appearances['HomeOrAway']=appearances.position.apply(search_team_ID)

In [18]:
#Looking into the 'games' dataset, we now can assign a team ID to each player:

appearances = appearances.merge(games.loc[:,['gameID','homeTeamID', 'awayTeamID']], on = "gameID", how = 'inner')
appearances['teamID'] = appearances.apply(lambda x : x['homeTeamID'] if x['HomeOrAway'] == 'h' else x['awayTeamID'] , axis = 1)
appearances.drop(['awayTeamID','homeTeamID'], axis = 1, inplace = True)


In [19]:
# Now we can select the features of interest (xValues) in the dataset, 
# excluding the substitutes in order to be consistent when averaging the xValues over the whole team 
appearances_keep = appearances.loc[(appearances.position != 'Sub'), ['gameID', 
                                      'xGoals',
                                      'xGoalsChain', 
                                      'xGoalsBuildup', 
                                      'xAssists',
                                      'HomeOrAway',
                                      'teamID']]

# We do the averaging using a groupby over the game ID and the HomeOrAway column
test_app = appearances_keep.loc\
[:, ['gameID','xGoals','xGoalsChain','xGoalsBuildup','xAssists','HomeOrAway']]\
.groupby(['gameID','HomeOrAway']).mean()
test_app.reset_index(inplace = True)
test_app.head()

Unnamed: 0,gameID,HomeOrAway,xGoals,xGoalsChain,xGoalsBuildup,xAssists
0,81,a,0.061429,0.14394,0.061746,0.050608
1,81,h,0.057049,0.107573,0.071012,0.025907
2,82,a,0.032679,0.107775,0.066983,0.050972
3,82,h,0.079646,0.196319,0.106445,0.03818
4,83,a,0.040389,0.166683,0.09369,0.036977


In [20]:
# Now we will want to have one row per game
away_stats = test_app.loc[test_app.HomeOrAway == 'a',['xGoals', 'xGoalsChain', 'xGoalsBuildup', 'xAssists']]
away_stats.rename(columns = {'xGoals' : 'xGoals_a', 
                           'xGoalsChain' : 'xGoalsChain_a', 
                           'xGoalsBuildup' : 'xGoalsBuildup_a',
                           'xAssists' : 'xAssists_a'}, inplace = True)
away_stats.reset_index(drop = True, inplace = True)

home_stats = test_app.loc[test_app.HomeOrAway == 'h', ['gameID','xGoals', 'xGoalsChain', 'xGoalsBuildup','xAssists']]
home_stats.rename(columns = {'xGoals' : 'xGoals_h', 
                           'xGoalsChain' : 'xGoalsChain_h', 
                           'xGoalsBuildup' : 'xGoalsBuildup_h',
                           'xAssists' : 'xAssists_h'}, inplace = True)
home_stats.reset_index(drop = True, inplace = True)

X3 = pd.concat([home_stats, away_stats], axis=1)
X3.dropna(inplace = True)
X3.sort_values('gameID', inplace = True)
X3.reset_index(drop = True, inplace = True)

#Now let's add the team IDs
X3['away_team'] = games.awayTeamID
X3['home_team'] = games.homeTeamID
X3.drop('gameID', axis = 1, inplace = True)
X3.head()

Unnamed: 0,xGoals_h,xGoalsChain_h,xGoalsBuildup_h,xAssists_h,xGoals_a,xGoalsChain_a,xGoalsBuildup_a,xAssists_a,away_team,home_team
0,0.057049,0.107573,0.071012,0.025907,0.061429,0.14394,0.061746,0.050608,82,89
1,0.079646,0.196319,0.106445,0.03818,0.032679,0.107775,0.066983,0.050972,71,73
2,0.041633,0.078184,0.044866,0.04817,0.040389,0.166683,0.09369,0.036977,90,72
3,0.233457,0.608535,0.340174,0.14598,0.058666,0.533865,0.416345,0.117171,77,75
4,0.097121,0.172641,0.061029,0.030727,0.180785,0.932923,0.734567,0.186426,78,79


In [21]:

preprocessor = ColumnTransformer(
    transformers = [('teams_one_hot', OneHotEncoder(drop = 'first'), [-2,-1])],
    remainder='passthrough'
)

# We fit preprocessor on X2, however we do not transform X2 as train_test_split do not accept sparse matrices
preprocessor.fit(X3)


ColumnTransformer(remainder='passthrough',
                  transformers=[('teams_one_hot', OneHotEncoder(drop='first'),
                                 [-2, -1])])

In [22]:
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y2, test_size = 0.3, random_state = 0)

X3_train = preprocessor.transform(X3_train)
X3_test = preprocessor.transform(X3_test)

LR3 = LinearRegression()
LR3.fit(X3_train, y3_train)

LinearRegression()

In [23]:
y3_test_pred = LR3.predict(X3_test).squeeze()
y3_train_pred = LR3.predict(X3_train).squeeze()

y3_test_pred_res = transform_goalsDiff_to_results(y3_test_pred)
y3_train_pred_res = transform_goalsDiff_to_results(y3_train_pred)
y3_test_res = transform_goalsDiff_to_results(y3_test.values)
y3_train_res = transform_goalsDiff_to_results(y3_train.values)

print("accuracy on training set : ", accuracy_score(y3_train_res, y3_train_pred_res))
print("accuracy on test set : ", accuracy_score(y3_test_res, y3_test_pred_res))

accuracy on training set :  0.5447273546642631
accuracy on test set :  0.5281282860147214


In [24]:
cvp3 = cross_val_predict(LinearRegression(), preprocessor.transform(X3), y2)
y3_pred_res = transform_goalsDiff_to_results(cvp3)
print("accuracy : ", accuracy_score(y3_pred_res, games.homeRes))

accuracy :  0.4857255520504732


# MODEL 4
<b> using all the features from the previous model

In [25]:
X4 = pd.concat([X2.drop(['homeTeamID','awayTeamID'], axis = 1), X3], axis=1)
X4.head()

Unnamed: 0,0,1,2,3,4,5,xGoals_h,xGoalsChain_h,xGoalsBuildup_h,xAssists_h,xGoals_a,xGoalsChain_a,xGoalsBuildup_a,xAssists_a,away_team,home_team
0,3,0,0,0,0,0,0.057049,0.107573,0.071012,0.025907,0.061429,0.14394,0.061746,0.050608,82,89
1,0,0,0,0,0,0,0.079646,0.196319,0.106445,0.03818,0.032679,0.107775,0.066983,0.050972,71,73
2,0,0,0,0,0,0,0.041633,0.078184,0.044866,0.04817,0.040389,0.166683,0.09369,0.036977,90,72
3,0,0,0,0,0,0,0.233457,0.608535,0.340174,0.14598,0.058666,0.533865,0.416345,0.117171,77,75
4,0,0,0,0,0,0,0.097121,0.172641,0.061029,0.030727,0.180785,0.932923,0.734567,0.186426,78,79


In [26]:
preprocessor = ColumnTransformer(
    transformers = [('teams_one_hot', OneHotEncoder(drop = 'first'), [-2,-1])],
    remainder='passthrough'
)

preprocessor.fit(X4)

X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y2, test_size = 0.3, random_state = 0)

X4_train = preprocessor.transform(X4_train)
X4_test = preprocessor.transform(X4_test)

LR4 = LinearRegression()
LR4.fit(X4_train, y4_train)

LinearRegression()

In [27]:
y4_test_pred = LR4.predict(X4_test).squeeze()
y4_train_pred = LR4.predict(X4_train).squeeze()

y4_test_pred_res = transform_goalsDiff_to_results(y4_test_pred)
y4_train_pred_res = transform_goalsDiff_to_results(y4_train_pred)
y4_test_res = transform_goalsDiff_to_results(y4_test.values)
y4_train_res = transform_goalsDiff_to_results(y4_train.values)

print("accuracy on training set : ", accuracy_score(y4_train_res, y4_train_pred_res))
print("accuracy on test set : ", accuracy_score(y4_test_res, y4_test_pred_res))

accuracy on training set :  0.5466426318161334
accuracy on test set :  0.5289169295478444


In [28]:
cvp4 = cross_val_predict(LinearRegression(), preprocessor.transform(X4), y2)
y4_pred_res = transform_goalsDiff_to_results(cvp4)
print("accuracy : ", accuracy_score(y4_pred_res, games.homeRes))

accuracy :  0.48541009463722395
