# Portfolio Project: Predicting EPL Football Match Winners Using Machine Learning

### Introduction

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("epl.stats.csv")

In [3]:
data.sort_values(by="Date", ascending=False, inplace=True)
data.reset_index(inplace=True, drop=True)
data.columns = [col.lower() for col in data.columns]
data

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,dist,fk,pk,pkatt,npxg,npxg/sh,g-xg,np:g-xg,season,team
0,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Home,W,5,0,Aston Villa,...,16.8,0.0,0.0,0.0,2.5,0.17,2.5,2.5,2023-2024,Crystal Palace
1,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Away,W,3,0,Sheffield Utd,...,14.4,1.0,0.0,0.0,3.1,0.18,-0.1,-0.1,2023-2024,Tottenham Hotspur
2,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Home,W,3,1,West Ham,...,,,,,,,,,2023-2024,Manchester City
3,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Home,L,0,2,Manchester Utd,...,,,,,,,,,2023-2024,Brighton and Hove Albion
4,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Away,W,2,0,Brighton,...,,,,,,,,,2023-2024,Manchester United
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1515,2022-08-06,12:30,Premier League,Matchweek 1,Sat,Away,D,2,2,Fulham,...,12.4,0.0,0.0,0.0,1.2,0.11,0.8,0.8,2022-2023,Liverpool
1516,2022-08-06,15:00,Premier League,Matchweek 1,Sat,Away,L,1,4,Tottenham,...,17.1,0.0,0.0,0.0,0.5,0.05,0.5,0.5,2022-2023,Southampton
1517,2022-08-06,15:00,Premier League,Matchweek 1,Sat,Away,L,0,2,Newcastle Utd,...,16.0,0.0,0.0,0.0,0.3,0.06,-0.3,-0.3,2022-2023,Nottingham Forest
1518,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Home,L,0,2,Arsenal,...,14.2,0.0,0.0,0.0,1.2,0.12,-1.2,-1.2,2022-2023,Crystal Palace


### Cleaning Data for Machine Learning

In [4]:
# data["gdiff"] = data["gf"] - data["ga"] # Added a new column - Goal Different instead of "gf" and "ga"

In [5]:
matches = data[["date", "time", "day", "venue", "result", "gf", "ga", "opponent", "xg", "xga", "poss", "formation", "opp formation", "sh", "sot", "dist", "fk", "pk", "pkatt", "season", "team"]]

In [6]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1520 entries, 0 to 1519
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1520 non-null   object 
 1   time           1520 non-null   object 
 2   day            1520 non-null   object 
 3   venue          1520 non-null   object 
 4   result         1520 non-null   object 
 5   gf             1520 non-null   int64  
 6   ga             1520 non-null   int64  
 7   opponent       1520 non-null   object 
 8   xg             1520 non-null   float64
 9   xga            1520 non-null   float64
 10  poss           1520 non-null   int64  
 11  formation      1520 non-null   object 
 12  opp formation  1520 non-null   object 
 13  sh             1461 non-null   float64
 14  sot            1461 non-null   float64
 15  dist           1461 non-null   float64
 16  fk             1461 non-null   float64
 17  pk             1461 non-null   float64
 18  pkatt   

In [7]:
matches = matches.copy().dropna(subset=["sh", "sot"])

In [8]:
matches.shape

(1461, 21)

In [9]:
matches["date"] = pd.to_datetime(matches["date"])

In [10]:
matches["season"] = [int(season.split("-")[0]) for season in matches["season"]]

In [11]:
matches["hour"] = [time.split(":")[0] for time in matches["time"]] # Extracting the part of hour from starting time of match

### Creating Predictors for Machine Learning

In [12]:
matches["target"] = [1 if result == "W" else 0 for result in matches["result"]] # Encoding the result "Win" to 1 otherwise 0 in a new column. This will be the target variable. 

In [13]:
predictors = ["date", "venue", "opponent", "hour", "day", "season", "target", "gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"] # Initially this columns will be used as predictors

In [14]:
datasource = matches[predictors]

In [15]:
datasource = pd.get_dummies(datasource, drop_first=True)

In [16]:
datasource["team"] = matches["team"]

In [17]:
datasource.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1461 entries, 0 to 1519
Data columns (total 49 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      1461 non-null   datetime64[ns]
 1   season                    1461 non-null   int64         
 2   target                    1461 non-null   int64         
 3   gf                        1461 non-null   int64         
 4   ga                        1461 non-null   int64         
 5   sh                        1461 non-null   float64       
 6   sot                       1461 non-null   float64       
 7   dist                      1461 non-null   float64       
 8   fk                        1461 non-null   float64       
 9   pk                        1461 non-null   float64       
 10  pkatt                     1461 non-null   float64       
 11  venue_Home                1461 non-null   bool          
 12  opponent_Aston Villa     

### Training an Initial ML Model

In [18]:
X_train = datasource[datasource["season"] == 2022].drop(["date", "season", "target", "team"], axis=1) # Using the season 2022 for training data and removing the season column becaose of that is not necessary now in predicting

In [19]:
X_train.shape

(733, 45)

In [20]:
X_test = datasource[datasource["season"] == 2023].drop(["date", "season", "target", "team"], axis=1) # Season 2023 will be used as test set.

In [21]:
X_test.shape

(728, 45)

In [22]:
y_train = datasource[datasource["season"] == 2022]["target"]

In [23]:
y_train.shape

(733,)

In [24]:
y_test = datasource[datasource["season"] == 2023]["target"]

In [25]:
y_test.shape

(728,)

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
model = RandomForestClassifier(n_jobs=-1)

In [28]:
model.fit(X_train, y_train)

In [29]:
predicts = model.predict(X_test)

In [30]:
from sklearn.metrics import accuracy_score

In [31]:
score = accuracy_score(y_test, predicts)

In [32]:
score

0.9793956043956044

### Improving the Model with Rolling Averages

In [33]:
features = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]

In [34]:
new_features = [f"rolling_{name}" for name in features]

In [35]:
new_features

['rolling_gf',
 'rolling_ga',
 'rolling_sh',
 'rolling_sot',
 'rolling_dist',
 'rolling_fk',
 'rolling_pk',
 'rolling_pkatt']

In [36]:
def rolling_averages(group, features, new_features):
    rolling_means = group.sort_values("date")[features].rolling(window=3, closed="left").mean()

    group[new_features] = rolling_means 

    return group

In [40]:
rolling_data = datasource.groupby("team").apply(lambda x: rolling_averages(x, features, new_features), include_groups=False)

In [41]:
rolling_data

Unnamed: 0_level_0,Unnamed: 1_level_0,date,season,target,gf,ga,sh,sot,dist,fk,pk,...,day_Tue,day_Wed,rolling_gf,rolling_ga,rolling_sh,rolling_sot,rolling_dist,rolling_fk,rolling_pk,rolling_pkatt
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,68,2024-04-28,2023,1,3,2,9.0,3.0,14.3,1.0,0.0,...,False,False,2.333333,0.666667,23.000000,7.666667,17.100000,0.333333,0.000000,0.000000
Arsenal,98,2024-04-23,2023,1,5,0,27.0,10.0,16.3,0.0,0.0,...,True,False,1.666667,0.666667,20.333333,6.333333,15.800000,0.333333,0.333333,0.333333
Arsenal,109,2024-04-20,2023,1,2,0,24.0,9.0,17.8,0.0,0.0,...,False,False,1.666667,0.666667,16.666667,4.666667,15.800000,0.333333,0.333333,0.333333
Arsenal,118,2024-04-14,2023,0,0,2,18.0,4.0,17.2,1.0,0.0,...,False,False,1.666667,0.000000,12.666667,4.000000,14.800000,0.000000,0.333333,0.333333
Arsenal,146,2024-04-06,2023,1,3,0,19.0,6.0,12.4,0.0,1.0,...,False,False,1.333333,0.333333,12.000000,3.666667,16.133333,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,1430,2022-08-31,2022,0,0,0,17.0,3.0,17.7,1.0,0.0,...,False,True,0.333333,0.666667,12.333333,2.666667,20.433333,1.000000,0.000000,0.000000
Wolverhampton Wanderers,1445,2022-08-28,2022,0,1,1,10.0,4.0,20.7,1.0,0.0,...,False,False,0.333333,1.000000,14.000000,3.333333,19.833333,0.666667,0.000000,0.000000
Wolverhampton Wanderers,1478,2022-08-20,2022,0,0,1,20.0,3.0,21.9,1.0,0.0,...,False,False,,,,,,,,
Wolverhampton Wanderers,1497,2022-08-13,2022,0,0,0,7.0,1.0,18.7,1.0,0.0,...,False,False,,,,,,,,


In [42]:
rolling_data = rolling_data.droplevel("team")

In [43]:
rolling_data

Unnamed: 0,date,season,target,gf,ga,sh,sot,dist,fk,pk,...,day_Tue,day_Wed,rolling_gf,rolling_ga,rolling_sh,rolling_sot,rolling_dist,rolling_fk,rolling_pk,rolling_pkatt
68,2024-04-28,2023,1,3,2,9.0,3.0,14.3,1.0,0.0,...,False,False,2.333333,0.666667,23.000000,7.666667,17.100000,0.333333,0.000000,0.000000
98,2024-04-23,2023,1,5,0,27.0,10.0,16.3,0.0,0.0,...,True,False,1.666667,0.666667,20.333333,6.333333,15.800000,0.333333,0.333333,0.333333
109,2024-04-20,2023,1,2,0,24.0,9.0,17.8,0.0,0.0,...,False,False,1.666667,0.666667,16.666667,4.666667,15.800000,0.333333,0.333333,0.333333
118,2024-04-14,2023,0,0,2,18.0,4.0,17.2,1.0,0.0,...,False,False,1.666667,0.000000,12.666667,4.000000,14.800000,0.000000,0.333333,0.333333
146,2024-04-06,2023,1,3,0,19.0,6.0,12.4,0.0,1.0,...,False,False,1.333333,0.333333,12.000000,3.666667,16.133333,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1430,2022-08-31,2022,0,0,0,17.0,3.0,17.7,1.0,0.0,...,False,True,0.333333,0.666667,12.333333,2.666667,20.433333,1.000000,0.000000,0.000000
1445,2022-08-28,2022,0,1,1,10.0,4.0,20.7,1.0,0.0,...,False,False,0.333333,1.000000,14.000000,3.333333,19.833333,0.666667,0.000000,0.000000
1478,2022-08-20,2022,0,0,1,20.0,3.0,21.9,1.0,0.0,...,False,False,,,,,,,,
1497,2022-08-13,2022,0,0,0,7.0,1.0,18.7,1.0,0.0,...,False,False,,,,,,,,


In [48]:
rolling_data.dropna(subset=new_features, inplace=True) # Due to rolling mean's window size. The first three value each team is Nan.

In [49]:
rolling_data.shape

(1392, 56)

In [51]:
rolling_data.reset_index()

Unnamed: 0,index,date,season,target,gf,ga,sh,sot,dist,fk,...,day_Tue,day_Wed,rolling_gf,rolling_ga,rolling_sh,rolling_sot,rolling_dist,rolling_fk,rolling_pk,rolling_pkatt
0,68,2024-04-28,2023,1,3,2,9.0,3.0,14.3,1.0,...,False,False,2.333333,0.666667,23.000000,7.666667,17.100000,0.333333,0.000000,0.000000
1,98,2024-04-23,2023,1,5,0,27.0,10.0,16.3,0.0,...,True,False,1.666667,0.666667,20.333333,6.333333,15.800000,0.333333,0.333333,0.333333
2,109,2024-04-20,2023,1,2,0,24.0,9.0,17.8,0.0,...,False,False,1.666667,0.666667,16.666667,4.666667,15.800000,0.333333,0.333333,0.333333
3,118,2024-04-14,2023,0,0,2,18.0,4.0,17.2,1.0,...,False,False,1.666667,0.000000,12.666667,4.000000,14.800000,0.000000,0.333333,0.333333
4,146,2024-04-06,2023,1,3,0,19.0,6.0,12.4,0.0,...,False,False,1.333333,0.333333,12.000000,3.666667,16.133333,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,1373,2022-10-01,2022,0,0,2,15.0,4.0,21.6,0.0,...,False,False,0.333333,1.000000,10.000000,2.000000,16.733333,1.000000,0.000000,0.000000
1388,1395,2022-09-17,2022,0,0,3,6.0,1.0,19.6,2.0,...,False,False,0.666667,0.333333,11.333333,3.000000,17.100000,0.666667,0.000000,0.000000
1389,1413,2022-09-03,2022,1,1,0,7.0,2.0,12.9,0.0,...,False,False,0.333333,0.666667,15.666667,3.333333,20.100000,1.000000,0.000000,0.000000
1390,1430,2022-08-31,2022,0,0,0,17.0,3.0,17.7,1.0,...,False,True,0.333333,0.666667,12.333333,2.666667,20.433333,1.000000,0.000000,0.000000
