# Portfolio Project: Predicting EPL Football Match Winners Using Machine Learning

### Introduction

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("epl.stats.csv")

In [3]:
data.sort_values(by="Date", ascending=False, inplace=True)
data.reset_index(inplace=True, drop=True)
data.columns = [col.lower() for col in data.columns]
data

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,dist,fk,pk,pkatt,npxg,npxg/sh,g-xg,np:g-xg,season,team
0,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Home,W,5,0,Aston Villa,...,16.8,0.0,0.0,0.0,2.5,0.17,2.5,2.5,2023-2024,Crystal Palace
1,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Away,W,3,0,Sheffield Utd,...,14.4,1.0,0.0,0.0,3.1,0.18,-0.1,-0.1,2023-2024,Tottenham Hotspur
2,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Home,W,3,1,West Ham,...,,,,,,,,,2023-2024,Manchester City
3,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Home,L,0,2,Manchester Utd,...,,,,,,,,,2023-2024,Brighton and Hove Albion
4,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Away,W,2,0,Brighton,...,,,,,,,,,2023-2024,Manchester United
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1515,2022-08-06,12:30,Premier League,Matchweek 1,Sat,Away,D,2,2,Fulham,...,12.4,0.0,0.0,0.0,1.2,0.11,0.8,0.8,2022-2023,Liverpool
1516,2022-08-06,15:00,Premier League,Matchweek 1,Sat,Away,L,1,4,Tottenham,...,17.1,0.0,0.0,0.0,0.5,0.05,0.5,0.5,2022-2023,Southampton
1517,2022-08-06,15:00,Premier League,Matchweek 1,Sat,Away,L,0,2,Newcastle Utd,...,16.0,0.0,0.0,0.0,0.3,0.06,-0.3,-0.3,2022-2023,Nottingham Forest
1518,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Home,L,0,2,Arsenal,...,14.2,0.0,0.0,0.0,1.2,0.12,-1.2,-1.2,2022-2023,Crystal Palace


### Cleaning Data for Machine Learning

In [4]:
data["gdiff"] = data["gf"] - data["ga"] # Added a new column - Goal Different instead of "gf" and "ga"

In [5]:
matches = data[["date", "time", "day", "venue", "result", "gdiff", "opponent", "xg", "xga", "poss", "formation", "opp formation", "sh", "sot", "season", "team"]]

In [6]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1520 entries, 0 to 1519
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1520 non-null   object 
 1   time           1520 non-null   object 
 2   day            1520 non-null   object 
 3   venue          1520 non-null   object 
 4   result         1520 non-null   object 
 5   gdiff          1520 non-null   int64  
 6   opponent       1520 non-null   object 
 7   xg             1520 non-null   float64
 8   xga            1520 non-null   float64
 9   poss           1520 non-null   int64  
 10  formation      1520 non-null   object 
 11  opp formation  1520 non-null   object 
 12  sh             1461 non-null   float64
 13  sot            1461 non-null   float64
 14  season         1520 non-null   object 
 15  team           1520 non-null   object 
dtypes: float64(4), int64(2), object(10)
memory usage: 190.1+ KB


In [7]:
matches = matches.copy().dropna(subset=["sh", "sot"])

In [8]:
matches.shape

(1461, 16)

In [9]:
matches["date"] = pd.to_datetime(matches["date"])

In [10]:
matches = pd.get_dummies(matches, 
                         prefix=["time", "day", "venue", "result", "opponent", "formation", "opp_formation", "season", "team"], 
                         columns=["time", "day", "venue", "result", "opponent", "formation", "opp formation", "season", "team"], 
                         drop_first=True)

In [11]:
matches.shape

(1461, 117)