# 2. Predicting Soccer Match Winners using ML

In [2]:
import pandas as pd

In [3]:
matches = pd.read_csv('matches.csv')
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,...,Match Report,,13.0,1.0,17.8,0.0,0,0,2020,Liverpool
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,...,Match Report,,25.0,9.0,16.8,1.0,0,1,2020,Liverpool
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,...,Match Report,,9.0,4.0,17.2,1.0,0,0,2020,Liverpool
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,...,Match Report,,17.0,4.0,14.7,0.0,0,0,2020,Liverpool
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,...,Match Report,,16.0,5.0,15.8,0.0,0,0,2020,Liverpool


In [4]:
matches.shape

(3456, 27)

### 2.1 Investigating missing data

In [5]:
matches['team'].value_counts()

team
Liverpool                   173
Manchester United           173
Everton                     173
Crystal Palace              173
Wolverhampton Wanderers     173
Chelsea                     173
Newcastle United            173
Brighton and Hove Albion    173
West Ham United             173
Tottenham Hotspur           173
Aston Villa                 173
Arsenal                     173
Manchester City             172
Southampton                 152
Leicester City              152
Burnley                     135
Leeds United                114
Sheffield United             97
Fulham                       97
Brentford                    96
Bournemouth                  96
Watford                      76
Norwich City                 76
Nottingham Forest            59
West Bromwich Albion         38
Luton Town                   20
Name: count, dtype: int64

In [6]:
matches['round'].value_counts()

round
Matchweek 1     100
Matchweek 11    100
Matchweek 21    100
Matchweek 2     100
Matchweek 19    100
Matchweek 16    100
Matchweek 15    100
Matchweek 14    100
Matchweek 13    100
Matchweek 12    100
Matchweek 20    100
Matchweek 10    100
Matchweek 5     100
Matchweek 9     100
Matchweek 4     100
Matchweek 3     100
Matchweek 6     100
Matchweek 7     100
Matchweek 8     100
Matchweek 17     98
Matchweek 18     98
Matchweek 31     80
Matchweek 32     80
Matchweek 37     80
Matchweek 36     80
Matchweek 35     80
Matchweek 28     80
Matchweek 34     80
Matchweek 33     80
Matchweek 25     80
Matchweek 30     80
Matchweek 29     80
Matchweek 27     80
Matchweek 26     80
Matchweek 24     80
Matchweek 23     80
Matchweek 22     80
Matchweek 38     80
Name: count, dtype: int64

### 2.2 Cleaning data for Machine Learning

- Converting 'object' date column to 'datetime'
- Binary encoding 'venue': very important predictor, home advantage, away disadvantage
- Numerically encoding opponents 
- Converting each hour to int, possible that some teams play better at certain times
- Numerically encoding day of week, teams might play better on certain days

In [8]:
matches['date'] = pd.to_datetime(matches['date'])
matches.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                     float64
ga                     float64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                  float64
sh                     float64
sot                    float64
dist                   float64
fk                     float64
pk                       int64
pkatt                    int64
season                   int64
team                    object
dtype: object

Binary Encoding 'venue',  0: Away | 1: Home

In [12]:
matches['venue_code'] = matches['venue'].astype('category').cat.codes

Encoding all Opponents

In [13]:
matches['opp_code'] = matches['opponent'].astype('category').cat.codes

Encoding Hours

In [14]:
matches['hour'] = matches['time'].str.replace(':.+',"",regex=True).astype('int')

Encoding Day of Week

In [16]:
matches['day_code'] = matches['date'].dt.dayofweek

**Encoding Target Variable 'result',  0: Loss, Draw | 1: Win**

In [18]:
matches['target'] = (matches['result'] == 'W').astype('int')