# Predicting the 2020/21 EPL season!!!
Credit to [Tuan Doan Nguyen](https://towardsdatascience.com/o-jogo-bonito-predicting-the-premier-league-with-a-random-model-1b02fa3a7e5a) for inspiration :)   
Blue is the colour, football is the game...  
KTBFFH

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import poisson
from glob import glob

## Data prep - create kaggle-like data set for all historical EPL match results (1993-)
source csv -> http://www.football-data.co.uk/englandm.php  
check against -> https://www.premierleague.com/results?co=1&se=2&cl=-1

In [2]:
# data ingestion
files = glob('./E0*.csv')

matches = pd.DataFrame()
for file in files:
    #print(file)
    matches = pd.concat([matches, 
                         pd.read_csv(file, usecols=range(1,12), encoding = "latin", date_parser='pandas.to_datetime')]
                       ).drop_duplicates()


matches['Date'] = pd.to_datetime(matches['Date'], dayfirst=True)
matches = matches.sort_values('Date').reset_index(drop=True)
matches.dropna(axis=1, how='all', inplace=True)
matches.to_csv('./matches.csv')
matches.info()
matches.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10425 entries, 0 to 10424
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        10424 non-null  datetime64[ns]
 1   HomeTeam    10424 non-null  object        
 2   AwayTeam    10424 non-null  object        
 3   FTHG        10424 non-null  float64       
 4   FTAG        10424 non-null  float64       
 5   FTR         10424 non-null  object        
 6   HTHG        9500 non-null   float64       
 7   HTAG        9500 non-null   float64       
 8   HTR         9500 non-null   object        
 9   Referee     7600 non-null   object        
 10  HS          6460 non-null   float64       
 11  Attendance  759 non-null    float64       
 12  Time        380 non-null    object        
dtypes: datetime64[ns](1), float64(6), object(6)
memory usage: 1.0+ MB


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,Attendance,Time
0,1993-08-14,Southampton,Everton,0.0,2.0,A,,,,,,,
1,1993-08-14,Arsenal,Coventry,0.0,3.0,A,,,,,,,
2,1993-08-14,Aston Villa,QPR,4.0,1.0,H,,,,,,,
3,1993-08-14,Chelsea,Blackburn,1.0,2.0,A,,,,,,,
4,1993-08-14,Liverpool,Sheffield Weds,2.0,0.0,H,,,,,,,


In [3]:
# data processing
is_2020 = matches['Date'] > '2019-07'
last_10 = matches['Date'] > '2010-07'
#last_5 = (matches['Date'] > '2013-07') & (matches['Date'] < '2018-07')
last_5 = matches['Date'] > '2015-07'

# aggregate by home away team and return avg home away goals
matches[last_10].groupby(['HomeTeam', 'AwayTeam']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,FTHG,FTAG,HTHG,HTAG,HS,Attendance
HomeTeam,AwayTeam,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Arsenal,Aston Villa,2.714286,1.142857,0.857143,0.571429,18.833333,
Arsenal,Birmingham,2.000000,1.000000,1.000000,1.000000,18.000000,
Arsenal,Blackburn,3.500000,0.500000,1.500000,0.500000,19.500000,
Arsenal,Blackpool,6.000000,0.000000,3.000000,0.000000,26.000000,
Arsenal,Bolton,3.500000,0.500000,0.500000,0.500000,23.000000,
...,...,...,...,...,...,...,...
Wolves,Tottenham,1.500000,2.500000,0.500000,1.250000,13.000000,
Wolves,Watford,1.000000,1.000000,0.500000,1.000000,10.000000,
Wolves,West Brom,2.000000,3.000000,1.500000,0.500000,12.000000,
Wolves,West Ham,2.000000,0.333333,0.666667,0.000000,14.500000,


In [4]:
matches[last_10]['HomeTeam'].value_counts()

Man United          190
Arsenal             190
Liverpool           190
Chelsea             190
Man City            190
Everton             190
Tottenham           190
West Ham            171
Newcastle           171
Stoke               152
Southampton         152
West Brom           152
Crystal Palace      133
Sunderland          133
Aston Villa         133
Swansea             133
Leicester           114
Bournemouth          95
Burnley              95
Norwich              95
Watford              95
Fulham               95
Wolves               76
Wigan                57
QPR                  57
Brighton             57
Hull                 57
Bolton               38
Huddersfield         38
Cardiff              38
Blackburn            38
Birmingham           19
Sheffield United     19
Reading              19
Middlesbrough        19
Blackpool            19
Name: HomeTeam, dtype: int64

In [5]:
# head to head between every team pairing in the last 10 seasons
h2h = matches[last_5].groupby(['HomeTeam', 'AwayTeam']).mean()
matches[last_5].groupby(['HomeTeam', 'AwayTeam']).get_group(('Arsenal', 'Chelsea'))
#h2h.filter(like='Chelsea', axis=0)
#h2h.loc[('Arsenal', 'Chelsea')]
#h2h.loc[['Chelsea']]

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,Attendance,Time
8753,2016-01-24,Arsenal,Chelsea,0.0,1.0,A,0.0,1.0,A,M Clattenburg,9.0,,
8959,2016-09-24,Arsenal,Chelsea,3.0,0.0,H,3.0,0.0,H,M Oliver,14.0,,
9502,2018-01-03,Arsenal,Chelsea,2.0,2.0,D,0.0,0.0,D,A Taylor,14.0,,
9890,2019-01-19,Arsenal,Chelsea,2.0,0.0,H,2.0,0.0,H,A Taylor,13.0,,
10240,2019-12-29,Arsenal,Chelsea,1.0,2.0,A,1.0,0.0,H,C Pawson,,,14:00


## Modeling - get score function using nothing but the Poisson distribution

In [6]:
# modeling - get score function
home = 'Brighton'
away = 'Chelsea'

def get_score(home, away):
    # head to head results in last 5 seasons
    home_mean = h2h.loc[(home, away)][0]
    away_mean = h2h.loc[(home, away)][1]
    
    # simulate score by random sampling from parametrized Poisson distribution
    home_score = poisson.rvs(home_mean, size=1)[0]
    away_score = poisson.rvs(away_mean, size=1)[0]

    return (home_score, away_score)

get_score(home, away)

(0, 4)

In [7]:
# simulate score over 10000 trials and plot histogram of most probable result
sims = {}
trials = 100
for i in range(trials):
    score = get_score(home, away)
    sims[score] = sims.get(score, 0) + 1

hist = []
for k, v in sims.items():
    p = v / trials
    hist.append((v, k, p))
    
hist.sort(reverse=True)
hist

[(12, (0, 3), 0.12),
 (12, (0, 1), 0.12),
 (11, (0, 2), 0.11),
 (10, (1, 1), 0.1),
 (9, (1, 2), 0.09),
 (8, (1, 3), 0.08),
 (5, (0, 5), 0.05),
 (5, (0, 0), 0.05),
 (4, (2, 1), 0.04),
 (3, (2, 2), 0.03),
 (3, (2, 0), 0.03),
 (3, (1, 5), 0.03),
 (3, (1, 4), 0.03),
 (3, (0, 4), 0.03),
 (2, (2, 3), 0.02),
 (1, (3, 4), 0.01),
 (1, (3, 3), 0.01),
 (1, (3, 2), 0.01),
 (1, (1, 6), 0.01),
 (1, (1, 0), 0.01),
 (1, (0, 11), 0.01),
 (1, (0, 6), 0.01)]

In [8]:
# modeling - get score function returns most probable result between two sides
home = 'Brighton'
away = 'Chelsea'
trials = 10000

def get_score(home, away):
    try:
        # head to head results in last 5 seasons
        home_mean = h2h.loc[(home, away)][0]
        away_mean = h2h.loc[(home, away)][1]

        # simulate score by random sampling from parametrized Poisson distribution
        home_scores = poisson.rvs(home_mean, size=trials).astype(str)
        away_scores = poisson.rvs(away_mean, size=trials).astype(str)

        scores = pd.DataFrame(data={'home':home_scores, 'away':away_scores})
        scores['result'] = scores['home'] + '-' + scores['away']
        predictions = scores['result'].value_counts()
        probability = round(predictions / trials * 100, 1)
        
        return predictions.index[0], probability[0]

    except KeyError:
        # return NA for teams with no head to head record in last 5 seasons        
        return 'N/A', 'N/A'
    
get_score(home, away)

('0-2', 13.4)

# Data analysis - predict scores using most probable result
Simulate 20/21 season opening week - games starting 2020-09-12

In [9]:
# data analysis - simulate 20/21 season opening week (week 1 games starting 2020-09-12)
home_teams = ['Fulham', 'Crystal Palace', 'Liverpool', 'West Ham', 'West Brom', 'Tottenham', 'Sheffield United', 'Brighton']
away_teams = ['Arsenal', 'Southampton', 'Leeds', 'Newcastle', 'Leicester', 'Everton', 'Wolves', 'Chelsea']

week1 = pd.DataFrame(data={'Home':home_teams, 'Away':away_teams})
week1['Predictions'] = week1.apply(lambda x: get_score(x.Home, x.Away)[0], axis=1)
week1['Probability%'] = week1.apply(lambda x: get_score(x.Home, x.Away)[1], axis=1)
week1

Unnamed: 0,Home,Away,Predictions,Probability%
0,Fulham,Arsenal,1-5,7.0
1,Crystal Palace,Southampton,0-0,16.9
2,Liverpool,Leeds,,
3,West Ham,Newcastle,2-1,9.3
4,West Brom,Leicester,0-2,9.5
5,Tottenham,Everton,2-0,11.8
6,Sheffield United,Wolves,1-0,37.2
7,Brighton,Chelsea,0-2,13.3


Come on Chelsea!!!