## Predicting Premier league standings using Poisson distribution

Poisson distribution is used for the probability distribution of goal scoring. 
This model is then applied to simulate league matches and predict points scored by each team to build the final league standings.

In [1]:
# Import libraries

import pandas as pd
import numpy as np
from scipy.stats import poisson 

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Fetch Dataset : Premier League Data since Season 2005-06
dfList = []
for i in range(5, 19):
    df = pd.read_csv("http://www.football-data.co.uk/mmz4281/{0:02d}{1:02d}/E0.csv".format(i,i+1))
    df = df[['HomeTeam','AwayTeam','FTHG','FTAG']]
    df = df.rename(columns={'FTHG': 'HomeGoals', 'FTAG': 'AwayGoals'})
    dfList.append(df)
    
eplData = dfList[0].append(dfList[1:])

# Fetch current standings (as of 29-01-2019)
eplPointsTable = pd.read_excel('PointsTable.xlsx')[['Team', 'Points']]
eplPointsTable.index = np.arange(1, len(eplPointsTable) + 1)

# Fetch upcoming match fixures
eplMatchesLeft = pd.read_excel('MatchesLeft.xlsx')

In [3]:
'PAST MATCH DATA: {} rows'.format(len(eplData))
eplData.head()
'CURRENT POINTS TABLE'
eplPointsTable
'UPCOMING MATCH FIXTURES: {} rows'.format(len(eplMatchesLeft))
eplMatchesLeft.head()

'PAST MATCH DATA: 5171 rows'

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals
0,Aston Villa,Bolton,2.0,2.0
1,Everton,Man United,0.0,2.0
2,Fulham,Birmingham,0.0,0.0
3,Man City,West Brom,0.0,0.0
4,Middlesbrough,Liverpool,0.0,0.0


'CURRENT POINTS TABLE'

Unnamed: 0,Team,Points
1,Liverpool,60
2,Man City,56
3,Tottenham,51
4,Chelsea,47
5,Arsenal,44
6,Man United,44
7,Watford,33
8,Wolves,32
9,Leicester,31
10,West Ham,31


'UPCOMING MATCH FIXTURES: 150 rows'

Unnamed: 0,HomeTeam,AwayTeam
0,Arsenal,Cardiff
1,Bournemouth,Chelsea
2,Fulham,Brighton
3,Huddersfield,Everton
4,Wolves,West Ham


In [4]:
# Calculate Team and League stats

# For each team - Average {HomeScored, HomeConceded, AwayScored, AwayConceded} 
eplHomeTeam = eplData[['HomeTeam', 'HomeGoals', 'AwayGoals']].rename(
    columns={'HomeTeam':'Team', 'HomeGoals':'HomeScored', 'AwayGoals':'HomeConceded'}).groupby(
    ['Team'], as_index=False)[['HomeScored', 'HomeConceded']].mean()

eplAwayTeam = eplData[['AwayTeam', 'HomeGoals', 'AwayGoals']].rename(
    columns={'AwayTeam':'Team', 'HomeGoals':'AwayConceded', 'AwayGoals':'AwayScored'}).groupby(
    ['Team'], as_index=False)[['AwayScored', 'AwayConceded']].mean()

# Overall - Average {leagueHomeScored, leagueHomeConceded, leagueAwayScored, leagueAwayConceded}
leagueHomeScored, leagueHomeConceded = eplHomeTeam['HomeScored'].mean(), eplHomeTeam['HomeConceded'].mean()
leagueAwayScored, leagueAwayConceded = eplAwayTeam['AwayScored'].mean(), eplAwayTeam['AwayConceded'].mean()

eplTeamStrength = pd.merge(eplHomeTeam, eplAwayTeam, on='Team')

assert(leagueHomeScored != 0)
assert(leagueHomeConceded != 0)
assert(leagueAwayScored != 0)
assert(leagueAwayConceded != 0)

# Normalize the parameters 
# For each team - {HomeAttack, HomeDefence, AwayAttack, AwayDefense}
eplTeamStrength['HomeScored'] /= leagueHomeScored
eplTeamStrength['HomeConceded'] /= leagueHomeConceded
eplTeamStrength['AwayScored'] /= leagueAwayScored
eplTeamStrength['AwayConceded'] /= leagueAwayConceded

eplTeamStrength.columns=['Team','HomeAttack','HomeDefense','AwayAttack','AwayDefense']
eplTeamStrength.set_index('Team', inplace=True)

# Overall - {overallHomeScored, overallAwayScored}
overallHomeScored = (leagueHomeScored+leagueAwayConceded)/2
overallAwayScored = (leagueHomeConceded+leagueAwayScored)/2

In [5]:
'TEAM STRENGTH: {} rows'.format(len(eplTeamStrength))
eplTeamStrength.head()
'Overall Home scored = {}'.format(overallHomeScored)
'Overall Away scored = {}'.format(overallAwayScored)

'TEAM STRENGTH: 39 rows'

Unnamed: 0_level_0,HomeAttack,HomeDefense,AwayAttack,AwayDefense
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Arsenal,1.555182,0.634374,1.622722,0.775125
Aston Villa,0.881719,0.959995,1.12442,1.010961
Birmingham,0.833799,0.810705,0.805519,1.088563
Blackburn,1.018632,0.896823,1.054228,1.096394
Blackpool,1.150068,1.53826,1.299225,1.284348


'Overall Home scored = 1.5265323119870773'

'Overall Away scored = 1.1393523647998394'

In [6]:
# Predict outcome of match and assign points to the teams

def predictMatchScore(home, away):
    if home in eplTeamStrength.index and away in eplTeamStrength.index:
        lambdH = eplTeamStrength.at[home,'HomeAttack'] * eplTeamStrength.at[away,'AwayDefense'] * overallHomeScored
        lambdA = eplTeamStrength.at[away,'AwayAttack'] * eplTeamStrength.at[home,'HomeDefense'] * overallAwayScored
        probH, probA, probT = 0, 0, 0  # Probability of Home win(H), Away win(A) or Tie(T) 
        for X in range(0,11):
            for Y in range(0, 11):
                p = poisson.pmf(X, lambdH) * poisson.pmf(Y, lambdA)
                if X == Y:
                    probT += p
                elif X > Y:
                    probH += p
                else:
                    probA += p
        scoreH = 3 * probH + probT
        scoreA = 3 * probA + probT
        return (scoreH, scoreA)
    else:
        return (0, 0)

In [7]:
#  Simulate the matches to predict final standings
for index, row in eplMatchesLeft.iterrows():
    home, away = row['HomeTeam'], row['AwayTeam']
    assert(home in eplPointsTable.Team.values and away in eplPointsTable.Team.values)
    sH, sA = predictMatchScore(home, away)
    eplPointsTable.loc[eplPointsTable.Team == home, 'Points'] += sH
    eplPointsTable.loc[eplPointsTable.Team == away, 'Points'] += sA

In [8]:
'PREDICTED FINAL STANDINGS'
eplPointsTable = eplPointsTable.sort_values('Points', ascending=False)
eplPointsTable.index = np.arange(1, len(eplPointsTable)+1) 
eplPointsTable.round(2)

'PREDICTED FINAL STANDINGS'

Unnamed: 0,Team,Points
1,Liverpool,89.94
2,Man City,84.66
3,Chelsea,77.82
4,Tottenham,77.39
5,Man United,75.48
6,Arsenal,73.97
7,Everton,52.71
8,Leicester,52.1
9,West Ham,49.39
10,Watford,48.08
