In [1]:
from scipy.stats import poisson 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
def get_premier_league_data(start_year):
    season = str(start_year)[-2:] + str(start_year + 1)[-2:]
    data = pd.read_csv("https://www.football-data.co.uk/mmz4281/" + season + "/E0.csv") 
    return data

In [3]:
dataa = get_premier_league_data(2021)
datab = get_premier_league_data(2020)
data = pd.concat([datab,dataa],ignore_index=True)


In [4]:
data

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E0,12/09/2020,12:30,Fulham,Arsenal,0,3,A,0,1,...,1.84,0.75,2.01,1.89,2.02,1.91,2.13,1.92,2.02,1.87
1,E0,12/09/2020,15:00,Crystal Palace,Southampton,1,0,H,1,0,...,1.70,0.25,1.78,2.13,1.79,2.17,1.85,2.18,1.79,2.12
2,E0,12/09/2020,17:30,Liverpool,Leeds,4,3,H,3,2,...,2.62,-1.50,1.85,2.05,1.85,2.08,1.90,2.16,1.84,2.04
3,E0,12/09/2020,20:00,West Ham,Newcastle,0,2,A,0,0,...,1.92,-0.50,2.03,1.87,2.04,1.88,2.09,1.91,2.02,1.86
4,E0,13/09/2020,14:00,West Brom,Leicester,0,3,A,0,0,...,1.73,0.25,1.92,1.98,1.93,1.99,1.95,2.01,1.91,1.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,E0,22/05/2022,16:00,Crystal Palace,Man United,1,0,H,1,0,...,2.04,0.25,1.68,2.15,1.74,2.23,1.88,2.25,1.74,2.16
756,E0,22/05/2022,16:00,Leicester,Southampton,4,1,H,0,0,...,2.63,-0.75,1.83,2.07,1.88,2.03,1.94,2.26,1.87,2.01
757,E0,22/05/2022,16:00,Liverpool,Wolves,3,1,H,1,1,...,3.28,-2.50,2.02,1.77,2.06,1.83,2.19,1.99,2.07,1.80
758,E0,22/05/2022,16:00,Man City,Aston Villa,3,2,H,0,1,...,3.36,-2.25,2.06,1.84,2.05,1.86,2.09,2.03,2.01,1.87


In [5]:
columns = ["HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR"]
data = data[columns]

data = data.rename(
    columns={"FTHG": "HomeGoals", "FTAG": "AwayGoals", "FTR": "Result"}
)

In [6]:
home_goals = data[["HomeTeam", "AwayTeam", "HomeGoals"]]
home_goals = home_goals.assign(home=1)
home_goals = home_goals.rename(
    columns={"HomeTeam": "team",
             "AwayTeam": "opponent", 
             "HomeGoals": "goals"}
)

away_goals = data[["AwayTeam", "HomeTeam", "AwayGoals"]]
away_goals = away_goals.assign(home=0)
away_goals = away_goals.rename(
    columns={"AwayTeam": "team",
             "HomeTeam": "opponent", 
             "AwayGoals": "goals"}
)

In [8]:
data

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals,Result
0,Fulham,Arsenal,0,3,A
1,Crystal Palace,Southampton,1,0,H
2,Liverpool,Leeds,4,3,H
3,West Ham,Newcastle,0,2,A
4,West Brom,Leicester,0,3,A
...,...,...,...,...,...
755,Crystal Palace,Man United,1,0,H
756,Leicester,Southampton,4,1,H
757,Liverpool,Wolves,3,1,H
758,Man City,Aston Villa,3,2,H


In [9]:
homeWin = round((data['Result'].value_counts()['H']/data['Result'].value_counts().sum()),2)
awayWin = round((data['Result'].value_counts()['A']/data['Result'].value_counts().sum()),2)
noWin = round((data['Result'].value_counts()['D']/data['Result'].value_counts().sum()),2)
print(homeWin)
print(awayWin)
print(noWin)

0.4
0.37
0.22


In [10]:
grouped = data.groupby(['HomeTeam'])['Result'].value_counts()

In [15]:
grouped

HomeTeam     Result
Arsenal      H         21
             A         11
             D          6
Aston Villa  A         16
             H         13
                       ..
West Ham     A         10
             D          9
Wolves       A         17
             H         14
             D          7
Name: Result, Length: 69, dtype: int64

In [11]:
homewinhome=round(data[data['Result']=='H']['HomeGoals'].mean())
homewinaway = round(data[data['Result']=='H']['AwayGoals'].mean())

In [12]:
drawhome = round(data[data['Result']=='D']['HomeGoals'].mean())
drawaway = round(data[data['Result']=='D']['AwayGoals'].mean())

In [13]:
awaywinhome=round(data[data['Result']=='A']['HomeGoals'].mean())
awaywinaway = round(data[data['Result']=='A']['AwayGoals'].mean())


In [14]:
team = [['Brighton','Liverpool'],['Brentford','Burnley'],['Man United','Tottenham'],['Chelsea','Newcastle'],['Everton','Wolves'],['Leeds','Norwich'],['Southampton','Watford'],['West Ham','Aston Villa'],['Arsenal','Leicester'],['Crystal Palace','Man City']]
for i,(home,away) in enumerate(team):
    if(grouped[home].idxmax()=='H'):
        homescore=homewinhome
        awayscore=homewinaway
    elif(grouped[home].idxmax()=='A'):
        homescore=awaywinhome
        awayscore=awaywinaway
    else:
        homescore=drawhome
        awayscore=drawaway
    print(f'{home} {homescore} {away} {awayscore}')

Brighton 1 Liverpool 1
Brentford 1 Burnley 2
Man United 2 Tottenham 1
Chelsea 2 Newcastle 1
Everton 1 Wolves 2
Leeds 1 Norwich 2
Southampton 1 Watford 2
West Ham 2 Aston Villa 1
Arsenal 2 Leicester 1
Crystal Palace 1 Man City 1
