In [1]:
# Dependencies
from scipy.stats import poisson 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [2]:
#register for api token at the-odds-api.com
#since odds are constantly updated, no guarantee will produce exact same results, look through before running again
TOKEN = ""

In [3]:
url="https://api.the-odds-api.com/v4/sports/soccer_epl/odds/?apiKey="+TOKEN+"&regions=eu&markets=h2h"

In [4]:
df =pd.read_json(url)

In [5]:
fixtures=df[["home_team","away_team","bookmakers"]]

In [6]:
data=fixtures.explode('bookmakers')

In [7]:
data["odds"]=data["bookmakers"].map(lambda x:x['markets'][0]["outcomes"])

In [8]:
data["bookmakers"]=data["bookmakers"].map(lambda x:x['key'])


In [9]:
data = pd.concat([data.reset_index(drop=True),pd.DataFrame(data["odds"].to_list(),columns=["bet_1","bet_2","bet_3"])],axis=1).drop("odds",axis=1)

In [10]:
data

Unnamed: 0,home_team,away_team,bookmakers,bet_1,bet_2,bet_3
0,Wolverhampton Wanderers,Leeds United,marathonbet,"{'name': 'Leeds United', 'price': 3.46}","{'name': 'Wolverhampton Wanderers', 'price': 2...","{'name': 'Draw', 'price': 3.58}"
1,Wolverhampton Wanderers,Leeds United,pinnacle,"{'name': 'Leeds United', 'price': 3.54}","{'name': 'Wolverhampton Wanderers', 'price': 2.2}","{'name': 'Draw', 'price': 3.51}"
2,Wolverhampton Wanderers,Leeds United,betclic,"{'name': 'Leeds United', 'price': 3.4}","{'name': 'Wolverhampton Wanderers', 'price': 2...","{'name': 'Draw', 'price': 3.48}"
3,Wolverhampton Wanderers,Leeds United,mybookieag,"{'name': 'Leeds United', 'price': 3.35}","{'name': 'Wolverhampton Wanderers', 'price': 2...","{'name': 'Draw', 'price': 3.4}"
4,Wolverhampton Wanderers,Leeds United,onexbet,"{'name': 'Leeds United', 'price': 3.48}","{'name': 'Wolverhampton Wanderers', 'price': 2...","{'name': 'Draw', 'price': 3.6}"
...,...,...,...,...,...,...
133,Crystal Palace,Arsenal,mybookieag,"{'name': 'Arsenal', 'price': 1.85}","{'name': 'Crystal Palace', 'price': 3.9}","{'name': 'Draw', 'price': 3.3}"
134,Crystal Palace,Arsenal,unibet,"{'name': 'Arsenal', 'price': 1.9300000000000002}","{'name': 'Crystal Palace', 'price': 4.0}","{'name': 'Draw', 'price': 3.55}"
135,Crystal Palace,Arsenal,sport888,"{'name': 'Arsenal', 'price': 1.9300000000000002}","{'name': 'Crystal Palace', 'price': 4.0}","{'name': 'Draw', 'price': 3.55}"
136,Crystal Palace,Arsenal,betfair,"{'name': 'Arsenal', 'price': 1.87}","{'name': 'Crystal Palace', 'price': 4.0}","{'name': 'Draw', 'price': 3.4}"


In [11]:
def convert_home_unfair(row):
    if row["bet_1"]["name"]==row["home_team"]:
        return row["bet_1"]["price"]
    elif row["bet_2"]["name"]==row["home_team"]:
        return row["bet_2"]["price"]
    elif row["bet_3"]["name"]==row["home_team"]: 
        return row["bet_3"]["price"]
def convert_away_unfair(row):
    if row["bet_1"]["name"]==row["away_team"]:
        return row["bet_1"]["price"]
    elif row["bet_2"]["name"]==row["away_team"]:
        return row["bet_2"]["price"]
    elif row["bet_3"]["name"]==row["away_team"]:
        return row["bet_3"]["price"]
def convert_draw_unfair(row):
    if row["bet_3"]["name"]=="Draw":
        return row["bet_3"]["price"]
    elif row["bet_1"]["name"]=="Draw":
        return row["bet_1"]["price"]
    elif row["bet_2"]["name"]=="Draw": 
        return row["bet_2"]["price"]

In [12]:
data["home_unfair"]=data.apply(convert_home_unfair,axis=1)
data["away_unfair"]=data.apply(convert_away_unfair,axis=1)
data["draw_unfair"]=data.apply(convert_draw_unfair,axis=1)

In [13]:
data=data.drop(["bet_1","bet_2","bet_3"],axis=1)

In [14]:
data["overround"] = (1/data["home_unfair"])+(1/data["away_unfair"])+(1/data["draw_unfair"])

In [15]:
data["home_fair_prob"]=(1/data["home_unfair"])/data["overround"]
data["away_fair_prob"]=(1/data["away_unfair"])/data["overround"]
data["draw_fair_prob"]=(1/data["draw_unfair"])/data["overround"]
data

Unnamed: 0,home_team,away_team,bookmakers,home_unfair,away_unfair,draw_unfair,overround,home_fair_prob,away_fair_prob,draw_fair_prob
0,Wolverhampton Wanderers,Leeds United,marathonbet,2.19,3.46,3.58,1.024968,0.445498,0.281977,0.272525
1,Wolverhampton Wanderers,Leeds United,pinnacle,2.20,3.54,3.51,1.021932,0.444790,0.276423,0.278786
2,Wolverhampton Wanderers,Leeds United,betclic,2.18,3.40,3.48,1.040190,0.440992,0.282754,0.276254
3,Wolverhampton Wanderers,Leeds United,mybookieag,2.14,3.35,3.40,1.059915,0.440875,0.281633,0.277492
4,Wolverhampton Wanderers,Leeds United,onexbet,2.21,3.48,3.60,1.017623,0.444653,0.282380,0.272967
...,...,...,...,...,...,...,...,...,...,...
133,Crystal Palace,Arsenal,mybookieag,3.90,1.85,3.30,1.099981,0.233104,0.491409,0.275487
134,Crystal Palace,Arsenal,unibet,4.00,1.93,3.55,1.049825,0.238135,0.493544,0.268321
135,Crystal Palace,Arsenal,sport888,4.00,1.93,3.55,1.049825,0.238135,0.493544,0.268321
136,Crystal Palace,Arsenal,betfair,4.00,1.87,3.40,1.078877,0.231722,0.495663,0.272615


In [16]:
data.groupby(['home_team','away_team'])[["home_fair_prob","away_fair_prob","draw_fair_prob"]].std()

Unnamed: 0_level_0,Unnamed: 1_level_0,home_fair_prob,away_fair_prob,draw_fair_prob
home_team,away_team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aston Villa,Arsenal,0.003361,0.002636,0.004514
Brighton and Hove Albion,Norwich City,0.007017,0.007201,0.004278
Burnley,Manchester City,0.005838,0.009333,0.006066
Chelsea,Brentford,0.010298,0.004913,0.006571
Crystal Palace,Arsenal,0.007632,0.006242,0.003919
Leeds United,Southampton,0.009043,0.005558,0.011838
Leicester City,Brentford,0.003438,0.002818,0.003538
Liverpool,Watford,0.011942,0.004861,0.007969
Manchester City,Liverpool,,,
Manchester United,Leicester City,0.005997,0.002821,0.004328


In [17]:
def get_premier_league_data(start_year):
    season = str(start_year)[-2:] + str(start_year + 1)[-2:]
    data = pd.read_csv("https://www.football-data.co.uk/mmz4281/" + season + "/E0.csv") 
    return data

In [18]:
# Get data from the 2018/2019 season
data_2021 = get_premier_league_data(2021)
data_2020 = get_premier_league_data(2020)
data_total=pd.concat([data_2020,data_2021])
data_total.tail()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
279,E0,13/03/2022,16:30,Arsenal,Leicester,2,0,H,1,0,...,2.3,-1.25,1.97,1.96,1.97,1.95,1.98,2.0,1.93,1.95
280,E0,14/03/2022,20:00,Crystal Palace,Man City,0,0,D,0,0,...,2.21,1.5,1.9,2.03,1.92,2.01,2.03,2.05,1.91,1.97
281,E0,16/03/2022,19:30,Brighton,Tottenham,0,2,A,0,1,...,1.76,0.25,2.05,1.75,2.11,1.81,2.13,1.87,2.06,1.82
282,E0,16/03/2022,20:15,Arsenal,Liverpool,0,2,A,0,0,...,2.06,0.5,1.81,2.09,1.85,2.09,1.9,2.12,1.83,2.06
283,E0,17/03/2022,19:45,Everton,Newcastle,1,0,H,0,0,...,1.69,-0.25,2.03,1.87,2.06,1.87,2.09,1.88,2.05,1.84


In [19]:
# Filtering columns of interest
columns = ["HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR"]
data_total = data_total[columns]

# Renaming columns
data_total = data_total.rename(
    columns={"FTHG": "HomeGoals", "FTAG": "AwayGoals", "FTR": "Result"}
)

In [20]:

home_goals = data_total[["HomeTeam", "AwayTeam", "HomeGoals"]]
home_goals = home_goals.assign(home=1)
home_goals = home_goals.rename(
    columns={"HomeTeam": "team",
             "AwayTeam": "opponent", 
             "HomeGoals": "goals"}
)

# Separate away goals data 
away_goals = data_total[["AwayTeam", "HomeTeam", "AwayGoals"]]
away_goals = away_goals.assign(home=0)
away_goals = away_goals.rename(
    columns={"AwayTeam": "team",
             "HomeTeam": "opponent", 
             "AwayGoals": "goals"}
)

In [21]:
# Concatenating into training data 
training_data = pd.concat([home_goals, away_goals])
training_data['team'].value_counts()

Man United          67
Leeds               67
Wolves              67
West Ham            67
Newcastle           67
Southampton         67
Brighton            67
Man City            67
Crystal Palace      67
Liverpool           67
Chelsea             66
Tottenham           66
Aston Villa         66
Arsenal             65
Everton             65
Burnley             65
Leicester           64
Sheffield United    38
West Brom           38
Fulham              38
Brentford           29
Norwich             29
Watford             29
Name: team, dtype: int64

In [22]:
# Building the model
# Poisson Regression: log-linear model
poisson_model = smf.glm(
    formula="goals ~ home + team + opponent",
    data=training_data,
    family=sm.families.Poisson() 
).fit()

In [23]:
# Get a statistical summary of the poisson model
poisson_model.summary()

0,1,2,3
Dep. Variable:,goals,No. Observations:,1328.0
Model:,GLM,Df Residuals:,1282.0
Model Family:,Poisson,Df Model:,45.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-1902.6
Date:,"Fri, 18 Mar 2022",Deviance:,1490.6
Time:,00:29:19,Pearson chi2:,1330.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.1400,0.160,0.873,0.383,-0.174,0.454
team[T.Aston Villa],-0.0332,0.144,-0.230,0.818,-0.315,0.249
team[T.Brentford],-0.2896,0.204,-1.418,0.156,-0.690,0.111
team[T.Brighton],-0.4212,0.159,-2.642,0.008,-0.734,-0.109
team[T.Burnley],-0.5572,0.169,-3.301,0.001,-0.888,-0.226
team[T.Chelsea],0.1444,0.138,1.048,0.294,-0.126,0.414
team[T.Crystal Palace],-0.1995,0.151,-1.322,0.186,-0.495,0.096
team[T.Everton],-0.2409,0.153,-1.573,0.116,-0.541,0.059
team[T.Fulham],-0.7302,0.218,-3.354,0.001,-1.157,-0.303


In [24]:
# Create feature data for home and away team for the match
def create_X(home_team, away_team):
    X_home = pd.DataFrame(data={"team": home_team,
                                "opponent": away_team,
                                "home": 1
                                }, index=[1])
    
    X_away = pd.DataFrame(data={"team": away_team,
                                "opponent": home_team,
                                "home": 0
                                }, index=[1])
    
    # Creating DataFrame for away team features 
    return X_home, X_away

In [25]:
def predict_avg_goals(X_home, X_away, model):
    # Predict the mean number of goals for home team
    home_goals_avg = model.predict(X_home) 
    
    # Predict the mean number of goals for away team
    away_goals_avg = model.predict(X_away) 
    return home_goals_avg, away_goals_avg

In [26]:

def predict_score_pmf(X_home, X_away, model, max_goals):
    avg_goals = predict_avg_goals(X_home, X_away, model)
    home_goals_avg = avg_goals[0]
    away_goals_avg = avg_goals[1]
    home_goals_pmf = [poisson.pmf(i, home_goals_avg)
                      for i in range(0, max_goals + 1)]
    away_goals_pmf = [poisson.pmf(i, away_goals_avg)
                      for i in range(0, max_goals + 1)]
    joint_pmf = np.outer(np.array(home_goals_pmf),
                         np.array(away_goals_pmf))
    
    return joint_pmf

In [27]:
def predict_score(X_home, X_away, model):
    score_pmf = predict_score_pmf(X_home, X_away, model,16)
    score_pmf = score_pmf.round(3)
    
    home_goals_mode = np.argmax(score_pmf) // (17)
    away_goals_mode = np.argmax(score_pmf) % (17)
    score_pred = (home_goals_mode, away_goals_mode)
    return score_pred

In [28]:
data=data.replace({
    'Norwich City':'Norwich',
    'Newcastle United':'Newcastle',
    'Brighton and Hove Albion':'Brighton',
    'Leicester City':'Leicester',
    'Manchester City':'Man City',
    'Leeds United':'Leeds',
    'West Ham United':'West Ham',
    'Tottenham Hotspur':'Tottenham',
    'Manchester United':'Man United',
    'Wolverhampton Wanderers':'Wolves'
})

In [29]:
data.groupby(['home_team','away_team'])[["home_fair_prob","away_fair_prob","draw_fair_prob"]].std()

Unnamed: 0_level_0,Unnamed: 1_level_0,home_fair_prob,away_fair_prob,draw_fair_prob
home_team,away_team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aston Villa,Arsenal,0.003361,0.002636,0.004514
Brighton,Norwich,0.007017,0.007201,0.004278
Burnley,Man City,0.005838,0.009333,0.006066
Chelsea,Brentford,0.010298,0.004913,0.006571
Crystal Palace,Arsenal,0.007632,0.006242,0.003919
Leeds,Southampton,0.009043,0.005558,0.011838
Leicester,Brentford,0.003438,0.002818,0.003538
Liverpool,Watford,0.011942,0.004861,0.007969
Man City,Liverpool,,,
Man United,Leicester,0.005997,0.002821,0.004328


In [30]:
data.groupby(['home_team','away_team'])[["home_fair_prob","away_fair_prob","draw_fair_prob"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,home_fair_prob,away_fair_prob,draw_fair_prob
home_team,away_team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aston Villa,Arsenal,0.321435,0.40508,0.273485
Brighton,Norwich,0.609937,0.148867,0.241196
Burnley,Man City,0.072656,0.783986,0.143358
Chelsea,Brentford,0.716119,0.097391,0.18649
Crystal Palace,Arsenal,0.238551,0.490429,0.27102
Leeds,Southampton,0.392535,0.338119,0.269346
Leicester,Brentford,0.441155,0.278486,0.280359
Liverpool,Watford,0.8482,0.047049,0.104751
Man City,Liverpool,0.473087,0.268984,0.25793
Man United,Leicester,0.616041,0.171224,0.212735


In [31]:
fixtures={
    ('Wolves', 'Leeds'):0,
    ('Aston Villa', 'Arsenal'):0,
    ('Leicester', 'Brentford'):0,
    ('Tottenham','West Ham'):0
}
for i,j in enumerate(fixtures):
    X_home,X_away=create_X(j[0],j[1])
    score_pred =  predict_score(X_home, X_away, poisson_model)
    fixtures[j]=list(score_pred)
sum_prob=0
for name,group in data.groupby(['home_team','away_team']):
    if(name in fixtures):
        if(fixtures[name][0]>fixtures[name][1]):
            fixtures[name].append(group[["home_fair_prob"]].mean()[0])
            sum_prob+=group[["home_fair_prob"]].mean()[0]
        elif(fixtures[name][0]==fixtures[name][1]):
            fixtures[name].append(group[["draw_fair_prob"]].mean()[0])
            sum_prob+=group[["draw_fair_prob"]].mean()[0]
        elif(fixtures[name][0<fixtures[name][1]]):
            fixtures[name].append(group[["away_fair_prob"]].mean()[0])
            sum_prob+=group[["away_fair_prob"]].mean()[0]
for i,j in enumerate(fixtures):
    fixtures[j].append(round(fixtures[j][2]/sum_prob*100))
print(fixtures)

{('Wolves', 'Leeds'): [1, 1, 0.27916310130303784, 23], ('Aston Villa', 'Arsenal'): [1, 1, 0.2734852077106782, 22], ('Leicester', 'Brentford'): [2, 1, 0.441154574881816, 36], ('Tottenham', 'West Ham'): [1, 1, 0.2423537332717092, 20]}
