This Notebook reads the bookmaker predictions and join them with the Poisson model predictions.

* Input: "predictions_bookmaker.csv", predictions copied from "output/predictions_past" folder
* Output: "predictions_joined.csv"

In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
# Reading bookmaker predictions (one file).

df_bookmaker = pd.read_csv('./data/predictions_bookmaker.csv', sep=';', parse_dates=['date'], low_memory=False)
df_bookmaker

Unnamed: 0,date,team_home,team_away,goals_home,goals_away,probability_home,probability_draw,probability_away,prediction,observation,is_true
0,2012-08-18,Celta,Malaga,0,1,0.444,0.308,0.312,home,away,0
1,2012-08-18,Mallorca,Espanol,2,1,0.500,0.303,0.263,home,home,1
2,2012-08-18,Sevilla,Getafe,2,1,0.617,0.267,0.182,home,home,1
3,2012-08-19,Ath Bilbao,Betis,3,5,0.546,0.286,0.238,home,away,0
4,2012-08-19,Barcelona,Sociedad,5,1,0.909,0.111,0.038,home,home,1
...,...,...,...,...,...,...,...,...,...,...,...
3795,2022-05-22,Granada,Espanol,0,0,0.694,0.231,0.133,home,draw,0
3796,2022-05-22,Osasuna,Mallorca,0,2,0.303,0.294,0.455,away,away,1
3797,2022-05-22,Barcelona,Villarreal,0,2,0.476,0.250,0.333,home,away,0
3798,2022-05-22,Sevilla,Ath Bilbao,1,0,0.400,0.303,0.348,home,home,1


In [3]:
# Reading Poisson model predictions (folder with multiple files manually copied into "data" folder).

files_prediction_poisson = glob.glob('./data/predictions_past/predictions_train_*csv')
files_prediction_poisson.sort()

df_prediction_poisson = pd.DataFrame()
for file in files_prediction_poisson:
    df_read = pd.read_csv(file, sep=';', parse_dates=['date'], low_memory=False)
    df_prediction_poisson = pd.concat([df_prediction_poisson, df_read])
    
df_prediction_poisson

Unnamed: 0,date,team_home,team_away,goals_home,goals_away,probability_home,probability_draw,probability_away,prediction,observation,is_true
0,2013-08-17,Sociedad,Getafe,2,0,0.674,0.186,0.132,home,home,1.0
1,2013-08-17,Valencia,Malaga,1,0,0.594,0.211,0.190,home,home,1.0
2,2013-08-17,Valladolid,Ath Bilbao,1,2,0.505,0.230,0.261,home,away,0.0
3,2013-08-18,Barcelona,Levante,7,0,0.826,0.082,0.036,home,home,1.0
4,2013-08-18,Osasuna,Granada,1,2,0.321,0.364,0.315,draw,away,0.0
...,...,...,...,...,...,...,...,...,...,...,...
375,2022-05-22,Granada,Espanol,0,0,,,,,draw,
376,2022-05-22,Osasuna,Mallorca,0,2,,,,,away,
377,2022-05-22,Barcelona,Villarreal,0,2,0.544,0.224,0.224,home,away,0.0
378,2022-05-22,Sevilla,Ath Bilbao,1,0,0.498,0.334,0.168,home,home,1.0


In [4]:
# Joining predictions together.

df = pd.merge(
    df_prediction_poisson, 
    df_bookmaker.drop(columns=['goals_home', 'goals_away', 'observation']),
    how='left', 
    on=['date', 'team_home', 'team_away'],
    suffixes=('_poisson', '_bookmaker')
)

In [5]:
# Deleting matches with no clear prediction.

df = df.dropna()
df = df.loc[df['prediction_bookmaker'] != 'unknown'].copy()
df = df.loc[df['prediction_bookmaker'] != 'unknown'].copy()
df

Unnamed: 0,date,team_home,team_away,goals_home,goals_away,probability_home_poisson,probability_draw_poisson,probability_away_poisson,prediction_poisson,observation,is_true_poisson,probability_home_bookmaker,probability_draw_bookmaker,probability_away_bookmaker,prediction_bookmaker,is_true_bookmaker
0,2013-08-17,Sociedad,Getafe,2,0,0.674,0.186,0.132,home,home,1.0,0.578,0.278,0.211,home,1
1,2013-08-17,Valencia,Malaga,1,0,0.594,0.211,0.190,home,home,1.0,0.654,0.250,0.167,home,1
2,2013-08-17,Valladolid,Ath Bilbao,1,2,0.505,0.230,0.261,home,away,0.0,0.400,0.303,0.357,home,0
3,2013-08-18,Barcelona,Levante,7,0,0.826,0.082,0.036,home,home,1.0,0.926,0.100,0.038,home,1
4,2013-08-18,Osasuna,Granada,1,2,0.321,0.364,0.315,draw,away,0.0,0.500,0.303,0.267,home,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3413,2022-05-22,Elche,Getafe,3,1,0.328,0.382,0.290,draw,home,0.0,0.323,0.348,0.382,away,0
3414,2022-05-22,Alaves,Cadiz,0,1,0.383,0.299,0.318,home,away,0.0,0.238,0.270,0.546,away,1
3417,2022-05-22,Barcelona,Villarreal,0,2,0.544,0.224,0.224,home,away,0.0,0.476,0.250,0.333,home,0
3418,2022-05-22,Sevilla,Ath Bilbao,1,0,0.498,0.334,0.168,home,home,1.0,0.400,0.303,0.348,home,1


In [6]:
# Saving as CSV.

df.to_csv('./data/predictions_joined.csv', index=False, encoding='UTF-8', sep=';')