Data enrichment of the joined predictions (bookmaker and Poisson) with addition information.

* Input: "predictions_joined.csv"
* Ouput: "predictions_joined_enriched.csv"

In [1]:
import pandas as pd
from datetime import datetime
import geopy.distance

# Reading data

In [2]:
df_read = pd.read_csv('./data/predictions_joined.csv', sep=';', parse_dates=['date'], low_memory=False)
df_read

Unnamed: 0,date,team_home,team_away,goals_home,goals_away,probability_home_poisson,probability_draw_poisson,probability_away_poisson,prediction_poisson,observation,is_true_poisson,probability_home_bookmaker,probability_draw_bookmaker,probability_away_bookmaker,prediction_bookmaker,is_true_bookmaker
0,2013-08-17,Sociedad,Getafe,2,0,0.674,0.186,0.132,home,home,1.0,0.578,0.278,0.211,home,1
1,2013-08-17,Valencia,Malaga,1,0,0.594,0.211,0.190,home,home,1.0,0.654,0.250,0.167,home,1
2,2013-08-17,Valladolid,Ath Bilbao,1,2,0.505,0.230,0.261,home,away,0.0,0.400,0.303,0.357,home,0
3,2013-08-18,Barcelona,Levante,7,0,0.826,0.082,0.036,home,home,1.0,0.926,0.100,0.038,home,1
4,2013-08-18,Osasuna,Granada,1,2,0.321,0.364,0.315,draw,away,0.0,0.500,0.303,0.267,home,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2424,2022-05-22,Elche,Getafe,3,1,0.328,0.382,0.290,draw,home,0.0,0.323,0.348,0.382,away,0
2425,2022-05-22,Alaves,Cadiz,0,1,0.383,0.299,0.318,home,away,0.0,0.238,0.270,0.546,away,1
2426,2022-05-22,Barcelona,Villarreal,0,2,0.544,0.224,0.224,home,away,0.0,0.476,0.250,0.333,home,0
2427,2022-05-22,Sevilla,Ath Bilbao,1,0,0.498,0.334,0.168,home,home,1.0,0.400,0.303,0.348,home,1


# Season start and end dates

In [3]:
seasons_complete = [
            {
                'name': '2021-2022',
                'start_date': '2021-08-13',
                'end_date': '2022-05-22'
            },
            {
                'name': '2020-2021',
                'start_date': '2020-09-12',
                'end_date': '2021-05-23'
            },
            {
                'name': '2019-2020',
                'start_date': '2019-08-16',
                'end_date': '2020-07-19'
            },
            {
                'name': '2018-2019',
                'start_date': '2018-08-17',
                'end_date': '2019-05-19'
            },
            {
                'name': '2017-2018',
                'start_date': '2017-08-18',
                'end_date': '2018-05-20'
            },
            {
                'name': '2016-2017',
                'start_date': '2016-08-19',
                'end_date': '2017-05-21'
            },
            {
                'name': '2015-2016',
                'start_date': '2015-08-21',
                'end_date': '2016-05-15'
            },
            {
                'name': '2014-2015',
                'start_date': '2014-08-23',
                'end_date': '2015-05-23'
            },
            {
                'name': '2013-2014',
                'start_date': '2013-08-17',
                'end_date': '2014-05-18'
            },
            {
                'name': '2012-2013',
                'start_date': '2012-08-18',
                'end_date': '2013-06-01'
            },
        ]

# How far in the season are we

In [4]:
df = df_read.copy()
seasons_progress = []

for season in seasons_complete:
    
    df_season = df.loc[ (df['date']>=season['start_date']) & (df['date']<=season['end_date']) ].copy()
    
    start_date_season = datetime.strptime(season['start_date'], '%Y-%m-%d')
    end_date_season = datetime.strptime(season['end_date'], '%Y-%m-%d')
    length_season = end_date_season - start_date_season

    for match in df_season.itertuples():
        season_progress = (match.date - start_date_season) / length_season
        seasons_progress.append([match.date, round(season_progress, 2)])


df_seasons_progress = pd.DataFrame(data=seasons_progress, columns=['date', 'season_progress'])
df_seasons_progress = df_seasons_progress.drop_duplicates()
df_seasons_progress

Unnamed: 0,date,season_progress
0,2021-08-13,0.00
1,2021-08-14,0.00
3,2021-08-15,0.01
5,2021-08-16,0.01
7,2021-08-20,0.02
...,...,...
2415,2014-05-10,0.97
2416,2014-05-11,0.97
2422,2014-05-16,0.99
2423,2014-05-17,1.00


In [5]:
df = df.merge(df_seasons_progress, left_on='date', right_on='date', how='left')
df

Unnamed: 0,date,team_home,team_away,goals_home,goals_away,probability_home_poisson,probability_draw_poisson,probability_away_poisson,prediction_poisson,observation,is_true_poisson,probability_home_bookmaker,probability_draw_bookmaker,probability_away_bookmaker,prediction_bookmaker,is_true_bookmaker,season_progress
0,2013-08-17,Sociedad,Getafe,2,0,0.674,0.186,0.132,home,home,1.0,0.578,0.278,0.211,home,1,0.0
1,2013-08-17,Valencia,Malaga,1,0,0.594,0.211,0.190,home,home,1.0,0.654,0.250,0.167,home,1,0.0
2,2013-08-17,Valladolid,Ath Bilbao,1,2,0.505,0.230,0.261,home,away,0.0,0.400,0.303,0.357,home,0,0.0
3,2013-08-18,Barcelona,Levante,7,0,0.826,0.082,0.036,home,home,1.0,0.926,0.100,0.038,home,1,0.0
4,2013-08-18,Osasuna,Granada,1,2,0.321,0.364,0.315,draw,away,0.0,0.500,0.303,0.267,home,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2424,2022-05-22,Elche,Getafe,3,1,0.328,0.382,0.290,draw,home,0.0,0.323,0.348,0.382,away,0,1.0
2425,2022-05-22,Alaves,Cadiz,0,1,0.383,0.299,0.318,home,away,0.0,0.238,0.270,0.546,away,1,1.0
2426,2022-05-22,Barcelona,Villarreal,0,2,0.544,0.224,0.224,home,away,0.0,0.476,0.250,0.333,home,0,1.0
2427,2022-05-22,Sevilla,Ath Bilbao,1,0,0.498,0.334,0.168,home,home,1.0,0.400,0.303,0.348,home,1,1.0


# Distance between stadiums
It is a measure of how far an "away team" is from home.

In [6]:
df_stadiums = pd.read_csv('./data/stadiums_coordinates.csv', sep=';', encoding='UTF-8')
df_stadiums.head()

Unnamed: 0,team,stadium,latitude,longitude
0,Alaves,Estadio de Mendizorroza,42.837154,-2.688511
1,Albacete,Estadio Municipal Carlos Belmonte,38.981551,-1.852065
2,Almeria,Estadio Municipal de los Juegos Mediterraneos,36.840251,-2.436027
3,Ath Bilbao,San Mames,43.264062,-2.948984
4,Ath Madrid,Wanda Metropolitano,40.436308,-3.599478


In [7]:
distances = []

for match in df.itertuples():

    team_home = match.team_home
    team_away = match.team_away
    
    longitude_home = df_stadiums.loc[df_stadiums['team']==team_home, 'longitude'].values[0]
    latitude_home = df_stadiums.loc[df_stadiums['team']==team_home, 'latitude'].values[0]
    longitude_away = df_stadiums.loc[df_stadiums['team']==team_away, 'longitude'].values[0]
    latitude_away = df_stadiums.loc[df_stadiums['team']==team_away, 'latitude'].values[0]
    
    coordinates_home = (latitude_home, longitude_home)
    coordinates_away = (latitude_away, longitude_away)
    
    distance = geopy.distance.geodesic(coordinates_home, coordinates_away).km
    distances.append([match.date, team_home, team_away, round(distance)])

    
df_distances = pd.DataFrame(data=distances, columns=['date', 'team_home', 'team_away', 'distance'])
df_distances    

Unnamed: 0,date,team_home,team_away,distance
0,2013-08-17,Sociedad,Getafe,361
1,2013-08-17,Valencia,Malaga,469
2,2013-08-17,Valladolid,Ath Bilbao,234
3,2013-08-18,Barcelona,Levante,297
4,2013-08-18,Osasuna,Granada,649
...,...,...,...,...
2424,2022-05-22,Elche,Getafe,349
2425,2022-05-22,Alaves,Cadiz,767
2426,2022-05-22,Barcelona,Villarreal,247
2427,2022-05-22,Sevilla,Ath Bilbao,701


In [8]:
df = df.merge(df_distances, left_on=['date', 'team_home', 'team_away'], right_on=['date', 'team_home', 'team_away'], how='left')
df

Unnamed: 0,date,team_home,team_away,goals_home,goals_away,probability_home_poisson,probability_draw_poisson,probability_away_poisson,prediction_poisson,observation,is_true_poisson,probability_home_bookmaker,probability_draw_bookmaker,probability_away_bookmaker,prediction_bookmaker,is_true_bookmaker,season_progress,distance
0,2013-08-17,Sociedad,Getafe,2,0,0.674,0.186,0.132,home,home,1.0,0.578,0.278,0.211,home,1,0.0,361
1,2013-08-17,Valencia,Malaga,1,0,0.594,0.211,0.190,home,home,1.0,0.654,0.250,0.167,home,1,0.0,469
2,2013-08-17,Valladolid,Ath Bilbao,1,2,0.505,0.230,0.261,home,away,0.0,0.400,0.303,0.357,home,0,0.0,234
3,2013-08-18,Barcelona,Levante,7,0,0.826,0.082,0.036,home,home,1.0,0.926,0.100,0.038,home,1,0.0,297
4,2013-08-18,Osasuna,Granada,1,2,0.321,0.364,0.315,draw,away,0.0,0.500,0.303,0.267,home,0,0.0,649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2424,2022-05-22,Elche,Getafe,3,1,0.328,0.382,0.290,draw,home,0.0,0.323,0.348,0.382,away,0,1.0,349
2425,2022-05-22,Alaves,Cadiz,0,1,0.383,0.299,0.318,home,away,0.0,0.238,0.270,0.546,away,1,1.0,767
2426,2022-05-22,Barcelona,Villarreal,0,2,0.544,0.224,0.224,home,away,0.0,0.476,0.250,0.333,home,0,1.0,247
2427,2022-05-22,Sevilla,Ath Bilbao,1,0,0.498,0.334,0.168,home,home,1.0,0.400,0.303,0.348,home,1,1.0,701


# Saving as CSV

In [9]:
df.to_csv('./data/predictions_joined_enriched.csv', index=False, encoding='UTF-8', sep=';')