# Get the predictions
Website: https://projects.fivethirtyeight.com/soccer-predictions/  
Data source (git): https://github.com/fivethirtyeight/data/tree/master/soccer-spi/

Gets predictions of all leagues (my selection) for the matches of today and the next 30 days.

Stores them in `./data/This_months_predictions.csv`.

In [1]:
import pandas as pd
from datetime import date, timedelta

# show all columns in this notebook
pd.options.display.max_columns = 0

In [2]:
# read their entire CSV
df = pd.read_csv('https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv', parse_dates=[0])

# show
print(df.shape)
df.head()

(20923, 22)


Unnamed: 0,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,probtie,proj_score1,proj_score2,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
0,2016-08-12,1843,French Ligue 1,Bastia,Paris Saint-Germain,51.16,85.68,0.0463,0.838,0.1157,0.91,2.36,32.4,67.7,0.0,1.0,0.97,0.63,0.43,0.45,0.0,1.05
1,2016-08-12,1843,French Ligue 1,AS Monaco,Guingamp,68.85,56.48,0.5714,0.1669,0.2617,1.82,0.86,53.7,22.9,2.0,2.0,2.45,0.77,1.75,0.42,2.1,2.1
2,2016-08-13,2411,Barclays Premier League,Hull City,Leicester City,53.57,66.81,0.3459,0.3621,0.2921,1.16,1.24,38.1,22.2,2.0,1.0,0.85,2.77,0.17,1.25,2.1,1.05
3,2016-08-13,2411,Barclays Premier League,Crystal Palace,West Bromwich Albion,55.19,58.66,0.4214,0.2939,0.2847,1.35,1.14,43.6,34.6,0.0,1.0,1.11,0.68,0.84,1.6,0.0,1.05
4,2016-08-13,2411,Barclays Premier League,Everton,Tottenham Hotspur,68.02,73.25,0.391,0.3401,0.2689,1.47,1.38,31.9,48.0,1.0,1.0,0.73,1.11,0.88,1.81,1.05,1.05


In [3]:
# use only some leagues
leagues = pd.DataFrame({'league_id': [1849, 1843, 1845, 1854, 1869, 2411]})

# get the actual league names from the data
leagues = leagues.merge(df[['league_id', 'league']], on='league_id', how='left').drop_duplicates().reset_index(drop=True)

# show what we'll use
#display(leagues)

# so filter only these leagues
df = df[df.league_id.isin(leagues.league_id)]
print(df.shape)

(6090, 22)


In [4]:
# filter matches from today up to 21 days from now
today = date.today()
delta = timedelta(21)
print(today, 'to', today+delta)
df = df[(df.date.dt.date >= today) & (df.date.dt.date <= today+delta)]
print(df.shape)

2018-12-09 to 2018-12-30
(186, 22)


In [5]:
# transform text fields to lowercase
df.league = df.league.str.lower()
df.team1 = df.team1.str.lower()
df.team2 = df.team2.str.lower()

# and remove accents
# https://stackoverflow.com/questions/37926248/how-to-remove-accents-from-values-in-columns
df.team1 = df.team1.str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')
df.team2 = df.team2.str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')

#
print(df.shape)

(186, 22)


In [6]:
# drop columns I dont need
df = df[['date', 'league', 'team1', 'team2', 'prob1', 'prob2', 'probtie']]

# and rename those I do need
df.rename(columns={'team1': 'home_team',
                   'team2': 'away_team',
                   'prob1': 'prob_home_win', 
                   'prob2': 'prob_away_win',
                   'probtie': 'prob_tie'}, inplace=True)

print(df.shape)

(186, 7)


In [7]:
# sort by league_id and date
df = df.sort_values(by=['league', 'date'])

# show
print(df.shape)
df.head()

(186, 7)


Unnamed: 0,date,league,home_team,away_team,prob_home_win,prob_away_win,prob_tie
16498,2018-12-09,barclays premier league,newcastle,wolverhampton,0.4229,0.28,0.2971
16532,2018-12-10,barclays premier league,everton,watford,0.4477,0.274,0.2783
16623,2018-12-15,barclays premier league,manchester city,everton,0.8786,0.0249,0.0965
16646,2018-12-15,barclays premier league,huddersfield town,newcastle,0.4009,0.2788,0.3203
16647,2018-12-15,barclays premier league,watford,cardiff city,0.5195,0.2146,0.2659


In [8]:
# save to CSV
df.to_csv('./data/This_months_predictions.csv', index=False)