# Crawler

In [1]:
from lxml import html
from lxml.cssselect import CSSSelector
import requests
import re
import pandas as pd

## Predictions from five thirty eight

In [110]:
# Go to the URL
#url = 'https://projects.fivethirtyeight.com/soccer-predictions/eredivisie/'
#url = 'https://projects.fivethirtyeight.com/soccer-predictions/la-liga/'
url = 'https://projects.fivethirtyeight.com/soccer-predictions/la-liga-2/'
#url = 'https://projects.fivethirtyeight.com/soccer-predictions/bundesliga/'
#url = 'https://projects.fivethirtyeight.com/soccer-predictions/premier-league/'
#url = 'https://projects.fivethirtyeight.com/soccer-predictions/ligue-1/'
page = requests.get(url)

# Get the source code
tree = html.fromstring(page.content)

In [139]:
matches = tree.cssselect('.games-container.upcoming .match-container:not(.hidden)') # Skip the hidden ones
matches = tree.cssselect('.games-container.upcoming .match-container')[:12] # Dont skip the hidden ones, take first 12
print("Number of matches found: ", len(matches))

Number of matches found:  12


In [166]:
cols = ['date', 'home_team', 'away_team', 'home_win', 'tie', 'away_win']
df538 = pd.DataFrame(columns=cols)

for idx in range(len(matches)):
    match = matches[idx]
    
    df538.at[idx, 'date'] = match.cssselect(".date div")[0].text
    df538.at[idx, 'home_team'] = match.cssselect(".match-top .name")[0].text
    df538.at[idx, 'home_win'] = match.cssselect(".match-top .prob")[0].text
    df538.at[idx, 'tie'] = match.cssselect(".tie-prob div")[0].text
    df538.at[idx, 'away_team'] = match.cssselect(".match-bottom .name")[0].text
    df538.at[idx, 'away_win'] = match.cssselect(".match-bottom .prob")[0].text
    
df538

Unnamed: 0,date,home_team,away_team,home_win,tie,away_win
0,12/19,Barcelona B,Albacete,49%,27%,24%
1,12/19,Valladolid,Zaragoza,52%,25%,24%
2,12/20,Lorca,Osasuna,27%,30%,44%
3,12/20,Granada,Sporting Gijón,48%,27%,25%
4,12/20,Córdoba,Reus,38%,36%,26%
5,12/21,Numancia,Sevilla Atlético,62%,25%,13%
6,12/21,Tenerife,Cádiz,36%,32%,32%
7,12/22,Lugo,Rayo Vallecano,37%,30%,33%
8,12/22,Alcorcón,Almería,45%,32%,23%
9,12/23,Gimnástic,Huesca,25%,28%,47%


### Data transformation

In [167]:
# Turn percentages into floats
df538['home_win'] = pd.to_numeric(df538['home_win'].str.replace("%",""))/100
df538['tie'] = pd.to_numeric(df538['tie'].str.replace("%",""))/100
df538['away_win'] = pd.to_numeric(df538['away_win'].str.replace("%",""))/100
df538

# Drop the date column
del df538["date"]

# Change team names to lowercase
df538["home_team"] = df538["home_team"].str.lower()
df538["away_team"] = df538["away_team"].str.lower()

# Remove all accents
df538["home_team"] = df538["home_team"].str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')
df538["away_team"] = df538["away_team"].str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')

# SHow
df538

Unnamed: 0,home_team,away_team,home_win,tie,away_win
0,barcelona b,albacete,0.49,0.27,0.24
1,valladolid,zaragoza,0.52,0.25,0.24
2,lorca,osasuna,0.27,0.3,0.44
3,granada,sporting gijon,0.48,0.27,0.25
4,cordoba,reus,0.38,0.36,0.26
5,numancia,sevilla atletico,0.62,0.25,0.13
6,tenerife,cadiz,0.36,0.32,0.32
7,lugo,rayo vallecano,0.37,0.3,0.33
8,alcorcon,almeria,0.45,0.32,0.23
9,gimnastic,huesca,0.25,0.28,0.47


## Odds from unibet

In [7]:
import re
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep

In [8]:
# Start our headless (no GUI) browser
browser = webdriver.PhantomJS()

In [116]:
# Go to the URL
#url = 'https://www.unibet.eu/betting#filter/football/netherlands/eredivisie'
#url = 'https://www.unibet.eu/betting#filter/football/spain/laliga'
url = 'https://www.unibet.eu/betting#filter/football/spain/laliga2'
#url = 'https://www.unibet.eu/betting#filter/football/germany/bundesliga'
#url = 'https://www.unibet.eu/betting#filter/football/england/premier_league'
#url = 'https://www.unibet.eu/betting#filter/football/france/ligue_1'
browser.get(url)

# Wait
sleep(3.75)

# Get the source code
soup = BeautifulSoup(browser.page_source, 'html5lib')

In [117]:
# Find the dropdown bars
dropdowns = browser.find_elements_by_class_name('KambiBC-collapsible-header')

# Click on them
for dropdown in dropdowns:
    dropdown.click()
    sleep(1.75)

# Now get the source code
soup = BeautifulSoup(browser.page_source, 'html5lib')

In [118]:
matches = soup.find(id="KambiBC-container").find_all("li", class_="KambiBC-event-item")
print("Number of matches found: ", len(matches))

Number of matches found:  11


In [119]:
# Create empty list
jobs = []

for match in matches:
    # Create empty dictionary
    job = {}
    
    # Find info on the page
    #job["date"] = match.find(class_="KambiBC-event-item__start-time--date").text # <- Doesnt work..?
    teams = match.find_all(class_="KambiBC-event-participants__name")
    job["home_team"] = teams[0].text
    job["away_team"] = teams[1].text
    odds = match.find_all(class_="KambiBC-mod-outcome__odds")
    job["odd_home_win"] = odds[0].text
    job["odd_tie"] = odds[1].text
    job["odd_away_win"] = odds[2].text
    
    # Add to the list
    jobs.append(job)

In [120]:
# Make a data frame
dfOdds = pd.DataFrame(jobs)
dfOdds = dfOdds[:12]
dfOdds

Unnamed: 0,away_team,home_team,odd_away_win,odd_home_win,odd_tie
0,Albacete Balompié,FC Barcelona B,4.2,1.9,3.3
1,Real Zaragoza,Valladolid,4.6,1.77,3.45
2,Osasuna,La Hoya Lorca CF,2.55,3.0,3.0
3,Reus Deportiu,Córdoba CF,3.95,2.1,3.0
4,Sporting de Gijón,Granada CF,3.8,2.0,3.3
5,Sevilla Atlético,Numancia,4.9,1.73,3.45
6,Cadiz,Tenerife,3.3,2.25,3.15
7,Rayo Vallecano,Lugo,2.75,2.7,3.05
8,UD Almeria,Alcorcón,3.8,2.1,3.05
9,Huesca,Gimnástic de Tarragona,2.25,3.25,3.15


### Data transformation

In [168]:
# Change team names to lowercase
dfOdds["home_team"] = dfOdds["home_team"].str.lower()
dfOdds["away_team"] = dfOdds["away_team"].str.lower()

# Remove all accents
dfOdds["home_team"] = dfOdds["home_team"].str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')
dfOdds["away_team"] = dfOdds["away_team"].str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')

Unnamed: 0,away_team,home_team,odd_away_win,odd_home_win,odd_tie,home_code,away_code
0,albacete balompie,barcelona b,4.2,1.9,3.3,Bar,Alb
1,zaragoza,valladolid,4.6,1.77,3.45,Val,Zar
2,osasuna,lorca,2.55,3.0,3.0,Lor,Osa
3,reus deportiu,cordoba cf,3.95,2.1,3.0,Cór,Reu
4,sporting de gijon,granada cf,3.8,2.0,3.3,Gra,Spo
5,sevilla atletico,numancia,4.9,1.73,3.45,Num,Sev
6,cadiz,tenerife,3.3,2.25,3.15,Ten,Cad
7,rayo vallecano,lugo,2.75,2.7,3.05,Lug,Ray
8,almeria,alcorcon,3.8,2.1,3.05,Alc,Alm
9,huesca,gimnastic de tarragona,2.25,3.25,3.15,Gim,Hue


In [135]:
# Change the team names so that they match the ones in the 538 data frame
# Eredivisie
changes_nl = {'PSV Eindhoven': 'PSV', 
           'Roda JC Kerkrade': 'Roda JC',
           'VVV Venlo': 'VVV-Venlo',
           'SC Heerenveen': 'Heerenveen',
           'FC Twente': 'Twente',
             'Heracles Almelo': 'Heracles',
             'FC Utrecht': 'Utrecht',
             'Excelsior Rotterdam': 'Excelsior',
             'FC Groningen': 'Groningen'}
# Bundesliga
changes_de = {'Borussia Mönchengladbach': 'Gladbach', 
           'VfL Wolfsburg': 'Wolfsburg',
           '1. FC Köln': '1. FC Cologne',
           'Mainz 05': 'Mainz',
           'Borussia Dortmund': 'Dortmund',
           'Bayern München': 'Bayern Munich',
           'Augsburg': 'FC Augsburg',
           'Eintracht Frankfurt': 'Eintracht',
           'Bayer Leverkusen': 'Leverkusen',
           'Hertha Berlin': 'Hertha BSC'}
# La Liga
changes_es = {'Athletic Club Bilbao': 'Athletic Bilbao',
              'FC Barcelona': 'Barcelona',
              'Deportivo La Coruña': 'Deportivo',
              'Deportiva Las Palmas': 'Las Palmas',
              'Girona FC': 'Girona',
             'Real Zaragoza': 'Zaragoza',
             'La Hoya Lorca CF': 'Lorca',
             'UD Almeria': 'Almeria',
             'Real Oviedo': 'Oviedo'}
# Premier League
changes_en = {'Brighton \& Hove Albion': 'Brighton',
             'Manchester City': 'Man. City',
             'Newcastle United': 'Newcastle',
             'West Bromwich': 'West Brom',
             'Huddersfield Town': 'Huddersfield',
             'Manchester United': 'Man. United'}
# Ligue 1
changes_fr = {'Dijon': 'Dijon FCO',
             'Saint-Étienne': 'St Étienne',
             'Paris SG': 'PSG'}

changes = {**changes_nl, **changes_de, **changes_es, **changes_en, **changes_fr}
for old,new in changes.items():
    # Replace!
    dfOdds["home_team"] = dfOdds["home_team"].str.replace(old, new)
    dfOdds["away_team"] = dfOdds["away_team"].str.replace(old, new)
    
# Drop the dates
#del dfOdds["date"]

# Convert to numbers
dfOdds["odd_home_win"] = pd.to_numeric(dfOdds["odd_home_win"])
dfOdds["odd_away_win"] = pd.to_numeric(dfOdds["odd_away_win"])
dfOdds["odd_tie"] = pd.to_numeric(dfOdds["odd_tie"])

# Show
dfOdds

Unnamed: 0,away_team,home_team,odd_away_win,odd_home_win,odd_tie,home_code,away_code
0,Albacete Balompié,Barcelona B,4.2,1.9,3.3,Bar,Alb
1,Zaragoza,Valladolid,4.6,1.77,3.45,Val,Rea
2,Osasuna,Lorca,2.55,3.0,3.0,La,Osa
3,Reus Deportiu,Córdoba CF,3.95,2.1,3.0,Cór,Reu
4,Sporting de Gijón,Granada CF,3.8,2.0,3.3,Gra,Spo
5,Sevilla Atlético,Numancia,4.9,1.73,3.45,Num,Sev
6,Cadiz,Tenerife,3.3,2.25,3.15,Ten,Cad
7,Rayo Vallecano,Lugo,2.75,2.7,3.05,Lug,Ray
8,Almeria,Alcorcón,3.8,2.1,3.05,Alc,UD
9,Huesca,Gimnástic de Tarragona,2.25,3.25,3.15,Gim,Hue


# Merge

In [122]:
dfMerge = df538.merge(dfOdds, how="inner", on=["home_team", "away_team"])
dfMerge

Unnamed: 0,home_team,away_team,home_win,tie,away_win,odd_away_win,odd_home_win,odd_tie
0,Numancia,Sevilla Atlético,0.62,0.25,0.13,4.9,1.73,3.45
1,Lugo,Rayo Vallecano,0.37,0.3,0.33,2.75,2.7,3.05


In [136]:
# Merge by 3-letter codes
df538["home_code"] = df538["home_team"].str[:3]
df538["away_code"] = df538["away_team"].str[:3]
dfOdds["home_code"] = dfOdds["home_team"].str[:3]
dfOdds["away_code"] = dfOdds["away_team"].str[:3]

dfMerge = df538.merge(dfOdds, how="inner", on=["home_code", "away_code"])
dfMerge

Unnamed: 0,home_team_x,away_team_x,home_win,tie,away_win,home_code,away_code,away_team_y,home_team_y,odd_away_win,odd_home_win,odd_tie
0,Barcelona B,Albacete,0.49,0.27,0.24,Bar,Alb,Albacete Balompié,Barcelona B,4.2,1.9,3.3
1,Valladolid,Zaragoza,0.52,0.25,0.24,Val,Zar,Zaragoza,Valladolid,4.6,1.77,3.45
2,Lorca,Osasuna,0.27,0.3,0.44,Lor,Osa,Osasuna,Lorca,2.55,3.0,3.0
3,Granada,Sporting Gijón,0.48,0.27,0.25,Gra,Spo,Sporting de Gijón,Granada CF,3.8,2.0,3.3
4,Córdoba,Reus,0.38,0.36,0.26,Cór,Reu,Reus Deportiu,Córdoba CF,3.95,2.1,3.0
5,Numancia,Sevilla Atlético,0.62,0.25,0.13,Num,Sev,Sevilla Atlético,Numancia,4.9,1.73,3.45
6,Lugo,Rayo Vallecano,0.37,0.3,0.33,Lug,Ray,Rayo Vallecano,Lugo,2.75,2.7,3.05
7,Alcorcón,Almería,0.45,0.32,0.23,Alc,Alm,Almeria,Alcorcón,3.8,2.1,3.05
8,Gimnástic,Huesca,0.25,0.28,0.47,Gim,Hue,Huesca,Gimnástic de Tarragona,2.25,3.25,3.15
9,Oviedo,Cultural Leonesa,0.6,0.23,0.17,Ovi,Cul,Cultural Leonesa,Oviedo,4.9,1.75,3.4


In [132]:
df538

Unnamed: 0,home_team,away_team,home_win,tie,away_win,home_code,away_code
0,Barcelona B,Albacete,0.49,0.27,0.24,Bar,Alb
1,Valladolid,Zaragoza,0.52,0.25,0.24,Val,Zar
2,Lorca,Osasuna,0.27,0.3,0.44,Lor,Osa
3,Granada,Sporting Gijón,0.48,0.27,0.25,Gra,Spo
4,Córdoba,Reus,0.38,0.36,0.26,Cór,Reu
5,Numancia,Sevilla Atlético,0.62,0.25,0.13,Num,Sev
6,Tenerife,Cádiz,0.36,0.32,0.32,Ten,Cád
7,Lugo,Rayo Vallecano,0.37,0.3,0.33,Lug,Ray
8,Alcorcón,Almería,0.45,0.32,0.23,Alc,Alm
9,Gimnástic,Huesca,0.25,0.28,0.47,Gim,Hue


In [134]:
dfOdds[["home_team", "away_team"]]

Unnamed: 0,home_team,away_team
0,Barcelona B,Albacete Balompié
1,Valladolid,Real Zaragoza
2,La Hoya Lorca CF,Osasuna
3,Córdoba CF,Reus Deportiu
4,Granada CF,Sporting de Gijón
5,Numancia,Sevilla Atlético
6,Tenerife,Cadiz
7,Lugo,Rayo Vallecano
8,Alcorcón,UD Almeria
9,Gimnástic de Tarragona,Huesca


# Analyse

In [107]:
# Calculate the expected values for win, tie and loss
dfMerge["expect_home"] = dfMerge["home_win"] * dfMerge["odd_home_win"]
dfMerge["expect_away"] = dfMerge["away_win"] * dfMerge["odd_away_win"]
dfMerge["expect_tie"] = dfMerge["tie"] * dfMerge["odd_tie"]

# Show
dfMerge[["home_team", "away_team", "expect_home", "expect_tie", "expect_away"]]

Unnamed: 0,home_team,away_team,expect_home,expect_tie,expect_away
0,PSG,Caen,0.963,0.92,0.58
1,Bordeaux,Montpellier,0.8648,1.005,1.0005
2,Monaco,Rennes,0.8832,1.034,1.12
3,Marseille,Troyes,0.884,1.134,1.045
4,Lille,Nice,1.053,0.928,0.891
5,Toulouse,Lyon,1.161,0.999,0.828
6,Metz,Strasbourg,1.1895,0.864,0.8228
7,Amiens,Nantes,0.868,0.976,1.0175
8,Guingamp,St Étienne,1.08,0.868,0.817
9,Angers,Dijon FCO,1.034,0.864,0.91
