# Crawler

In [80]:
from lxml import html
from lxml.cssselect import CSSSelector
import requests
import re
import pandas as pd

## Predictions from five thirty eight

In [321]:
# Go to the URL
url = 'https://projects.fivethirtyeight.com/soccer-predictions/eredivisie/'
url = 'https://projects.fivethirtyeight.com/soccer-predictions/la-liga/'
#url = 'https://projects.fivethirtyeight.com/soccer-predictions/bundesliga/'
page = requests.get(url)

# Get the source code
tree = html.fromstring(page.content)

In [322]:
matches = tree.cssselect('.games-container.upcoming .match-container:not(.hidden)')
print("Number of matches found: ", len(matches))

Number of matches found:  9


In [323]:
cols = ['date', 'home_team', 'away_team', 'home_win', 'tie', 'away_win']
df538 = pd.DataFrame(columns=cols)

for idx in range(len(matches)):
    match = matches[idx]
    
    df538.at[idx, 'date'] = match.cssselect(".date div")[0].text
    df538.at[idx, 'home_team'] = match.cssselect(".match-top .name")[0].text
    df538.at[idx, 'home_win'] = match.cssselect(".match-top .prob")[0].text
    df538.at[idx, 'tie'] = match.cssselect(".tie-prob div")[0].text
    df538.at[idx, 'away_team'] = match.cssselect(".match-bottom .name")[0].text
    df538.at[idx, 'away_win'] = match.cssselect(".match-bottom .prob")[0].text
    
df538

Unnamed: 0,date,home_team,away_team,home_win,tie,away_win
0,12/15,Sevilla,Levante,70%,19%,11%
1,12/16,Athletic Bilbao,Real Sociedad,45%,24%,30%
2,12/16,Eibar,Valencia,38%,27%,35%
3,12/16,Atlético Madrid,Alavés,70%,22%,7%
4,12/17,Girona,Getafe,44%,28%,28%
5,12/17,Celta Vigo,Villarreal,51%,24%,26%
6,12/17,Las Palmas,Espanyol,41%,29%,31%
7,12/17,Barcelona,Deportivo,90%,8%,2%
8,12/18,Málaga,Real Betis,51%,26%,24%


In [161]:
#df538.to_csv('./data/eredivisie.csv')

### Data transformation

In [324]:
# Turn percentages into floats
df538['home_win'] = pd.to_numeric(df538['home_win'].str.replace("%",""))/100
df538['tie'] = pd.to_numeric(df538['tie'].str.replace("%",""))/100
df538['away_win'] = pd.to_numeric(df538['away_win'].str.replace("%",""))/100
df538

# Drop the date column
del df538["date"]

# SHow
df538

Unnamed: 0,home_team,away_team,home_win,tie,away_win
0,Sevilla,Levante,0.7,0.19,0.11
1,Athletic Bilbao,Real Sociedad,0.45,0.24,0.3
2,Eibar,Valencia,0.38,0.27,0.35
3,Atlético Madrid,Alavés,0.7,0.22,0.07
4,Girona,Getafe,0.44,0.28,0.28
5,Celta Vigo,Villarreal,0.51,0.24,0.26
6,Las Palmas,Espanyol,0.41,0.29,0.31
7,Barcelona,Deportivo,0.9,0.08,0.02
8,Málaga,Real Betis,0.51,0.26,0.24


## Odds from unibet

In [286]:
import re
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep

In [287]:
# Start our headless (no GUI) browser
browser = webdriver.PhantomJS()

In [325]:
# Go to the URL
url = 'https://www.unibet.eu/betting#filter/football/netherlands/eredivisie'
url = 'https://www.unibet.eu/betting#filter/football/spain/laliga'
#url = 'https://www.unibet.eu/betting#filter/football/germany/bundesliga'
browser.get(url)

# Wait
sleep(6)

# Get the source code
soup = BeautifulSoup(browser.page_source, 'html5lib')

In [327]:
# Find the dropdown bars
dropdowns = browser.find_elements_by_class_name('KambiBC-collapsible-header')

# Click on them
for dropdown in dropdowns:
    dropdown.click()
    sleep(2)

# Now get the source code
soup = BeautifulSoup(browser.page_source, 'html5lib')

In [328]:
matches = soup.find(id="KambiBC-container").find_all("li", class_="KambiBC-event-item")
print("Number of matches found: ", len(matches))

Number of matches found:  19


In [329]:
# Create empty list
jobs = []

for match in matches:
    # Create empty dictionary
    job = {}
    
    # Find info on the page
    #job["date"] = match.find(class_="KambiBC-event-item__start-time--date").text # <- Doesnt work..?
    teams = match.find_all(class_="KambiBC-event-participants__name")
    job["home_team"] = teams[0].text
    job["away_team"] = teams[1].text
    odds = match.find_all(class_="KambiBC-mod-outcome__odds")
    job["odd_home_win"] = odds[0].text
    job["odd_tie"] = odds[1].text
    job["odd_away_win"] = odds[2].text
    
    # Add to the list
    jobs.append(job)

In [334]:
# Make a data frame
dfOdds = pd.DataFrame(jobs)
dfOdds = dfOdds[:9]
dfOdds

Unnamed: 0,away_team,home_team,odd_away_win,odd_home_win,odd_tie
0,Levante,Sevilla,7.0,1.5,3.9
1,Real Sociedad,Athletic Club Bilbao,3.4,2.2,3.45
2,Valencia,Eibar,2.5,2.9,3.4
3,Alavés,Atlético Madrid,14.0,1.22,6.4
4,Getafe,Girona FC,3.45,2.2,3.4
5,Villarreal,Celta Vigo,3.0,2.4,3.45
6,Espanyol,Deportiva Las Palmas,2.9,2.55,3.3
7,Deportivo La Coruña,FC Barcelona,23.0,1.08,14.0
8,Real Betis,Málaga,3.6,2.1,3.5


In [231]:
#dfOdds.to_csv('./data/premium-league-odds.csv')

### Data transformation

In [338]:
# Change the team names so that they match the ones in the 538 data frame
# Eredivisie
changes_nl = {'PSV Eindhoven': 'PSV', 
           'Roda JC Kerkrade': 'Roda JC',
           'VVV Venlo': 'VVV-Venlo',
           'SC Heerenveen': 'Heerenveen',
           'FC Twente': 'Twente'}
# Bundesliga
changes_de = {'Borussia Mönchengladbach': 'Gladbach', 
           'VfL Wolfsburg': 'Wolfsburg',
           '1. FC Köln': '1. FC Cologne',
           'Mainz 05': 'Mainz',
           'Borussia Dortmund': 'Dortmund',
           'Bayern München': 'Bayern Munich',
           'Augsburg': 'FC Augsburg',
           'Eintracht Frankfurt': 'Eintracht',
           'Bayer Leverkusen': 'Leverkusen',
           'Hertha Berlin': 'Hertha BSC'}
# La Liga
changes_es = {'Athletic Club Bilbao': 'Athletic Bilbao',
              'FC Barcelona': 'Barcelona',
              'Deportivo La Coruña': 'Deportivo',
              'Deportiva Las Palmas': 'Las Palmas',
              'Girona FC': 'Girona'}

changes = changes_es
for old,new in changes.items():
    # Replace!
    dfOdds["home_team"] = dfOdds["home_team"].str.replace(old, new)
    dfOdds["away_team"] = dfOdds["away_team"].str.replace(old, new)

# Drop the dates
#del dfOdds["date"]

# Convert to numbers
dfOdds["odd_home_win"] = pd.to_numeric(dfOdds["odd_home_win"])
dfOdds["odd_away_win"] = pd.to_numeric(dfOdds["odd_away_win"])
dfOdds["odd_tie"] = pd.to_numeric(dfOdds["odd_tie"])

# Show
dfOdds

Unnamed: 0,away_team,home_team,odd_away_win,odd_home_win,odd_tie
0,Levante,Sevilla,7.0,1.5,3.9
1,Real Sociedad,Athletic Bilbao,3.4,2.2,3.45
2,Valencia,Eibar,2.5,2.9,3.4
3,Alavés,Atlético Madrid,14.0,1.22,6.4
4,Getafe,Girona,3.45,2.2,3.4
5,Villarreal,Celta Vigo,3.0,2.4,3.45
6,Espanyol,Las Palmas,2.9,2.55,3.3
7,Deportivo,Barcelona,23.0,1.08,14.0
8,Real Betis,Málaga,3.6,2.1,3.5


# Merge

In [339]:
dfMerge = df538.merge(dfOdds, how="inner", on=["home_team", "away_team"])
dfMerge

Unnamed: 0,home_team,away_team,home_win,tie,away_win,odd_away_win,odd_home_win,odd_tie
0,Sevilla,Levante,0.7,0.19,0.11,7.0,1.5,3.9
1,Athletic Bilbao,Real Sociedad,0.45,0.24,0.3,3.4,2.2,3.45
2,Eibar,Valencia,0.38,0.27,0.35,2.5,2.9,3.4
3,Atlético Madrid,Alavés,0.7,0.22,0.07,14.0,1.22,6.4
4,Girona,Getafe,0.44,0.28,0.28,3.45,2.2,3.4
5,Celta Vigo,Villarreal,0.51,0.24,0.26,3.0,2.4,3.45
6,Las Palmas,Espanyol,0.41,0.29,0.31,2.9,2.55,3.3
7,Barcelona,Deportivo,0.9,0.08,0.02,23.0,1.08,14.0
8,Málaga,Real Betis,0.51,0.26,0.24,3.6,2.1,3.5


# Analyse

In [340]:
# Calculate the expected values for win, tie and loss
dfMerge["expect_home"] = dfMerge["home_win"] * dfMerge["odd_home_win"]
dfMerge["expect_away"] = dfMerge["away_win"] * dfMerge["odd_away_win"]
dfMerge["expect_tie"] = dfMerge["tie"] * dfMerge["odd_tie"]

# Show
dfMerge[["home_team", "away_team", "expect_home", "expect_tie", "expect_away"]]

Unnamed: 0,home_team,away_team,expect_home,expect_tie,expect_away
0,Sevilla,Levante,1.05,0.741,0.77
1,Athletic Bilbao,Real Sociedad,0.99,0.828,1.02
2,Eibar,Valencia,1.102,0.918,0.875
3,Atlético Madrid,Alavés,0.854,1.408,0.98
4,Girona,Getafe,0.968,0.952,0.966
5,Celta Vigo,Villarreal,1.224,0.828,0.78
6,Las Palmas,Espanyol,1.0455,0.957,0.899
7,Barcelona,Deportivo,0.972,1.12,0.46
8,Málaga,Real Betis,1.071,0.91,0.864
