# Crawler

## Predictions from five thirty eight

In [1]:
from lxml import html
from lxml.cssselect import CSSSelector
import requests
import re
import pandas as pd

In [30]:
# Go to the URL
#url = 'https://projects.fivethirtyeight.com/soccer-predictions/eredivisie/'
#url = 'https://projects.fivethirtyeight.com/soccer-predictions/la-liga/'
#url = 'https://projects.fivethirtyeight.com/soccer-predictions/bundesliga/'
#url = 'https://projects.fivethirtyeight.com/soccer-predictions/premier-league/'
#url = 'https://projects.fivethirtyeight.com/soccer-predictions/ligue-1/'
url = 'https://projects.fivethirtyeight.com/soccer-predictions/serie-a/'
page = requests.get(url)

# Get the source code
tree = html.fromstring(page.content)

In [33]:
matches = tree.cssselect('.games-container.upcoming .match-container:not(.hidden)') # Skip the hidden ones
matches = tree.cssselect('.games-container.upcoming .match-container')[:10] # Dont skip the hidden ones, take first 12
print("Number of matches found: ", len(matches))

Number of matches found:  10


In [34]:
cols = ['date', 'home_team', 'away_team', 'home_win', 'tie', 'away_win']
df538 = pd.DataFrame(columns=cols)

for idx in range(len(matches)):
    match = matches[idx]
    
    df538.at[idx, 'date'] = match.cssselect(".date div")[0].text
    df538.at[idx, 'home_team'] = match.cssselect(".match-top .name")[0].text
    df538.at[idx, 'home_win'] = match.cssselect(".match-top .prob")[0].text
    df538.at[idx, 'tie'] = match.cssselect(".tie-prob div")[0].text
    df538.at[idx, 'away_team'] = match.cssselect(".match-bottom .name")[0].text
    df538.at[idx, 'away_win'] = match.cssselect(".match-bottom .prob")[0].text
    
df538

Unnamed: 0,date,home_team,away_team,home_win,tie,away_win
0,12/29,Crotone,Napoli,10%,19%,70%
1,12/30,Fiorentina,AC Milan,49%,27%,23%
2,12/30,Roma,Sassuolo,75%,18%,7%
3,12/30,Sampdoria,SPAL,59%,23%,18%
4,12/30,Atalanta,Cagliari,66%,21%,12%
5,12/30,Torino,Genoa,49%,28%,23%
6,12/30,Bologna,Udinese,34%,29%,37%
7,12/30,Benevento,Chievo,33%,29%,39%
8,12/30,Inter Milan,Lazio,48%,24%,28%
9,12/30,Verona,Juventus,8%,16%,76%


### Data transformation

In [35]:
# Turn percentages into floats
df538['home_win'] = pd.to_numeric(df538['home_win'].str.replace("%",""))/100
df538['tie'] = pd.to_numeric(df538['tie'].str.replace("%",""))/100
df538['away_win'] = pd.to_numeric(df538['away_win'].str.replace("%",""))/100

# Drop the date column
#del df538["date"]

# Change team names to lowercase
df538["home_team"] = df538["home_team"].str.lower()
df538["away_team"] = df538["away_team"].str.lower()

# Remove all accents
df538["home_team"] = df538["home_team"].str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')
df538["away_team"] = df538["away_team"].str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')

# Make 3-letter code names
df538["home_code"] = df538["home_team"].str[:3]
df538["away_code"] = df538["away_team"].str[:3]

# SHow
df538

Unnamed: 0,date,home_team,away_team,home_win,tie,away_win,home_code,away_code
0,12/29,crotone,napoli,0.1,0.19,0.7,cro,nap
1,12/30,fiorentina,ac milan,0.49,0.27,0.23,fio,ac
2,12/30,roma,sassuolo,0.75,0.18,0.07,rom,sas
3,12/30,sampdoria,spal,0.59,0.23,0.18,sam,spa
4,12/30,atalanta,cagliari,0.66,0.21,0.12,ata,cag
5,12/30,torino,genoa,0.49,0.28,0.23,tor,gen
6,12/30,bologna,udinese,0.34,0.29,0.37,bol,udi
7,12/30,benevento,chievo,0.33,0.29,0.39,ben,chi
8,12/30,inter milan,lazio,0.48,0.24,0.28,int,laz
9,12/30,verona,juventus,0.08,0.16,0.76,ver,juv


## Odds from unibet

In [8]:
import re
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep

In [9]:
# Start our headless (no GUI) browser
browser = webdriver.PhantomJS()

In [36]:
# Go to the URL
#url = 'https://www.unibet.eu/betting#filter/football/netherlands/eredivisie'
#url = 'https://www.unibet.eu/betting#filter/football/spain/laliga'
#url = 'https://www.unibet.eu/betting#filter/football/germany/bundesliga'
#url = 'https://www.unibet.eu/betting#filter/football/england/premier_league'
#url = 'https://www.unibet.eu/betting#filter/football/france/ligue_1'
url = 'https://www.unibet.eu/betting#filter/football/italy/serie_a'
browser.get(url)

# Wait
sleep(3.75)

# Get the source code
soup = BeautifulSoup(browser.page_source, 'html5lib')

In [37]:
# Find the dropdown bars
dropdowns = browser.find_elements_by_class_name('KambiBC-collapsible-header')

# Click on them
for dropdown in dropdowns:
    dropdown.click()
    sleep(1.75)

# Now get the source code
soup = BeautifulSoup(browser.page_source, 'html5lib')

In [38]:
matches = soup.find(id="KambiBC-container").find_all("li", class_="KambiBC-event-item")
print("Number of matches found: ", len(matches))

Number of matches found:  20


In [39]:
# Create empty list
jobs = []

for match in matches:
    # Create empty dictionary
    job = {}
    
    # Find info on the page
    #job["date"] = match.find(class_="KambiBC-event-item__start-time--date").text # <- Doesnt work..?
    teams = match.find_all(class_="KambiBC-event-participants__name")
    job["home_team"] = teams[0].text
    job["away_team"] = teams[1].text
    odds = match.find_all(class_="KambiBC-mod-outcome__odds")
    job["odd_home_win"] = odds[0].text
    job["odd_tie"] = odds[1].text
    job["odd_away_win"] = odds[2].text
    
    # Add to the list
    jobs.append(job)

In [40]:
# Make a data frame
dfOdds = pd.DataFrame(jobs)

# Only select the top 12 for now
dfOdds = dfOdds[:10]

# Show
dfOdds

Unnamed: 0,away_team,home_team,odd_away_win,odd_home_win,odd_tie
0,Napoli,Crotone,1.28,9.5,5.6
1,AC Milan,Fiorentina,3.3,2.2,3.35
2,Cagliari,Atalanta,6.75,1.47,4.2
3,Chievo Verona,Benevento,2.25,3.2,3.35
4,Udinese,Bologna,3.15,2.3,3.25
5,Sassuolo,Roma,9.0,1.32,5.25
6,Spal,Sampdoria,4.7,1.7,3.9
7,Genoa,Torino,4.4,1.82,3.5
8,Lazio,Inter,3.35,2.1,3.55
9,Juventus,Hellas Verona,1.29,10.0,5.2


### Data transformation

In [45]:
# Change team names to lowercase
dfOdds["home_team"] = dfOdds["home_team"].str.lower()
dfOdds["away_team"] = dfOdds["away_team"].str.lower()

# Remove all accents
dfOdds["home_team"] = dfOdds["home_team"].str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')
dfOdds["away_team"] = dfOdds["away_team"].str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')

# Change the team names so that they match the ones in the 538 data frame
changes_nl = {}
changes_de = {}
changes_es = {}
changes_en = {}
changes_fr = {}
changes_it = {'hellas verona': 'verona'}
changes = {**changes_nl, **changes_de, **changes_es, **changes_en, **changes_fr, **changes_it}
for old,new in changes.items():
    # Replace!
    dfOdds["home_team"] = dfOdds["home_team"].str.replace(old, new)
    dfOdds["away_team"] = dfOdds["away_team"].str.replace(old, new)

# Make 3-letter code names
dfOdds["home_code"] = dfOdds["home_team"].str[:3]
dfOdds["away_code"] = dfOdds["away_team"].str[:3]

# Convert to numbers
dfOdds["odd_home_win"] = pd.to_numeric(dfOdds["odd_home_win"])
dfOdds["odd_away_win"] = pd.to_numeric(dfOdds["odd_away_win"])
dfOdds["odd_tie"] = pd.to_numeric(dfOdds["odd_tie"])

# Show
dfOdds

Unnamed: 0,away_team,home_team,odd_away_win,odd_home_win,odd_tie,home_code,away_code
0,napoli,crotone,1.28,9.5,5.6,cro,nap
1,ac milan,fiorentina,3.3,2.2,3.35,fio,ac
2,cagliari,atalanta,6.75,1.47,4.2,ata,cag
3,chievo verona,benevento,2.25,3.2,3.35,ben,chi
4,udinese,bologna,3.15,2.3,3.25,bol,udi
5,sassuolo,roma,9.0,1.32,5.25,rom,sas
6,spal,sampdoria,4.7,1.7,3.9,sam,spa
7,genoa,torino,4.4,1.82,3.5,tor,gen
8,lazio,inter,3.35,2.1,3.55,int,laz
9,juventus,verona,1.29,10.0,5.2,ver,juv


# Merge

In [44]:
# Merge by 3-letter codes
dfMerge = df538.merge(dfOdds, how="inner", on=["home_code", "away_code"])
dfMerge

Unnamed: 0,date,home_team_x,away_team_x,home_win,tie,away_win,home_code,away_code,away_team_y,home_team_y,odd_away_win,odd_home_win,odd_tie
0,12/29,crotone,napoli,0.1,0.19,0.7,cro,nap,napoli,crotone,1.28,9.5,5.6
1,12/30,fiorentina,ac milan,0.49,0.27,0.23,fio,ac,ac milan,fiorentina,3.3,2.2,3.35
2,12/30,roma,sassuolo,0.75,0.18,0.07,rom,sas,sassuolo,roma,9.0,1.32,5.25
3,12/30,sampdoria,spal,0.59,0.23,0.18,sam,spa,spal,sampdoria,4.7,1.7,3.9
4,12/30,atalanta,cagliari,0.66,0.21,0.12,ata,cag,cagliari,atalanta,6.75,1.47,4.2
5,12/30,torino,genoa,0.49,0.28,0.23,tor,gen,genoa,torino,4.4,1.82,3.5
6,12/30,bologna,udinese,0.34,0.29,0.37,bol,udi,udinese,bologna,3.15,2.3,3.25
7,12/30,benevento,chievo,0.33,0.29,0.39,ben,chi,chievo verona,benevento,2.25,3.2,3.35
8,12/30,inter milan,lazio,0.48,0.24,0.28,int,laz,lazio,inter,3.35,2.1,3.55
9,12/30,verona,juventus,0.08,0.16,0.76,ver,juv,juventus,verona,1.29,10.0,5.2


# Analyse

In [46]:
# Calculate the expected values for win, tie and loss
dfMerge["expect_home"] = dfMerge["home_win"] * dfMerge["odd_home_win"]
dfMerge["expect_away"] = dfMerge["away_win"] * dfMerge["odd_away_win"]
dfMerge["expect_tie"] = dfMerge["tie"] * dfMerge["odd_tie"]

# Show
dfMerge[["home_team_x", "away_team_x", "expect_home", "expect_tie", "expect_away"]]

Unnamed: 0,home_team_x,away_team_x,expect_home,expect_tie,expect_away
0,crotone,napoli,0.95,1.064,0.896
1,fiorentina,ac milan,1.078,0.9045,0.759
2,roma,sassuolo,0.99,0.945,0.63
3,sampdoria,spal,1.003,0.897,0.846
4,atalanta,cagliari,0.9702,0.882,0.81
5,torino,genoa,0.8918,0.98,1.012
6,bologna,udinese,0.782,0.9425,1.1655
7,benevento,chievo,1.056,0.9715,0.8775
8,inter milan,lazio,1.008,0.852,0.938
9,verona,juventus,0.8,0.832,0.9804
