# Scraper for minor leagues
Temporary extra bets to get the bonus money

## Predictions from five thirty eight

In [2]:
from lxml import html
from lxml.cssselect import CSSSelector
import requests
import re
import pandas as pd

In [23]:
# Go to the URL
#url = 'https://projects.fivethirtyeight.com/soccer-predictions/championship/'
url = 'https://projects.fivethirtyeight.com/soccer-predictions/la-liga-2/'
page = requests.get(url)

# Get the source code
tree = html.fromstring(page.content)

In [24]:
matches = tree.cssselect('.games-container.upcoming .match-container:not(.hidden)') # Skip the hidden ones
matches = tree.cssselect('.games-container.upcoming .match-container')[:12] # Dont skip the hidden ones, take first 12
print("Number of matches found: ", len(matches))

Number of matches found:  12


In [25]:
cols = ['date', 'home_team', 'away_team', 'home_win', 'tie', 'away_win']
df538 = pd.DataFrame(columns=cols)

for idx in range(len(matches)):
    match = matches[idx]
    
    df538.at[idx, 'date'] = match.cssselect(".date div")[0].text
    df538.at[idx, 'home_team'] = match.cssselect(".match-top .name")[0].text
    df538.at[idx, 'home_win'] = match.cssselect(".match-top .prob")[0].text
    df538.at[idx, 'tie'] = match.cssselect(".tie-prob div")[0].text
    df538.at[idx, 'away_team'] = match.cssselect(".match-bottom .name")[0].text
    df538.at[idx, 'away_win'] = match.cssselect(".match-bottom .prob")[0].text
    
df538

Unnamed: 0,date,home_team,away_team,home_win,tie,away_win
0,1/6,Sporting Gijón,Córdoba,58%,23%,19%
1,1/6,Osasuna,Valladolid,53%,24%,23%
2,1/6,Rayo Vallecano,Gimnástic,61%,24%,16%
3,1/6,Cádiz,Granada,44%,31%,25%
4,1/6,Reus,Alcorcón,32%,39%,29%
5,1/6,Zaragoza,Barcelona B,50%,26%,24%
6,1/7,Sevilla Atlético,Lorca,40%,32%,28%
7,1/7,Albacete,Tenerife,34%,30%,35%
8,1/7,Huesca,Oviedo,53%,27%,20%
9,1/7,Cultural Leonesa,Numancia,31%,28%,42%


### Data transformation

In [33]:
# Turn percentages into floats
df538['home_win'] = pd.to_numeric(df538['home_win'].str.replace("%",""))/100
df538['tie'] = pd.to_numeric(df538['tie'].str.replace("%",""))/100
df538['away_win'] = pd.to_numeric(df538['away_win'].str.replace("%",""))/100

# Drop the date column
#del df538["date"]

# Change team names to lowercase
df538["home_team"] = df538["home_team"].str.lower()
df538["away_team"] = df538["away_team"].str.lower()

# Remove all accents
df538["home_team"] = df538["home_team"].str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')
df538["away_team"] = df538["away_team"].str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')

# Make 3-letter code names
df538["home_code"] = df538["home_team"].str[:3]
df538["away_code"] = df538["away_team"].str[:3]

# SHow
df538

Unnamed: 0,date,home_team,away_team,home_win,tie,away_win,home_code,away_code
0,1/6,sporting gijon,cordoba,0.58,0.23,0.19,spo,cor
1,1/6,osasuna,valladolid,0.53,0.24,0.23,osa,val
2,1/6,rayo vallecano,gimnastic,0.61,0.24,0.16,ray,gim
3,1/6,cadiz,granada,0.44,0.31,0.25,cad,gra
4,1/6,reus,alcorcon,0.32,0.39,0.29,reu,alc
5,1/6,zaragoza,barcelona b,0.5,0.26,0.24,zar,bar
6,1/7,sevilla atletico,lorca,0.4,0.32,0.28,sev,lor
7,1/7,albacete,tenerife,0.34,0.3,0.35,alb,ten
8,1/7,huesca,oviedo,0.53,0.27,0.2,hue,ovi
9,1/7,cultural leonesa,numancia,0.31,0.28,0.42,cul,num


## Odds from unibet

In [9]:
import re
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep

In [10]:
# Start our headless (no GUI) browser
browser = webdriver.PhantomJS()

In [26]:
# Go to the URL
#url = 'https://www.unibet.eu/betting#filter/football/england/the_championship'
url = 'https://www.unibet.eu/betting#filter/football/spain/laliga2'
browser.get(url)

# Wait
sleep(2.75)

# Get the source code
soup = BeautifulSoup(browser.page_source, 'html5lib')

In [27]:
# Find the dropdown bars
dropdowns = browser.find_elements_by_class_name('KambiBC-collapsible-header')

# Click on them
for dropdown in dropdowns:
    dropdown.click()
    sleep(0.75)

# Now get the source code
soup = BeautifulSoup(browser.page_source, 'html5lib')

In [28]:
matches = soup.find(id="KambiBC-container").find_all("li", class_="KambiBC-event-item")
print("Number of matches found: ", len(matches))

Number of matches found:  11


In [29]:
# Create empty list
jobs = []

for match in matches:
    # Create empty dictionary
    job = {}
    
    # Find info on the page
    #job["date"] = match.find(class_="KambiBC-event-item__start-time--date").text # <- Doesnt work..?
    teams = match.find_all(class_="KambiBC-event-participants__name")
    job["home_team"] = teams[0].text
    job["away_team"] = teams[1].text
    odds = match.find_all(class_="KambiBC-mod-outcome__odds")
    job["odd_home_win"] = odds[0].text
    job["odd_tie"] = odds[1].text
    job["odd_away_win"] = odds[2].text
    
    # Add to the list
    jobs.append(job)

In [30]:
# Make a data frame
dfOdds = pd.DataFrame(jobs)

# Only select the top 12 for now
dfOdds = dfOdds[:12]

# Show
dfOdds

Unnamed: 0,away_team,home_team,odd_away_win,odd_home_win,odd_tie
0,Valladolid,Osasuna,3.2,2.4,3.0
1,Córdoba CF,Sporting de Gijón,5.0,1.75,3.35
2,Granada CF,Cadiz,3.6,2.2,3.0
3,Gimnástic de Tarragona,Rayo Vallecano,6.75,1.5,3.9
4,Alcorcón,Reus Deportiu,3.0,2.8,2.7
5,FC Barcelona B,Real Zaragoza,3.9,1.95,3.35
6,La Hoya Lorca CF,Sevilla Atlético,3.6,2.2,3.0
7,Tenerife,Albacete Balompié,3.25,2.45,2.9
8,Numancia,Cultural Leonesa,2.85,2.6,3.1
9,Real Oviedo,Huesca,4.3,1.9,3.2


### Data transformation

In [39]:
# Change team names to lowercase
dfOdds["home_team"] = dfOdds["home_team"].str.lower()
dfOdds["away_team"] = dfOdds["away_team"].str.lower()

# Remove all accents
dfOdds["home_team"] = dfOdds["home_team"].str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')
dfOdds["away_team"] = dfOdds["away_team"].str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')

# Change the team names so that they match the ones in the 538 data frame
changes_nl = {}
changes_de = {}
changes_es = {'deportiva las palmas': 'las palmas',
             'fc barcelona': 'barcelona',
             'real zaragoza': 'zaragoza',
             'la hoya lorca cf': 'lorca',
             'real oviedo': 'oviedo',
             'ud almeria': 'almeria'}
changes_en = {'queens park rangers': 'qpr'}
changes_fr = {}
changes_it = {'hellas verona': 'verona'}
changes = {**changes_nl, **changes_de, **changes_es, **changes_en, **changes_fr, **changes_it}
for old,new in changes.items():
    # Replace!
    dfOdds["home_team"] = dfOdds["home_team"].str.replace(old, new)
    dfOdds["away_team"] = dfOdds["away_team"].str.replace(old, new)

# Make 3-letter code names
dfOdds["home_code"] = dfOdds["home_team"].str[:3]
dfOdds["away_code"] = dfOdds["away_team"].str[:3]

# Convert to numbers
dfOdds["odd_home_win"] = pd.to_numeric(dfOdds["odd_home_win"])
dfOdds["odd_away_win"] = pd.to_numeric(dfOdds["odd_away_win"])
dfOdds["odd_tie"] = pd.to_numeric(dfOdds["odd_tie"])

# Show
dfOdds

Unnamed: 0,away_team,home_team,odd_away_win,odd_home_win,odd_tie,home_code,away_code
0,valladolid,osasuna,3.2,2.4,3.0,osa,val
1,cordoba cf,sporting de gijon,5.0,1.75,3.35,spo,cor
2,granada cf,cadiz,3.6,2.2,3.0,cad,gra
3,gimnastic de tarragona,rayo vallecano,6.75,1.5,3.9,ray,gim
4,alcorcon,reus deportiu,3.0,2.8,2.7,reu,alc
5,barcelona b,zaragoza,3.9,1.95,3.35,zar,bar
6,lorca,sevilla atletico,3.6,2.2,3.0,sev,lor
7,tenerife,albacete balompie,3.25,2.45,2.9,alb,ten
8,numancia,cultural leonesa,2.85,2.6,3.1,cul,num
9,oviedo,huesca,4.3,1.9,3.2,hue,ovi


# Merge

In [40]:
# Merge by 3-letter codes
dfMerge = df538.merge(dfOdds, how="inner", on=["home_code", "away_code"])
dfMerge

Unnamed: 0,date,home_team_x,away_team_x,home_win,tie,away_win,home_code,away_code,away_team_y,home_team_y,odd_away_win,odd_home_win,odd_tie
0,1/6,sporting gijon,cordoba,0.58,0.23,0.19,spo,cor,cordoba cf,sporting de gijon,5.0,1.75,3.35
1,1/6,osasuna,valladolid,0.53,0.24,0.23,osa,val,valladolid,osasuna,3.2,2.4,3.0
2,1/6,rayo vallecano,gimnastic,0.61,0.24,0.16,ray,gim,gimnastic de tarragona,rayo vallecano,6.75,1.5,3.9
3,1/6,cadiz,granada,0.44,0.31,0.25,cad,gra,granada cf,cadiz,3.6,2.2,3.0
4,1/6,reus,alcorcon,0.32,0.39,0.29,reu,alc,alcorcon,reus deportiu,3.0,2.8,2.7
5,1/6,zaragoza,barcelona b,0.5,0.26,0.24,zar,bar,barcelona b,zaragoza,3.9,1.95,3.35
6,1/7,sevilla atletico,lorca,0.4,0.32,0.28,sev,lor,lorca,sevilla atletico,3.6,2.2,3.0
7,1/7,albacete,tenerife,0.34,0.3,0.35,alb,ten,tenerife,albacete balompie,3.25,2.45,2.9
8,1/7,huesca,oviedo,0.53,0.27,0.2,hue,ovi,oviedo,huesca,4.3,1.9,3.2
9,1/7,cultural leonesa,numancia,0.31,0.28,0.42,cul,num,numancia,cultural leonesa,2.85,2.6,3.1


# Analyse

In [41]:
# Calculate the expected values for win, tie and loss
dfMerge["expect_home"] = dfMerge["home_win"] * dfMerge["odd_home_win"]
dfMerge["expect_away"] = dfMerge["away_win"] * dfMerge["odd_away_win"]
dfMerge["expect_tie"] = dfMerge["tie"] * dfMerge["odd_tie"]

# Show
dfMerge[["home_team_x", "away_team_x", "expect_home", "expect_tie", "expect_away"]]

Unnamed: 0,home_team_x,away_team_x,expect_home,expect_tie,expect_away
0,sporting gijon,cordoba,1.015,0.7705,0.95
1,osasuna,valladolid,1.272,0.72,0.736
2,rayo vallecano,gimnastic,0.915,0.936,1.08
3,cadiz,granada,0.968,0.93,0.9
4,reus,alcorcon,0.896,1.053,0.87
5,zaragoza,barcelona b,0.975,0.871,0.936
6,sevilla atletico,lorca,0.88,0.96,1.008
7,albacete,tenerife,0.833,0.87,1.1375
8,huesca,oviedo,1.007,0.864,0.86
9,cultural leonesa,numancia,0.806,0.868,1.197
