# Scrape the odds
Website: https://www.unibet.eu/betting#filter/football/

From Unibet using Selenium and Firefox.

Data is stored in `./data/This_months_odds.csv`.

In [None]:
import numpy as np
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from time import sleep

## Initialize the browser

In [None]:
# start the browser
browser = webdriver.Firefox()

# have a generic wait object for the driver
wait = WebDriverWait(browser, 10)

In [None]:
# wait for page to be done loading
def pageReady(interval=1):
    # in seconds
    waited = 0
    while True:
        soupCheck1 = BeautifulSoup(browser.page_source, 'html5lib')
        waited += interval
        sleep(interval)
        soupCheck2 = BeautifulSoup(browser.page_source, 'html5lib')
        if soupCheck1 == soupCheck2:
            break
    return waited

In [None]:
# go to Unibet
url_0 = 'https://www.unibet.eu/betting#filter/football/'
browser.get(url_0)

# wait
print('Page ready in', pageReady())

# click on the cookie dialog
browser.find_element(By.ID, "CybotCookiebotDialogBodyButtonAccept").click()

# wait again
print('Page ready in', pageReady())

## Crawl the leagues and store the page source code

In [None]:
# the urls we'll visit
urls = [url_0 + u for u in ['netherlands/eredivisie',
                            'spain/la_liga',
                            'germany/bundesliga',
                            'england/premier_league',
                            'france/ligue_1',
                            'italy/serie_a']]
urls

In [None]:
# create empty dict to append the soup of each league
soups = dict()

# loop over all urls
for url in urls:
    
    # debug
    #url = urls[0]
    
    # go to url
    print('Going to:', url)
    browser.get(url)
    
    # wait until page is ready
    print('Page ready in', pageReady())

    # click on all the dropdown elements, ignore the already expanded ones
    dropdowns = browser.find_elements(By.CSS_SELECTOR,
        '.KambiBC-collapsible-container:not(.KambiBC-expanded)')
    for dropdown in dropdowns:
        dropdown.click()

    # wait again
    print('Page ready in', pageReady())

    # now get the beautiful source code
    soup = BeautifulSoup(browser.page_source, 'html5lib')
    
    # select and store only the list of matches
    league = re.split('/', url)[-1]
    soups[league] = soup.select_one('.KambiBC-event-groups-list')
    
    # debug
    #break

We got the source code. From here on we won't need the browser anymore.

In [None]:
# quit the browser
browser.quit()

## Scrape the soups we got

In [None]:
# fcn: extract match info from a <li.KambiBC-event-item>
def extractMatchInfo(match):
    teams = match.select('.KambiBC-event-participants__name')
    odds = match.select('span.KambiBC-mod-outcome__odds')
    info = {'home_team': teams[0].text,
            'away_team': teams[1].text,
            'odd_home_win': odds[0].text,
            'odd_tie': odds[1].text,
            'odd_away_win': odds[2].text
           }
    return info

In [None]:
# prepare a main data frame to store all info in
df = pd.DataFrame()

# loop over all soups
for league, soup in soups.items():
    
    # find all green buttons (with odds)
    greenButtons = soup.select('button.KambiBC-mod-outcome')
    
    # find the <li> parent which represents one match and combine them as a set
    matches = list(set(o.find_parent('li', class_='KambiBC-event-item') for o in greenButtons))

    # extract info of all matches and store in a data frame
    df_0 = pd.DataFrame([extractMatchInfo(m) for m in matches])
    
    # add the league name to the data frame
    df_0['league'] = league
    print(league, ':', df_0.shape)
    
    # append
    df = pd.concat([df, df_0])
    
    # debug
    #break

# show
print(df.shape)
df

## Some text transformation

In [None]:
# change team names to lowercase
df["home_team"] = df["home_team"].str.lower()
df["away_team"] = df["away_team"].str.lower()

# remove all accents
# https://stackoverflow.com/questions/37926248/how-to-remove-accents-from-values-in-columns
df["home_team"] = df["home_team"].str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')
df["away_team"] = df["away_team"].str.normalize('NFKD').str.encode(encoding='ascii',errors='ignore').str.decode('utf-8')

# convert to numbers
df["odd_home_win"] = pd.to_numeric(df["odd_home_win"])
df["odd_away_win"] = pd.to_numeric(df["odd_away_win"])
df["odd_tie"] = pd.to_numeric(df["odd_tie"])

# show
print(df.shape)
df

In [None]:
# save as csv
df.to_csv('./data/This_months_odds.csv', index=False)