In [244]:
# Python 3.6.0 |Anaconda 4.3.1 (64-bit)|

from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError

import pandas as pd

from bs4 import BeautifulSoup as bs
import urllib.request

from datetime import datetime, timedelta
from pandas import HDFStore

import os
import re

import h5py
from pandas import HDFStore

import warnings
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)

Loader

In [245]:
def from_url_to_bs4(url):

    while True:
        req = Request(url)
        try:
            response = urlopen(req)
        except URLError as e:
            if hasattr(e, 'reason'):
                print('URLError. Failed to reach a server.')
                print('Reason: ', e.reason)
            elif hasattr(e, 'code'):
                print('URLError. The server couldn\'t fulfill the request.')
                print('Error code: ', e.code)
            continue
        except HTTPError as e:
            if hasattr(e, 'reason'):
                print('HTTPError. Reason: ', e.reason)
            elif hasattr(e, 'code'):
                print('HTTPError. Error code: ', e.code)
            continue
        break

    html_read = response.read()
    bs4 = bs(html_read, "lxml")

    return bs4

Helper

In [246]:
def get_current_date():

    """
    Returns current date and time in the same format as in get_kick_off()
    """
    
    now = datetime.now()
    date = now.strftime("%Y-%m-%d %H:%M")
    
    return date



def remove_duplicate_prices(df):

    """
    Remove duplicate prices from dataframe
    """
    
    df.drop_duplicates(subset = ['fixture', 'kickoff', 'market', 'selection'], inplace = True)
    
    return df

Extractor

In [247]:
def get_markets_from_oddschecker(url):

    """
    Returns a list with markets available on www.oddschecker.com
    
    URL example: 'https://www.oddschecker.com/football/italy/serie-a/fiorentina-v-inter/betting-markets'
    """
    if "/betting-markets" not in url:
        raise ValueError("URL is invalid. \nURL example: 'https://www.oddschecker.com/football/italy/serie-a/fiorentina-v-inter				/betting-markets'")
        
    list_of_markets = []
    bs4 = from_url_to_bs4(url)

    result = re.search('https://www.oddschecker.com/(.*)betting-markets', url)
    link_core = result.group(1)
    link_part = "<a href=\"/" + link_core

    all_links = bs4.find_all(href = True)

    for link in all_links:
        if link_part in str(link):
            market_name = re.search(link_part + "(.*)\">", str(link))
            list_of_markets.append(market_name.group(1))
    
    return list_of_markets



def get_best_prices_for_market(url):

    """
    Given URL, returns dictionary with selections and prices for correcponding fixture and market
    """

    bs4 = from_url_to_bs4(url)
    best_odds = bs4.findAll(class_ = "add-to-bet-basket")
    selections = []
    prices = []
    prices_dict = {}

    for record in best_odds:
        selection = record['data-name']    
        price_string = record['data-ng-click']
        price = re.search("(.*), (.*), (.*)\)", price_string).group(3)
        selections.append(selection)
        prices.append(price) 

    prices_dict["selection"] = selections
    prices_dict["price"] = prices
    
    return prices_dict



def get_kick_off(url):
    
    """
    Given URL, returns fixture's kick-off date and time as a string
    """
    
    bs4 = from_url_to_bs4(url)
    source = str(bs4)
    kick_off_string = re.search("startDate\":\"(\d\d\d\d-\d\d-\d\dT\d\d:\d\d)", source)
    kick_off = kick_off_string.group(1).replace('T', " ")
    
    return kick_off



def parse_football_url(url):

    """
    Given URL, returns a dictionary with market attributes
    """

    football_url_pattern = "https://(.*)/(.*)/(.*)/(.*)/(.*)/(.*)"
    result = re.search(football_url_pattern, url)

    market_attrs = {}

    market_attrs["fixture"] = result.group(5)
    market_attrs["market"] = result.group(6)

    return market_attrs



def get_df_for_market(url):

    """
    Given URL, returns dataframe with market attributes and prices
    """

    prices_dict = get_best_prices_for_market(url)
    df_prices = pd.DataFrame(prices_dict)

    market_attrs = parse_football_url(url)
    kick_off = get_kick_off(url)
    date = get_current_date()
    market_attrs["kickoff"] = [kick_off]
    market_attrs["date"] = [date]
    df_market_attrs = pd.DataFrame(market_attrs)
   
    number_of_prices = len(prices_dict['price'])
    
    if number_of_prices > 0:
        df = df_market_attrs
        for number in range(number_of_prices - 1):
            df = pd.concat([df, df_market_attrs])

        df.index = list(range(number_of_prices)) 
        
        dataframe = pd.concat([df, df_prices], axis = 1)
        dataframe = dataframe.loc[:, ['date', 'fixture','kickoff', 'market', 'selection' ,'price']]
    else:
        dataframe = pd.DataFrame(columns = ['date', 'fixture', 'kickoff', 'market', 'selection' ,'price'])
        
    return dataframe



def get_fixtures_from_oddschecker(country, league, days = None):
    
    """
    Given country and league, returns a list of fixtures filtered by days ahead. Default number of days = 2.
    """
    
    url_league = 'https://www.oddschecker.com/football/' + country + '/' + league + '/'
    url_core = url_league.replace('https://www.oddschecker.com', "")

    bs4 = from_url_to_bs4(url_league)
    all_fixtures = []
    filtered_fixtures = []

    all_fixtures_links = bs4.find_all(class_ = "button btn-1-small", href = True)
    for fixture_link in all_fixtures_links[:-1]:
        pattern = url_core + "(.*)" + "/winner"
        all_fixtures.append((re.search(pattern, str(fixture_link)).group(1)))
    
    if days == None:
        days = 2
    
    now = datetime.now()
    days_ahead = timedelta(days)
    
    for fixture in all_fixtures:
        fixture_url = url_league + '/' + fixture
        
        kick_off_str = get_kick_off(fixture_url)
        kick_off_date = datetime.strptime(kick_off_str, "%Y-%m-%d %H:%M")
        
        if (now + days_ahead) >= kick_off_date:
            filtered_fixtures.append(fixture)
        
    return filtered_fixtures



def get_df_for_fixture(fixtures, country, league):
    
    url_football = 'https://www.oddschecker.com/football/'
    first_fixture = True
    df = pd.DataFrame()
    
    for fixture in fixtures:
        
        print ('\t\t\t' + fixture)
                
        url_fixture = url_football + country + '/' + league + '/' + fixture + '/'
        url_markets = url_fixture + 'betting-markets'
    
        all_markets = get_markets_from_oddschecker(url_markets)
        
        if first_fixture:
            url_market = url_fixture + all_markets[0]
            df = get_df_for_market(url_market)
            all_markets = all_markets[1:]
            first_fixture = False
            
        for market in all_markets:
            url_temp = url_fixture + market
            
            try:
                df_temp = get_df_for_market(url_temp)
                df = pd.concat([df, df_temp])
            except KeyError:
                print ("Error occured in " + market)
                continue
            
        df.reset_index(drop = True, inplace = True)
        
    return df


Database handler

In [248]:
def create_football_h5():
    
    """
    Given a dict with leagues, create and return hdf5 file
    """
    
    h5 = h5py.File('db.h5', 'w')
    football = h5.create_group('football')
    h5.close()
            
    return h5



def print_h5_structure(file):
    
    """
    Prints the structure of given hdf5 file
    """
    
    for sport in file.keys():
        print (sport)
        for country in file[sport]:
            print ('\t' + country)
            for league in file[sport][country]:
                print ('\t\t' + league)
                for table in file[sport][country][league]:
                    print ('\t\t\t' + table)
        print ('\n')

        
        
def add_league_to_football_h5(leagues):    
    
    """
    Adds leagues from {country: league} dict to football h5 file; returns updated h5
    """
    
    h5 = h5py.File('db.h5', 'r+')
    
    football = h5['football']
    
    for country in leagues.keys():
        for league in leagues[country]:
            football.create_group(country + '/' + league)
    
    h5.close()
    
    return h5



def update_league_in_db(df, country, league):
    
    """
    Update league in database given datafrme, country and league
    """
    
    if country == 'english':
        country = country.replace('english', 'england')
    country = country.replace('other/', '')
    country = country.replace('world/', '')
    league = league.replace('-', '_')

    store = HDFStore('db.h5')
    try:
        data_h5 = store['football/' + country + '/' + league]
        
        data_updated = pd.concat([data_h5, df], axis = 0)
        data_updated = data_updated.loc[:, ['date', 'fixture', 'kickoff', 'market', 'selection', 'price']]
        data_updated = remove_duplicate_prices(data_updated)
        data_updated.reset_index(drop = True, inplace = True)

        store['football/' + country + '/' + league] = data_updated
    except TypeError:
        pass

    store.close()
    
    
    
def update_db(leagues):

    """
    Update database given dictionary with leagues
    """
    
    for country in leagues:
        for league in leagues[country]:
            
            print(country)
            print('\t' + league)
    
            fixtures = get_fixtures_from_oddschecker(country, league)
            
            df = get_df_for_fixture(fixtures, country, league)

            update_league_in_db(df, country, league)

In [249]:
leagues = {
        'italy': ['serie-a', 'serie-b'],
        'english': ['premier-league', 'championship'],
        'france': ['ligue-1', 'ligue-2'],
        'spain': ['la-liga-primera', 'la-liga-segunda'],
        'germany': ['bundesliga', 'bundesliga-2'],
        'other/belgium': ['jupiler-pro-league'],
        'other/russia' : ['premier-league', '1-division'],
        'other/netherlands': ['eredivisie'],
        'other/portugal': ['primeira-liga'],
        'other/switzerland': ['super-league'],
        'other/norway' : ['tippeligaen'],
        'other/finland' : ['veikkausliiga'],
        'other/turkey': ['super-lig'],
        'world/usa': ['mls'],
        'world/china': ['super-league']
    }

In [250]:
update_db(leagues)

In [251]:
# scrap df manually

# country = 'other/sweden'
# league = 'allsvenskan'
# fixtures = get_fixtures_from_oddschecker(country, league)
# df = get_df_for_fixture(fixtures, country, league)

In [252]:
# h5 = add_league_to_football_h5(test)

In [253]:
# store = HDFStore('db.h5')
# store['football/sweden/allsvenskan'] = df

In [236]:
# store.close()