In [1]:
import bs4
import helium
import selenium
import sqlite3
import time

options = selenium.webdriver.FirefoxOptions()
options.add_argument("--width=1024")
options.add_argument("--height=768")
options.add_argument("-headless")

In [2]:
def create_db_html_per_year(year):
    with sqlite3.connect(f'data/html/html_{year}.db') as con:
        query = '''CREATE TABLE html (
            game_id TEXT NOT NULL PRIMARY KEY,
            summary_html TEXT,
            stats_html TEXT,
            odds_html TEXT)
            '''
        cur = con.cursor()
        cur.execute(query)

In [3]:
def populate_db_html_per_year(year):
    games_ids = get_games_ids_per_year(year)
    populate_db_html_per_games_ids(games_ids, year)

In [4]:
def get_games_ids_per_year(year):
    url = f'flashscore.com/basketball/usa/nba-{year}-{year+1}/results/'
    with helium.start_firefox(options=options) as driver:
        helium.go_to(url)
        time.sleep(2)
        html = get_html_from_page(driver)
    games_ids = get_games_ids_from_html(html)
    return games_ids


def get_html_from_page(driver):
    end = False
    while not end:
        try:
            time.sleep(2)
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
            helium.click('Show more matches')
        except LookupError:
            end = True
    html = driver.page_source
    return html


def get_games_ids_from_html(html):
    soup = bs4.BeautifulSoup(html, 'html.parser')
    games_ids = soup.find_all('div', {'class': 'event__match'})
    games_ids = [game_id['id'] for game_id in games_ids]
    games_ids = [game_id.split('_')[-1] for game_id in games_ids]
    return games_ids

In [5]:
def populate_db_html_per_games_ids(games_ids, year):
    with helium.start_firefox(options=options) as driver:
        for i, game_id in zip(range(len(games_ids)), games_ids):
            print(f'{game_id} {i+1}/{len(games_ids)}')
            with sqlite3.connect(f'data/html/html_{year}.db') as con:
                query = '''INSERT INTO html VALUES (
                    :game_id,
                    :summary_html,
                    :stats_html,
                    :odds_html)
                    '''
                row = get_row_db_html(game_id, driver)
                cur = con.cursor()
                cur.execute(query, row)

In [6]:
def get_row_db_html(game_id, driver):
    keys = [
        'game_id',
        'summary_html',
        'stats_html',
        'odds_html',
    ]
    row = {key: None for key in keys}
    url = f'flashscore.com/match/{game_id}/'
    helium.go_to(url)
    time.sleep(2)
    soup = bs4.BeautifulSoup(driver.page_source, 'html.parser')
    
    row['game_id'] = game_id

    html = driver.page_source
    soup = bs4.BeautifulSoup(html, 'html.parser')
    soup = soup.find('div', {'class': 'container__detail'})
    summary_html = str(soup)
    row['summary_html'] = summary_html

    if soup.find_all('a', {'href': '#match-summary/match-statistics'}):
        helium.go_to(f'{url}#match-summary/match-statistics')
        time.sleep(2)
        html = driver.page_source
        soup = bs4.BeautifulSoup(html, 'html.parser')
        soup = soup.find('div', {'class': 'container__detail'})
        stats_html = str(soup)
        row['stats_html'] = stats_html

    if soup.find_all('a', {'href': '#odds-comparison'}):
        helium.go_to(f'{url}#odds-comparison')
        time.sleep(2)
        html = driver.page_source
        soup = bs4.BeautifulSoup(html, 'html.parser')
        soup = soup.find('div', {'class': 'container__detail'})
        odds_html = str(soup)
        row['odds_html'] = odds_html
            
    return row

In [7]:
create_db_html_per_year(2013)
populate_db_html_per_year(2013)

KeyboardInterrupt: 