In [2]:
import bs4
import helium
import os
import random
import selenium
import sqlite3
import time

In [3]:
options = selenium.webdriver.FirefoxOptions()
options.add_argument("--width=1024")
options.add_argument("--height=768")
options.add_argument("-headless")

In [110]:
def populate_games_html_per_year(year, show_progress=False):
    games_ids = get_games_ids_per_year(year)
    count, total = 1, len(games_ids)
    ### open browser here
    for game_id in games_ids:
        if show_progress:
            print(f'{year} {game_id} {count}/{total}')
        populate_games_html_per_game_id(game_id, year)
        count += 1


def populate_games_html_per_game_id(game_id, year):
    with sqlite3.connect(f'data/games_html_{year}.db') as con:
        query = '''INSERT INTO games_html VALUES (
            :game_id,
            :game_summary,
            :game_statistics,
            :odds_comparison)
            '''
        row = get_row_to_games_html(game_id)
        cur = con.cursor()
        cur.execute(query, row)
    

def get_row_to_games_html(game_id):
    keys = [
        'game_id',
        'game_summary',
        'game_statistics',
        'odds_comparison',
    ]
    row = {key: None for key in keys}
    url = f'flashscore.com/match/{game_id}/'
    with helium.start_firefox(url=url, options=options) as driver:
        time.sleep(2)
        row['game_id'] = game_id
        row['game_summary'] = driver.page_source
        soup = bs4.BeautifulSoup(driver.page_source, 'html.parser')        
        hrefs = [
            '#match-summary/match-statistics',
            '#odds-comparison',
        ]
        for key, href in zip(reversed(keys), reversed(hrefs)):
            if soup.find_all('a', {'href': href}):
                helium.go_to(f'{url}{href}')
                time.sleep(random.choice([1, 1.2]))
                row[key] = driver.page_source
    return row

In [111]:
def get_games_ids_per_year(year):
    url = f'flashscore.com/basketball/usa/nba-{year}-{year+1}/results/'
    with helium.start_firefox(url=url, options=options) as driver:
        time.sleep(2)
        scroll_down_page(driver)
        soup = bs4.BeautifulSoup(driver.page_source, 'html.parser')
        games_ids = get_games_ids_from_soup(soup)
        games_ids.reverse()
    return games_ids


def scroll_down_page(driver):
    end_of_page = False
    while not end_of_page:
        try:
            time.sleep(2)
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
            helium.click('Show more matches')
        except LookupError:
            end_of_page = True


def get_games_ids_from_soup(soup):
    games_ids = soup.find_all('div', attrs={'class': 'event__match'})
    games_ids = [game_id['id'] for game_id in games_ids]
    games_ids = [game_id.split('_')[-1] for game_id in games_ids]
    return games_ids

In [112]:
def create_games_html_per_year(year, remove_if_exists=False):
    path = f'data/games_html_{year}.db'
    if os.path.exists(path) and remove_if_exists:
        os.remove(path)    
    with sqlite3.connect(path) as con:
        query = '''CREATE TABLE games_html (
            game_id TEXT NOT NULL PRIMARY KEY,
            game_summary TEXT,
            game_statistics TEXT,
            odds_comparison TEXT)
            '''
        cur = con.cursor()
        cur.execute(query)

In [70]:
create_games_html_per_year(2021, remove_if_exists=True)

In [71]:
populate_games_html_per_year(2021, show_progress=True)

2021 fH6uCFJr 1/313
2021 2RsuJUh8 2/313
2021 xtdpBZYl 3/313
2021 MaMBO7Ug 4/313
2021 nLnRp3qd 5/313
2021 EZrjBOLs 6/313
2021 UPPJMoa6 7/313
2021 h40weI3I 8/313
2021 nHYbkaYA 9/313
2021 UuO3lJmH 10/313
2021 8WM7mw3N 11/313
2021 UqIEUCM2 12/313
2021 GdMBncIT 13/313
2021 E7CRd9xi 14/313
2021 jq0lAgle 15/313
2021 ClFOL5EC 16/313
2021 I58wrHQp 17/313
2021 GMLFNRq0 18/313
2021 KAnM603o 19/313
2021 YDjQ5KIi 20/313
2021 WfasfxJO 21/313
2021 tItqIl8E 22/313
2021 SzjU4vYc 23/313
2021 6RlY3bm4 24/313
2021 O2qw3I2A 25/313
2021 KEzO7kFk 26/313
2021 6ck317ET 27/313
2021 KjlMkvUo 28/313
2021 fH6uQVEK 29/313
2021 2grs2xIG 30/313
2021 p6ah9D31 31/313
2021 dGso1dXM 32/313
2021 d62KdtsJ 33/313
2021 8jok0GnT 34/313
2021 SUKpfxfr 35/313
2021 SAGVeTic 36/313
2021 zyt99nMd 37/313
2021 Odbd8XI7 38/313
2021 AiomH8NK 39/313
2021 Ol4XWGe8 40/313
2021 EyzS69Ue 41/313
2021 EXNHezep 42/313
2021 8Yya92jf 43/313
2021 YawQILeP 44/313
2021 hOMLffAj 45/313
2021 l0k6KqZP 46/313
2021 2u3yWzBE 47/313
2021 Gd8GGyX9 48/313
2

In [72]:
create_games_html_per_year(1993, remove_if_exists=True)

In [73]:
populate_games_html_per_year(1993, show_progress=True)

1993 vDxeS9an 1/1184
1993 SxZjTkpt 2/1184
1993 SbWrLhhJ 3/1184
1993 l8XvMYxD 4/1184
1993 KYSzNEN6 5/1184
1993 bJvSOzgf 6/1184
1993 trUWNf80 7/1184
1993 pGzOPGwl 8/1184
1993 jgyKQdOs 9/1184
1993 lpkhVxGQ 10/1184
1993 8YXfWI0K 11/1184
1993 UsZjXboE 12/1184
1993 Gvo98F8D 13/1184
1993 0j9jHqio 14/1184
1993 GIXJM5TN 15/1184
1993 bsNOLPrU 16/1184
1993 2FTFNoEH 17/1184
1993 8hUBORbB 18/1184
1993 U1V7P7q5 19/1184
1993 CQz3QmUb 20/1184
1993 YZyaRTEh 21/1184
1993 rm4lxoTA 22/1184
1993 lQ4pwRD4 23/1184
1993 zN0tv7bb 24/1184
1993 hbByumqh 25/1184
1993 E7CXuTTo 26/1184
1993 I5d1ZrDT 27/1184
1993 vFcczPcN 28/1184
1993 Mw3hy5rH 29/1184
1993 Sb8xXA8S 30/1184
1993 Yy9YXjgM 31/1184
1993 U7f4SUGq 32/1184
1993 KY4UYWwG 33/1184
1993 xAFPZCO9 34/1184
1993 bJGLzg93 35/1184
1993 UqCHyZgc 36/1184
1993 jgDDxFvi 37/1184
1993 nLnXTMzi 38/1184
1993 zoqTU2Lp 39/1184
1993 xWj0nj8F 40/1184
1993 buhdmWh9 41/1184
1993 GG2ilCw3 42/1184
1993 2DbmkhOd 43/1184
1993 lAA5Q8od 44/1184
1993 AJB1RlWk 45/1184
1993 MylnwnV1 46/11

In [74]:
create_games_html_per_year(1994, remove_if_exists=True)

In [75]:
populate_games_html_per_year(1994, show_progress=True)

1994 fL0qprha 1/1180
1994 QoduoOxg 2/1180
1994 necyn4Nn 3/1180
1994 4W7Wnp8t 4/1180
1994 SAG4iQaP 5/1180
1994 E7SMStNO 6/1180
1994 zHTITM7I 7/1180
1994 8fcMU2hC 8/1180
1994 U3bIVrw6 9/1180
1994 zsaEWONa 10/1180
1994 MweAX48g 11/1180
1994 vF55Ypgm 12/1180
1994 UwFRdLy5 13/1180
1994 nFPMc1Mb 14/1180
1994 8UDVeuiB 15/1180
1994 YBLIbs7h 16/1180
1994 vkMEaNin 17/1180
1994 0bNA03xt 18/1180
1994 0Oj4uK6U 19/1180
1994 MXi0t0iO 20/1180
1994 SAhdstxI 21/1180
1994 GzkhrMMB 22/1180
1994 YBamq275 23/1180
1994 GfDZfa6H 24/1180
1994 EVhAlcqo 25/1180
1994 xA1rgwyU 26/1180
1994 foCwfJLN 27/1180
1994 dEY89bLA 28/1180
1994 2iZ4Av64 29/1180
1994 O0z1BKjb 30/1180
1994 6PudC0yh 31/1180
1994 SYthDtMo 32/1180
1994 xMkuCis9 33/1180
1994 OIoyDXR2 34/1180
1994 MD7hllk2 35/1180
1994 WrkqBBdF 36/1180
1994 Em4pjAKk 37/1180
1994 zc5tij5q 38/1180
1994 faiC8IzH 39/1180
1994 rN8lkUZe 40/1180
1994 0KfK2m4e 41/1180
1994 nHjG3Tlk 42/1180
1994 hfiC49Zr 43/1180
1994 jo5p9kRR 44/1180
1994 AZmmAVBL 45/1180
1994 vBFgHnJl 46/11

In [9]:
create_games_html_per_year(2020, remove_if_exists=True)

In [10]:
populate_games_html_per_year(2020, show_progress=True)

2020 dznuKnPh 1/1221
2020 IqmyLS9n 2/1221
2020 jcoqJ6va 3/1221
2020 2XlmIQg5 4/1221
2020 pdvhHp9B 5/1221
2020 je6Pbxgp 6/1221
2020 OnwdG4OH 7/1221
2020 0lb6m9k7 8/1221
2020 WjqCMuoi 9/1221
2020 Uo5Tcd8j 10/1221
2020 pfmGLa0c 11/1221
2020 vHnKKJF3 12/1221
2020 I7oOJwV9 13/1221
2020 bJ2LIcpG 14/1221
2020 xA1PHHaM 15/1221
2020 WW0TGyFS 16/1221
2020 jHs0BFxj 17/1221
2020 8hheCeNq 18/1221
2020 U7t4AZid 19/1221
2020 INZoZIUd 20/1221
2020 vucAnT4D 21/1221
2020 vwOfXda9 22/1221
2020 dEYkYxp3 23/1221
2020 xOtbWGEF 24/1221
2020 Wpt2VzUL 25/1221
2020 Ayu6UfqS 26/1221
2020 jVMYPEiq 27/1221
2020 ULLxPY6k 28/1221
2020 OrLtOhMe 29/1221
2020 vi3TodE2 30/1221
2020 xnPpNCy2 31/1221
2020 Q71ypzqF 32/1221
2020 Is2XpGT8 33/1221
2020 nNkprEDR 34/1221
2020 4b0uqfbL 35/1221
2020 OrsCwYLr 36/1221
2020 dduKyCje 37/1221
2020 I5tGxhyk 38/1221
2020 EXZOzW51 39/1221
2020 QsIpBCsE 40/1221
2020 6RMAbZc8 41/1221
2020 0QFJdDSK 42/1221
2020 I1QEcgDE 43/1221
2020 hOYSZkL7 44/1221
2020 vHENeXsR 45/1221
2020 W86kiijr 46/12

In [108]:
create_games_html_per_year(2019, remove_if_exists=True)

In [109]:
populate_games_html_per_year(2019, show_progress=True)

2019 j7vaAjcp 1/1243
2019 6mG4LvOD 2/1243
2019 2FkOCnKD 3/1243
2019 lKFtcBAM 4/1243
2019 UVWwdVPS 5/1243
2019 0jg7GUKl 6/1243
2019 v7XFDW0k 7/1243
2019 WvJ5ikYq 8/1243
2019 YsnJCjGe 9/1243
2019 Q1I9j9mj 10/1243
2019 4K7yMjcR 11/1243
2019 v5MDkT2d 12/1243
2019 EDJCJIhQ 13/1243
2019 GYVLd9eB 14/1243
2019 CYpNBAV1 15/1243
2019 Ucw39ACj 16/1243
2019 tA0dECuc 17/1243
2019 K0a0DWf3 18/1243
2019 lz44CjA9 19/1243
2019 EwP5zf82 20/1243
2019 Maf3HA5r 21/1243
2019 SQ28BAPF 22/1243
2019 bcLQeTAH 23/1243
2019 StT9ZDN8 24/1243
2019 tKo2xuAd 25/1243
2019 AkhDAUvM 26/1243
2019 IVLMWB7R 27/1243
2019 0bunGp2C 28/1243
2019 M3SDYXxF 29/1243
2019 0fRHXihL 30/1243
2019 jH3XMWsL 31/1243
2019 8zBG9lfS 32/1243
2019 QFcr582q 33/1243
2019 IyVp1Onc 34/1243
2019 UXqjF4HI 35/1243
2019 46dn4SHk 36/1243
2019 0Yke26n2 37/1243
2019 Kv8lSVFr 38/1243
2019 Msii3nXe 39/1243
2019 EyvwRkP2 40/1243
2019 IRmCFlze 41/1243
2019 6BbiCzo0 42/1243
2019 2Ppv2chJ 43/1243
2019 E17hRkVl 44/1243
2019 SvrsQ9v9 45/1243
2019 p2ceBf06 46/12

In [113]:
create_games_html_per_year(2018)

In [114]:
populate_games_html_per_year(2018, show_progress=True)

2018 ShiEl0pH 1/1380
2018 by0qn1PB 2/1380
2018 tdeImKaN 3/1380
2018 OKpNnvFT 4/1380
2018 QTWlrbNp 5/1380
2018 EqQW6s0o 6/1380
2018 hzPz61Fi 7/1380
2018 YqXr4up4 8/1380
2018 zaEu5LUc 9/1380
2018 S6Wn3aaA 10/1380
2018 6cVj2JEG 11/1380
2018 OYPe1wUM 12/1380
2018 K4BReHip 13/1380
2018 2POa0cqT 14/1380
2018 dMInoLvI 15/1380
2018 Y7FVfy7j 16/1380
2018 zDgEJKqi 17/1380
2018 MTiMHbE3 18/1380
2018 E3hIIvac 19/1380
2018 0KjQGIT9 20/1380
2018 KMndNMlQ 21/1380
2018 rqtVFxqG 22/1380
2018 fL8bbADr 23/1380
2018 lr82cUSl 24/1380
2018 SY66dlre 25/1380
2018 O00FfSC7 26/1380
2018 pA1Be8c1 27/1380
2018 x4lcJBk8 28/1380
2018 lvh1IV4E 29/1380
2018 W2s6HkKK 30/1380
2018 AgtAG9ZQ 31/1380
2018 2oYyCTsr 32/1380
2018 txXuBmcl 33/1380
2018 O0MpA7Cf 34/1380
2018 x4Ql9RR0 35/1380
2018 WQPh8os7 36/1380
2018 AHOd75dD 37/1380
2018 dKqXVlZD 38/1380
2018 zBryV8lK 39/1380
2018 UXsuUS3Q 40/1380
2018 6gZ6PnCs 41/1380
2018 S8NFNQtf 42/1380
2018 lIOBO6Rm 43/1380
2018 drSJMpd0 44/1380
2018 zZQNL4B6 45/1380
2018 IcGSKOQC 46/13