In [1]:
import bs4
import helium
import selenium
import sqlite3
import time

options = selenium.webdriver.FirefoxOptions()
options.add_argument("--width=1024")
options.add_argument("--height=768")
options.add_argument("-headless")

In [2]:
def create_db_sources(year):
    with sqlite3.connect(f'data/sources/sources_{year}.db') as con:
        query = f'''CREATE TABLE sources (
            id TEXT NOT NULL PRIMARY KEY,
            summary_source TEXT,
            stats_source TEXT,
            odds_source TEXT)
            '''
        cur = con.cursor()
        cur.execute(query)

In [3]:
def populate_db_sources(year):
    ids = get_ids_db_sources(year)
    populate_db_sources_per_ids(ids, year)

In [4]:
def get_ids_db_sources(year):
    url = f'flashscore.com/basketball/usa/nba-{year}-{year+1}/results/'
    with helium.start_firefox(options=options) as driver:
        helium.go_to(url)
        time.sleep(2)
        source = get_source(driver)
    ids = get_ids(source)
    return ids


def get_source(driver):
    end = False
    while not end:
        try:
            driver.execute_script(
                'window.scrollTo(0, document.body.scrollHeight);')
            helium.click('Show more matches')
            time.sleep(2)
        except LookupError:
            end = True
    source = driver.page_source
    return source


def get_ids(source):
    soup = bs4.BeautifulSoup(source, 'html.parser')
    ids = soup.find_all('div', {'class': 'event__match'})
    ids = [id_['id'] for id_ in ids]
    ids = [id_.split('_')[-1] for id_ in ids]
    return ids

In [5]:
def populate_db_sources_per_ids(ids, year):
    with helium.start_firefox(options=options) as driver:
        for i, id_ in enumerate(ids):
            print(f'{id_} {i+1}/{len(ids)}')
            values = get_values_db_sources(id_, driver)
            values['id'] = id_
            with sqlite3.connect(f'data/sources/sources_{year}.db') as con:
                query = f'''INSERT INTO sources VALUES (
                    :id,
                    :summary_source,
                    :stats_source,
                    :odds_source)
                    '''
                cur = con.cursor()
                cur.execute(query, values)

In [14]:
def get_values_db_sources(id_, driver):
    keys = [
        'summary_source',
        'stats_source',
        'odds_source',
    ]
    values = {key: None for key in keys}
    url = f'flashscore.com/match/{id_}/'

    href_summary = '#match-summary'
    helium.go_to(f'{url}{href_summary}')
    time.sleep(3)
    soup = bs4.BeautifulSoup(driver.page_source, 'html.parser')
    soup = soup.find('div', {'class': 'container__detail'})
    values['summary_source'] = str(soup)

    href_stats = '#match-summary/match-statistics'
    if soup.find_all('a', {'href': href_stats}):
        helium.go_to(f'{url}{href_stats}')
        time.sleep(2)
        soup = bs4.BeautifulSoup(driver.page_source, 'html.parser')
        soup = soup.find('div', {'class': 'container__detail'})
        values['stats_source'] = str(soup)

    href_odds = '#odds-comparison'
    if soup.find_all('a', {'href': href_odds}):
        helium.go_to(f'{url}{href_odds}')
        time.sleep(2)
        soup = bs4.BeautifulSoup(driver.page_source, 'html.parser')
        soup = soup.find('div', {'class': 'container__detail'})
        values['odds_source'] = str(soup)

    return values

In [23]:
create_db_sources(1995)
populate_db_sources(1995)

IXOqEXE1 1/1257
OdAtFDae 2/1257
p6BxGgpk 3/1257
2sMXGZUr 4/1257
j5g4Y10G 5/1257
drf0ZsoA 6/1257
IiedzOV3 7/1257
8MyENpnb 8/1257
EaxAOQWi 9/1257
z9w6P6Ho 10/1257
zHXU0Nvo 11/1257
Y7o353WT 12/1257
vona6qHN 13/1257
Qsre7P1H 14/1257
rJhj85nB 15/1257
4Sgn9oX4 16/1257
zgkrARHb 17/1257
d2jvB72h 18/1257
Ew1ZBmmn 19/1257
vX6Qip2U 20/1257
GhrWCTXu 21/1257
0t8MhQnO 22/1257
dlB5d8ma 23/1257
AiF9eS25 24/1257
4EEDfnIB 25/1257
r5DHg6XH 26/1257
EPL0clYh 27/1257
zZMdbUIn 28/1257
QayR3jeP 29/1257
S8NhaA3t 30/1257
Ofcru7IO 31/1257
Aigvtm3I 32/1257
xveWs9Y5 33/1257
GUgzsTmC 34/1257
K20RrkJa 35/1257
tC1NqV3g 36/1257
nk2JpBlm 37/1257
zFbFoiZt 38/1257
fHpo3J4E 39/1257
GQos4ak8 40/1257
6e4jsxre 41/1257
lKgetdc1 42/1257
SAhauGC7 43/1257
dMEGmv5R 44/1257
UarKlKkL 45/1257
hh0nrISl 46/1257
bV0rqbDr 47/1257
tv2PSxT1 48/1257
rqt8iM52 49/1257
dnpCjtL8 50/1257
YDbHUbbk 51/1257
2m3LTIDe 52/1257
rqKxxs6F 53/1257
fNaDVvqq 54/1257
x6Jty1LL 55/1257
ltNpzLzS 56/1257
fNwrGaqd 57/1257
8QRPvqMd 58/1257
nHQTw3y3 59/1257
4hLYxN