In [2]:
import bs4
import helium
import os
import random
import selenium
import sqlite3
import time

In [3]:
options = selenium.webdriver.FirefoxOptions()
options.add_argument("--width=1024")
options.add_argument("--height=768")
options.add_argument("-headless")

In [4]:
def populate_games_html_per_year(year, show_progress=False):
    games_ids = get_games_ids_per_year(year)
    count, total = 1, len(games_ids)
    for game_id in games_ids:
        if show_progress:
            print(f'{year} {game_id} {count}/{total}')
        populate_games_html_per_game_id(game_id, year)
        count += 1


def populate_games_html_per_game_id(game_id, year):
    with sqlite3.connect(f'data/games_html_{year}.db') as con:
        query = '''INSERT INTO games_html VALUES (
            :game_id,
            :game_summary,
            :player_statistics,
            :game_statistics,
            :lineups,
            :odds_comparison)
            '''
        row = get_row_to_games_html(game_id)
        cur = con.cursor()
        cur.execute(query, row)
    

def get_row_to_games_html(game_id):
    keys = [
        'game_id',
        'game_summary',
        'player_statistics',
        'game_statistics',
        'lineups',
        'odds_comparison',
    ]
    row = {key: None for key in keys}
    url = f'flashscore.com/match/{game_id}/'
    with helium.start_firefox(url=url, options=options) as driver:
        time.sleep(random.randint(1, 2))
        row['game_id'] = game_id
        row['game_summary'] = driver.page_source
        soup = bs4.BeautifulSoup(driver.page_source, 'html.parser')        
        hrefs = [
            '#match-summary/player-statistics',
            '#match-summary/match-statistics',
            '#match-summary/lineups',
            '#odds-comparison',
        ]
        for key, href in zip(reversed(keys), reversed(hrefs)):
            if soup.find_all('a', {'href': href}):
                helium.go_to(f'{url}{href}')
                time.sleep(random.randint(1, 2))
                row[key] = driver.page_source
    return row

In [5]:
def get_games_ids_per_year(year):
    url = f'flashscore.com/basketball/usa/nba-{year}-{year+1}/results/'
    with helium.start_firefox(url=url, options=options) as driver:
        time.sleep(2)
        scroll_down_page(driver)
        soup = bs4.BeautifulSoup(driver.page_source, 'html.parser')
        games_ids = get_games_ids_from_soup(soup)
        games_ids.reverse()
    return games_ids


def scroll_down_page(driver):
    end_of_page = False
    while not end_of_page:
        try:
            time.sleep(2)
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
            helium.click('Show more matches')
        except LookupError:
            end_of_page = True


def get_games_ids_from_soup(soup):
    games_ids = soup.find_all('div', attrs={'class': 'event__match'})
    games_ids = [game_id['id'] for game_id in games_ids]
    games_ids = [game_id.split('_')[-1] for game_id in games_ids]
    return games_ids

In [63]:
get_games_ids_per_year(2021)

['fH6uCFJr',
 '2RsuJUh8',
 'xtdpBZYl',
 'MaMBO7Ug',
 'nLnRp3qd',
 'EZrjBOLs',
 'UPPJMoa6',
 'h40weI3I',
 'nHYbkaYA',
 'UuO3lJmH',
 '8WM7mw3N',
 'UqIEUCM2',
 'GdMBncIT',
 'E7CRd9xi',
 'jq0lAgle',
 'ClFOL5EC',
 'I58wrHQp',
 'GMLFNRq0',
 'KAnM603o',
 'YDjQ5KIi',
 'WfasfxJO',
 'tItqIl8E',
 'SzjU4vYc',
 '6RlY3bm4',
 'O2qw3I2A',
 'KEzO7kFk',
 '6ck317ET',
 'KjlMkvUo',
 'fH6uQVEK',
 '2grs2xIG',
 'p6ah9D31',
 'dGso1dXM',
 'd62KdtsJ',
 '8jok0GnT',
 'SUKpfxfr',
 'SAGVeTic',
 'zyt99nMd',
 'Odbd8XI7',
 'AiomH8NK',
 'Ol4XWGe8',
 'EyzS69Ue',
 'EXNHezep',
 '8Yya92jf',
 'YawQILeP',
 'hOMLffAj',
 'l0k6KqZP',
 '2u3yWzBE',
 'Gd8GGyX9',
 'KUE7Ic2c',
 'tYOlgd9l',
 'tvG3Jwmi',
 '2g4CHHH3',
 'ERvD86y3',
 '8zHITWy9',
 'AJ7KFenG',
 'SvvW5Tq2',
 'djUH7Qj9',
 'd22uVfQK',
 'A56qUEuR',
 'E51pcmWl',
 'nqVmbTGr',
 'nJaheR01',
 'xALMSjiF',
 '0OEwf7M9',
 'KQUcopfc',
 '0M3dfoG7',
 'Q50XZJVr',
 '8UkxZwok',
 'n3L1lAZh',
 'nqcKyXkm',
 't2SsZXUs',
 'MXgtYc0e',
 'row7tVfb',
 '8Awwkdrf',
 'WUqKJhVf',
 'dUjBvf7K',
 'AagmM09q',

In [6]:
def create_games_html_per_year(year, remove_if_exists=False):
    path = f'data/games_html_{year}.db'
    if os.path.exists(path) and remove_if_exists:
        os.remove(path)    
    with sqlite3.connect(path) as con:
        query = '''CREATE TABLE games_html (
            game_id TEXT NOT NULL PRIMARY KEY,
            game_summary TEXT,
            player_statistics TEXT,
            game_statistics TEXT,
            lineups TEXT,
            odds_comparison TEXT)
            '''
        cur = con.cursor()
        cur.execute(query)

In [70]:
create_games_html_per_year(2021, remove_if_exists=True)

In [71]:
populate_games_html_per_year(2021, show_progress=True)

2021 fH6uCFJr 1/313
2021 2RsuJUh8 2/313
2021 xtdpBZYl 3/313
2021 MaMBO7Ug 4/313
2021 nLnRp3qd 5/313
2021 EZrjBOLs 6/313
2021 UPPJMoa6 7/313
2021 h40weI3I 8/313
2021 nHYbkaYA 9/313
2021 UuO3lJmH 10/313
2021 8WM7mw3N 11/313
2021 UqIEUCM2 12/313
2021 GdMBncIT 13/313
2021 E7CRd9xi 14/313
2021 jq0lAgle 15/313
2021 ClFOL5EC 16/313
2021 I58wrHQp 17/313
2021 GMLFNRq0 18/313
2021 KAnM603o 19/313
2021 YDjQ5KIi 20/313
2021 WfasfxJO 21/313
2021 tItqIl8E 22/313
2021 SzjU4vYc 23/313
2021 6RlY3bm4 24/313
2021 O2qw3I2A 25/313
2021 KEzO7kFk 26/313
2021 6ck317ET 27/313
2021 KjlMkvUo 28/313
2021 fH6uQVEK 29/313
2021 2grs2xIG 30/313
2021 p6ah9D31 31/313
2021 dGso1dXM 32/313
2021 d62KdtsJ 33/313
2021 8jok0GnT 34/313
2021 SUKpfxfr 35/313
2021 SAGVeTic 36/313
2021 zyt99nMd 37/313
2021 Odbd8XI7 38/313
2021 AiomH8NK 39/313
2021 Ol4XWGe8 40/313
2021 EyzS69Ue 41/313
2021 EXNHezep 42/313
2021 8Yya92jf 43/313
2021 YawQILeP 44/313
2021 hOMLffAj 45/313
2021 l0k6KqZP 46/313
2021 2u3yWzBE 47/313
2021 Gd8GGyX9 48/313
2

In [72]:
create_games_html_per_year(1993, remove_if_exists=True)

In [73]:
populate_games_html_per_year(1993, show_progress=True)

1993 vDxeS9an 1/1184
1993 SxZjTkpt 2/1184
1993 SbWrLhhJ 3/1184
1993 l8XvMYxD 4/1184
1993 KYSzNEN6 5/1184
1993 bJvSOzgf 6/1184
1993 trUWNf80 7/1184
1993 pGzOPGwl 8/1184
1993 jgyKQdOs 9/1184
1993 lpkhVxGQ 10/1184
1993 8YXfWI0K 11/1184
1993 UsZjXboE 12/1184
1993 Gvo98F8D 13/1184
1993 0j9jHqio 14/1184
1993 GIXJM5TN 15/1184
1993 bsNOLPrU 16/1184
1993 2FTFNoEH 17/1184
1993 8hUBORbB 18/1184
1993 U1V7P7q5 19/1184
1993 CQz3QmUb 20/1184
1993 YZyaRTEh 21/1184
1993 rm4lxoTA 22/1184
1993 lQ4pwRD4 23/1184
1993 zN0tv7bb 24/1184
1993 hbByumqh 25/1184
1993 E7CXuTTo 26/1184
1993 I5d1ZrDT 27/1184
1993 vFcczPcN 28/1184
1993 Mw3hy5rH 29/1184
1993 Sb8xXA8S 30/1184
1993 Yy9YXjgM 31/1184
1993 U7f4SUGq 32/1184
1993 KY4UYWwG 33/1184
1993 xAFPZCO9 34/1184
1993 bJGLzg93 35/1184
1993 UqCHyZgc 36/1184
1993 jgDDxFvi 37/1184
1993 nLnXTMzi 38/1184
1993 zoqTU2Lp 39/1184
1993 xWj0nj8F 40/1184
1993 buhdmWh9 41/1184
1993 GG2ilCw3 42/1184
1993 2DbmkhOd 43/1184
1993 lAA5Q8od 44/1184
1993 AJB1RlWk 45/1184
1993 MylnwnV1 46/11

In [74]:
create_games_html_per_year(1994, remove_if_exists=True)

In [75]:
populate_games_html_per_year(1994, show_progress=True)

1994 fL0qprha 1/1180
1994 QoduoOxg 2/1180
1994 necyn4Nn 3/1180
1994 4W7Wnp8t 4/1180
1994 SAG4iQaP 5/1180
1994 E7SMStNO 6/1180
1994 zHTITM7I 7/1180
1994 8fcMU2hC 8/1180
1994 U3bIVrw6 9/1180
1994 zsaEWONa 10/1180
1994 MweAX48g 11/1180
1994 vF55Ypgm 12/1180
1994 UwFRdLy5 13/1180
1994 nFPMc1Mb 14/1180
1994 8UDVeuiB 15/1180
1994 YBLIbs7h 16/1180
1994 vkMEaNin 17/1180
1994 0bNA03xt 18/1180
1994 0Oj4uK6U 19/1180
1994 MXi0t0iO 20/1180
1994 SAhdstxI 21/1180
1994 GzkhrMMB 22/1180
1994 YBamq275 23/1180
1994 GfDZfa6H 24/1180
1994 EVhAlcqo 25/1180
1994 xA1rgwyU 26/1180
1994 foCwfJLN 27/1180
1994 dEY89bLA 28/1180
1994 2iZ4Av64 29/1180
1994 O0z1BKjb 30/1180
1994 6PudC0yh 31/1180
1994 SYthDtMo 32/1180
1994 xMkuCis9 33/1180
1994 OIoyDXR2 34/1180
1994 MD7hllk2 35/1180
1994 WrkqBBdF 36/1180
1994 Em4pjAKk 37/1180
1994 zc5tij5q 38/1180
1994 faiC8IzH 39/1180
1994 rN8lkUZe 40/1180
1994 0KfK2m4e 41/1180
1994 nHjG3Tlk 42/1180
1994 hfiC49Zr 43/1180
1994 jo5p9kRR 44/1180
1994 AZmmAVBL 45/1180
1994 vBFgHnJl 46/11

In [7]:
create_games_html_per_year(2020, remove_if_exists=True)

In [None]:
populate_games_html_per_year(2020, show_progress=True)

1994 fL0qprha 1/1180
1994 QoduoOxg 2/1180
1994 necyn4Nn 3/1180
1994 4W7Wnp8t 4/1180
1994 SAG4iQaP 5/1180
1994 E7SMStNO 6/1180
1994 zHTITM7I 7/1180
1994 8fcMU2hC 8/1180
1994 U3bIVrw6 9/1180
1994 zsaEWONa 10/1180
1994 MweAX48g 11/1180
1994 vF55Ypgm 12/1180
1994 UwFRdLy5 13/1180
1994 nFPMc1Mb 14/1180
1994 8UDVeuiB 15/1180
1994 YBLIbs7h 16/1180
1994 vkMEaNin 17/1180
1994 0bNA03xt 18/1180
1994 0Oj4uK6U 19/1180
1994 MXi0t0iO 20/1180
1994 SAhdstxI 21/1180
1994 GzkhrMMB 22/1180
1994 YBamq275 23/1180
1994 GfDZfa6H 24/1180
1994 EVhAlcqo 25/1180
1994 xA1rgwyU 26/1180
1994 foCwfJLN 27/1180
1994 dEY89bLA 28/1180
1994 2iZ4Av64 29/1180
1994 O0z1BKjb 30/1180
1994 6PudC0yh 31/1180
1994 SYthDtMo 32/1180
1994 xMkuCis9 33/1180
1994 OIoyDXR2 34/1180
1994 MD7hllk2 35/1180
1994 WrkqBBdF 36/1180
1994 Em4pjAKk 37/1180
1994 zc5tij5q 38/1180
1994 faiC8IzH 39/1180
1994 rN8lkUZe 40/1180
1994 0KfK2m4e 41/1180
1994 nHjG3Tlk 42/1180
1994 hfiC49Zr 43/1180
1994 jo5p9kRR 44/1180
1994 AZmmAVBL 45/1180
1994 vBFgHnJl 46/11

In [16]:
dados = {}

with db.connect('data/games-source-2021.db') as con:
    query = 'SELECT game_id, game_summary, game_statistics FROM games_source'
    cur = con.cursor()
    cur.execute(query)
    for game_id, game_summary, game_statistics in cur.fetchall():
        dados[game_id] = get_summary_data(game_summary)

In [10]:
def get_summary_data(game_summary_source):
    data = []
    soup = BeautifulSoup(game_summary_source, 'html.parser')
    data.append(soup.find('div', {'class': 'duelParticipant__startTime'}).text)
    data.append([t.text for t in soup.findAll('div', {'class': 'participant__participantName'})])
    data.append(soup.find('div', {'class': 'detailScore__wrapper'}).text)
    return data

In [17]:
for value in dados.values():
    print(value)

['03.10.2021 16:30', ['Los Angeles Lakers', 'Brooklyn Nets'], '97-123']
['04.10.2021 20:00', ['Toronto Raptors', 'Philadelphia 76ers'], '123-107']
['04.10.2021 20:30', ['Miami Heat', 'Atlanta Hawks'], '125-99']
['04.10.2021 20:30', ['Boston Celtics', 'Orlando Magic'], '98-97']
['04.10.2021 21:00', ['Oklahoma City Thunder', 'Charlotte Hornets'], '97-113']
['04.10.2021 21:00', ['Minnesota Timberwolves', 'New Orleans Pelicans'], '117-114']
['04.10.2021 21:30', ['San Antonio Spurs', 'Utah Jazz'], '111-85']
['04.10.2021 23:00', ['Sacramento Kings', 'Phoenix Suns'], '117-106']
['04.10.2021 23:00', ['Portland Trail Blazers', 'Golden State Warriors'], '107-121']
['04.10.2021 23:30', ['Los Angeles Clippers', 'Denver Nuggets'], '103-102']
['05.10.2021 20:30', ['New York Knicks', 'Indiana Pacers'], '125-104']
['05.10.2021 21:00', ['Memphis Grizzlies', 'Milwaukee Bucks'], '87-77']
['05.10.2021 21:00', ['Houston Rockets', 'Washington Wizards'], '125-119']
['05.10.2021 21:00', ['Chicago Bulls', 'Cle

In [245]:
soup = BeautifulSoup(game_summary, 'html.parser')
fase = soup.find('a', {'href': '/basketball/usa/nba/'}).text
print('fase: '.rjust(15), fase)

data = soup.find('div', {'class': 'duelParticipant__startTime'}).text
print('data: '.rjust(15), data)

times = soup.findAll('div', {'class': 'participant__participantName'})
times = [t.text for t in times]
print('times: '.rjust(15), times)

pontos = soup.find('div', {'class': 'detailScore__wrapper'}).text
print('pontos: '.rjust(15), pontos)

try:
    status = soup.find('div', {'class': 'detailScore__status'}).text
    print('status: '.rjust(15), status)
except:
    print('status: '.rjust(15), None)

try:        
    obs = soup.find('div', {'class': 'infoBox__info'}).text
    print('obs.: '.rjust(15), obs)
except:
    print('obs.: '.rjust(15), None)

stats_home = soup.findAll('div', {'class': 'statHomeValue'})
stats_home = [s.text for s in stats_home]
print('stats casa: '.rjust(15), stats_home)

stats_away = soup.findAll('div', {'class': 'statAwayValue'})
stats_away = [s.text for s in stats_away]
print('stats fora: '.rjust(15), stats_away)

try:
    casa_de_aposta = soup.find('img', {'class': 'prematchLogo'})['title']
    print('odds casa: '.rjust(15), casa_de_aposta)
except:
    print('odds casa: '.rjust(15), None)

odds = soup.find_all('div', attrs={'class': 'cellWrapper'})
odds = [o['title'] for o in odds]
if odds == [] or odds[0] == '':
    odds = soup.find_all('span', {'class': 'oddsValue'})
    odds = [o.text for o in odds]
print('odds: '.rjust(15), odds)

print()

         fase:  NBA
         data:  30.10.2021 18:00
        times:  ['Washington Wizards', 'Boston Celtics']
       pontos:  115-112
       status:  After Overtime
         obs.:  2 extra times played.
   stats casa:  []
   stats fora:  []
    odds casa:  bet365
         odds:  ['1.71[u]1.76', '2.20[d]2.10']



home_team

away_team

In [232]:
def create_db_games_per_year(year):
    path = f'data/games-{year}.db'

    if os.path.exists(path):
        os.remove(path)
    
    with db.connect(path) as con:
        query = '''CREATE TABLE games (
            game_id TEXT NOT NULL PRIMARY KEY,
            game_summary TEXT,
            player_statistics TEXT,
            game_statistics TEXT,
            lineups TEXT,
            odds_comparison TEXT)
            '''
        cur = con.cursor()
        cur.execute(query)

In [260]:
soup = BeautifulSoup(game_summary, 'html.parser')

print(soup.find('div', class_="mi__data").text)

print([t.get_text() for t in soup.find_all('div', class_="smh__home")[2:]])
print([t.get_text() for t in soup.find_all('div', class_="smh__away")[2:]])

Attendance: 15 813, Venue: Capital One Arena (Washington)
['115', '25', '28', '23', '27', '12']
['112', '20', '27', '26', '30', '9']
