In [1]:
import pandas as pd
import sqlite3 as db

from bs4 import BeautifulSoup
from helium import click, go_to, start_firefox

from time import sleep

In [2]:
def scroll_page(driver, sleep_time=2, button='Show more matches'):
    sleep(sleep_time)
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
    click(button)


def get_source(year=2018):
    link = f'flashscore.com/basketball/usa/nba-{year}-{year+1}/results/'
    with start_firefox(link, headless=True) as driver:
        while True:
            try:
                scroll_page(driver)
            except LookupError:
                try:
                    scroll_page(driver)
                except LookupError:
                    break
        source = driver.page_source
    
    with open('data/source.txt', 'w') as file:
        file.write(source)

get_source(2021)

In [3]:
with open('data/source.txt', 'r') as file:
    source = file.read()
    source = BeautifulSoup(source, 'html.parser')

In [4]:
games = source.findAll('div', attrs={'class': 'event__match event__match--static event__match--twoLine'})
ids = [game['id'].split('_')[-1] for game in games]

In [5]:
with db.connect('data/nba_games_source.db') as conn:
    cur = conn.cursor()
    query = '''CREATE TABLE IF NOT EXISTS games (
        id TEXT NOT NULL PRIMARY KEY,
        match_statistics TEXT,
        player_statistics TEXT,
        lineups TEXT
        )
        '''
    cur.execute(query)

In [6]:
with db.connect('data/nba_games_source.db') as conn:
    cur = conn.cursor()
    for id in ids:
        query = '''INSERT INTO games VALUES (
            :match_id,
            :match_statistics,
            :player_statistics,
            :lineups)
        '''
        values = {
            'match_id': id,
            'match_statistics': None,
            'player_statistics': None,
            'lineups': None}
        cur.execute(query, values)

In [7]:
lista = []
with db.connect('data/nba_games_source.db') as conn:
    cur = conn.cursor()
    cur.execute('SELECT id, match_statistics FROM games')
    with start_firefox(headless=True) as driver:
        for row in cur.fetchmany(5):
            match_id, match_sumary_source = row
            link = f'flashscore.com/match/{match_id}/#match-summary/match-statistics'
            if match_sumary_source is None:
                go_to(link)
                sleep(.5)
                lista.append(driver.page_source)
            

In [8]:
for html in lista:
    html = BeautifulSoup(html, 'html.parser')

    fase = html.find('a', {'href': '/basketball/usa/nba/'}).text
    print('fase: '.rjust(15), fase)

    data = html.find('div', {'class': 'duelParticipant__startTime'}).text
    print('data: '.rjust(15), data)

    times = html.findAll('div', {'class': 'participant__participantName'})
    times = [t.text for t in times]
    print('times: '.rjust(15), times)

    pontos = html.find('div', {'class': 'detailScore__wrapper'}).text
    print('pontos: '.rjust(15), pontos)

    try:
        status = html.find('div', {'class': 'detailScore__status'}).text
        print('status: '.rjust(15), status)
    except:
        print('status: '.rjust(15), None)

    try:        
        obs = html.find('div', {'class': 'infoBox__info'}).text
        print('obs.: '.rjust(15), obs)
    except:
        print('obs.: '.rjust(15), None)

    stats_home = html.findAll('div', {'class': 'statHomeValue'})
    stats_home = [s.text for s in stats_home]
    print('stats casa: '.rjust(15), stats_home)

    stats_away = html.findAll('div', {'class': 'statAwayValue'})
    stats_away = [s.text for s in stats_away]
    print('stats fora: '.rjust(15), stats_away)

    try:
        casa_de_aposta = html.find('img', {'class': 'prematchLogo'})['title']
        print('odds casa: '.rjust(15), casa_de_aposta)
    except:
        print('odds casa: '.rjust(15), None)

    odds = html.find_all('div', attrs={'class': 'cellWrapper'})
    odds = [o['title'] for o in odds]
    if odds == [] or odds[0] == '':
        odds = html.find_all('span', {'class': 'oddsValue'})
        odds = [o.text for o in odds]
    print('odds: '.rjust(15), odds)

    print()

         fase:  NBA
         data:  18.11.2021 00:00
        times:  ['Phoenix Suns', 'Dallas Mavericks']
       pontos:  105-98
       status:  Finished
         obs.:  None
   stats casa:  ['93', '41', '44.1%', '54', '28', '51.9%', '39', '13', '33.3%', '14', '10', '71.4%', '8', '41', '49', '27', '7', '8', '6', '9', '1']
   stats fora:  ['92', '38', '41.3%', '60', '22', '36.7%', '32', '16', '50%', '6', '6', '100%', '9', '41', '50', '28', '2', '10', '5', '14', '0']
    odds casa:  bet365
         odds:  ['1.28[d]1.27', '3.75[u]3.80']

         fase:  NBA
         data:  18.11.2021 00:00
        times:  ['Portland Trail Blazers', 'Chicago Bulls']
       pontos:  112-107
       status:  Finished
         obs.:  None
   stats casa:  ['84', '37', '44%', '53', '26', '49.1%', '31', '11', '35.5%', '28', '27', '96.4%', '12', '32', '44', '26', '5', '17', '10', '19', '0']
   stats fora:  ['83', '39', '47%', '54', '26', '48.1%', '29', '13', '44.8%', '20', '16', '80%', '10', '26', '36', '23', '9',

In [None]:
with db.connect('data/nba_games_source.db') as conn:
    cur = conn.cursor()
    cur.execute('UPDATE games SET match_sumary = "Meu HTML" WHERE id = "fJ12apqa"')

In [None]:
with db.connect('data/nba_games_source.db') as conn:
    cur = conn.cursor()
    cur.execute('SELECT * FROM games')
    for row in cur.fetchmany(5):
        print(row)

In [133]:
with start_firefox(headless=True) as driver:
    match_id = 'WfD5dX3J'
    link = f'flashscore.com/match/{match_id}/#match-summary/match-statistics'
    go_to(link)
    sleep(.5)
    source = driver.page_source
            

In [None]:
html = BeautifulSoup(source, 'html.parser')

fase = html.find('a', {'href': '/basketball/usa/nba/'}).text
print(fase)

data = html.find('div', {'class': 'duelParticipant__startTime'}).text
print(data)

times = html.findAll('div', {'class': 'participant__participantName'})
times = [t.text for t in times]
print(times)

pontos = html.find('div', {'class': 'detailScore__wrapper'}).text
print(pontos)

pontos_first_4 = html.find('div', attrs={'detailScore__fullTime'}).text
print(pontos_first_4)

status = html.find('div', {'class': 'detailScore__status'}).text
print(status)

try:        
    obs = html.find('div', {'class': 'infoBox__info'}).text
    print(obs)
except:
    print(None)

stats_home = html.findAll('div', {'class': 'statHomeValue'})
stats_home = [s.text for s in stats_home]
print(stats_home)

stats_away = html.findAll('div', {'class': 'statAwayValue'})
stats_away = [s.text for s in stats_away]
print(stats_away)

try:
    casa_de_aposta = html.find('img', {'class': 'prematchLogo'})['title']
    print(casa_de_aposta)
except:
    print(None)

odds = html.find_all('div', attrs={'class': 'cellWrapper'})
odds = [o['title'] for o in odds]
if odds == [] or odds[0] == '':
    odds = html.find_all('span', {'class': 'oddsValue'})
    odds = [o.text for o in odds]
print(odds)