In [1]:
import pandas as pd
import sqlite3 as db
import os
import random

from bs4 import BeautifulSoup
from helium import *
from time import sleep

from selenium.webdriver import FirefoxOptions
options = FirefoxOptions()
options.add_argument("--width=1024")
options.add_argument("--height=768")
options.add_argument("-headless")

In [2]:
def populate_db_source_per_year(year):
    games_ids = get_games_ids_per_year(year)
    for game_id in games_ids:
        populate_db_source_per_game_id(year, game_id)


def populate_db_source_per_game_id(year, game_id):
    with db.connect(f'data/games-source-{year}.db') as con:
        query = '''INSERT INTO games_source VALUES (
            :game_id,
            :game_summary,
            :player_statistics,
            :game_statistics,
            :lineups,
            :odds_comparison)
            '''
        values = get_values(game_id)
        cur = con.cursor()
        cur.execute(query, values)
    

def get_values(game_id):
    values_keys = [
        'game_id',
        'game_summary',
        'player_statistics',
        'game_statistics',
        'lineups',
        'odds_comparison',
    ]
    values = {key: None for key in values_keys}
    values['game_id'] = game_id
        
    url = f'flashscore.com/match/{game_id}/'
    hrefs = [
        '#match-summary',
        '#match-summary/player-statistics',
        '#match-summary/match-statistics',
        '#match-summary/lineups',
        '#odds-comparison',
    ]
    with start_firefox(url=url, options=options) as driver:
        sleep(random.randint(2, 4))
        source = BeautifulSoup(driver.page_source, 'html.parser')
        count = 1
        for href in hrefs:
            if source.find_all('a', {'href': href}):
                go_to(f'{url}{href}')
                sleep(random.randint(2, 4))
                values[values_keys[count]] = driver.page_source
            count += 1
    return values

In [3]:
def create_db_source_per_year(year):
    path = f'data/games-source-{year}.db'

    if os.path.exists(path):
        os.remove(path)
    
    with db.connect(path) as con:
        query = '''CREATE TABLE games_source (
            game_id TEXT NOT NULL PRIMARY KEY,
            game_summary TEXT,
            player_statistics TEXT,
            game_statistics TEXT,
            lineups TEXT,
            odds_comparison TEXT)
            '''
        cur = con.cursor()
        cur.execute(query)

In [4]:
def get_games_ids_per_year(year):
    url = f'flashscore.com/basketball/usa/nba-{year}-{year+1}/results/'
    with start_firefox(url=url, options=options) as driver:
        scroll_down_page(driver)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        games_ids = get_games_ids_from_soup(soup)
        games_ids = reversed(games_ids)
    return games_ids


def scroll_down_page(driver):
    end_of_page = False
    while not end_of_page:
        try:
            sleep(2)
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
            click('Show more matches')
        except LookupError:
            end_of_page = True


def get_games_ids_from_soup(soup):
    games_ids = soup.find_all('div', attrs={'class': 'event__match'})
    games_ids = [game_id['id'] for game_id in games_ids]
    games_ids = [game_id.split('_')[-1] for game_id in games_ids]
    return games_ids

In [5]:
create_db_source_per_year(2021)

In [6]:
# populate_db_source_per_game_id(2021, 'K8mNdAob')
populate_db_source_per_year(2021)

In [16]:
dados = {}

with db.connect('data/games-source-2021.db') as con:
    query = 'SELECT game_id, game_summary, game_statistics FROM games_source'
    cur = con.cursor()
    cur.execute(query)
    for game_id, game_summary, game_statistics in cur.fetchall():
        dados[game_id] = get_summary_data(game_summary)


In [10]:
def get_summary_data(game_summary_source):
    data = []
    soup = BeautifulSoup(game_summary_source, 'html.parser')
    data.append(soup.find('div', {'class': 'duelParticipant__startTime'}).text)
    data.append([t.text for t in soup.findAll('div', {'class': 'participant__participantName'})])
    data.append(soup.find('div', {'class': 'detailScore__wrapper'}).text)
    return data

In [17]:
for value in dados.values():
    print(value)

['03.10.2021 16:30', ['Los Angeles Lakers', 'Brooklyn Nets'], '97-123']
['04.10.2021 20:00', ['Toronto Raptors', 'Philadelphia 76ers'], '123-107']
['04.10.2021 20:30', ['Miami Heat', 'Atlanta Hawks'], '125-99']
['04.10.2021 20:30', ['Boston Celtics', 'Orlando Magic'], '98-97']
['04.10.2021 21:00', ['Oklahoma City Thunder', 'Charlotte Hornets'], '97-113']
['04.10.2021 21:00', ['Minnesota Timberwolves', 'New Orleans Pelicans'], '117-114']
['04.10.2021 21:30', ['San Antonio Spurs', 'Utah Jazz'], '111-85']
['04.10.2021 23:00', ['Sacramento Kings', 'Phoenix Suns'], '117-106']
['04.10.2021 23:00', ['Portland Trail Blazers', 'Golden State Warriors'], '107-121']
['04.10.2021 23:30', ['Los Angeles Clippers', 'Denver Nuggets'], '103-102']
['05.10.2021 20:30', ['New York Knicks', 'Indiana Pacers'], '125-104']
['05.10.2021 21:00', ['Memphis Grizzlies', 'Milwaukee Bucks'], '87-77']
['05.10.2021 21:00', ['Houston Rockets', 'Washington Wizards'], '125-119']
['05.10.2021 21:00', ['Chicago Bulls', 'Cle

In [245]:
soup = BeautifulSoup(game_summary, 'html.parser')
fase = soup.find('a', {'href': '/basketball/usa/nba/'}).text
print('fase: '.rjust(15), fase)

data = soup.find('div', {'class': 'duelParticipant__startTime'}).text
print('data: '.rjust(15), data)

times = soup.findAll('div', {'class': 'participant__participantName'})
times = [t.text for t in times]
print('times: '.rjust(15), times)

pontos = soup.find('div', {'class': 'detailScore__wrapper'}).text
print('pontos: '.rjust(15), pontos)

try:
    status = soup.find('div', {'class': 'detailScore__status'}).text
    print('status: '.rjust(15), status)
except:
    print('status: '.rjust(15), None)

try:        
    obs = soup.find('div', {'class': 'infoBox__info'}).text
    print('obs.: '.rjust(15), obs)
except:
    print('obs.: '.rjust(15), None)

stats_home = soup.findAll('div', {'class': 'statHomeValue'})
stats_home = [s.text for s in stats_home]
print('stats casa: '.rjust(15), stats_home)

stats_away = soup.findAll('div', {'class': 'statAwayValue'})
stats_away = [s.text for s in stats_away]
print('stats fora: '.rjust(15), stats_away)

try:
    casa_de_aposta = soup.find('img', {'class': 'prematchLogo'})['title']
    print('odds casa: '.rjust(15), casa_de_aposta)
except:
    print('odds casa: '.rjust(15), None)

odds = soup.find_all('div', attrs={'class': 'cellWrapper'})
odds = [o['title'] for o in odds]
if odds == [] or odds[0] == '':
    odds = soup.find_all('span', {'class': 'oddsValue'})
    odds = [o.text for o in odds]
print('odds: '.rjust(15), odds)

print()

         fase:  NBA
         data:  30.10.2021 18:00
        times:  ['Washington Wizards', 'Boston Celtics']
       pontos:  115-112
       status:  After Overtime
         obs.:  2 extra times played.
   stats casa:  []
   stats fora:  []
    odds casa:  bet365
         odds:  ['1.71[u]1.76', '2.20[d]2.10']



home_team

away_team

In [232]:
def create_db_games_per_year(year):
    path = f'data/games-{year}.db'

    if os.path.exists(path):
        os.remove(path)
    
    with db.connect(path) as con:
        query = '''CREATE TABLE games (
            game_id TEXT NOT NULL PRIMARY KEY,
            game_summary TEXT,
            player_statistics TEXT,
            game_statistics TEXT,
            lineups TEXT,
            odds_comparison TEXT)
            '''
        cur = con.cursor()
        cur.execute(query)

In [260]:
soup = BeautifulSoup(game_summary, 'html.parser')

print(soup.find('div', class_="mi__data").text)

print([t.get_text() for t in soup.find_all('div', class_="smh__home")[2:]])
print([t.get_text() for t in soup.find_all('div', class_="smh__away")[2:]])

Attendance: 15 813,Â Venue: Capital One Arena (Washington)
['115', '25', '28', '23', '27', '12']
['112', '20', '27', '26', '30', '9']
