In [79]:
import pandas as pd
import sqlite3 as db

from bs4 import BeautifulSoup
from helium import click, go_to, start_firefox
from helium import *

from time import sleep

In [2]:
from selenium.webdriver import FirefoxOptions
options = FirefoxOptions()
options.add_argument("--width=1024")
options.add_argument("--height=768")
options.add_argument("-headless")

In [3]:
with db.connect('data/nba-games-source.db') as con:
    query = 'CREATE TABLE nba_games_source (game_id TEXT NOT NULL PRIMARY KEY)'
    cur = con.cursor()
    cur.execute(query)

In [46]:
def get_sources(years=[]):
    sources_dict = {}
    for year in years:
        link = f'flashscore.com/basketball/usa/nba-{year}-{year+1}/results/'
        with start_firefox(options=options) as driver:            
            go_to(link)
            accept_cookies()
            scroll_down_page(driver)
            sources_dict[year] = driver.page_source
    return sources_dict


def accept_cookies():
    try: click(Button('I Accept'))
    except LookupError: return


def scroll_down_page(driver):
    errors = 0
    while errors < 2:
        sleep(2)
        try: show_more_games(driver)
        except LookupError: errors += 1
        else: errors = 0


def show_more_games(driver):
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
    click('Show more matches')

In [47]:
def get_games_id(sources):
    years = sources.keys()
    games_id_dict = {}
    for year in years:
        source = BeautifulSoup(sources[year], 'html.parser')
        games_id = source.find_all('div', attrs={'class': 'event__match'})
        games_id = [match_id['id'].split('_')[-1] for match_id in games_id]
        games_id_dict[year] = pd.DataFrame(reversed(games_id), columns=['game_id'])
    return games_id_dict

In [49]:
def record_games_id(games_id):
    years = games_id.keys()
    for year in years:
        with db.connect('data/nba-games-source.db') as con:
            games_id[year].to_sql('nba_games_source', con=con, index=False, if_exists='append')

In [66]:
#years = list(range(1993, 2021))
years = [2009, 2010, 2011, 2012, 2013]
sources = get_sources(years)
games_id = get_games_id(sources)
record_games_id(games_id)

In [71]:
with db.connect('data/nba-games-source.db') as con:
    cur = con.cursor()
    cur.execute('SELECT game_id FROM nba_games_source')
    print(cur.fetchmany(5))
    cur.execute('SELECT game_id FROM nba_games_source')
    print(cur.fetchall()[-5:])

[('dKWb4wc3',), ('x4DY02ZR',), ('p8wAZTdd',), ('82MDq5Ig',), ('rmXxWTN0',)]
[('ETkrYCgg',), ('hKlnXW8a',), ('nsljWjO5',), ('pS6gVAwC',), ('67Upolsm',)]


In [None]:
def create_db(year):
    with db.connect(f'data/nba-matches-{year}-source.db') as conn:
        query = '''CREATE TABLE IF NOT EXISTS matches_source (
            match_id TEXT NOT NULL PRIMARY KEY,
            match_summary TEXT,
            player_statistics TEXT,
            match_statistics_0 TEXT,
            match_statistics_1 TEXT,
            match_statistics_2 TEXT,
            match_statistics_3 TEXT,
            match_statistics_4 TEXT,
            match_statistics_5 TEXT,
            lineups TEXT,
            point_by_point_0 TEXT,
            point_by_point_1 TEXT,
            point_by_point_2 TEXT,
            point_by_point_3 TEXT,
            point_by_point_4 TEXT,
            odds_comparison TEXT)
            '''
        cur = conn.cursor()
        cur.execute(query)


def insert_matches_id(year, matches_id):
    with db.connect(f'data/nba-matches-{year}-source.db') as conn:
        query = '''INSERT INTO matches_source VALUES (
            :match_id,
            :match_summary,
            :player_statistics,
            :match_statistics_0,
            :match_statistics_1,
            :match_statistics_2,
            :match_statistics_3,
            :match_statistics_4,
            :match_statistics_5,
            :lineups,
            :point_by_point_0,
            :point_by_point_1,
            :point_by_point_2,
            :point_by_point_3,
            :point_by_point_4,
            :odds_comparison)
            '''
        values = {'match_id': None,
            'match_summary': None,
            'player_statistics': None,
            'match_statistics_0': None,
            'match_statistics_1': None,
            'match_statistics_2': None,
            'match_statistics_3': None,
            'match_statistics_4': None,
            'match_statistics_5': None,
            'lineups': None,
            'point_by_point_0': None,
            'point_by_point_1': None,
            'point_by_point_2': None,
            'point_by_point_3': None,
            'point_by_point_4': None,
            'odds_comparison': None}     

        cur = conn.cursor()
        for match_id in reversed(matches_id):
            values['match_id'] = match_id
            cur.execute(query, values)

- #match-summary
  - #match-summary/match-summary
  - #match-summary/player-statistics
  - #match-summary/match-statistics
    - #match-summary/match-statistics/0
    - #match-summary/match-statistics/1
    - #match-summary/match-statistics/2
    - #match-summary/match-statistics/3
    - #match-summary/match-statistics/4
    - #match-summary/match-statistics/5
  - #match-summary/lineups
  - #match-summary/point-by-point
    - #match-summary/point-by-point/0
    - #match-summary/point-by-point/1
    - #match-summary/point-by-point/2
    - #match-summary/point-by-point/3
    - #match-summary/point-by-point/4
- #odds-comparison


In [118]:
with start_firefox(options=options) as driver:
    link = f'flashscore.com/match/dKWb4wc3/#match-summary'
    go_to(link)
    sleep(1)
    source = BeautifulSoup(driver.page_source, 'html.parser')

In [114]:
tabs_top = source.find('div', attrs={'class': 'tabs__detail'}).find_all('a', attrs={'class': 'tabs__tab'})
tabs_top = [tab['href'] for tab in tabs_top]
print(tabs_top)

['#match-summary', '#odds-comparison', '#h2h', '#draw']


In [115]:
tabs_bottom = source.find('div', attrs={'class': 'tabs__detail--nav'}).find_all('a', attrs={'class': 'tabs__tab'})
tabs_bottom = [tab['href'] for tab in tabs_bottom]
print(tabs_bottom)

['#match-summary/match-summary', '#match-summary/player-statistics', '#match-summary/match-statistics', '#match-summary/lineups', '#match-summary/point-by-point']


In [117]:
if source.find_all('a', attrs={'href': '#match-summary/match-summary'}):
    print('#match-summary/match-summary')

if source.find_all('a', attrs={'href': '#match-summary/player-statistics'}):
    print('#match-summary/player-statistics')    

#match-summary/match-summary
#match-summary/player-statistics


In [113]:
with start_firefox(options=options) as driver:
    link = f'flashscore.com/match/67Upolsm/#match-summary/match-statistics'
    go_to(link)
    sleep(1)
    source = BeautifulSoup(driver.page_source, 'html.parser')

In [None]:
with start_firefox(options=options) as driver:
    link = f'flashscore.com/match/67Upolsm/#match-summary/match-statistics'
    go_to(link)
    sleep(1)
    source = BeautifulSoup(driver.page_source, 'html.parser')

['#match-summary/match-summary',
 '#match-summary/player-statistics',
 '#match-summary/match-statistics',
 '#match-summary/lineups',
 '#match-summary/point-by-point']

In [8]:
for html in lista:
    html = BeautifulSoup(html, 'html.parser')

    fase = html.find('a', {'href': '/basketball/usa/nba/'}).text
    print('fase: '.rjust(15), fase)

    data = html.find('div', {'class': 'duelParticipant__startTime'}).text
    print('data: '.rjust(15), data)

    times = html.findAll('div', {'class': 'participant__participantName'})
    times = [t.text for t in times]
    print('times: '.rjust(15), times)

    pontos = html.find('div', {'class': 'detailScore__wrapper'}).text
    print('pontos: '.rjust(15), pontos)

    try:
        status = html.find('div', {'class': 'detailScore__status'}).text
        print('status: '.rjust(15), status)
    except:
        print('status: '.rjust(15), None)

    try:        
        obs = html.find('div', {'class': 'infoBox__info'}).text
        print('obs.: '.rjust(15), obs)
    except:
        print('obs.: '.rjust(15), None)

    stats_home = html.findAll('div', {'class': 'statHomeValue'})
    stats_home = [s.text for s in stats_home]
    print('stats casa: '.rjust(15), stats_home)

    stats_away = html.findAll('div', {'class': 'statAwayValue'})
    stats_away = [s.text for s in stats_away]
    print('stats fora: '.rjust(15), stats_away)

    try:
        casa_de_aposta = html.find('img', {'class': 'prematchLogo'})['title']
        print('odds casa: '.rjust(15), casa_de_aposta)
    except:
        print('odds casa: '.rjust(15), None)

    odds = html.find_all('div', attrs={'class': 'cellWrapper'})
    odds = [o['title'] for o in odds]
    if odds == [] or odds[0] == '':
        odds = html.find_all('span', {'class': 'oddsValue'})
        odds = [o.text for o in odds]
    print('odds: '.rjust(15), odds)

    print()

         fase:  NBA
         data:  18.11.2021 00:00
        times:  ['Phoenix Suns', 'Dallas Mavericks']
       pontos:  105-98
       status:  Finished
         obs.:  None
   stats casa:  ['93', '41', '44.1%', '54', '28', '51.9%', '39', '13', '33.3%', '14', '10', '71.4%', '8', '41', '49', '27', '7', '8', '6', '9', '1']
   stats fora:  ['92', '38', '41.3%', '60', '22', '36.7%', '32', '16', '50%', '6', '6', '100%', '9', '41', '50', '28', '2', '10', '5', '14', '0']
    odds casa:  bet365
         odds:  ['1.28[d]1.27', '3.75[u]3.80']

         fase:  NBA
         data:  18.11.2021 00:00
        times:  ['Portland Trail Blazers', 'Chicago Bulls']
       pontos:  112-107
       status:  Finished
         obs.:  None
   stats casa:  ['84', '37', '44%', '53', '26', '49.1%', '31', '11', '35.5%', '28', '27', '96.4%', '12', '32', '44', '26', '5', '17', '10', '19', '0']
   stats fora:  ['83', '39', '47%', '54', '26', '48.1%', '29', '13', '44.8%', '20', '16', '80%', '10', '26', '36', '23', '9',

In [48]:
with start_firefox(headless=True) as driver:
    match_id = 'K8mNdAob'
    link = f'flashscore.com/match/{match_id}/'
    go_to(link)
    sleep(.5)
    source = driver.page_source
    
    links = []
    for i in html.find('div', {'class': 'tabs__detail--nav'}).find_all('a', {'class': 'tabs__tab'}):
        print(i['href'])

    go_to(link + '#match-summary/match-statistics')
    for i in html.find('a', {'class': 'tabs__detail--sub'}).find_all('a', {'class': 'subTabs__tab'}):
        print(i['href'])

    go_to(link + '#match-summary/point-by-point')
    for i in html.find('div', {'class': 'tabs__detail--sub'}).find_all('a', {'class': 'subTabs__tab'}):
        print(i['href'])


#match-summary/match-summary
#match-summary/player-statistics
#match-summary/match-statistics
#match-summary/lineups
#match-summary/point-by-point


AttributeError: 'NoneType' object has no attribute 'find_all'

In [42]:
links = []
for i in html.find('div', {'class': 'tabs__detail--nav'}).find_all('a', {'class': 'tabs__tab'}):
    links.append(i['href'])

print(links)

['#match-summary/match-summary', '#match-summary/player-statistics', '#match-summary/match-statistics', '#match-summary/lineups', '#match-summary/point-by-point']


In [41]:
html = BeautifulSoup(source, 'html.parser')

fase = html.find('a', {'href': '/basketball/usa/nba/'}).text
print('fase: '.rjust(15), fase)

data = html.find('div', {'class': 'duelParticipant__startTime'}).text
print('data: '.rjust(15), data)

times = html.findAll('div', {'class': 'participant__participantName'})
times = [t.text for t in times]
print('times: '.rjust(15), times)

pontos = html.find('div', {'class': 'detailScore__wrapper'}).text
print('pontos: '.rjust(15), pontos)

try:
    status = html.find('div', {'class': 'detailScore__status'}).text
    print('status: '.rjust(15), status)
except:
    print('status: '.rjust(15), None)

try:        
    obs = html.find('div', {'class': 'infoBox__info'}).text
    print('obs.: '.rjust(15), obs)
except:
    print('obs.: '.rjust(15), None)

stats_home = html.findAll('div', {'class': 'statHomeValue'})
stats_home = [s.text for s in stats_home]
print('stats casa: '.rjust(15), stats_home)

stats_away = html.findAll('div', {'class': 'statAwayValue'})
stats_away = [s.text for s in stats_away]
print('stats fora: '.rjust(15), stats_away)

try:
    casa_de_aposta = html.find('img', {'class': 'prematchLogo'})['title']
    print('odds casa: '.rjust(15), casa_de_aposta)
except:
    print('odds casa: '.rjust(15), None)

odds = html.find_all('div', attrs={'class': 'cellWrapper'})
odds = [o['title'] for o in odds]
if odds == [] or odds[0] == '':
    odds = html.find_all('span', {'class': 'oddsValue'})
    odds = [o.text for o in odds]
print('odds: '.rjust(15), odds)

         fase:  NBA
         data:  30.10.2021 18:00
        times:  ['Washington Wizards', 'Boston Celtics']
       pontos:  115-112
       status:  After Overtime
         obs.:  2 extra times played.
   stats casa:  []
   stats fora:  []
    odds casa:  None
         odds:  []
