In [15]:
import bs4
import os
import sqlite3

In [312]:
with sqlite3.connect('data/games_html_2021.db') as con:
    query = 'SELECT game_summary, game_statistics, odds_comparison FROM games_html'
    cur = con.cursor()
    cur.execute(query)
    game_summary, game_statistics, odds_comparison = cur.fetchone()

In [314]:
def get_data_from_game_summary(game_summary_html):
    soup = bs4.BeautifulSoup(game_summary_html, 'html.parser')
    game_summary_dict = {}
    
    game_summary_dict['tournament'] = soup.find('a', {'href': '/basketball/usa/nba/'}).text
    game_summary_dict['start_time'] = soup.find('div', {'class': 'duelParticipant__startTime'}).text

    game_summary_dict['home_team'] = soup.find('div', {'class': 'duelParticipant__home'}).text
    game_summary_dict['away_team'] = soup.find('div', {'class': 'duelParticipant__away'}).text

    status = soup.find('div', {'class': 'detailScore__status'})
    game_summary_dict['status'] = status
    if status is not None:
        game_summary_dict['status'] = status.text

    info = soup.find('div', {'class': 'infoBox__info'})
    game_summary_dict['info'] = info
    if info is not None:
        game_summary_dict['info'] = info.text

    home_score_list = soup.find_all('div', {'class': 'smh__home'})
    home_score_list = [hs.text for hs in home_score_list]
    away_score_list = soup.find_all('div', {'class': 'smh__away'})
    away_score_list = [ascore.text for ascore in away_score_list]

    game_summary_dict['home_score'] = home_score_list[2]
    game_summary_dict['away_score'] = away_score_list[2]

    game_summary_dict['home_score_per_quarter'] = home_score_list[3:]
    game_summary_dict['away_score_per_quarter'] = away_score_list[3:]

    venue = soup.find('div', {'class': 'mi__data'})
    game_summary_dict['venue'] = venue
    if venue is not None:
        game_summary_dict['venue'] = venue.text

    return game_summary_dict

print(get_data_from_game_summary(game_summary))

{'tournament': 'NBA - Pre-season', 'start_time': '03.10.2021 16:30', 'home_team': 'Los Angeles Lakers', 'away_team': 'Brooklyn Nets', 'status': 'Finished', 'info': None, 'home_score': '97', 'away_score': '123', 'home_score_per_quarter': ['19', '30', '32', '16', ''], 'away_score_per_quarter': ['25', '32', '27', '39', ''], 'venue': 'Attendance: 16 000,\xa0Venue: Staples Center (Los Angeles)'}


In [304]:
def get_data_from_game_statistics(game_statistics_html):
    soup = bs4.BeautifulSoup(game_statistics_html, 'html.parser')
    game_statistics_dict = {}

    categorys = soup.find_all('div', {'class': 'statCategoryName'})
    h_values = soup.find_all('div', {'class': 'statHomeValue'})
    a_values = soup.find_all('div', {'class': 'statAwayValue'})

    for category, h_value, a_value in zip(categorys, h_values, a_values):
        game_statistics_dict[category.text] = [h_value.text, a_value.text]

    return game_statistics_dict

print(get_data_from_game_statistics(game_statistics))

{'Field Goals Attempted': ['88', '87'], 'Field Goals Made': ['32', '42'], 'Field Goals %': ['36.4%', '48.3%'], '2-Point Field G. Attempted': ['53', '63'], '2-Point Field Goals Made': ['21', '32'], '2-Point Field Goals %': ['39.6%', '50.8%'], '3-Point Field G. Attempted': ['35', '24'], '3-Point Field Goals Made': ['11', '10'], '3-Point Field Goals %': ['31.4%', '41.7%'], 'Free Throws Attempted': ['32', '40'], 'Free Throws Made': ['22', '29'], 'Free Throws %': ['68.8%', '72.5%'], 'Offensive Rebounds': ['13', '14'], 'Defensive Rebounds': ['28', '40'], 'Total Rebounds': ['41', '54'], 'Assists': ['17', '20'], 'Blocks': ['6', '7'], 'Turnovers': ['16', '16'], 'Steals': ['9', '9'], 'Personal Fouls': ['30', '30'], 'Technical Fouls': ['1', '0']}


In [310]:
def get_data_from_odds_comparison(odds_comparison_html):
    soup = bs4.BeautifulSoup(odds_comparison_html, 'html.parser')
    odds_comparison_dict = {}

    for row in soup.find_all('div', {'class': 'ui-table__row'}):
        bookmaker = row.find('img', {'class': 'prematchLogo'})['title'].lower()
        h_odds = row.find_all('a', {'class': 'oddsCell__odd'})[0].text
        h_odds_var =row.find_all('a', {'class': 'oddsCell__odd'})[0]['title']
        a_odds = row.find_all('a', {'class': 'oddsCell__odd'})[1].text
        a_odds_var = row.find_all('a', {'class': 'oddsCell__odd'})[1]['title']
        odds_comparison_dict[bookmaker] = [h_odds, a_odds, h_odds_var, a_odds_var]

    return odds_comparison_dict

print(get_data_from_odds_comparison(odds_comparison))

{'bet365': ['1.62', '2.40', '1.57 » 1.62', '2.55 » 2.40'], 'unibet': ['1.65', '2.30', '1.55 » 1.65', '2.48 » 2.30'], '1xbet': ['1.65', '2.26', '1.64 » 1.65', '2.33 » 2.26'], 'betfair': ['1.61', '2.40', '1.62 » 1.61', '2.35 » 2.40'], 'bwin': ['1.67', '2.25', '1.57 » 1.67', '2.45 » 2.25'], 'william hill': ['1.57', '2.30', '', '']}


In [407]:
def create_db_games_per_year(year, remove_if_exists=False):
    path = f'data/games_{year}.db'

    if os.path.exists(path) and remove_if_exists:
        os.remove(path)    
    
    with sqlite3.connect(path) as con:
        query = '''CREATE TABLE games (
            game_id TEXT NOT NULL PRIMARY KEY,
            tournament TEXT,
            start_time TEXT,
            status TEXT,
            info TEXT,
            venue TEXT,
            home_team TEXT,
            away_team TEXT,
            home_score TEXT,
            away_score TEXT,
            home_scorepq TEXT,
            away_scorepq TEXT,
            home_fga TEXT,
            away_fga TEXT,
            home_fgm TEXT,
            away_fgm TEXT,
            home_3pa TEXT,
            away_3pa TEXT,
            home_3pm TEXT,
            away_3pm TEXT,
            home_fta TEXT,
            away_fta TEXT,
            home_ftm TEXT,
            away_ftm TEXT,
            home_reb TEXT,
            away_reb TEXT,
            home_oreb TEXT,
            away_oreb TEXT,
            home_ast TEXT,
            away_ast TEXT,
            home_blk TEXT,
            away_blk TEXT,
            home_to TEXT,
            away_to TEXT,
            home_stl TEXT,
            away_stl TEXT,
            home_pf TEXT,
            away_pf TEXT,
            bookmaker TEXT,
            home_odds TEXT,
            away_odds TEXT,
            home_oddsv TEXT,
            away_oddsv TEXT)
            '''
        cur = con.cursor()
        cur.execute(query)

In [408]:
def populate_db_games_per_game_id(game_id, year):
    with sqlite3.connect(f'data/games_{year}.db') as con:
        query = '''INSERT INTO games VALUES (
            :game_id,
            :tournament,
            :start_time,
            :status,
            :info,
            :venue,
            :home_team,
            :away_team,
            :home_score,
            :away_score,
            :home_scorepq,
            :away_scorepq,
            :home_fga,
            :away_fga,
            :home_fgm,
            :away_fgm,
            :home_3pa,
            :away_3pa,
            :home_3pm,
            :away_3pm,
            :home_fta,
            :away_fta,
            :home_ftm,
            :away_ftm,
            :home_reb,
            :away_reb,
            :home_oreb,
            :away_oreb,
            :home_ast,
            :away_ast,
            :home_blk,
            :away_blk,
            :home_to,
            :away_to,
            :home_stl,
            :away_stl,
            :home_pf,
            :away_pf,
            :bookmaker,
            :home_odds,
            :away_odds,
            :home_oddsv,
            :away_oddsv)
            '''
        try:
            row = get_row_to_db_games(game_id, year)
        except:
            pass
        else:
            cur = con.cursor()
            cur.execute(query, row)

In [423]:
def get_row_to_db_games(game_id, year):
    with sqlite3.connect(f'data/games_html_{year}.db') as con:
        query = f'SELECT game_summary, game_statistics, odds_comparison FROM games_html WHERE game_id = "{game_id}"'
        cur = con.cursor()
        cur.execute(query)
        game_summary, game_statistics, odds_comparison = cur.fetchone()

    row = {}

    game_summary_dict = get_data_from_game_summary(game_summary)
    row['game_id'] = game_id
    row['tournament'] = game_summary_dict['tournament']
    row['start_time'] = game_summary_dict['start_time']
    row['status'] = game_summary_dict['status']
    row['info'] = game_summary_dict['info']
    row['venue'] = game_summary_dict['venue']
    row['home_team'] = game_summary_dict['home_team']
    row['away_team'] = game_summary_dict['away_team']
    row['home_score'] = game_summary_dict['home_score']
    row['away_score'] = game_summary_dict['away_score']
    row['home_scorepq'] = ','.join(game_summary_dict['home_score_per_quarter'])
    row['away_scorepq'] = ','.join(game_summary_dict['away_score_per_quarter'])

    if game_statistics is not None:
        game_statistics_dict = get_data_from_game_statistics(game_statistics)
        row['home_fga'] = game_statistics_dict['Field Goals Attempted'][0]
        row['away_fga'] = game_statistics_dict['Field Goals Attempted'][1]
        row['home_fgm'] = game_statistics_dict['Field Goals Made'][0]
        row['away_fgm'] = game_statistics_dict['Field Goals Made'][1]
        row['home_3pa'] = game_statistics_dict['3-Point Field G. Attempted'][0]
        row['away_3pa'] = game_statistics_dict['3-Point Field G. Attempted'][1]
        row['home_3pm'] = game_statistics_dict['3-Point Field Goals Made'][0]
        row['away_3pm'] = game_statistics_dict['3-Point Field Goals Made'][1]
        row['home_fta'] = game_statistics_dict['Free Throws Attempted'][0]
        row['away_fta'] = game_statistics_dict['Free Throws Attempted'][1]
        row['home_ftm'] = game_statistics_dict['Free Throws Made'][0]
        row['away_ftm'] = game_statistics_dict['Free Throws Made'][1]
        row['home_reb'] = game_statistics_dict['Total Rebounds'][0]
        row['away_reb'] = game_statistics_dict['Total Rebounds'][1]
        row['home_oreb'] = game_statistics_dict['Offensive Rebounds'][0]
        row['away_oreb'] = game_statistics_dict['Offensive Rebounds'][1]
        row['home_ast'] = game_statistics_dict['Assists'][0]
        row['away_ast'] = game_statistics_dict['Assists'][1]
        row['home_blk'] = game_statistics_dict['Blocks'][0]
        row['away_blk'] = game_statistics_dict['Blocks'][1]
        row['home_to'] = game_statistics_dict['Turnovers'][0]
        row['away_to'] = game_statistics_dict['Turnovers'][1]
        row['home_stl'] = game_statistics_dict['Steals'][0]
        row['away_stl'] = game_statistics_dict['Steals'][1]
        row['home_pf'] = game_statistics_dict['Personal Fouls'][0]
        row['away_pf'] = game_statistics_dict['Personal Fouls'][1]
    else:
        row['home_fga'] = None
        row['away_fga'] = None
        row['home_fgm'] = None
        row['away_fgm'] = None
        row['home_3pa'] = None
        row['away_3pa'] = None
        row['home_3pm'] = None
        row['away_3pm'] = None
        row['home_fta'] = None
        row['away_fta'] = None
        row['home_ftm'] = None
        row['away_ftm'] = None
        row['home_reb'] = None
        row['away_reb'] = None
        row['home_oreb'] = None
        row['away_oreb'] = None
        row['home_ast'] = None
        row['away_ast'] = None
        row['home_blk'] = None
        row['away_blk'] = None
        row['home_to'] = None
        row['away_to'] = None
        row['home_stl'] = None
        row['away_stl'] = None
        row['home_pf'] = None
        row['away_pf'] = None
    
    if odds_comparison is not None:
        odds_comparison_dict = get_data_from_odds_comparison(odds_comparison)
        key = list(odds_comparison_dict.keys())[0]
        row['bookmaker'] = key
        row['home_odds'] = odds_comparison_dict[key][0]
        row['away_odds'] = odds_comparison_dict[key][1]
        row['home_oddsv'] = odds_comparison_dict[key][2]
        row['away_oddsv'] = odds_comparison_dict[key][3]
    else:
        row['bookmaker'] = None
        row['home_odds'] = None
        row['away_odds'] = None
        row['home_oddsv'] = None
        row['away_oddsv'] = None

    return row

get_row_to_db_games('2w4yiBRt', 1993)

{'game_id': '2w4yiBRt',
 'tournament': 'NBA',
 'start_time': '21.04.1994 20:00',
 'status': 'Finished',
 'info': None,
 'venue': None,
 'home_team': 'Dallas Mavericks',
 'away_team': 'Houston Rockets',
 'home_score': '107',
 'away_score': '95',
 'home_scorepq': '27,20,26,34,',
 'away_scorepq': '31,23,20,21,',
 'home_fga': None,
 'away_fga': None,
 'home_fgm': None,
 'away_fgm': None,
 'home_3pa': None,
 'away_3pa': None,
 'home_3pm': None,
 'away_3pm': None,
 'home_fta': None,
 'away_fta': None,
 'home_ftm': None,
 'away_ftm': None,
 'home_reb': None,
 'away_reb': None,
 'home_oreb': None,
 'away_oreb': None,
 'home_ast': None,
 'away_ast': None,
 'home_blk': None,
 'away_blk': None,
 'home_to': None,
 'away_to': None,
 'home_stl': None,
 'away_stl': None,
 'home_pf': None,
 'away_pf': None,
 'bookmaker': None,
 'home_odds': None,
 'away_odds': None,
 'home_oddsv': None,
 'away_oddsv': None}

In [388]:
def populate_db_games_per_year(year, show_progress=False):
    with sqlite3.connect(f'data/games_html_{year}.db') as con:
        query = 'SELECT game_id FROM games_html'
        cur = con.cursor()
        cur.execute(query)
        games_ids = cur.fetchall()

    count, total = 1, len(games_ids)
    for game_id in games_ids:
        game_id = game_id[0]
        if show_progress:
            print(f'{year} {game_id} {count}/{total}')
        populate_db_games_per_game_id(game_id, year)
        count += 1

In [413]:
create_db_games_per_year(2021, remove_if_exists=True)

In [414]:
populate_db_games_per_year(2021, show_progress=True)

2021 00YiQxBJ 1/313
2021 0KfPAiSB 2/313
2021 0M3dfoG7 3/313
2021 0OEwf7M9 4/313
2021 0UamFC2I 5/313
2021 0Y9I1Yat 6/313
2021 0YZcPjs5 7/313
2021 0dCxKe5P 8/313
2021 0fBYyUQA 9/313
2021 0ruMTBHB 10/313
2021 21p2cshe 11/313
2021 23RrKzH0 12/313
2021 29M0LTk0 13/313
2021 29jEfuMR 14/313
2021 2BLmsgSP 15/313
2021 2P2JWD06 16/313
2021 2RDiPjRI 17/313
2021 2RsuJUh8 18/313
2021 2g4CHHH3 19/313
2021 2grs2xIG 20/313
2021 2u3yWzBE 21/313
2021 2w2F6oV8 22/313
2021 2wrhoDFa 23/313
2021 44qR1dqs 24/313
2021 48qdTvfE 25/313
2021 4SgHT5Yj 26/313
2021 4WqTp48k 27/313
2021 4dkZg8VG 28/313
2021 4hgkHyTs 29/313
2021 6BlqmZpm 30/313
2021 6BzfQWRb 31/313
2021 6D1mWbfk 32/313
2021 6NAFZwp2 33/313
2021 6RlY3bm4 34/313
2021 6ceLBXC5 35/313
2021 6ck317ET 36/313
2021 6e4hi6VL 37/313
2021 6iEtz8eN 38/313
2021 6yt0ZzkQ 39/313
2021 88rURknO 40/313
2021 8AZdnHT7 41/313
2021 8AeS1dTf 42/313
2021 8Awwkdrf 43/313
2021 8I9Anl45 44/313
2021 8SlbbNM2 45/313
2021 8UJ4vTVT 46/313
2021 8UkxZwok 47/313
2021 8WM7mw3N 48/313
2

In [424]:
create_db_games_per_year(1993, remove_if_exists=True)

In [425]:
populate_db_games_per_year(1993, show_progress=True)

1993 0086b3rr 1/1184
1993 00S1oJt5 2/1184
1993 00nyqOre 3/1184
1993 02Ri5IgK 4/1184
1993 06sjnss5 5/1184
1993 08KlIbY1 6/1184
1993 0Aa5vgqR 7/1184
1993 0CpoF3Jh 8/1184
1993 0E7z0Ieb 9/1184
1993 0EDNqZ2G 10/1184
1993 0EihUg3T 11/1184
1993 0I3Z0XxF 12/1184
1993 0I9BFaDd 13/1184
1993 0IRukFM5 14/1184
1993 0KxlfgOq 15/1184
1993 0QXzPg57 16/1184
1993 0W8NizMR 17/1184
1993 0dhEQcNS 18/1184
1993 0fTFiN0f 19/1184
1993 0h229amU 20/1184
1993 0hKrS9yE 21/1184
1993 0j9jHqio 22/1184
1993 0jDCFXs0 23/1184
1993 0jaNhVOB 24/1184
1993 0lMzKnZo 25/1184
1993 0ljhbQDI 26/1184
1993 0pSD1MDJ 27/1184
1993 0pYIAHPM 28/1184
1993 0pYL8i0C 29/1184
1993 0tEgzvGD 30/1184
1993 0tx2pOwL 31/1184
1993 0vFRtvHu 32/1184
1993 0vZMTgF2 33/1184
1993 0xOJQvvl 34/1184
1993 0zRNmhDP 35/1184
1993 0zaZ8YeQ 36/1184
1993 0zqBCeu4 37/1184
1993 21K5gkVl 38/1184
1993 250yVcdO 39/1184
1993 25vtMHVq 40/1184
1993 27aZMwRa 41/1184
1993 29jfl42p 42/1184
1993 29zPkJF0 43/1184
1993 2BFX32Ep 44/1184
1993 2BVAAbD9 45/1184
1993 2DbmkhOd 46/11

In [426]:
create_db_games_per_year(1994, remove_if_exists=True)
populate_db_games_per_year(1994, show_progress=True)

1994 04M9u1Nk 1/1180
1994 06DN4JSC 2/1180
1994 08CLdacl 3/1180
1994 0AmBPvnk 4/1180
1994 0Em8PuE8 5/1180
1994 0Gt2c7p6 6/1180
1994 0I36641g 7/1180
1994 0KGkIS3r 8/1180
1994 0KfK2m4e 9/1180
1994 0MBXYZ5J 10/1180
1994 0MmPOkrC 11/1180
1994 0Oj4uK6U 12/1180
1994 0QD8lHNH 13/1180
1994 0UBZnUCP 14/1180
1994 0Umd9xsf 15/1180
1994 0UmgB264 16/1180
1994 0WGVkdHi 17/1180
1994 0YawtGGu 18/1180
1994 0YyYcBFS 19/1180
1994 0bNA03xt 20/1180
1994 0fXnr0re 21/1180
1994 0fmNurJ3 22/1180
1994 0hWu2bTj 23/1180
1994 0p48kCQ1 24/1180
1994 0pQw5iQl 25/1180
1994 0rsWEXQ0 26/1180
1994 0t6uwNJs 27/1180
1994 0t8lRr4A 28/1180
1994 0xCIKdbF 29/1180
1994 0xWiI105 30/1180
1994 0zTcK2MM 31/1180
1994 0zX1b2mj 32/1180
1994 0zkTZsM9 33/1180
1994 2184m7kG 34/1180
1994 21nZXRkT 35/1180
1994 233FbA48 36/1180
1994 23a5ZFFD 37/1180
1994 23aX7YWa 38/1180
1994 23uAn0Hj 39/1180
1994 27HX3jOc 40/1180
1994 27kqPW6b 41/1180
1994 29hd7rEU 42/1180
1994 2DbLGZ6A 43/1180
1994 2FNSwV6a 44/1180
1994 2Hv6MXog 45/1180
1994 2LUHwWLi 46/11

In [418]:
create_db_games_per_year(2019, remove_if_exists=True)
populate_db_games_per_year(2019, show_progress=True)

2019 04AY9MQH 1/1243
2019 04EWyVlg 2/1243
2019 04W3P1Q4 3/1243
2019 06NWqT7l 4/1243
2019 08SNmVnd 5/1243
2019 08pMUspf 6/1243
2019 0C4KdCj1 7/1243
2019 0Ct67EnT 8/1243
2019 0K25bSMs 9/1243
2019 0OIanc2L 10/1243
2019 0OhHFQdt 11/1243
2019 0QqgBB5f 12/1243
2019 0WS5N3Wj 13/1243
2019 0Yke26n2 14/1243
2019 0biDILrH 15/1243
2019 0bunGp2C 16/1243
2019 0bwDiZRm 17/1243
2019 0f3PWD8B 18/1243
2019 0fRHXihL 19/1243
2019 0fRbPnem 20/1243
2019 0hrWViDe 21/1243
2019 0jg7GUKl 22/1243
2019 0jsEYrcr 23/1243
2019 0lOYPXQN 24/1243
2019 0rFo486e 25/1243
2019 0rwP7xa2 26/1243
2019 0tUeyYWp 27/1243
2019 0tnHiwMA 28/1243
2019 0vPR6Oq5 29/1243
2019 0xbY0dHE 30/1243
2019 0zJHv26k 31/1243
2019 21fKT68F 32/1243
2019 239nFmFk 33/1243
2019 23DU5Pnb 34/1243
2019 254cQjUs 35/1243
2019 27ZAAATR 36/1243
2019 27eZLywd 37/1243
2019 29fPuyjB 38/1243
2019 29rr00Fq 39/1243
2019 2BNWmx9M 40/1243
2019 2BNbMfQ8 41/1243
2019 2Bik5PRK 42/1243
2019 2BoDBhUt 43/1243
2019 2D66CEPd 44/1243
2019 2D82d76n 45/1243
2019 2DKKhrIO 46/12

In [427]:
create_db_games_per_year(2020, remove_if_exists=True)
populate_db_games_per_year(2020, show_progress=True)

2020 02NHldmG 1/1221
2020 02V2kA9q 2/1221
2020 08AbiHMt 3/1221
2020 0Au5XMV7 4/1221
2020 0GnBUpdp 5/1221
2020 0I1SnIy9 6/1221
2020 0IgJNMqg 7/1221
2020 0KSYWtMD 8/1221
2020 0OKIO75Q 9/1221
2020 0QBJAGVN 10/1221
2020 0QFJdDSK 11/1221
2020 0SYzUmPF 12/1221
2020 0Sf6qTO2 13/1221
2020 0SliE43s 14/1221
2020 0UJJnbg2 15/1221
2020 0W8Ol9rh 16/1221
2020 0WODU3P1 17/1221
2020 0WhZeIVM 18/1221
2020 0bF48oMb 19/1221
2020 0bTboznU 20/1221
2020 0dUQGTla 21/1221
2020 0hd3HeiA 22/1221
2020 0hfwd8VM 23/1221
2020 0lb6m9k7 24/1221
2020 0lhEX6xG 25/1221
2020 0rBIP28A 26/1221
2020 0rBLNxq0 27/1221
2020 0vFiwtDj 28/1221
2020 0veKrvLP 29/1221
2020 0vwVGXrM 30/1221
2020 0zTMGwka 31/1221
2020 0zknQjvQ 32/1221
2020 214rVJ99 33/1221
2020 21Ql7BaC 34/1221
2020 21l3d93p 35/1221
2020 23LLDMl3 36/1221
2020 25nmoWa7 37/1221
2020 27NRCoef 38/1221
2020 29ECokmK 39/1221
2020 29xRzmih 40/1221
2020 2BDwUqVh 41/1221
2020 2BLMcgJ0 42/1221
2020 2BNSmiA3 43/1221
2020 2BaGL1Tc 44/1221
2020 2DUrfeJI 45/1221
2020 2DvXZBcq 46/12