In [1]:
import bs4
import os
import sqlite3

In [99]:
def create_db_games_per_year(year):    
    with sqlite3.connect(f'data/games/games_{year}.db') as con:
        query = '''CREATE TABLE games (
            game_id TEXT NOT NULL PRIMARY KEY,
            tournament TEXT,
            start_time TEXT,
            status TEXT,
            info TEXT,
            venue TEXT,
            home_team TEXT,
            away_team TEXT,
            home_score INTEGER,
            away_score INTEGER,
            home_scorepq TEXT,
            away_scorepq TEXT,
            home_FGA INTEGER,
            away_FGA INTEGER,
            home_FGM INTEGER,
            away_FGM INTEGER,
            home_2PA INTEGER,
            away_2PA INTEGER,
            home_2PM INTEGER,
            away_2PM INTEGER,
            home_3PA INTEGER,
            away_3PA INTEGER,
            home_3PM INTEGER,
            away_3PM INTEGER,
            home_FTA INTEGER,
            away_FTA INTEGER,
            home_FTM INTEGER,
            away_FTM INTEGER,
            home_OREB INTEGER,
            away_OREB INTEGER,
            home_DREB INTEGER,
            away_DREB INTEGER,
            home_REB INTEGER,
            away_REB INTEGER,
            home_AST INTEGER,
            away_AST INTEGER,
            home_BLK INTEGER,
            away_BLK INTEGER,
            home_TO INTEGER,
            away_TO INTEGER,
            home_STL INTEGER,
            away_STL INTEGER,
            home_PF INTEGER,
            away_PF INTEGER,
            home_TF INTEGER,
            away_TF INTEGER,
            bookmaker TEXT,
            home_odds REAL,
            away_odds REAL,
            home_oddsv TEXT,
            away_oddsv TEXT)
            '''
        cur = con.cursor()
        cur.execute(query)

In [149]:
def get_data_from_summary_html(summary_html):
    keys = [
        'tournament',
        'start_time',
        'status',
        'info',
        'venue',
        'home_team',
        'away_team',
        'home_score',
        'away_score',
        'home_scorepq',
        'away_scorepq',
    ]
    summary_data = {key: None for key in keys}
    
    if summary_html is not None:
        soup = bs4.BeautifulSoup(summary_html, 'html.parser')
    else:
        return summary_data
        
    tournament = soup.find('a', {'href': '/basketball/usa/nba/'})
    if tournament is not None:
        summary_data['tournament'] = tournament.text

    start_time = soup.find('div', {'class': 'duelParticipant__startTime'})
    if start_time is not None:
        summary_data['start_time'] = start_time.text

    status = soup.find('div', {'class': 'detailScore__status'})
    if status is not None:
        summary_data['status'] = status.text

    info = soup.find('div', {'class': 'infoBox__info'})
    if info is not None:
        summary_data['info'] = info.text

    venue = soup.find('div', {'class': 'mi__data'})
    if venue is not None:
        summary_data['venue'] = venue.text

    home_team = soup.find('div', {'class': 'duelParticipant__home'})
    if home_team is not None:
        summary_data['home_team'] = home_team.text

    away_team = soup.find('div', {'class': 'duelParticipant__away'})
    if away_team is not None:
        summary_data['away_team'] = away_team.text

    home_score_list = soup.find_all('div', {'class': 'smh__home'})
    home_score_list = [home_score.text for home_score in home_score_list]
    if len(home_score_list) > 3:            
        summary_data['home_score'] = home_score_list[2]
        summary_data['home_scorepq'] = ' '.join(home_score_list[3:]).strip()

    away_score_list = soup.find_all('div', {'class': 'smh__away'})
    away_score_list = [away_score.text for away_score in away_score_list]
    if len(away_score_list) > 3:
        summary_data['away_score'] = away_score_list[2]
        summary_data['away_scorepq'] = ' '.join(away_score_list[3:]).strip()

    return summary_data

In [156]:
def get_data_from_stats_html(stats_html):
    keys = [
        'home_FGA',
        'away_FGA',
        'home_FGM',
        'away_FGM',
        'home_2PA',
        'away_2PA',
        'home_2PM',
        'away_2PM',
        'home_3PA',
        'away_3PA',
        'home_3PM',
        'away_3PM',
        'home_FTA',
        'away_FTA',
        'home_FTM',
        'away_FTM',
        'home_OREB',
        'away_OREB',
        'home_DREB',
        'away_DREB',
        'home_REB',
        'away_REB',
        'home_AST',
        'away_AST',
        'home_BLK',
        'away_BLK',
        'home_TO',
        'away_TO',
        'home_STL',
        'away_STL',
        'home_PF',
        'away_PF',
        'home_TF',
        'away_TF',
    ]
    stats_data = {key: None for key in keys}
    
    if stats_html is not None:
        soup = bs4.BeautifulSoup(stats_html, 'html.parser')
    else:
        return stats_data
        
    category_names = soup.find_all('div', {'class': 'statCategoryName'})
    home_values = soup.find_all('div', {'class': 'statHomeValue'})
    away_values = soup.find_all('div', {'class': 'statAwayValue'})

    category_names_dict = {
        'Field Goals Attempted': 'FGA',
        'Field Goals Made': 'FGM',
        '2-Point Field G. Attempted': '2PA',
        '2-Point Field Goals Made': '2PM',
        '3-Point Field G. Attempted': '3PA',
        '3-Point Field Goals Made': '3PM',
        'Free Throws Attempted': 'FTA',
        'Free Throws Made': 'FTM',
        'Offensive Rebounds': 'OREB',
        'Defensive Rebounds': 'DREB',
        'Total Rebounds': 'REB',
        'Assists': 'AST',
        'Blocks': 'BLK',
        'Turnovers': 'TO',
        'Steals': 'STL',
        'Personal Fouls': 'PF',
        'Technical Fouls': 'TF',
    }
    for category, home_value, away_value in zip(category_names, home_values, away_values):
        if category.text in category_names_dict.keys():
            stats_data[f'home_{category_names_dict[category.text]}'] = home_value.text
            stats_data[f'away_{category_names_dict[category.text]}'] = away_value.text

    return stats_data

In [159]:
def get_data_from_odds_html(odds_html):
    keys = [
        'bookmaker',
        'home_odds',
        'away_odds',
        'home_oddsv',
        'away_oddsv',
    ]
    odds_data = {key: None for key in keys}

    if odds_html is not None:
        soup = bs4.BeautifulSoup(odds_html, 'html.parser')
    else:
        return odds_data

    row = soup.find('div', {'class': 'ui-table__row'})
    
    if row is not None and len(row.find_all('a', {'class': 'oddsCell__odd'})) == 2:    
        odds_data['bookmaker'] = row.find('img', {'class': 'prematchLogo'})['title']
        odds_data['home_odds'] = row.find_all('a', {'class': 'oddsCell__odd'})[0].text
        odds_data['home_oddsv'] = row.find_all('a', {'class': 'oddsCell__odd'})[0]['title']
        odds_data['away_odds'] = row.find_all('a', {'class': 'oddsCell__odd'})[1].text
        odds_data['away_oddsv'] = row.find_all('a', {'class': 'oddsCell__odd'})[1]['title']

    return odds_data

In [163]:
def populate_db_games_per_game_id(game_id, year):
    with sqlite3.connect(f'data/games/games_{year}.db') as con:
        query = '''INSERT INTO games VALUES (
            :game_id,
            :tournament,
            :start_time,
            :status,
            :info,
            :venue,
            :home_team,
            :away_team,
            :home_score,
            :away_score,
            :home_scorepq,
            :away_scorepq,
            :home_FGA,
            :away_FGA,
            :home_FGM,
            :away_FGM,
            :home_2PA,
            :away_2PA,
            :home_2PM,
            :away_2PM,
            :home_3PA,
            :away_3PA,
            :home_3PM,
            :away_3PM,
            :home_FTA,
            :away_FTA,
            :home_FTM,
            :away_FTM,
            :home_OREB,
            :away_OREB,
            :home_DREB,
            :away_DREB,
            :home_REB,
            :away_REB,
            :home_AST,
            :away_AST,
            :home_BLK,
            :away_BLK,
            :home_TO,
            :away_TO,
            :home_STL,
            :away_STL,
            :home_PF,
            :away_PF,
            :home_TF,
            :away_TF,
            :bookmaker,
            :home_odds,
            :away_odds,
            :home_oddsv,
            :away_oddsv)
            '''
        row = get_row_db_games(game_id, year)
        cur = con.cursor()
        cur.execute(query, row)

In [164]:
def get_row_db_games(game_id, year):
    with sqlite3.connect(f'data/html/html_{year}.db') as con:
        query = f'SELECT summary_html, stats_html, odds_html FROM html WHERE game_id = "{game_id}"'
        cur = con.cursor()
        cur.execute(query)
        summary_html, stats_html, odds_html = cur.fetchone()

    summary_data = get_data_from_summary_html(summary_html)
    stats_data = get_data_from_stats_html(stats_html)
    odds_data = get_data_from_odds_html(odds_html)
    
    row = {'game_id': game_id, **summary_data, **stats_data, **odds_data}
    return row