In [None]:
import bs4
import sqlite3

In [None]:
def create_db_games(year):    
    with sqlite3.connect(f'data/games/games_{year}.db') as con:
        query = '''CREATE TABLE games (
            id TEXT NOT NULL PRIMARY KEY,
            tournament TEXT,
            season INTEGER,
            date TEXT,
            status TEXT,
            info TEXT,
            venue TEXT,
            attendance INTEGER,
            home_team TEXT,
            away_team TEXT,
            home_score INTEGER,
            away_score INTEGER,
            home_scorepq TEXT,
            away_scorepq TEXT,
            home_FGA INTEGER,
            away_FGA INTEGER,
            home_FGM INTEGER,
            away_FGM INTEGER,
            home_2PA INTEGER,
            away_2PA INTEGER,
            home_2PM INTEGER,
            away_2PM INTEGER,
            home_3PA INTEGER,
            away_3PA INTEGER,
            home_3PM INTEGER,
            away_3PM INTEGER,
            home_FTA INTEGER,
            away_FTA INTEGER,
            home_FTM INTEGER,
            away_FTM INTEGER,
            home_REB INTEGER,
            away_REB INTEGER,
            home_OREB INTEGER,
            away_OREB INTEGER,
            home_DREB INTEGER,
            away_DREB INTEGER,
            home_AST INTEGER,
            away_AST INTEGER,
            home_BLK INTEGER,
            away_BLK INTEGER,
            home_STL INTEGER,
            away_STL INTEGER,
            home_TO INTEGER,
            away_TO INTEGER,
            home_PF INTEGER,
            away_PF INTEGER,
            home_TF INTEGER,
            away_TF INTEGER,
            bookmaker TEXT,
            home_odds REAL,
            away_odds REAL,
            home_oddsv TEXT,
            away_oddsv TEXT)
            '''
        cur = con.cursor()
        cur.execute(query)

In [None]:
def populate_db_games(year):
    ids = get_ids_db_games(year)
    populate_db_games_per_ids(ids, year)

In [None]:
def get_ids_db_games(year):
    with sqlite3.connect(f'data/sources/sources_{year}.db') as con:
        query = 'SELECT id FROM sources'
        cur = con.cursor()
        cur.execute(query)
        ids = cur.fetchall()
    ids = [id_[0] for id_ in ids]
    return ids

In [None]:
def populate_db_games_per_ids(ids, year):
    for i, id_ in enumerate(ids):
        print(f'{id_} {i+1}/{len(ids)}')
        values = get_values_db_games(id_, year)
        values['id'] = id_
        with sqlite3.connect(f'data/games/games_{year}.db') as con:
            query = '''INSERT INTO games VALUES (
                :id,
                :tournament,
                :season,
                :date,
                :status,
                :info,
                :venue,
                :attendance,
                :home_team,
                :away_team,
                :home_score,
                :away_score,
                :home_scorepq,
                :away_scorepq,
                :home_FGA,
                :away_FGA,
                :home_FGM,
                :away_FGM,
                :home_2PA,
                :away_2PA,
                :home_2PM,
                :away_2PM,
                :home_3PA,
                :away_3PA,
                :home_3PM,
                :away_3PM,
                :home_FTA,
                :away_FTA,
                :home_FTM,
                :away_FTM,
                :home_OREB,
                :away_OREB,
                :home_DREB,
                :away_DREB,
                :home_REB,
                :away_REB,
                :home_AST,
                :away_AST,
                :home_BLK,
                :away_BLK,
                :home_TO,
                :away_TO,
                :home_STL,
                :away_STL,
                :home_PF,
                :away_PF,
                :home_TF,
                :away_TF,
                :bookmaker,
                :home_odds,
                :away_odds,
                :home_oddsv,
                :away_oddsv)
                '''
            cur = con.cursor()
            cur.execute(query, values)

In [None]:
def get_values_db_games(id_, year):
    with sqlite3.connect(f'data/sources/sources_{year}.db') as con:
        query = f'''SELECT summary_source, stats_source, odds_source
        FROM sources
        WHERE id = "{id_}"
        '''
        cur = con.cursor()
        cur.execute(query)
        summary_source, stats_source, odds_source = cur.fetchone()

    summary_values = get_values_summary_source(summary_source)
    stats_values = get_values_stats_source(stats_source)
    odds_values = get_values_odds_source(odds_source)

    values = {'season': year, **summary_values, **stats_values, **odds_values}
    return values

In [None]:
def get_values_summary_source(summary_source):
    keys = [
        'tournament',
        'date',
        'status',
        'info',
        'venue',
        'attendance',
        'home_team',
        'away_team',
        'home_score',
        'away_score',
        'home_scorepq',
        'away_scorepq',
    ]
    summary_data = {key: None for key in keys}
    
    if summary_source is None:
        return summary_data
    soup = bs4.BeautifulSoup(summary_source, 'html.parser')
        
    tournament = soup.find('a', {'href': '/basketball/usa/nba/'})
    if tournament is not None:
        summary_data['tournament'] = tournament.text

    start_time = soup.find('div', {'class': 'duelParticipant__startTime'})
    if start_time is not None:
        summary_data['date'] = start_time.text

    status = soup.find('div', {'class': 'detailScore__status'})
    if status is not None:
        summary_data['status'] = status.text

    info = soup.find('div', {'class': 'infoBox__info'})
    if info is not None:
        summary_data['info'] = info.text

    venue_info = soup.find('div', {'class': 'mi__data'})
    if venue_info is not None:
        venue_info = venue_info.text
        if 'Venue: ' in venue_info:
            summary_data['venue'] = venue_info.split('Venue: ')[1].split(',')[0]
        if 'Attendance: ' in venue_info:
            summary_data['attendance'] = venue_info.split('Attendance: ')[1].split(',')[0].replace(' ', '')

    home_values = soup.find_all('div', {'class': 'smh__home'})[1:]
    home_values = [value.text for value in home_values]
    if len(home_values) > 2:
        summary_data['home_team'] = home_values[0]
        summary_data['home_score'] = home_values[1]
        summary_data['home_scorepq'] = ' '.join(home_values[2:]).strip()

    away_values = soup.find_all('div', {'class': 'smh__away'})[1:]
    away_values = [value.text for value in away_values]
    if len(away_values) > 2:
        summary_data['away_team'] = away_values[0]
        summary_data['away_score'] = away_values[1]
        summary_data['away_scorepq'] = ' '.join(away_values[2:]).strip()

    return summary_data

In [None]:
def get_values_stats_source(stats_source):
    keys = [
        'home_FGA',
        'away_FGA',
        'home_FGM',
        'away_FGM',
        'home_2PA',
        'away_2PA',
        'home_2PM',
        'away_2PM',
        'home_3PA',
        'away_3PA',
        'home_3PM',
        'away_3PM',
        'home_FTA',
        'away_FTA',
        'home_FTM',
        'away_FTM',
        'home_OREB',
        'away_OREB',
        'home_DREB',
        'away_DREB',
        'home_REB',
        'away_REB',
        'home_AST',
        'away_AST',
        'home_BLK',
        'away_BLK',
        'home_TO',
        'away_TO',
        'home_STL',
        'away_STL',
        'home_PF',
        'away_PF',
        'home_TF',
        'away_TF',
    ]
    stats_data = {key: None for key in keys}
    
    if stats_source is None:
        return stats_data
    soup = bs4.BeautifulSoup(stats_source, 'html.parser')
        
    category_names = soup.find_all('div', {'class': 'statCategoryName'})
    home_values = soup.find_all('div', {'class': 'statHomeValue'})
    away_values = soup.find_all('div', {'class': 'statAwayValue'})

    category_names_dict = {
        'Field Goals Attempted': 'FGA',
        'Field Goals Made': 'FGM',
        '2-Point Field G. Attempted': '2PA',
        '2-Point Field Goals Made': '2PM',
        '3-Point Field G. Attempted': '3PA',
        '3-Point Field Goals Made': '3PM',
        'Free Throws Attempted': 'FTA',
        'Free Throws Made': 'FTM',
        'Offensive Rebounds': 'OREB',
        'Defensive Rebounds': 'DREB',
        'Total Rebounds': 'REB',
        'Assists': 'AST',
        'Blocks': 'BLK',
        'Turnovers': 'TO',
        'Steals': 'STL',
        'Personal Fouls': 'PF',
        'Technical Fouls': 'TF',
    }
    for category, home_value, away_value in zip(category_names, home_values, away_values):
        if category.text in category_names_dict.keys():
            stats_data[f'home_{category_names_dict[category.text]}'] = home_value.text
            stats_data[f'away_{category_names_dict[category.text]}'] = away_value.text

    return stats_data

In [None]:
def get_values_odds_source(odds_source):
    keys = [
        'bookmaker',
        'home_odds',
        'away_odds',
        'home_oddsv',
        'away_oddsv',
    ]
    odds_data = {key: None for key in keys}

    if odds_source is None:
        return odds_data
    soup = bs4.BeautifulSoup(odds_source, 'html.parser')

    row = soup.find('div', {'class': 'ui-table__row'})    
    if row is not None and len(row.find_all('a', {'class': 'oddsCell__odd'})) == 2:    
        odds_data['bookmaker'] = row.find('img', {'class': 'prematchLogo'})['title']
        odds_data['home_odds'] = row.find_all('a', {'class': 'oddsCell__odd'})[0].text
        odds_data['away_odds'] = row.find_all('a', {'class': 'oddsCell__odd'})[1].text
        if row.find_all('a', {'class': 'oddsCell__odd'})[0]['title']:
            odds_data['home_oddsv'] = row.find_all('a', {'class': 'oddsCell__odd'})[0]['title']
        if row.find_all('a', {'class': 'oddsCell__odd'})[1]['title']:
            odds_data['away_oddsv'] = row.find_all('a', {'class': 'oddsCell__odd'})[1]['title']

    return odds_data

In [None]:
create_db_games(2011)
populate_db_games(2011)

In [None]:
import gzip
import shutil

def compress_db_sources(year):
    with open('test.db', 'rb') as db:
        with gzip.open('test.db.gz', 'wb') as comp_db:
            shutil.copyfileobj(db, comp_db)

compress_db_sources(1)

In [None]:
import tarfile

with tarfile.open('test.db', 'r:') as db:
    with tarfile.open('test.db.tar.bz2', 'w:bz2') as comp_db:
        comp_db.add(db)