In [2]:
import os
import re
import time
import glob

import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

from constants import (
    DATA_DIRECTORY,
    SEASON_DATES,
    TIME_SLEEP,
    URL,
    BOXSCORES_FORMAT,
    DATAFRAME_COlUMNS,
    URL_ROTO_FORMAT,
)


In [5]:
# Scrap the basketball game data since 2014-15 season
def get_boxes(season, date_list):
    """
    Scrapes box score data for a given season and a list of dates from
    Basketball-Reference.com and saves the data to CSV files.

    Args:
        season (str): The season to scrape (e.g., '2014-15').
        date_list (list): A list of dates in the format 'YYYYMMDD'.
    """
    print(f"Scraping boxscores from the {season} regular season")
    for date in tqdm(date_list):
        url = URL + URL_ROTO_FORMAT.format(
            month = date[4:6], day =date[6:8], year= date[0:4]
        )
        soup = BeautifulSoup(urlopen(url=url), 'lxml')
        games = soup.find_all("div", class_ = "game_summary expanded nohover")
        for game in games:
            summary = {}
            host = game.find_all("table")[1].find_all("a")[1]["href"][7:10]
            winner = game.find("tr", class_="winner").find_all("td")
            loser = game.find("tr", class_="loser").find_all("td")
            summary["winner"] = [
                winner[0].find("a")["href"][7:10],
                int(winner[1].get_text())
            ]
            summary["loser"] = [
                loser[0].find("a")["href"][7:10],
                int(loser[1].get_text())
            ]
            teams = ["winner", "loser"]
            url_game = URL+game.find("a", string="Box Score")["href"]
            soup_game = BeautifulSoup(urlopen(url=url_game), 'lxml')

            # Iterate through both the winner and loser of the game
            for team in teams:
                if summary[team][0]== host:
                    home = 1

                else: 
                    home = 0

                basic_stat_form = "box-{team}-game-basic"
                advanced_stat_form = "box-{team}-game-advanced"
                # basic_state = basic_stat_form.format(team=summary[team][0].upper())
                # advanced_state = advanced_stat_form.format(team=summary[team][0].upper())

                game_data = [date, summary[team][0], home, summary['winner'][0], ['winner'][1], summary['loser'][0], summary['loser'][1]] 
                basic_data = (
                    soup_game.find("table", id="basic_stat_form").find('tbody').find_all("tr", class_=None)
                )
                advanced_data = (
                    soup_game.find("table", id="advanced_stat_form").find('tbody').find_all("tr", class_=None)
                )
                n = len(basic_data)
                player_names = [basic_data[i].find("a").get_text() for i in range(n)]

                player_data = []
                injury_keywords = ["Did Not Play", "Not With Team"]
                for i in range(n):
                    if basic_data[i].find("td").get_text() not in injury_keywords:
                        data = ([player_names[i] + game_data + [
                                td.get_text()
                                for td in advanced_data[i].find_all("td")[1:]
                            ]])
                        player_data.append(data)
                    
                df = pd.DataFrame(player_data, columns=DATAFRAME_COlUMNS)
                df = df.fillna(0)
                df["MP"] = [
                    round(int(t.split(":")[0]) + int(t.split(":")[1]) / 60, 2)
                    if isinstance(t, str) and ":" in t
                    else 00.00
                    for t in df["MP"]
                ]
                df.to_csv(
                    os.path.join(
                        *[
                            DATA_DIRECTORY,
                            "Boxscores",
                            season,
                            date + "-" + summary[team][0] + ".csv",
                        ]
                    ),
                    index=False,
                )

            time.sleep(TIME_SLEEP)
    return None
