In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
from typing import List
from io import StringIO
import datetime as dt

In [93]:
def generate_url(date: dt.datetime, site: str) -> str:
    '''
    Generates the url for a given game at a date and game site.

    Args:
        date (dt.datetime): Date of the game.
        site (str): Three capital letter location of game.

    Returns:
        str: Subsequent url in basketball-reference.
    '''
    return f'https://www.basketball-reference.com/boxscores/%(date)s0%(site)s.html' % {'date': date.strftime("%Y%m%d"), 'site': site}

def clean_game_table(game_table: pd.DataFrame) -> pd.DataFrame:
    '''
    Puts game table in clean format for PostgreSQL.

    Args:
        game_table (pd.DataFrame): Uncleaned game dataframe from parsing basketball-reference.

    Returns:
        pd.DataFrame: Cleaned table.
    '''

    # Drop "Reserves" and "Team Totals" rows
    game_table = game_table[~game_table['Starters']].isin(['Reserves', 'Team Totals'])

    # Fill NaN with 0
    game_table.fillna(0, inplace=True)

    # Drop any DNP
    game_table = game_table[~(game_table["MP"] == "Did Not Play")]
    
    return game_table
    

In [None]:
def scrape_NBA_game_data(date: dt.datetime, site: str) -> List[pd.DataFrame]:
    '''
    Gets player data for an NBA game on a given date, played at a given site.

    Args:
        date (dt.datetime): Date of the game.
        site (str): Three capital letter location of game.

    Returns:
        List[pd.DataFrame]: Cleaned list of player dataframes, one per team.
    '''
    
    # URL to scrape
    url = generate_url(dt.datetime(2021, 11, 9), 'MIL')

    # collect HTML data, create beautiful Soup object
    html = urlopen(url)
    soup = BeautifulSoup(html, features='html.parser')
    tables = soup.find_all('table', {'id': re.compile('box-.*-game-basic')})

    # Convert the tables into a string and wrap it with StringIO
    html_string = "\n".join(str(table) for table in tables)
    html_io = StringIO(html_string)

    # Use read_html with the StringIO object; remove 'Reserves' and 'Team Totals' rows, fill in NaN
    game_tables = pd.read_html(html_io, header=1)
    game_tables = [clean_game_table(game_table) for game_table in game_tables]

    return game_tables

In [None]:
def get_all_nba_games(year: int) -> List[str]:
    '''
    Retrieve URLs for NBA games for a specific season.

    Args:
        year (int): The year which the season began.

    Returns:
        List[str]: A list of NBA games for the specified year.
    '''

    # URL to scrape, notice f string:
    url = f'https://www.basketball-reference.com/leagues/NBA_%(year)s_games.html' % {'year': year}
    html = urlopen(url)

    soup = BeautifulSoup(html, features='html.parser')
    months = ["October", "November", "December", "January", "February", "March", "April", "May", "June"]

In [91]:
test = scrape_NBA_team_data()

In [None]:
https://www.basketball-reference.com/leagues/NBA_2023_totals.html
https://www.basketball-reference.com/leagues/NBA_%(year)s_games.html