In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os 
from datetime import datetime

In [2]:
def scrape_nba_data_team_boxscores(team, season):
    """
    Scrapes NBA box score data for a specific team and season from Basketball-Reference.com.

    Parameters:
        team (str): The abbreviation of the NBA team (e.g., 'GSW' for Golden State Warriors).
        season (str): The season in the format 'YYYY' (e.g., '2022' for the 2022-2023 season).

    Returns:
        None

    Description:
        This function scrapes box score data for a specific NBA team and season from the Basketball-Reference.com website.
        It retrieves the game log page for the specified team and season, and then iterates over each game to extract
        relevant information such as game number, date, opponent, points scored, and box score link.

        For each box score link, the function scrapes the basic and advanced stats for the players who participated in that game.
        It combines the basic and advanced stats data based on matching player names and saves the resulting data as a CSV file
        in the directory structure "{season}/{team}/{game_number}_{date}_{team}_vs_{opponent}.csv".

        The CSV file contains the following columns:
        - Game_number: The game number of the season.
        - Team: The abbreviation of the team.
        - Player: The name of the player.
        - MP_basic: Minutes played (basic stats).
        - FG: Field goals made.
        - FGA: Field goals attempted.
        - FG%: Field goal percentage.
        - 3P: 3-pointers made.
        - 3PA: 3-pointers attempted.
        - 3P%: 3-point percentage.
        - FT: Free throws made.
        - FTA: Free throws attempted.
        - FT%: Free throw percentage.
        - ORB: Offensive rebounds.
        - DRB: Defensive rebounds.
        - TRB: Total rebounds.
        - AST: Assists.
        - STL: Steals.
        - BLK: Blocks.
        - TOV: Turnovers.
        - PF: Personal fouls.
        - PTS: Points scored.
        - +/-: Plus-minus.
        - MP_adv: Minutes played (advanced stats).
        - TS%: True shooting percentage.
        - eFG%: Effective field goal percentage.
        - 3PAr: 3-point attempt rate.
        - FTr: Free throw attempt rate.
        - ORB%: Offensive rebound percentage.
        - DRB%: Defensive rebound percentage.
        - TRB%: Total rebound percentage.
        - AST%: Assist percentage.
        - STL%: Steal percentage.
        - BLK%: Block percentage.
        - TOV%: Turnover percentage.
        - USG%: Usage percentage.
        - ORtg: Offensive rating.
        - DRtg: Defensive rating.
        - BPM: Box plus-minus.

        If there are any errors or missing data during the scraping process, appropriate messages will be printed to the console.

    Example:
        scrape_nba_data_team_boxscores('GSW', '2022')
        This will scrape the box score data for the Golden State Warriors team for the 2022-2023 season.
    """
    url = f'https://www.basketball-reference.com/teams/{team}/{season}_games.html'
    response = requests.get(url)
    
 
    # Create a BeautifulSoup object
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table element
    table = soup.find('table', class_='sortable stats_table', id='games')
    
    if table is None:
        print(f"No table found for team {team} in season {season}")
        return None
    
    # Find all the rows in the table body
    rows = table.tbody.find_all('tr')
    
    # Create empty lists to store the extracted data
    game_numbers = []
    dates = []
    visitor_neutrals = []
    visitor_pts_list = []
    home_neutrals = []
    home_pts_list = []
    box_score_links = []
    arenas = []
    
    # Iterate over each row and extract the desired information
    for row in rows:
        game_num = row.find('th', {'data-stat': 'g'})
        if game_num == "G":
            continue
        
        date_element = row.find('td', {'data-stat': 'date_game'})
        date = date_element.text.strip() if date_element else ''
        
        # Skip/continue if the value in the "Date" column is equal to "Date"
        
        visitor_neutral_element = row.find('td', {'data-stat': 'opp_name'})
        visitor_neutral = visitor_neutral_element.text.strip() if visitor_neutral_element else ''
        
        visitor_pts_element = row.find('td', {'data-stat': 'opp_pts'})
        visitor_pts = visitor_pts_element.text.strip() if visitor_pts_element else ''
        
        home_neutral = team
        
        home_pts_element = row.find('td', {'data-stat': 'pts'})
        home_pts = home_pts_element.text.strip() if home_pts_element else ''
        
        box_score_link_element = row.find('td', {'data-stat': 'box_score_text'})
        box_score_link = 'https://www.basketball-reference.com' + box_score_link_element.a['href'] if box_score_link_element and box_score_link_element.a else ''
        
        game_location_element = row.find('td', {'data-stat': 'game_location'})
        arena = 'Home' if game_location_element and game_location_element.text.strip() == '@' else 'Away'
        
        game_numbers.append(game_num.text.strip() if game_num else '')
        dates.append(date)
        visitor_neutrals.append(visitor_neutral)
        visitor_pts_list.append(visitor_pts)
        home_neutrals.append(home_neutral)
        home_pts_list.append(home_pts)
        box_score_links.append(box_score_link)
        arenas.append(arena)
    
    # Scrape data from each box score link and save as separate CSV files
    for i, link in enumerate(box_score_links):
        # Skip if the game number is "G"
        if game_numbers[i] == "G":
            continue
    
        box_score_response = requests.get(link)
        box_score_soup = BeautifulSoup(box_score_response.text, 'html.parser')
    
        # Find the table element with class "sortable stats_table" and an id containing "game-basic"
        stats_table = box_score_soup.find('table', class_='sortable stats_table', id=lambda x: x and f'{team}-game-basic' in x)
    
        if stats_table:
            # Extract the team abbreviation from the id attribute
            team_abbr = stats_table['id'].split('-')[1].upper()
    
            # Check if the team abbreviation matches the desired team
            if team_abbr != team:
                print(f"Skipping box score for {link} as it belongs to a different team ({team_abbr})")
                continue
    
            # Find all the rows in the table body
            stats_rows = stats_table.tbody.find_all('tr')
    
            # Extract the desired information from each row
            basic_stats_data = []
            for row in stats_rows:
                player = row.find('th', {'data-stat': 'player'})
                player = player.text.strip() if player else ''
    
                mp = row.find('td', {'data-stat': 'mp'})
                mp = mp.text.strip() if mp else ''
    
                fg = row.find('td', {'data-stat': 'fg'})
                fg = fg.text.strip() if fg else ''
    
                fga = row.find('td', {'data-stat': 'fga'})
                fga = fga.text.strip() if fga else ''
    
                fg_pct = row.find('td', {'data-stat': 'fg_pct'})
                fg_pct = fg_pct.text.strip() if fg_pct else ''
    
                fg3 = row.find('td', {'data-stat': 'fg3'})
                fg3 = fg3.text.strip() if fg3 else ''
    
                fg3a = row.find('td', {'data-stat': 'fg3a'})
                fg3a = fg3a.text.strip() if fg3a else ''
    
                fg3_pct = row.find('td', {'data-stat': 'fg3_pct'})
                fg3_pct = fg3_pct.text.strip() if fg3_pct else ''
    
                ft = row.find('td', {'data-stat': 'ft'})
                ft = ft.text.strip() if ft else ''
    
                fta = row.find('td', {'data-stat': 'fta'})
                fta = fta.text.strip() if fta else ''
    
                ft_pct = row.find('td', {'data-stat': 'ft_pct'})
                ft_pct = ft_pct.text.strip() if ft_pct else ''
    
                orb = row.find('td', {'data-stat': 'orb'})
                orb = orb.text.strip() if orb else ''
    
                drb = row.find('td', {'data-stat': 'drb'})
                drb = drb.text.strip() if drb else ''
    
                trb = row.find('td', {'data-stat': 'trb'})
                trb = trb.text.strip() if trb else ''
    
                ast = row.find('td', {'data-stat': 'ast'})
                ast = ast.text.strip() if ast else ''
    
                stl = row.find('td', {'data-stat': 'stl'})
                stl = stl.text.strip() if stl else ''
    
                blk = row.find('td', {'data-stat': 'blk'})
                blk = blk.text.strip() if blk else ''
    
                tov = row.find('td', {'data-stat': 'tov'})
                tov = tov.text.strip() if tov else ''
    
                pf = row.find('td', {'data-stat': 'pf'})
                pf = pf.text.strip() if pf else ''
    
                pts = row.find('td', {'data-stat': 'pts'})
                pts = pts.text.strip() if pts else ''
    
                plus_minus = row.find('td', {'data-stat': 'plus_minus'})
                plus_minus = plus_minus.text.strip() if plus_minus else ''
    
                basic_stats_data.append([player, mp, fg, fga, fg_pct, fg3, fg3a, fg3_pct, ft, fta, ft_pct, orb, drb, trb, ast, stl, blk, tov, pf, pts, plus_minus])
    
        else:
            print(f"No basic stats table found for {link}")
            continue
    
        # Find the table element with class "sortable stats_table" and the specific id
        adv_stats_table = box_score_soup.find('table', class_='sortable stats_table', id=lambda x: x and f'{team}-game-advanced' in x)
    
        if adv_stats_table:
            # Extract the team abbreviation from the id attribute
            adv_team_abbr = adv_stats_table['id'].split('-')[1].upper()
    
            # Check if the team abbreviation matches the desired team
            if adv_team_abbr != team:
                print(f"Skipping advanced stats table for {link} as it belongs to a different team ({adv_team_abbr})")
                continue
    
            # Find all the rows in the table body
            adv_stats_rows = adv_stats_table.tbody.find_all('tr')
    
            # Extract the desired information from each row
            adv_stats_data = []
            for row in adv_stats_rows:
                player = row.find('th', {'data-stat': 'player'})
                player = player.text.strip() if player else ''
    
                mp = row.find('td', {'data-stat': 'mp'})
                mp = mp.text.strip() if mp else ''
    
                ts_pct = row.find('td', {'data-stat': 'ts_pct'})
                ts_pct = ts_pct.text.strip() if ts_pct else ''
    
                efg_pct = row.find('td', {'data-stat': 'efg_pct'})
                efg_pct = efg_pct.text.strip() if efg_pct else ''
    
                fg3a_per_fga_pct = row.find('td', {'data-stat': 'fg3a_per_fga_pct'})
                fg3a_per_fga_pct = fg3a_per_fga_pct.text.strip() if fg3a_per_fga_pct else ''
    
                fta_per_fga_pct = row.find('td', {'data-stat': 'fta_per_fga_pct'})
                fta_per_fga_pct = fta_per_fga_pct.text.strip() if fta_per_fga_pct else ''
    
                orb_pct = row.find('td', {'data-stat': 'orb_pct'})
                orb_pct = orb_pct.text.strip() if orb_pct else ''
    
                drb_pct = row.find('td', {'data-stat': 'drb_pct'})
                drb_pct = drb_pct.text.strip() if drb_pct else ''
    
                trb_pct = row.find('td', {'data-stat': 'trb_pct'})
                trb_pct = trb_pct.text.strip() if trb_pct else ''
    
                ast_pct = row.find('td', {'data-stat': 'ast_pct'})
                ast_pct = ast_pct.text.strip() if ast_pct else ''
    
                stl_pct = row.find('td', {'data-stat': 'stl_pct'})
                stl_pct = stl_pct.text.strip() if stl_pct else ''
    
                blk_pct = row.find('td', {'data-stat': 'blk_pct'})
                blk_pct = blk_pct.text.strip() if blk_pct else ''
    
                tov_pct = row.find('td', {'data-stat': 'tov_pct'})
                tov_pct = tov_pct.text.strip() if tov_pct else ''
    
                usg_pct = row.find('td', {'data-stat': 'usg_pct'})
                usg_pct = usg_pct.text.strip() if usg_pct else ''
    
                off_rtg = row.find('td', {'data-stat': 'off_rtg'})
                off_rtg = off_rtg.text.strip() if off_rtg else ''
    
                def_rtg = row.find('td', {'data-stat': 'def_rtg'})
                def_rtg = def_rtg.text.strip() if def_rtg else ''
    
                bpm = row.find('td', {'data-stat': 'bpm'})
                bpm = bpm.text.strip() if bpm else ''
    
                adv_stats_data.append([player, mp, ts_pct, efg_pct, fg3a_per_fga_pct, fta_per_fga_pct, orb_pct, drb_pct, trb_pct, ast_pct, stl_pct, blk_pct, tov_pct, usg_pct, off_rtg, def_rtg, bpm])
    
        else:
            print(f"No advanced stats table found for {link}")
            continue
    
        # Combine the basic and advanced stats data based on matching player names
        combined_stats_data = []
        for basic_stats in basic_stats_data:
            player = basic_stats[0]
            for adv_stats in adv_stats_data:
                if adv_stats[0] == player:
                    combined_stats = [game_numbers[i], team_abbr] + basic_stats + adv_stats[1:]
                    combined_stats_data.append(combined_stats)
                    break
    
        # Create a DataFrame for the combined stats data
        columns = ['Game_number', 'Team', 'Player', 'MP_basic', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-', 'MP_adv', 'TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg', 'BPM']
        df_stats = pd.DataFrame(combined_stats_data, columns=columns)
    
        # Create the directory structure if it doesn't exist
        directory = f"{season}/{team}"
        os.makedirs(directory, exist_ok=True)
    
        # Convert the date to "mm/dd/YYYY" format
        date_obj = datetime.strptime(dates[i], '%a, %b %d, %Y')
        mm_dd_YYYY = date_obj.strftime('%m_%d_%Y')
        
        # Generate a unique filename for each game with the desired format
        filename = f"{directory}/{game_numbers[i]}_{mm_dd_YYYY}_{team_abbr}_vs_{visitor_neutrals[i]}.csv"
        
        # Save the DataFrame to a CSV file with the unique filename
        df_stats.to_csv(filename, index=False)
        
        print(f"DataFrame saved to {filename}")

In [1]:
# NBA team abbreviations
nba_teams = ['ATL', 'BOS', 'BRK', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

# Iterate over the years and teams
for year in range(2018, 2025):
    for team in nba_teams:
        scrape_nba_data_team_boxscores(team, year)