<a href="https://colab.research.google.com/github/arielba2002/Deep-Picker-Project/blob/main/model/DataScraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# NBA Statistics Processing
--------------
This notebook fetches NBA team and player data from a custom API, then
organizes and prepares the data for further analysis or modeling (e.g.,
train/test split). Finally, it saves the raw player data and extracted
metadata to JSON files.

Reference: NBA Stats API Github
https://github.com/swar/nba_api

# Import Modules

Import required packages

In [None]:
# =========================
# 1. Imports & Constants
# =========================
!pip install nba_api

import json
import time
import requests
from collections import defaultdict
from nba_api.stats.static import teams as nba_teams
from nba_api.stats.endpoints import teamyearbyyearstats
from requests.exceptions import ReadTimeout, RequestException



# Team Abbreviation Mapping

Create a dictionary to standardize team abbreviations across seasons.

In [None]:
# Mapping for outdated team abbreviations to modern equivalents
TEAM_ABBR_FIX = {
    "NJN": "BKN",  # New Jersey Nets -> Brooklyn Nets
    "BRK": "BKN",  # New Jersey Nets -> Brooklyn Nets
    "CHH": "CHA",  # Charlotte Hornets (original) -> Charlotte Hornets (new)
    "CHO": "CHA",  # Charlotte Hornets (original) -> Charlotte Hornets (new)
    "VAN": "MEM",  # Vancouver Grizzlies -> Memphis Grizzlies
    "SEA": "OKC",  # Seattle SuperSonics -> Oklahoma City Thunder
    "NOH": "NOP",  # New Orleans Hornets -> New Orleans Pelicans
    "NOK": "NOP",  # Temporary NO/OKC Hornets -> Pelicans
    "WSB": "WAS",
    "PHO": "PHX"
}

# Helper Functions

## Fetch Teams by Season

 Retrieve all teams for each NBA season through the REST API

In [None]:
def fetch_teams_by_season(api_base_url, seasons):
    teams_by_season = {}
    for season in seasons:
        season_url = f"{api_base_url}/PlayerDataTotals/season/{season}"
        try:
            response = requests.get(season_url)
            response.raise_for_status()
            team_set = {player["team"] for player in response.json()}
            teams_by_season[season] = sorted(team_set)
        except requests.exceptions.RequestException as e:
            print(f"Error fetching teams for season {season}: {e}")
            teams_by_season[season] = []
    return teams_by_season

## Fetch Team Season Stats

Collect comprehensive team statistics with built-in retry logic

In [None]:
def fetch_team_season_stats(team_abbr, season, max_retries=20, initial_delay=5):
    team_abbr = team_abbr.upper()
    team_abbr = TEAM_ABBR_FIX.get(team_abbr, team_abbr)

    team_info = nba_teams.find_team_by_abbreviation(team_abbr)
    if not team_info:
        print(f"Could not find team info for {team_abbr}")
        return []

    team_id = team_info["id"]
    formatted_season = f"{int(season)}-{str(int(season)+1)[-2:]}"

    for attempt in range(1, max_retries + 1):
        try:
            stats = teamyearbyyearstats.TeamYearByYearStats(
                team_id=team_id,
                per_mode_simple="PerGame",
                league_id="00",
                season_type_all_star="Regular Season",
                timeout=50
            ).get_dict()
            rows = stats["resultSets"][0]["rowSet"]
            headers = stats["resultSets"][0]["headers"]
            for row in rows:
                if row[3] == formatted_season:
                    stats_dict = dict(zip(headers, row))
                    return [
                        stats_dict.get("PTS", 0),
                        stats_dict.get("AST", 0),
                        stats_dict.get("REB", 0),
                        stats_dict.get("BLK", 0),
                        stats_dict.get("STL", 0),
                        stats_dict.get("WIN_PCT", 0),
                        stats_dict.get("CONF_RANK", 0),
                        stats_dict.get("FGM", 0),
                        stats_dict.get("FGA", 0),
                        stats_dict.get("FG3M", 0),
                        stats_dict.get("FG3A", 0),
                        stats_dict.get("FTM", 0),
                        stats_dict.get("FTA", 0),
                        stats_dict.get("OREB", 0),
                        stats_dict.get("DREB", 0),
                        stats_dict.get("PF", 0),
                        stats_dict.get("TOV", 0)
                    ]
            return []
        except ReadTimeout:
            # Exponential backoff - wait longer with each retry
            delay = initial_delay * (2 ** (attempt - 1))
            print(f"Timeout on {team_abbr}_{season}, retry {attempt}/{max_retries} (waiting {delay}s)")
            time.sleep(delay)
        except RequestException as e:
            # Exponential backoff for all request exceptions
            delay = initial_delay * (2 ** (attempt - 1))
            print(f"Retrying in {delay}s...")
            time.sleep(delay)

    print(f"Failed to fetch {team_abbr}_{season} after {max_retries} retries.")
    return []

## Fetch Player Stats

Gather player roster data for each team-season combination

In [None]:
def fetch_players_in_teamseason(api_base_url, teams_by_season):
    team_season_data = {}
    failed_teams = []

    for season, teams in teams_by_season.items():
        for team in teams:
            team_season_key = f"{team}_{season}" # BOS_1999 (1999-2000)
            print(f"\nFetching data for {team_season_key}")

            # Player stats from previous season
            player_season = season + 1 # Because 1999 is 1998-1999 in the api.
            # 1. get players id's from 1999-2000
            # 2. get those players stats from 1998-1999 whole stats (if the player is in TOT so take his stats from TOT)
            query_url = f"{api_base_url}/PlayerDataTotals/query?team={team}&season={player_season}&pageSize=20"

            try:
                response = requests.get(query_url)
                response.raise_for_status()
                players_data = response.json()
            except requests.exceptions.RequestException as e:
                print(f"Error fetching players for {team} in {player_season}: {e}")
                players_data = []

            time.sleep(1.5)

            # Team stats from current season
            try:
                team_labels = fetch_team_season_stats(team, season)
            except Exception as e:
                print(f"Error fetching team stats for {team_season_key}: {e}")
                team_labels = []
                failed_teams.append(team_season_key)

            team_season_data[team_season_key] = {
                "labels": team_labels,
                "players": players_data
            }

            time.sleep(1.5)

    if failed_teams:
        print("\n❌ Failed team stats for the following keys:")
        for key in failed_teams:
            print(f" - {key}")

    return team_season_data

## Create Metadata

 Generate structured metadata that indexes all teams, players, and their
relationships across seasons

In [None]:
def create_metadata(team_season_data):
    all_teams = set()
    all_players = set()
    teams_per_season = defaultdict(set)
    players_per_team_season = {}
    player_stats_attributes = set()
    team_labels = {}

    for team_season_key, data in team_season_data.items():
        team, season = team_season_key.split("_")
        labels = data.get("labels", [])
        players_list = data.get("players", [])

        all_teams.add(team)
        teams_per_season[season].add(team)

        player_names_in_this_team_season = []
        for player_dict in players_list:
            player_name = player_dict.get("playerName", "Unknown")
            all_players.add(player_name)
            player_names_in_this_team_season.append(player_name)
            player_stats_attributes.update(player_dict.keys())

        players_per_team_season[team_season_key] = player_names_in_this_team_season
        team_labels[team_season_key] = labels

    metadata = {
        "all_teams": sorted(all_teams),
        "all_players": sorted(all_players),
        "teams_per_season": {yr: sorted(teams) for yr, teams in teams_per_season.items()},
        "players_per_team_season": players_per_team_season,
        "player_stats_attributes": sorted(player_stats_attributes),
        "team_labels": team_labels
    }

    return metadata

# Collect NBA Stats by Season

Define the API endpoint and season range (1996–2025), then call the data collection functions and save the raw team and player data to JSON files.

In [None]:
API_BASE_URL = "http://rest.nbaapi.com/api"
SEASON_RANGE = range(1996, 2025)

# Get teams by season
teams_by_season_data = fetch_teams_by_season(API_BASE_URL, SEASON_RANGE)

# Count of teams each year
print("\nTeam counts per season:")
for season, teams in teams_by_season_data.items():
    print(f"{season}: {len(teams)}")

# Fetch data
players_in_teamseason_data = fetch_players_in_teamseason(API_BASE_URL, teams_by_season_data)

# Metadata
metadata_dict = create_metadata(players_in_teamseason_data)

# Save output (optional)
with open("team_player_data.json", "w") as f:
    json.dump(players_in_teamseason_data, f, indent=2)

with open("metadata.json", "w") as f:
    json.dump(metadata_dict, f, indent=2)

print("\n✅ Finished processing and saved data.")


Team counts per season:
1996: 30
1997: 30
1998: 30
1999: 30
2000: 30
2001: 30
2002: 30
2003: 30
2004: 30
2005: 31
2006: 31
2007: 31
2008: 31
2009: 31
2010: 31
2011: 31
2012: 31
2013: 31
2014: 31
2015: 31
2016: 31
2017: 31
2018: 31
2019: 31
2020: 31
2021: 31
2022: 31
2023: 31
2024: 31

Fetching data for ATL_1996

Fetching data for BOS_1996

Fetching data for CHH_1996

Fetching data for CHI_1996

Fetching data for CLE_1996

Fetching data for DAL_1996

Fetching data for DEN_1996

Fetching data for DET_1996

Fetching data for GSW_1996

Fetching data for HOU_1996

Fetching data for IND_1996

Fetching data for LAC_1996

Fetching data for LAL_1996

Fetching data for MIA_1996

Fetching data for MIL_1996

Fetching data for MIN_1996

Fetching data for NJN_1996

Fetching data for NYK_1996

Fetching data for ORL_1996

Fetching data for PHI_1996

Fetching data for PHO_1996

Fetching data for POR_1996

Fetching data for SAC_1996

Fetching data for SAS_1996

Fetching data for SEA_1996

Fetching data

# Generate Previous Year Data

Creates historical performance tracking by linking current team rosters to players' previous season statistics

In [None]:
data = players_in_teamseason_data
previous_year_data = {}

for key, value in data.items():
    previous_year_data[key] = {}
    previous_year_data[key]['labels'] = value['labels']
    previous_year_data[key]['players'] = []

    # get the previous year from the key
    previous_year = key.split('_')[1]
    previous_year = str(int(previous_year) - 1) ## str(1998)


    # search for the player in the previous year and add the player's data to the previous_year_data dictionary
    for player in value['players']:

        flag = 0
        player_id = player['playerId']
        # check if that player exists in the team of TOT previous_year, if it does, take the player from there
        if 'TOT_' + previous_year in data:
            for player_2 in data['TOT_' + previous_year]['players']:
                if player_2['playerId'] == player_id:
                    previous_year_data[key]['players'].append(player_2)
                    flag = 1
                    break

        # ONLY IF player was not found in TOT, search in all teams
        if flag == 0:
            for key_2, value_2 in data.items():
                if previous_year in key_2:
                    for player_2 in value_2['players']:
                        if player_2['playerId'] == player_id:
                            previous_year_data[key]['players'].append(player_2)
                            flag = 1
                            break
        if flag == 0:
            print("player " + player['playerName'] + str(player['season']-1) + "not found")



player Alan Henderson1996not found
player Anthony Miller1996not found
player Christian Laettner1996not found
player Darrin Hancock1996not found
player Derrick Alston1996not found
player Dikembe Mutombo*1996not found
player Donnie Boyce1996not found
player Eldridge Recasner1996not found
player Henry James1996not found
player Ivano Newbill1996not found
player Jon Barry1996not found
player Ken Norman1996not found
player Mookie Blaylock1996not found
player Priest Lauderdale1996not found
player Steve Smith1996not found
player Tyrone Corbin1996not found
player Willie Burton1996not found
player Alton Lister1996not found
player Antoine Walker1996not found
player Brett Szabo1996not found
player Dana Barros1996not found
player David Wesley1996not found
player Dee Brown1996not found
player Dino Radja*1996not found
player Eric Williams1996not found
player Frank Brickowski1996not found
player Greg Minor1996not found
player Marty Conlon1996not found
player Michael Hawkins1996not found
player Nate Dr

## Clean Data

Replace null values with 0

In [None]:
# iterate over the data dictionary and replace all null values with 0
for key, value in previous_year_data.items():
    for player in value['players']:
        for key, value in player.items():
            if value == None:
                player[key] = 0

## Select Top 8 Players

Filters each team's roster to retain only the top 8 players by minutes played, focusing
analysis on the most significant contributors

In [None]:
# for every team in previous_year_data dictionary to take the top 8 players based on the minutes played (minutesPg) and save to top_8_players_previous_year

top_8_players_previous_year = {}
for key, value in previous_year_data.items():
    top_8_players_previous_year[key] = {}
    top_8_players_previous_year[key]['labels'] =  value['labels']
    top_8_players_previous_year[key]['players'] = []

    # sort the players based on the minutes played
    value['players'] = sorted(value['players'], key=lambda x: x['minutesPg'], reverse=True)
    # add the top 8 players to the top_8_players_previous_year dictionary
    top_8_players_previous_year[key]['players'] = value['players'][:8]


previous_year_data = top_8_players_previous_year

## Save Final Data

 Exports the final processed dataset to a JSON file

In [None]:
with open('previous_year_data_modified.json', 'w') as file:
    json.dump(previous_year_data, file)

# Validate the Results

In [None]:
# Load the JSON data
with open('previous_year_data_modified.json', 'r') as file:
    data = json.load(file)

# Initialize arrays to store the results
players_per_team = []
teams_per_year = {}
labels_per_team = []
team_year_keys = []

# Process the data
for team_year, team_data in data.items():
    # Skip teams from 1996 and TOT teams.
    team = team_year.split('_')[0]
    year = team_year.split('_')[1]

    if year == "1996" or team == "TOT":
        continue

    team_year_keys.append(team_year)

    # Count number of players in this team
    if "players" in team_data:
        players_per_team.append(len(team_data["players"]))
        if len(team_data["players"]) < 8:
            print(f"Team {team_year} has less than 8 players. they have {len(team_data['players'])} players.")
    else:
        players_per_team.append(0)
        print(f"Team {team_year} has no players.")

    # Count number of labels in this team
    if "labels" in team_data:
        labels_per_team.append(len(team_data["labels"]))
        if len(team_data["labels"]) < 17:
            print(f"Team {team_year} has less then 17 labels.")
    else:
        labels_per_team.append(0)
        print(f"Team {team_year} has no labels.")

    # Update teams count per year
    if year not in teams_per_year:
        teams_per_year[year] = 0
    teams_per_year[year] += 1

# Convert teams_per_year dictionary to a list ordered by year
sorted_years = sorted(teams_per_year.keys())
teams_per_year_list = [teams_per_year[year] for year in sorted_years]

# Print the results
print("Number of players in each team (excluding 1996 teams and TOT teams):")
print(players_per_team)

print("\nNumber of teams in each year (excluding 1996 and TOT teams):")
print(teams_per_year_list)

print("\nNumber of labels for each team (excluding 1996 teams and TOT teams):")
print(labels_per_team)


Team WSB_1997 has less than 8 players. they have 0 players.
Team WAS_1998 has less than 8 players. they have 4 players.
Team VAN_2001 has less than 8 players. they have 0 players.
Team CHH_2002 has less than 8 players. they have 0 players.
Team CHH_2002 has less then 17 labels.
Team MEM_2002 has less than 8 players. they have 3 players.
Team NOH_2003 has less than 8 players. they have 6 players.
Team CHA_2005 has less than 8 players. they have 4 players.
Team NOH_2005 has less than 8 players. they have 0 players.
Team NOK_2007 has less than 8 players. they have 0 players.
Team NOH_2008 has less than 8 players. they have 5 players.
Team SEA_2008 has less than 8 players. they have 0 players.
Team OKC_2009 has less than 8 players. they have 5 players.
Team NJN_2012 has less than 8 players. they have 0 players.
Team NOH_2013 has less than 8 players. they have 0 players.
Team CHA_2014 has less than 8 players. they have 0 players.
Number of players in each team (excluding 1996 teams and TOT 