<a href="https://colab.research.google.com/github/arielba2002/Deep-Picker-Project/blob/15-all-necessary-datasets-are-downloaded-cleaned-and-stored-in-a-central-repository/DeepPicker_DataScraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Notebook Goal:
--------------
This notebook fetches NBA team and player data from a custom API, then
organizes and prepares the data for further analysis or modeling (e.g.,
train/test split). Finally, it saves the raw player data and extracted
metadata to JSON files.

Reference: NBA Stats API Docs
https://documenter.getpostman.com/view/24232555/2s93shzpR3#358b5496-a459-4e05-9f9d-e924633454fb

In [35]:
# =========================
# 1. Imports
# =========================

import requests
import json
from collections import defaultdict

In [36]:
# =========================
# 2. Data Fetching
# =========================

def fetch_teams_by_season(api_base_url, seasons):
    """
    Fetch unique team names for each season from the given API endpoint,
    excluding non-team entries like "TOT".

    Parameters
    ----------
    api_base_url : str
        The base URL of the API.
    seasons : iterable
        A collection of season years (int).

    Returns
    -------
    dict
        Dictionary mapping each season to a list of unique team names.
        Example: {1996: ["LAL", "BOS", ...], 1997: [...], ...}
    """
    teams_by_season = {}

    # Define non-team entries to exclude
    non_team_entries = {"TOT"}

    for season in seasons:
        season_url = f"{api_base_url}/PlayerDataTotals/season/{season}"
        try:
            response = requests.get(season_url)
            response.raise_for_status()

            # Extract teams and filter out non-team entries
            team_set = {player["team"] for player in response.json()}
            team_set -= non_team_entries  # Remove non-team entries
            teams_by_season[season] = sorted(team_set)
        except requests.exceptions.RequestException as e:
            print(f"Error fetching teams for season {season}: {e}")
            teams_by_season[season] = []

    return teams_by_season

def fetch_players_in_teamseason(api_base_url, teams_by_season):
    """
    Fetch all players belonging to each (team, season) combination.

    Parameters
    ----------
    api_base_url : str
        The base URL of the API.
    teams_by_season : dict
        A dictionary mapping seasons to lists of teams.

    Returns
    -------
    dict
        Mapping of "TEAM_SEASON" string (e.g., "LAL_2020") to a list of player stats.
        Example: {"LAL_2020": [{...player1...}, {...player2...}], ...}
    """
    players_in_teamseason = {}

    for season, teams in teams_by_season.items():
        for team in teams:
            query_url = (f"{api_base_url}/PlayerDataTotals/query"
                         f"?team={team}&season={season}&pageSize=20")
            try:
                response = requests.get(query_url)
                response.raise_for_status()
                players_in_teamseason[f"{team}_{season}"] = response.json()
            except requests.exceptions.RequestException as e:
                print(f"Error fetching players for {team} in {season}: {e}")
                players_in_teamseason[f"{team}_{season}"] = []

    return players_in_teamseason



**Important Note**

By default most of these players stats are on a yearly bases and per game. For example, there's a total points scored in entire season span and not a Points per Game stat. We can convert that easily by dividing those specific stats by the "Games Played" stat.



In [37]:
# =========================
# 3. Metadata Creation
# =========================

def create_metadata(players_in_teamseason):
    """
    Generate metadata from the fetched player data. The metadata includes:
      - A list of all teams encountered.
      - A list of all unique player names.
      - A mapping from season -> list of teams in that season.
      - A mapping from (team, season) -> list of player names.
      - A list of all encountered player-stat attributes.

    Parameters
    ----------
    players_in_teamseason : dict
        Mapping of "TEAM_SEASON" -> list of player dictionaries.

    Returns
    -------
    dict
        A dictionary containing multiple metadata components:
        {
            "all_teams": [...],
            "all_players": [...],
            "teams_per_season": { "1996": ["LAL", ...], "1997": [...], ... },
            "players_per_team_season": { "LAL_2020": ["LeBron James", "AD", ...], ... },
            "player_stats_attributes": [...]
        }
    """
    all_teams = set()
    all_players = set()
    teams_per_season = defaultdict(set)
    players_per_team_season = {}
    player_stats_attributes = set()

    for team_season_key, players_list in players_in_teamseason.items():
        # team_season_key format: "TEAM_SEASON"
        team, season = team_season_key.split("_")

        # Add team to overall set and season-specific set
        all_teams.add(team)
        teams_per_season[season].add(team)

        # Collect player names for the current team/season
        player_names_in_this_team_season = []

        for player_dict in players_list:
            player_name = player_dict.get("playerName", "Unknown")
            all_players.add(player_name)
            player_names_in_this_team_season.append(player_name)

            # Collect all attribute keys
            player_stats_attributes.update(player_dict.keys())

        players_per_team_season[team_season_key] = player_names_in_this_team_season

    # Convert sets to lists for JSON serialization
    metadata = {
        "all_teams": sorted(all_teams),
        "all_players": sorted(all_players),
        "teams_per_season": {yr: sorted(teams) for yr, teams in teams_per_season.items()},
        "players_per_team_season": players_per_team_season,
        "player_stats_attributes": sorted(player_stats_attributes)
    }

    return metadata

In [38]:
# =========================
# 4. Main Execution
# =========================

# 0. Define Run Constants
API_BASE_URL = "http://b8c40s8.143.198.70.30.sslip.io/api"
SEASON_RANGE = range(1996, 2025)  # Seasons from 1996 to 2024 inclusive

# 1. Fetch teams by season
teams_by_season_data = fetch_teams_by_season(API_BASE_URL, SEASON_RANGE)

# 2. Print a sample of teams for a given season (Defined in sample_season)
sample_season = 2000
if sample_season in teams_by_season_data:
    print(f"Teams in {sample_season}:")
    print(teams_by_season_data[sample_season])

sample_season = 2020
if sample_season in teams_by_season_data:
    print(f"Teams in {sample_season}:")
    print(teams_by_season_data[sample_season])

# 3. Show the count of teams each year
print("\nTeam counts per season:")
for season, teams in teams_by_season_data.items():
    print(f"{season}: {len(teams)}")

# 4. Fetch players for each (team, season)
players_in_teamseason_data = fetch_players_in_teamseason(API_BASE_URL, teams_by_season_data)

# 5. Print a sample of fetched data (example: LAL_2020)
sample_key = "LAL_2020"
if sample_key in players_in_teamseason_data:
    print(f"\nSample player data for {sample_key}:")
    print(json.dumps(players_in_teamseason_data[sample_key], indent=4))

# 6. Create metadata from fetched player data
metadata_dict = create_metadata(players_in_teamseason_data)

Teams in 2000:
['ATL', 'BOS', 'CHH', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MIA', 'MIL', 'MIN', 'NJN', 'NYK', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'SEA', 'TOR', 'UTA', 'VAN', 'WAS']
Teams in 2020:
['ATL', 'BOS', 'BRK', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

Team counts per season:
1996: 29
1997: 29
1998: 29
1999: 29
2000: 29
2001: 29
2002: 29
2003: 29
2004: 29
2005: 30
2006: 30
2007: 30
2008: 30
2009: 30
2010: 30
2011: 30
2012: 30
2013: 30
2014: 30
2015: 30
2016: 30
2017: 30
2018: 30
2019: 30
2020: 30
2021: 30
2022: 30
2023: 30
2024: 30

Sample player data for LAL_2020:
[
    {
        "id": 2311,
        "playerName": "Alex Caruso",
        "position": "PG",
        "age": 25,
        "games": 64,
        "gamesStarted": 2,
        "minutesPg": 1175.0,
        "fieldGoals": 120,
        "fieldAttempt

In [39]:
# =========================
# 4. Save Data locally
# =========================

from google.colab import files

# Save Data file
with open('raw_data.json', 'w') as f:
    json.dump(players_in_teamseason_data, f)

    files.download('raw_data.json')

# Save Metadata file
with open('metadata.json', 'w') as f:
  json.dump(metadata_dict, f, indent=4)

  files.download('metadata.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>