# Crawling All Games in the NBA Play-by-Play Era
This notebook will crawl through all NBA games in the play-by-play era (1997-Present). Each file path with take on the following structure:

`src/raw/play-by-play/<game_type>/<season>/<game_id>.json`

In [None]:
# Get urls for all games in a given season.
import pandas as pd

fp = '../about/game_ids.csv'
df = pd.read_csv(fp)

def get_season_urls(df, season, game_type):
    urls = []
    for i, row in df.iterrows():
        if row['season'] == season and row['game_type'] == game_type:
            urls.append(row['game_url'])

    return urls


In [96]:
# Get urls for all games in a given season.
import pandas as pd

fp = '../about/game_ids.csv'
df = pd.read_csv(fp)

def get_season_urls(df, season, game_type):
    urls = []
    for i, row in df.iterrows():
        if row['season'] == season and row['game_type'] == game_type:
            urls.append(row['game_url'])

    return urls


START = 1997
END = 2025

for season in range(START, END + 1):
    urls = get_season_urls(df, season, 'playoff')
    print(f'{season}: {len(urls)} games')
    with open(f'../about/playoff_urls/{season}.txt', 'w') as f:
        for url in urls:
            f.write(url + '\n')

1997: 72 games
1998: 71 games
1999: 66 games
2000: 75 games
2001: 71 games
2002: 71 games
2003: 88 games
2004: 82 games
2005: 84 games
2006: 89 games
2007: 79 games
2008: 86 games
2009: 85 games
2010: 82 games
2011: 81 games
2012: 84 games
2013: 85 games
2014: 89 games
2015: 81 games
2016: 86 games
2017: 79 games
2018: 82 games
2019: 82 games
2020: 83 games
2021: 85 games
2022: 87 games
2023: 84 games
2024: 82 games
2025: 0 games


In [118]:
from bs4 import BeautifulSoup
import requests
import json
import sys
import time
import random


def export_game(url, fp, season):
    try:
        headers = {
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Methods': 'GET',
            'Access-Control-Allow-Headers': 'Content-Type',
            'Access-Control-Max-Age': '3600',
            'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
        }

        retries = 3
        for attempt in range(retries):

            time.sleep(random.uniform(1, 3))

            r = requests.get(url, headers=headers)
            soup = BeautifulSoup(r.text, 'html.parser')
            script = soup.find(id="__NEXT_DATA__")

            if script is not None:
                break
            elif attempt < retries - 1:
                print(f'Retrying... ({attempt + 1}/{retries})\tURL: {url}')
            else:
                print(f'Failed to retrieve data after {retries} attempts for URL: {url}')
                return

        json_obj = json.loads(script.text)
        json_obj = json_obj['props']['pageProps']['playByPlay']['actions']

        if not json_obj:
            print(f'No data found for URL: {url}')
            return

        fp = fp if fp.endswith('.json') else fp + '.json'
        
        with open(fp, 'w') as f:
            json.dump(json_obj, f, indent=4)

    except Exception as e:
        print(e)
        sys.exit(1)
    return json_obj




In [123]:
# Tests export_game and exports into src/data/scripts folder/
# Open the text file in the playoff_urls folder and test the function for each URL.
years = range(2021, 2026)

def export_playoff_json(year):
    fp = f'../about/playoff_urls/{str(year)}.txt'
    with open(fp, 'r') as f:
        urls = f.readlines()
        for url in urls:
            url = url.strip()
            out = f'../raw/play-by-play/playoff/{str(year)}/{url.split("/")[-1]}.json'
            print(out)
            export_game(url, out, year)

for year in years:
    export_playoff_json(year)
    print(f'Finished exporting {year} playoff games.')

../raw/play-by-play/playoff/2021/bos-vs-bkn-0042000111.json
../raw/play-by-play/playoff/2021/dal-vs-lac-0042000171.json
../raw/play-by-play/playoff/2021/mia-vs-mil-0042000121.json
../raw/play-by-play/playoff/2021/por-vs-den-0042000161.json
../raw/play-by-play/playoff/2021/mem-vs-uta-0042000141.json
../raw/play-by-play/playoff/2021/atl-vs-nyk-0042000131.json
../raw/play-by-play/playoff/2021/was-vs-phi-0042000101.json
../raw/play-by-play/playoff/2021/lal-vs-phx-0042000151.json
../raw/play-by-play/playoff/2021/por-vs-den-0042000162.json
Retrying... (1/3)	URL: https://www.nba.com/game/por-vs-den-0042000162
../raw/play-by-play/playoff/2021/mia-vs-mil-0042000122.json
../raw/play-by-play/playoff/2021/dal-vs-lac-0042000172.json
../raw/play-by-play/playoff/2021/bos-vs-bkn-0042000112.json
../raw/play-by-play/playoff/2021/lal-vs-phx-0042000152.json
../raw/play-by-play/playoff/2021/was-vs-phi-0042000102.json
../raw/play-by-play/playoff/2021/atl-vs-nyk-0042000132.json
../raw/play-by-play/playoff/20

In [2]:
import pandas as pd

fp = '../about/regseason_game_ids.csv'
df = pd.read_csv(fp)

team = 'CLE'
df = df[(df['season'] == 2025) & (df['game_type'] == 'regular')]
df.shape

(1214, 8)