# Crawling All Games in the NBA Play-by-Play Era
This notebook will crawl through all NBA games in the play-by-play era (1997-Present). Each file path with take on the following structure:

`src/raw/play-by-play/<game_type>/<season>/<game_id>.json`

In [None]:
# Get urls for all games in a given season.
import pandas as pd

fp = '../about/regseason_game_ids.csv'
df = pd.read_csv(fp)

def get_season_urls(df, season):
    urls = df[df['season'] == season]
    return urls



season = 2024
df2024 = get_season_urls(df, season)

df2024[:5]


Unnamed: 0,date,game_id,game_url,game_type,season,game_number,away_team,home_team
29164,2023-10-24,22300061,https://www.nba.com/game/lal-vs-den-0022300061,regular,2024,61,LAL,DEN
29165,2023-10-24,22300062,https://www.nba.com/game/phx-vs-gsw-0022300062,regular,2024,62,PHX,GSW
29166,2023-10-25,22300066,https://www.nba.com/game/hou-vs-orl-0022300066,regular,2024,66,HOU,ORL
29167,2023-10-25,22300065,https://www.nba.com/game/bos-vs-nyk-0022300065,regular,2024,65,BOS,NYK
29168,2023-10-25,22300064,https://www.nba.com/game/was-vs-ind-0022300064,regular,2024,64,WAS,IND


(82, 8)

In [None]:
from bs4 import BeautifulSoup
import requests
import json
import sys

def export_game(url, out):
    try:
        headers = {
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Methods': 'GET',
            'Access-Control-Allow-Headers': 'Content-Type',
            'Access-Control-Max-Age': '3600',
            'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
            }

        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.text, 'html.parser')
        script = soup.find(id="__NEXT_DATA__")

        json_obj = json.loads(script.text)
        json_obj = json_obj['props']['pageProps']['playByPlay']['actions']

        with open(out, 'w') as f:
            json.dump(json_obj, f, indent=4)

    except Exception as e:
        print(e)
        sys.exit(1)
    return json_obj

# Tests export_game and exports into src/data/scripts folder/
test = df.iloc[0]
url = test['game_url']
out = f'./{test["game_id"]}.json'
export_game(url, out)

[{'actionNumber': 2,
  'clock': 'PT12M00.00S',
  'period': 1,
  'teamId': 0,
  'teamTricode': '',
  'personId': 0,
  'playerName': '',
  'playerNameI': '',
  'xLegacy': 0,
  'yLegacy': 0,
  'shotDistance': 0,
  'shotResult': '',
  'isFieldGoal': 0,
  'scoreHome': '0',
  'scoreAway': '0',
  'pointsTotal': 0,
  'location': '',
  'description': 'Start of 1st Period (7:36 PM EST)',
  'actionType': 'period',
  'subType': 'start',
  'videoAvailable': 0,
  'shotValue': 0,
  'actionId': 1},
 {'actionNumber': 4,
  'clock': 'PT12M00.00S',
  'period': 1,
  'teamId': 1610612743,
  'teamTricode': 'DEN',
  'personId': 203999,
  'playerName': 'Jokić',
  'playerNameI': 'N. Jokić',
  'xLegacy': 0,
  'yLegacy': 0,
  'shotDistance': 0,
  'shotResult': '',
  'isFieldGoal': 0,
  'scoreHome': '',
  'scoreAway': '',
  'pointsTotal': 0,
  'location': 'h',
  'description': 'Jump Ball Jokic vs. Davis: Tip to James',
  'actionType': 'Jump Ball',
  'subType': '',
  'videoAvailable': 1,
  'shotValue': 0,
  'action

In [12]:
df.shape

(1216, 8)

In [13]:
start = 34968
end = 36188
end - start

1220

In [34]:
import pandas as pd

fp = '../about/regseason_game_ids.csv'
df = pd.read_csv(fp)

team = 'CLE'
df = df[(df['season'] == 2024) & (df['game_type'] == 'regular')]
df.shape

(1216, 8)

In [35]:
# Sort the dataframe by game_number, check for missing games
df = df.sort_values('game_number')

# Get the game numbers
game_numbers = df['game_number'].values

# Check for missing games
missing = []
for i in range(1, len(game_numbers)):
    if game_numbers[i] - game_numbers[i-1] != 1:
        missing.append((game_numbers[i-1], game_numbers[i]))

missing

[(257, 265), (718, 726)]