In [50]:
import json
import pandas as pd
import requests
import time

from bs4 import BeautifulSoup
from tqdm import tqdm
from typing import *

In [2]:
# load data
player_data = "../data/processed/player_data.csv"
team_data = "../data/processed/team_data.csv"

def load_data(team_loc, player_loc):
    
    return pd.read_csv(team_loc), pd.read_csv(player_loc)

team_df, player_df = load_data(team_loc=team_data, player_loc=player_data)

In [15]:
TEAM_LOOKUP = {
  "ATL": "Hawks",
  "BKN": "Nets",
  "BOS": "Celtics",
  "CHA": "Hornets",
  "CHI": "Bulls",
  "CLE": "Cavaliers",
  "DAL": "Mavericks",
  "DEN": "Nuggets",
  "DET": "Pistons",
  "GSW": "Warriors",
  "HOU": "Rockets",
  "IND": "Pacers",
  "LAC": "Clippers",
  "LAL": "Lakers",
  "MEM": "Grizzlies",
  "MIA": "Heat",
  "MIL": "Bucks",
  "MIN": "Timberwolves",
  "NOP": "Pelicans",
  "NYK": "Knicks",
  "OKC": "Thunder",
  "ORL": "Magic",
  "PHI": "76ers",
  "PHX": "Suns",
  "POR": "Trail Blazers",
  "SAC": "Kings",
  "SAS": "Spurs",
  "TOR": "Raptors",
  "UTA": "Jazz",
  "WAS": "Wizards"
}

INVERTED_TEAM_LOOKUP = {v:k for k,v in TEAM_LOOKUP.items()}

### Filter to avoid game duplication:

In [16]:
filtered_team_df = team_df[team_df["game"].str.contains("vs")]
filtered_team_df.head(10)

Unnamed: 0.1,Unnamed: 0,teamName,opponent,game,gameId,gameDate,pointsScored,pointsAllowed,outcome,callsReceived,errorsInFavor,posessionsInFavor
0,0,76ers,Nuggets,76ers vs Nuggets,22200741,"January 28, 2023",126,119,WON,1,1,0
2,2,Pistons,Rockets,Pistons vs Rockets,22200743,"January 28, 2023",114,117,LOSS,4,0,0
4,4,Spurs,Suns,Spurs vs Suns,22200748,"January 28, 2023",118,128,LOSS,1,0,0
6,6,Celtics,Lakers,Celtics vs Lakers,22200749,"January 28, 2023",125,121,WON,3,1,0
8,8,Heat,Magic,Heat vs Magic,22200738,"January 27, 2023",110,105,WON,1,1,0
10,10,Celtics,Knicks,Celtics vs Knicks,22200730,"January 26, 2023",117,120,LOSS,4,2,0
12,12,Suns,Mavericks,Suns vs Mavericks,22200734,"January 26, 2023",95,99,LOSS,3,1,0
14,14,76ers,Nets,76ers vs Nets,22200721,"January 25, 2023",137,133,WON,0,1,0
16,16,Thunder,Hawks,Thunder vs Hawks,22200725,"January 25, 2023",132,137,LOSS,2,0,0
18,18,Warriors,Grizzlies,Warriors vs Grizzlies,22200726,"January 25, 2023",122,120,WON,4,1,0


In [45]:
def retrieve_urls(filtered_team_df):
    game_data = list(zip(filtered_team_df.gameId, filtered_team_df.game))

    game_urls = {}

    for game_id, game in tqdm(game_data):

        teams = game.split(" vs ")
        team1 = INVERTED_TEAM_LOOKUP[teams[0].strip()].lower()
        team2 = INVERTED_TEAM_LOOKUP[teams[1].strip()].lower()

        game_urls[game_id] = f"https://www.nba.com/game/{team1}-vs-{team2}-00{game_id}"
        

    print(f"Retreived {len(game_urls)} box score URLs")
    
    return game_urls

game_urls = retrieve_urls(filtered_team_df=filtered_team_df)

100%|█████████████████████████████████████| 277/277 [00:00<00:00, 606379.02it/s]

Retreived 277 box score URLs





In [51]:
def retrieve_officials_for_game(game_id, game_url):
    resp = requests.get(game_url)
    soup = BeautifulSoup(resp.text)
    data = json.loads(soup.find('script', type='application/json').text)
    officials = data["props"]["pageProps"]["game"]["officials"]
    for official in officials:
        official.update({"gameId": game_id})
    
    return officials

def retrieve_officials(game_urls):
    
    official_data = []
    for game_id, game_url in tqdm(game_urls.items()):
        
        try:
            official_data.extend(
                retrieve_officials_for_game(game_id=game_id, game_url=game_url)
            )
        except Exception as ex:
            print(f"Failed to process game on first try. Retrying: {game_id} with URL: {game_url}")
            time.sleep(5)
            try:
                official_data.extend(
                    retrieve_officials_for_game(game_id=game_id, game_url=game_url)
                )
            except Exception as ex2:
            
                print(f"Unable to process game: {game_id} with URL: {game_url}")
            
    return pd.DataFrame(official_data)

official_data = retrieve_officials(game_urls)
        

 36%|██████████████▉                          | 101/277 [01:22<03:12,  1.09s/it]

Failed to process game on first try. Retrying: 22200487 with URL: https://www.nba.com/game/hou-vs-dal-0022200487


 45%|██████████████████▋                      | 126/277 [02:37<05:44,  2.28s/it]

Failed to process game on first try. Retrying: 22200414 with URL: https://www.nba.com/game/cha-vs-det-0022200414


 48%|███████████████████▋                     | 133/277 [03:01<03:13,  1.34s/it]

Failed to process game on first try. Retrying: 22200395 with URL: https://www.nba.com/game/nop-vs-phx-0022200395


 50%|████████████████████▍                    | 138/277 [03:15<06:27,  2.79s/it]

Failed to process game on first try. Retrying: 22200388 with URL: https://www.nba.com/game/ind-vs-bkn-0022200388


 56%|███████████████████████                  | 156/277 [04:36<02:12,  1.09s/it]

Failed to process game on first try. Retrying: 22200356 with URL: https://www.nba.com/game/hou-vs-phi-0022200356


 64%|██████████████████████████▎              | 178/277 [05:46<02:43,  1.66s/it]

Failed to process game on first try. Retrying: 22200281 with URL: https://www.nba.com/game/okc-vs-chi-0022200281


 65%|██████████████████████████▋              | 180/277 [06:03<07:03,  4.36s/it]

Failed to process game on first try. Retrying: 22200260 with URL: https://www.nba.com/game/cha-vs-phi-0022200260


 66%|██████████████████████████▉              | 182/277 [06:19<08:52,  5.60s/it]

Failed to process game on first try. Retrying: 22200268 with URL: https://www.nba.com/game/okc-vs-den-0022200268


 68%|███████████████████████████▋             | 187/277 [06:38<03:49,  2.55s/it]

Failed to process game on first try. Retrying: 22200242 with URL: https://www.nba.com/game/sac-vs-det-0022200242


 82%|█████████████████████████████████▋       | 228/277 [08:19<02:14,  2.75s/it]

Failed to process game on first try. Retrying: 22200122 with URL: https://www.nba.com/game/phi-vs-nyk-0022200122


 90%|█████████████████████████████████████    | 250/277 [09:19<01:08,  2.54s/it]

Failed to process game on first try. Retrying: 22200077 with URL: https://www.nba.com/game/sas-vs-chi-0022200077


100%|█████████████████████████████████████████| 277/277 [10:17<00:00,  2.23s/it]


In [52]:
official_data.head(15)

Unnamed: 0,personId,name,nameI,firstName,familyName,jerseyNum,assignment,gameId
0,101283,Brian Forte,B. Forte,Brian,Forte,45,,22200741
1,101284,John Goble,J. Goble,John,Goble,10,,22200741
2,203592,Dedric Taylor,D. Taylor,Dedric,Taylor,21,,22200741
3,2004,Courtney Kirkland,C. Kirkland,Courtney,Kirkland,61,,22200743
4,201245,Marat Kogut,M. Kogut,Marat,Kogut,32,,22200743
5,1627524,Nate Green,N. Green,Nate,Green,65,,22200743
6,200834,Mark Lindsay,M. Lindsay,Mark,Lindsay,29,,22200748
7,202035,Brett Nansel,B. Nansel,Brett,Nansel,44,,22200748
8,1627963,Phenizee Ransom,P. Ransom,Phenizee,Ransom,70,,22200748
9,2715,Eric Lewis,E. Lewis,Eric,Lewis,42,,22200749


In [53]:
official_data.to_csv("../data/processed/officiating.csv")