In [67]:
import json
import pandas as pd
import requests

from bs4 import BeautifulSoup
from tqdm import tqdm
from typing import *

In [54]:
# retrieve all l2m links
url = "https://official.nba.com/2022-23-nba-officiating-last-two-minute-reports/"

def get_l2m_links(url: str):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    links = soup.find_all("a")
    links = [link.get("href") for link in links if link]
    links = [str(link) for link in links if "l2m" in str(link)]
    print(f"Discovered {len(links)} links")
    return links

links = get_l2m_links(url=url)

Discovered 277 links


In [58]:
def process_l2m_link(game_id: str):
#     game_id = link.split("=")[-1].split("%")[0]
    json_link = f"https://official.nba.com/l2m/json/{game_id}.json"
    resp = requests.get(json_link)
    data = json.loads(resp.text)
    return data

def retrieve_l2m_data(links: List[str]):
    all_data = {}
    for link in tqdm(links):
        game_id = link.split("=")[-1].split("%")[0]
        all_data[game_id] = process_l2m_link(game_id=game_id)
    
    return all_data

l2m_data = retrieve_l2m_data(links=links)

100%|█████████████████████████████████████████| 277/277 [02:01<00:00,  2.29it/s]


In [72]:
def process_team_data(datum):
    
    game_data = datum["game"][0]
    game_stats = datum["stats"]
    call_stats = [stat for stat in game_stats if stat["stats_name"] == "Calls"][0]
    error_stats = [stat for stat in game_stats if stat["stats_name"] == "Errors in Favor"][0]
    posession_stats = [stat for stat in game_stats if stat["stats_name"] == "Possessions in Favor"][0]
    
    away_team = game_data["Away_team"]
    home_team = game_data["Home_team"]
    
    home_row = {
        "teamName": home_team,
        "opponent": away_team,
        "game": f"{home_team} vs {away_team}",
        "gameId": game_data["GameId"],
        "gameDate": game_data["GameDateOut"],
        "pointsScored": game_data["HomeTeamScore"],
        "pointsAllowed": game_data["VisitorTeamScore"],
        "outcome": "WON" if game_data["HomeTeamScore"] > game_data["VisitorTeamScore"] else "LOSS",
        "callsReceived": call_stats["home"],
        "errorsInFavor": error_stats["home"],
        "posessionsInFavor": posession_stats["home"]
    }
    
    away_row = {
        "teamName": away_team,
        "opponent": home_team,
        "game": f"{away_team} @ {home_team}",
        "gameId": game_data["GameId"],
        "gameDate": game_data["GameDateOut"],
        "pointsScored": game_data["VisitorTeamScore"],
        "pointsAllowed": game_data["HomeTeamScore"],
        "outcome": "WON" if game_data["VisitorTeamScore"] > game_data["HomeTeamScore"] else "LOSS",
        "callsReceived": call_stats["away"],
        "errorsInFavor": error_stats["away"],
        "posessionsInFavor": posession_stats["away"]
    }
    
    return [
        home_row, away_row
    ]


def process_for_team_stats(data):
    
    team_stats = []
    
    for game_id, datum in tqdm(data.items()):
        
        team_stats.extend(
            process_team_data(datum=datum)
        )
    
    return pd.DataFrame(team_stats)

team_df = process_for_team_stats(data=l2m_data)
team_df.head(10)

100%|██████████████████████████████████████| 277/277 [00:00<00:00, 35120.53it/s]


Unnamed: 0,teamName,opponent,game,gameId,gameDate,pointsScored,pointsAllowed,outcome,callsReceived,errorsInFavor,posessionsInFavor
0,76ers,Nuggets,76ers vs Nuggets,22200741,"January 28, 2023",126,119,WON,1,1,0
1,Nuggets,76ers,Nuggets @ 76ers,22200741,"January 28, 2023",119,126,LOSS,0,0,0
2,Pistons,Rockets,Pistons vs Rockets,22200743,"January 28, 2023",114,117,LOSS,4,0,0
3,Rockets,Pistons,Rockets @ Pistons,22200743,"January 28, 2023",117,114,WON,0,0,0
4,Spurs,Suns,Spurs vs Suns,22200748,"January 28, 2023",118,128,LOSS,1,0,0
5,Suns,Spurs,Suns @ Spurs,22200748,"January 28, 2023",128,118,WON,0,1,0
6,Celtics,Lakers,Celtics vs Lakers,22200749,"January 28, 2023",125,121,WON,3,1,0
7,Lakers,Celtics,Lakers @ Celtics,22200749,"January 28, 2023",121,125,LOSS,0,2,0
8,Heat,Magic,Heat vs Magic,22200738,"January 27, 2023",110,105,WON,1,1,0
9,Magic,Heat,Magic @ Heat,22200738,"January 27, 2023",105,110,LOSS,0,1,0


In [92]:
ABBREVIATIONS = {
    "CC": "Correct Call",
    "IC": "Incorrect Call",
    "CNC": "Correct Non-Call",
    "INC": "Incorrect Non-Call"
}

def process_player_data(datum):
    
    game_data = datum["game"][0]
    team_map = {
        game_data["HomeTeamId"]: game_data["Home_team"],
        game_data["AwayTeamId"]: game_data["Away_team"]
    }
    
    rows = datum["l2m"]
    processed_rows = [
        {
            "id": f"{row['posID']}-{row['VideolLink']}",
            "committingPlayer": row["CP"],
            "disadvantagedPlayer": row["DP"],
            "callType": row["CallType"],
            "decision": ABBREVIATIONS.get(row["CallRatingName"], row["CallType"]),
            "time": row["PCTime"],
            "teamInPosession": team_map[row["posTeamId"]],
            "difficulty": row["Difficulty"],
            "comment": row["Comment"],
        }
        for row in rows
    ]
        
    return processed_rows

def process_for_player_stats(data):
    
    player_stats = []
    
    for game_id, datum in tqdm(data.items()):
        
        player_stats.extend(
            process_player_data(datum=datum)
        )

    
    return pd.DataFrame(player_stats)

player_df = process_for_player_stats(data=l2m_data)
player_df.head(10)
    

100%|██████████████████████████████████████| 277/277 [00:00<00:00, 21214.68it/s]


Unnamed: 0,id,committingPlayer,disadvantagedPlayer,callType,decision,time,teamInPosession,difficulty,comment
0,1535-2110,P.J. Tucker,Nikola Jokic,Foul: Personal,Correct Non-Call,01:59,Nuggets,Observable,Tucker (PHI) and Jokic (DEN) engage and diseng...
1,1535-2111,P.J. Tucker,Nikola Jokic,Foul: Personal,Correct Non-Call,01:54,Nuggets,Observable,Tucker (PHI) establishes a legal guarding posi...
2,1535-2112,P.J. Tucker,Nikola Jokic,Foul: Shooting,Correct Non-Call,01:52.6,Nuggets,Observable,Tucker (PHI) brings his left arm forward sligh...
3,1536-2113,Aaron Gordon,Joel Embiid,Foul: Shooting,Correct Non-Call,01:35.8,76ers,Observable,Gordon (DEN) jumps vertically and incidental b...
4,1537-2114,James Harden,Kentavious Caldwell-Pope,Foul: Shooting,Correct Non-Call,01:28.1,Nuggets,Observable,Harden (PHI) legally contests Caldwell-Pope&ap...
5,1538-2115,Kentavious Caldwell-Pope,James Harden,Foul: Personal,Correct Non-Call,01:10,76ers,Observable,Caldwell-Pope (DEN) makes clean contact with t...
6,1538-2116,Kentavious Caldwell-Pope,James Harden,Foul: Personal,Correct Non-Call,01:08,76ers,Observable,Caldwell-Pope (DEN) again makes clean contact ...
7,1538-2117,Kentavious Caldwell-Pope,James Harden,Foul: Shooting,Correct Non-Call,01:06.2,76ers,Observable,Caldwell-Pope (DEN) avoids making contact with...
8,1539-2118,P.J. Tucker,Nikola Jokic,Foul: Shooting,Correct Non-Call,00:52.5,Nuggets,Observable,Tucker (PHI) makes clean contact with the ball...
9,1540-2119,Michael Porter Jr.,James Harden,Foul: Personal,Correct Non-Call,00:50.1,76ers,Observable,Porter Jr. (DEN) brushes Harden&apos;s (PHI) r...


In [95]:
def output_all(raw, team_df, player_df, location="../data/"):
    
    team_df.to_csv(
        f"{location}processed/team_data.csv"
    )
    player_df.to_csv(
        f"{location}processed/player_data.csv"
    )
    
    with open(f"{location}raw/raw.json", "w") as f:
        f.write(json.dumps(obj=raw, indent=4))
        
output_all(raw=l2m_data, team_df=team_df, player_df=player_df)