In [3]:
import pandas as pd
import numpy as np
from nba_api.stats.endpoints import *
import json
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
import time

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Set up functions to gather data

In [11]:
def get_df_from_json(team_stats_json):
    """
    Get DataFrame format of json data
    
    Parameters:
    - team_stats_json: json of data
    
    Returns:
    - team_stats_df: json data as a Pandas DataFrame
    """
    # Load the JSON data into a DataFrame
    team_stats_df = pd.DataFrame(
        json.loads(team_stats_json)['resultSets'][0]['rowSet'], 
        columns = json.loads(team_stats_json)['resultSets'][0]['headers']
    )
    
    return team_stats_df


def get_shot_clock_splits(shot_clock_lst, season = '2024-25'):
    """
    Get shot clock split data from the NBA API
    
    Parameters:
    - shot_clock_lst: List of shot clock ranges
    - season: NBA season (default '2024-25')
    
    Returns:
    - shot_clock_dfs: Dictionary of DataFrames for each shot clock range with the range as the key
    """
    shot_clock_dfs = {}
    
    # Loop through the different shot clock timeframes
    for timeframe in shot_clock_lst:
        
        time.sleep(1)
        
        # Pull the JSON data for the year and shot clock timeframe
        per_poss_stats = leaguedashteamstats.LeagueDashTeamStats(
            season = season, 
            shot_clock_range_nullable = timeframe
        ).get_json()
        per_poss_df = get_df_from_json(per_poss_stats)
        
        # Pull the JSON Advanced data for the year and shot clock timeframe to get possessions
        adv_stats = leaguedashteamstats.LeagueDashTeamStats(
            season = season, 
            shot_clock_range_nullable = timeframe, 
            measure_type_detailed_defense = 'Advanced'
        ).get_json()
        adv_df = get_df_from_json(adv_stats)
        
        # Merge the two DataFrames together on 'TEAM_ID' and filter to the columns we care about
        team_stats_df = per_poss_df.merge(adv_df, on=['TEAM_ID', 'TEAM_NAME'], how='inner')
        team_stats_df = team_stats_df[['TEAM_ID', 'TEAM_NAME', 'PTS', 'POSS']]
        
        shot_clock_dfs[timeframe] = team_stats_df
        
    return shot_clock_dfs


def aggregate_shot_clock_data(shot_clock_dfs, shot_clock_lst):
    """
    Aggregates shot clock data across a selcted list of timeframes to calculate a weighted average of PPP per team

    Parameters:
    - shot_clock_dfs: Dictionary of DataFrames, where keys represent shot clock ranges
    - shot_clock_lst: List of shot clock ranges to aggregate

    Returns:
    - final_df: A DataFrame with TEAM_ID, TEAM_NAME, weighted PTS per possession, and total POSS.
    """
    # Initialize variables
    aggregated_data = []
    shot_clock_remaining = shot_clock_lst[0].split(' ')[0]
    
    # Loop through each shot clock range in the list
    for timeframe in shot_clock_lst:
        aggregated_data.append(shot_clock_dfs[timeframe])

    # Combine all dataframes
    merged_df = pd.concat(aggregated_data)

    # Aggregate by TEAM_ID and TEAM_NAME
    final_df = merged_df.groupby(['TEAM_ID', 'TEAM_NAME'], as_index=False).agg(
        TOTAL_POSS=('POSS', 'sum'),
        TOTAL_PTS=('PTS', 'sum')
    )

    # Calculate final weighted PTS per possession
    final_df['PPP'] = final_df['TOTAL_PTS'] / final_df['TOTAL_POSS']
    final_df['SC_REMAINING'] = shot_clock_remaining

    return final_df

## Loop through years to get data for each

In [12]:
all_df_lst = []
cols = ['TEAM_ID', 'TEAM_NAME', 'SEASON', 'SC_REMAINING', 'PPP', 'SCALED_PPP', 'TOTAL_PTS', 'TOTAL_POSS']

# Loop through each year from 2013-14 to 2024-25
for i in range (13, 25):
    
    # Establish the season to scrape
    season = f'20{i}-{i+1}'
    shot_clock_lst = ['24-22', '22-18 Very Early', '18-15 Early', '15-7 Average', '7-4 Late', '4-0 Very Late']
    
    # Load the shot clock data from that year
    shot_clock_dfs = get_shot_clock_splits(shot_clock_lst, season)
    
    # Aggregate the data for the average across all remaining shot clock values
    final_shot_clock_dfs = {}
    while len(shot_clock_lst) > 0:
        agg_df = aggregate_shot_clock_data(shot_clock_dfs, shot_clock_lst)
        sc = shot_clock_lst.pop(0)
        final_shot_clock_dfs[sc] = agg_df
        
    # Pull the overall offensive rating for each team without the shotclock filter
    time.sleep(1)
    off_rating_stats = leaguedashteamstats.LeagueDashTeamStats(
                            season = season,
                            measure_type_detailed_defense = 'Advanced'
                        ).get_json()
    orating_df = get_df_from_json(off_rating_stats)[['TEAM_ID', 'TEAM_NAME', 'OFF_RATING']]
    orating_df['orating_PPP'] = orating_df['OFF_RATING'] / 100
    
    # Pull the overall aggregated data based on the shot clock
    full_sc = final_shot_clock_dfs['24-22']
    
    # Merge the dataframes and calculate the factor needed for each team to reach their overall offensive rating from the shot clock aggregation
    scaling_df = full_sc.merge(orating_df, on=['TEAM_ID', 'TEAM_NAME'], how='inner')
    scaling_df['factor'] = scaling_df['orating_PPP'] / scaling_df['PPP']
    factor_avg = scaling_df['factor'].mean()
    print(f'Factor average for {season} from shot clock PPP to overall PPP is {round(factor_avg, 3)}')
    
    # Loop through each shot clock timeframe and scale based on the factor average
    season_dfs = []
    for timeframe in final_shot_clock_dfs:
        temp_df = final_shot_clock_dfs[timeframe]
        temp_df['SCALED_PPP'] = temp_df['PPP'] * factor_avg

        # Create "Overall" row
        new_row = temp_df.mean(numeric_only=True).to_dict()
        new_row.update({'TEAM_NAME': 'Overall', 'SC_REMAINING': timeframe.split(' ')[0], 'TEAM_ID': 0})
        temp_df.loc[len(temp_df)] = new_row

        season_dfs.append(temp_df)
        
    # Concatenate dataframes for season and add season column
    final_season_df = pd.concat(season_dfs)
    final_season_df['SEASON'] = season
    all_df_lst.append(final_season_df)
    
    print(f"Processed data for season: {season}\n")
    
# Concatenate dataframes for all time and add season column
final_df = pd.concat(all_df_lst).reset_index(drop=True)
final_df = final_df[cols]

Factor average for 2013-14 from shot clock PPP to overall PPP is 1.554
Processed data for season: 2013-14

Factor average for 2014-15 from shot clock PPP to overall PPP is 1.562
Processed data for season: 2014-15

Factor average for 2015-16 from shot clock PPP to overall PPP is 1.55
Processed data for season: 2015-16

Factor average for 2016-17 from shot clock PPP to overall PPP is 1.553
Processed data for season: 2016-17

Factor average for 2017-18 from shot clock PPP to overall PPP is 1.554
Processed data for season: 2017-18

Factor average for 2018-19 from shot clock PPP to overall PPP is 1.706
Processed data for season: 2018-19

Factor average for 2019-20 from shot clock PPP to overall PPP is 1.719
Processed data for season: 2019-20

Factor average for 2020-21 from shot clock PPP to overall PPP is 1.705
Processed data for season: 2020-21

Factor average for 2021-22 from shot clock PPP to overall PPP is 1.724
Processed data for season: 2021-22

Factor average for 2022-23 from shot c

In [13]:
final_df

Unnamed: 0,TEAM_ID,TEAM_NAME,SEASON,SC_REMAINING,PPP,SCALED_PPP,TOTAL_PTS,TOTAL_POSS
0,1610612737,Atlanta Hawks,2013-14,24-22,0.677363,1.052352,8169.0,12060.0
1,1610612738,Boston Celtics,2013-14,24-22,0.647117,1.005361,7889.0,12191.0
2,1610612739,Cleveland Cavaliers,2013-14,24-22,0.652213,1.013278,8047.0,12338.0
3,1610612740,New Orleans Pelicans,2013-14,24-22,0.688105,1.06904,8174.0,11879.0
4,1610612741,Chicago Bulls,2013-14,24-22,0.653971,1.01601,7658.0,11710.0
5,1610612742,Dallas Mavericks,2013-14,24-22,0.704518,1.094539,8593.0,12197.0
6,1610612743,Denver Nuggets,2013-14,24-22,0.674312,1.047612,8555.0,12687.0
7,1610612744,Golden State Warriors,2013-14,24-22,0.696117,1.081487,8407.0,12077.0
8,1610612745,Houston Rockets,2013-14,24-22,0.72782,1.130741,8827.0,12128.0
9,1610612746,Los Angeles Clippers,2013-14,24-22,0.73174,1.136831,8846.0,12089.0


In [None]:
final_df.to_csv('data/shot_clock_ppp.csv')