In [1]:
# imports
import pandas as pd
from utils.utils import *
from datetime import datetime

In [4]:
# functions
def get_turnover_stats(year, by_player=True, min_possessions=300, min_turnovers=25):
    """
    Generate turnover statistics for a specific year at either player or team level
    
    Parameters:
    year (int): Season year (use starting year, e.g., 2024 for 2024-25 season)
    by_player (bool): If True, returns player-level stats; if False, returns team-level stats
    min_possessions (int): Minimum possessions for player filtering
    min_turnovers (int): Minimum turnovers for player filtering
    
    Returns:
    pandas.DataFrame: Turnover statistics
    """
    # Create season string
    season = f"{year}_{str(year+1)[-2:]}"
    
    # Load data
    tov_df = pd.read_csv(f'data/tov_pbp/tov_data_{season}.csv')
    rating_df = pd.read_csv(f'data/team_adj/team_adj_{season}.csv')
    ppp_df = pd.read_csv(f'data/shot_clock_ppp/shot_clock_ppp_{season}.csv')
    scale_df = pd.read_csv(f'data/scale_data/usage_stats_{season}.csv')

    
    # Preprocessing
    tov_df['shot_clock'] = tov_df['shot_clock'].apply(lambda x: min(24, x))
    
    # Map team names to abbreviations
    nba_team_abbreviations = {
        "Atlanta Hawks": "ATL", "Boston Celtics": "BOS", "Brooklyn Nets": "BKN",
        "Charlotte Hornets": "CHA", "Chicago Bulls": "CHI", "Cleveland Cavaliers": "CLE",
        "Dallas Mavericks": "DAL", "Denver Nuggets": "DEN", "Detroit Pistons": "DET",
        "Golden State Warriors": "GSW", "Houston Rockets": "HOU", "Indiana Pacers": "IND",
        "LA Clippers": "LAC", "Los Angeles Lakers": "LAL", "Memphis Grizzlies": "MEM",
        "Miami Heat": "MIA", "Milwaukee Bucks": "MIL", "Minnesota Timberwolves": "MIN",
        "New Orleans Pelicans": "NOP", "New York Knicks": "NYK", "Oklahoma City Thunder": "OKC",
        "Orlando Magic": "ORL", "Philadelphia 76ers": "PHI", "Phoenix Suns": "PHX",
        "Portland Trail Blazers": "POR", "Sacramento Kings": "SAC", "San Antonio Spurs": "SAS",
        "Toronto Raptors": "TOR", "Utah Jazz": "UTA", "Washington Wizards": "WAS"
    }
    
    ppp_df['TEAM_NAME'] = ppp_df['TEAM_NAME'].apply(lambda x: nba_team_abbreviations.get(x, x))
    
    # Calculate lost points
    tov_df['lost_points'] = tov_df.apply(lambda row: _lost_points(ppp_df, row), axis=1)

    tov_df.dropna(subset=['team', 'opp_team'], inplace=True)
    

    # Calculate expected next points and dead ball adjustment
    tov_df['expected_next_points'] = tov_df.apply(
        lambda row: get_relative_rating(rating_df, row['team'], row['opp_team'], year), axis=1
    )
    tov_df['dead_ball_adj'] = tov_df.apply(
        lambda row: get_relative_adj(rating_df, row['team'], row['opp_team'], year), axis=1
    )
    
    # Calculate turnover value
    tov_df['value'] = tov_df.apply(lambda row: _get_value(row), axis=1)
    
    # Filter scale data for the specific season
    scale_df = scale_df[scale_df['SEASON'] == season.replace('_', '-')]
    scale_df = scale_df[['PLAYER_ID', 'GP', 'TOUCHES', 'TIME_OF_POSS']]

    
    # Common aggregation function 
    def process_data(data, groupby_cols, scale_data=None):
        # Aggregate the data
        result = data[groupby_cols + ['value', 'period']].groupby(by=groupby_cols).agg({'value': 'sum', 'period': 'count'})
        result.reset_index(inplace=True)
        
        # Rename the initial columns directly
        result.rename(columns={'period': 'TOs', 'value': 'TOI'}, inplace=True)
        

        # Merge with scale data if provided
        if scale_data is not None:
            result = pd.merge(result, scale_data, left_on='player_id', right_on='PLAYER_ID', how='left')
        else:
            # For team level, map player IDs to their teams first
            player_team_map = tov_df[['player_id', 'team']].drop_duplicates().set_index('player_id')['team'].to_dict()
            
            # Add team column to scale_df using the player-team mapping
            scale_df['team'] = scale_df['PLAYER_ID'].map(player_team_map)
            
            # Aggregate by team
            team_stats = scale_df.dropna(subset=['team']).groupby('team').agg({
                'TOUCHES': 'sum', 
                'TIME_OF_POSS': 'sum'
            }).reset_index()
            
            # Merge team stats with result
            result = pd.merge(result, team_stats, on='team', how='left')
        
        # Calculate scaled metrics with final names directly
        result['TOI_POSS'] = result['TOI'] / result['TIME_OF_POSS']
        result['TOI_10_TOUCHES'] = result['TOI'] / result['TOUCHES'] * 100
        result['TO_POSS'] = result['TOs'] / result['TIME_OF_POSS']
        result['TO_10_TOUCHES'] = result['TOs'] / result['TOUCHES'] * 100
        
        return result
    

    if by_player:
        # Player-level aggregation
        return process_data(tov_df, ['team', 'player_id', 'player'], scale_df)


    else:
        # Team-level aggregation
        by_data = process_data(tov_df, ['team'])
        return by_data.sort_values('TOI_POSS', ascending=False)

def _lost_points(df, row):
    """Helper function to calculate lost points based on shot clock"""
    if row['shot_clock'] >= 22:
        val = df[(df['SC_REMAINING'] == '24-22') & (df['TEAM_NAME'] == row['team'])]['SCALED_PPP']
    elif row['shot_clock'] >= 18:
        val = df[(df['SC_REMAINING'] == '22-18') & (df['TEAM_NAME'] == row['team'])]['SCALED_PPP']
    elif row['shot_clock'] >= 15:
        val = df[(df['SC_REMAINING'] == '18-15') & (df['TEAM_NAME'] == row['team'])]['SCALED_PPP']
    elif row['shot_clock'] >= 7:
        val = df[(df['SC_REMAINING'] == '15-7') & (df['TEAM_NAME'] == row['team'])]['SCALED_PPP']
    elif row['shot_clock'] >= 4:
        val = df[(df['SC_REMAINING'] == '7-4') & (df['TEAM_NAME'] == row['team'])]['SCALED_PPP']
    else:
        val = df[(df['SC_REMAINING'] == '4-0') & (df['TEAM_NAME'] == row['team'])]['SCALED_PPP']
    
    return val.iloc[0] if not val.empty else float('nan')

def _get_value(row):
    """Calculate the value of a turnover"""
    if row['dead_ball']:
        return row['lost_points'] - row['dead_ball_adj']
    return row['next_pos_points'] - row['expected_next_points'] + row['lost_points']

In [6]:
for year in tqdm(range(2013, datetime.now().year)):

    player_stats = get_turnover_stats(year, by_player = True, min_possessions=0, min_turnovers=0)
    player_stats.to_csv(f'data/player_stats/player_stats_{year}_{year%2000+1}.csv')

    team_stats =  get_turnover_stats(year, by_player = False, min_possessions=0, min_turnovers=0)
    team_stats.to_csv(f'data/team_stats/team_stats_{year}_{year%2000+1}.csv')


100%|██████████| 12/12 [08:09<00:00, 40.80s/it]
