In [1]:
import requests
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import math
from datetime import datetime, timedelta
from time import strftime
import os
from requests.adapters import HTTPAdapter, Retry
import json
import pandas as pd
import glob

from pbp_utils import get_matchup_games, retrieve_schedule, get_livedata_from_game, scrape_month_playbyplay
from team_utils import get_team_info, save_team_info_to_file


In [2]:
today_datetime= datetime.now()
yesterday_datetime = today_datetime - timedelta(days=1, hours=6) # UTC offset
yesterday = yesterday_datetime.strftime('%Y-%m-%d')
yesterday

'2024-11-11'

In [3]:
with open('data/team_info.json', 'r') as f:
    team_info = json.load(f)

In [4]:
# doesn't include international games
start_2024 = '2024-10-08'
# so_far = (retrieve_schedule(start_2024, yesterday))
# so_far

In [5]:
first_week = retrieve_schedule(start_2024, '2024-10-14')
psg = get_livedata_from_game(first_week)

psg_df = pd.DataFrame(psg)

In [6]:
def clean_pbp_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Renames columns by removing the 'details_' prefix and performs other cleaning operations.

    Parameters:
        df (pd.DataFrame): The original DataFrame with columns to clean.

    Returns:
        pd.DataFrame: A DataFrame with cleaned columns.
    """
    # Rename columns by removing the 'details_' prefix
    df = df.rename(columns=lambda x: x.replace('details_', '') if x.startswith('details_') else x)

    # List of columns to drop if they exist
    columns_to_drop = ['maxRegulationPeriods', 'timeInPeriod', 'period_type', 'highlightClip',
                       'highlightClipFr', 'discreteClip', 'discreteClipFr', 'highlightClipSharingUrl','highlightClipSharingUrlFr']
    
    # Drop columns that exist in the DataFrame
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    # Rename columns
    column_rename_map = {
        'period_number': 'period', 'timeRemaining': 'time',
        'homeTeamDefendingSide': 'homeSide', 'typeCode': 'code',
        'typeDescKey': 'desc', 'eventOwnerTeamId': 'team', 'losingPlayerId': 'loser',
        'winningPlayerId': 'winner', 'xCoord': 'x', 'yCoord': 'y',
        'zoneCode': 'zone', 'shootingPlayerId': 'shooter', 'goalieInNetId': 'goalie',
        'hittingPlayerId': 'hitter', 'hitteePlayerId': 'hittee',
        'drawnByPlayerId': 'drawnBy', 'servedByPlayerId': 'servedBy',
        'committedByPlayerId': 'committedBy', 'blockingPlayerId': 'blocker',
        'playerId': 'player', 'scoringPlayerId': 'scorer', 'assist1PlayerId': 'assist1',
        'assist2PlayerId': 'assist2'
    }
    df = df.rename(columns={k: v for k, v in column_rename_map.items() if k in df.columns})

    # Convert player_id related columns to integers
    player_id_columns = ['player','loser', 'winner', 'hitter', 'hittee', 'shooter', 'goalie', 
                         'blocker', 'playerId', 'scorer', 'assist1',
                         'assist2', 'committedBy', 'drawnBy', 'servedBy']

    for col in player_id_columns:
        if col in df.columns:
            df[col] = df[col].fillna(0).astype(int)

    return df

In [7]:
psg_df = clean_pbp_data(psg_df)

In [8]:
psg_df.head(10)

Unnamed: 0,gid,eventId,sortOrder,period,time,situationCode,homeSide,code,desc,team,loser,winner,x,y,zone,hitter,hittee,shotType,shooter,goalie,awaySOG,homeSOG,reason,blocker,player,secondaryReason,scorer,scoringPlayerTotal,assist1,assist1PlayerTotal,awayScore,homeScore,assist2,assist2PlayerTotal,code.1,descKey,duration,committedBy,drawnBy,servedBy
0,2024020003,9,8,1,20:00,1551,left,520,period-start,,0,0,,,,0,0,,0,0,,,,0,0,,0,,0,,,,0,,,,,0,0,0
1,2024020003,8,11,1,20:00,1551,left,502,faceoff,19.0,8482665,8480023,0.0,0.0,N,0,0,,0,0,,,,0,0,,0,,0,,,,0,,,,,0,0,0
2,2024020003,75,12,1,19:52,1551,left,503,hit,55.0,0,0,93.0,28.0,D,8477955,8476892,,0,0,,,,0,0,,0,,0,,,,0,,,,,0,0,0
3,2024020003,78,13,1,19:50,1551,left,503,hit,19.0,0,0,67.0,-40.0,N,8482077,8474586,,0,0,,,,0,0,,0,,0,,,,0,,,,,0,0,0
4,2024020003,61,22,1,19:22,1551,left,506,shot-on-goal,19.0,0,0,-61.0,27.0,O,0,0,wrist,8477402,8475831,1.0,0.0,,0,0,,0,,0,,,,0,,,,,0,0,0
5,2024020003,64,25,1,19:04,1551,left,507,missed-shot,19.0,0,0,-61.0,-7.0,O,0,0,wrist,8479385,8475831,,,hit-crossbar,0,0,,0,,0,,,,0,,,,,0,0,0
6,2024020003,70,30,1,18:49,1551,left,506,shot-on-goal,55.0,0,0,79.0,-30.0,O,0,0,wrist,8477416,8476412,1.0,1.0,,0,0,,0,,0,,,,0,,,,,0,0,0
7,2024020003,76,35,1,18:29,1551,left,507,missed-shot,55.0,0,0,83.0,9.0,O,0,0,backhand,8480009,8476412,,,wide-right,0,0,,0,,0,,,,0,,,,,0,0,0
8,2024020003,96,36,1,18:26,1551,left,503,hit,19.0,0,0,91.0,-26.0,N,8475170,8478840,,0,0,,,,0,0,,0,,0,,,,0,,,,,0,0,0
9,2024020003,77,37,1,18:13,1551,left,508,blocked-shot,55.0,0,0,73.0,-9.0,D,0,0,,8478840,0,,,blocked,8470600,0,,0,,0,,,,0,,,,,0,0,0


In [18]:
def calculate_player_stats(pbp_df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates counting stats for players based on play-by-play data,
    excluding entries with player_id = 0.

    Parameters:
        pbp_df (pd.DataFrame): Play-by-play DataFrame.

    Returns:
        pd.DataFrame: DataFrame with counting stats per player_id, including unique games played.
    """
    # Initialize a dictionary to hold stats
    stats = {}

    # Define the statistics to track and their corresponding DataFrame columns
    stat_mappings = {
        'goals': ['scorer'],
        'assists': ['assist1', 'assist2'],
        'shots': ['shooter'],
        'hits': ['hitter']
        # Add more stats and their corresponding columns as needed
    }

    # Calculate games_played as unique gids per player_id
    # Melt the DataFrame to have one player_id per row for relevant columns
    player_game_mapping = pd.melt(
        pbp_df,
        id_vars=['gid'],
        value_vars=[
            'player', 'loser', 'winner', 'hitter', 'hittee',
            'shooter', 'goalie', 'blocker', 'scorer', 'assist1',
            'assist2', 'committedBy', 'drawnBy', 'servedBy'
        ],
        var_name='event_type',
        value_name='player_id'
    )

    # Drop rows with NaN player_ids and exclude player_id = 0
    player_game_mapping = player_game_mapping.dropna(subset=['player_id'])
    player_game_mapping = player_game_mapping[player_game_mapping['player_id'] != 0]
    player_game_mapping['player_id'] = player_game_mapping['player_id'].astype(int)

    # Group by player_id and count unique gids
    games_played = player_game_mapping.groupby('player_id')['gid'].nunique()

    # Iterate over each statistic
    for stat, columns in stat_mappings.items():
        for col in columns:
            # Drop NaN values, convert to int, and exclude player_id = 0
            player_ids = pbp_df[col].dropna().astype(int)
            valid_player_ids = player_ids[player_ids != 0]

            # Count occurrences of each valid player_id
            counts = valid_player_ids.value_counts()

            # Accumulate counts in the stats dictionary
            for player_id, count in counts.items():
                if player_id not in stats:
                    stats[player_id] = {s: 0 for s in stat_mappings.keys()}
                stats[player_id][stat] += count

    # Convert the stats dictionary to a DataFrame
    stats_df = pd.DataFrame.from_dict(stats, orient='index').reset_index()
    stats_df = stats_df.rename(columns={'index': 'player_id'})

    # Add games_played to the stats DataFrame
    stats_df['games_played'] = stats_df['player_id'].map(games_played).fillna(0).astype(int)

    return stats_df

In [19]:
calculate_player_stats(psg_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_game_mapping['player_id'] = player_game_mapping['player_id'].astype(int)


Unnamed: 0,player_id,goals,assists,shots,hits,games_played
0,8482699,5,0,25,2,4
1,8481540,4,0,19,3,4
2,8476460,4,2,12,0,3
3,8477933,3,2,17,4,4
4,8476882,3,2,11,0,3
...,...,...,...,...,...,...
603,8478970,0,0,0,2,1
604,8483447,0,0,0,1,1
605,8474037,0,0,0,1,2
606,8476878,0,0,0,1,1
