In [2]:
import requests
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import math
from datetime import datetime, timedelta
from time import strftime
import os
from requests.adapters import HTTPAdapter, Retry
import json
import os
import pandas as pd
import glob

from pbp_utils import get_matchup_games, retrieve_schedule, get_livedata_from_game, scrape_month_playbyplay

In [3]:
today_datetime= datetime.now()
yesterday_datetime = today_datetime - timedelta(days=1, hours=6) # UTC offset
yesterday = yesterday_datetime.strftime('%Y-%m-%d')
yesterday

'2024-10-18'

In [4]:
# Creating a dictionary to store the NHL team information
def get_team_info():
    nhl_teams = {}
    # https://api.nhle.com/stats/rest/en/team
    response = requests.get("https://api.nhle.com/stats/rest/en/team", params={"Content-Type": "application/json"})
    data = response.json()

    for team in data["data"]:
        team_id = team['id']
        team_info = {
            "fullName": team["fullName"],
            "triCode": team["triCode"]
        }
        nhl_teams[team_id] = team_info
    return nhl_teams

In [5]:
team_info = get_team_info()

In [6]:
def save_team_info_to_file(team_info, file_path):
    """
    Saves the team_info dictionary to a JSON file.

    Parameters:
        team_info (dict): The dictionary containing team information.
        file_path (str): The path to the file where the data will be saved.
    """
    # Ensure the directory exists
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    # Write the team_info dictionary to a JSON file with indentation for readability
    with open(file_path, 'w') as f:
        json.dump(team_info, f, indent=4)

# Specify the desired file path
file_path = 'data/team_info.json'

# Call the function to save team_info
save_team_info_to_file(team_info, file_path)

print(f"team_info has been saved to {file_path}")

team_info has been saved to data/team_info.json


In [7]:
# doesn't include international games
start_2024 = '2024-10-08'
so_far = (retrieve_schedule(start_2024, yesterday))

In [8]:
so_far

{'game_ids': [2024020003,
  2024020004,
  2024020005,
  2024020006,
  2024020007,
  2024020008,
  2024020009,
  2024020010,
  2024020011,
  2024020012,
  2024020013,
  2024020015,
  2024020016,
  2024020014,
  2024020017,
  2024020018,
  2024020019,
  2024020020,
  2024020021,
  2024020022,
  2024020023,
  2024020024,
  2024020025,
  2024020026,
  2024020027,
  2024020028,
  2024020029,
  2024020030,
  2024020031,
  2024020032,
  2024020033,
  2024020034,
  2024020035,
  2024020036,
  2024020037,
  2024020038,
  2024020039,
  2024020040,
  2024020041,
  2024020042,
  2024020043,
  2024020044,
  2024020045,
  2024020046,
  2024020047,
  2024020048,
  2024020049,
  2024020050,
  2024020051,
  2024020052,
  2024020053,
  2024020054,
  2024020055,
  2024020056,
  2024020057,
  2024020058,
  2024020059,
  2024020060,
  2024020061,
  2024020062,
  2024020064,
  2024020065,
  2024020066,
  2024020067,
  2024020063,
  2024020068,
  2024020069,
  2024020070,
  2024020071,
  2024020072,
  202402

In [9]:
# psg_df = get_livedata_from_game(
#     {'game_ids':[2024020019],
#      'game_dates':['2024-10-10']}
# )
psg = get_livedata_from_game(so_far)

psg_df = pd.DataFrame(psg)

In [10]:
def clean_pbp_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Renames columns by removing the 'details_' prefix and performs other cleaning operations.

    Parameters:
        df (pd.DataFrame): The original DataFrame with columns to clean.

    Returns:
        pd.DataFrame: A DataFrame with cleaned columns.
    """
    # Rename columns by removing the 'details_' prefix
    df = df.rename(columns=lambda x: x.replace('details_', '') if x.startswith('details_') else x)

    # List of columns to drop if they exist
    columns_to_drop = ['maxRegulationPeriods', 'timeInPeriod', 'period_type', 'highlightClip',
                       'highlightClipFr', 'discreteClip', 'discreteClipFr', 'highlightClipSharingUrl','highlightClipSharingUrlFr']
    
    # Drop columns that exist in the DataFrame
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    # Rename columns
    column_rename_map = {
        'period_number': 'period', 'timeRemaining': 'time',
        'homeTeamDefendingSide': 'homeSide', 'typeCode': 'code',
        'typeDescKey': 'desc', 'eventOwnerTeamId': 'team', 'losingPlayerId': 'loser',
        'winningPlayerId': 'winner', 'xCoord': 'x', 'yCoord': 'y',
        'zoneCode': 'zone', 'shootingPlayerId': 'shooter', 'goalieInNetId': 'goalie',
        'hittingPlayerId': 'hitter', 'hitteePlayerId': 'hittee',
        'drawnByPlayerId': 'drawnBy', 'servedByPlayerId': 'servedBy',
        'committedByPlayerId': 'committedBy', 'blockingPlayerId': 'blocker',
        'playerId': 'player', 'scoringPlayerId': 'scorer', 'assist1PlayerId': 'assist1',
        'assist2PlayerId': 'assist2'
    }
    df = df.rename(columns={k: v for k, v in column_rename_map.items() if k in df.columns})

    # Convert player_id related columns to integers
    player_id_columns = ['player','loser', 'winner', 'hitter', 'hittee', 'shooter', 'goalie', 
                         'blocker', 'playerId', 'scorer', 'assist1',
                         'assist2', 'committedBy', 'drawnBy', 'servedBy']

    for col in player_id_columns:
        if col in df.columns:
            df[col] = df[col].fillna(0).astype(int)

    return df

In [11]:
def save_pbp_month(month: int, year: int, directory: str = 'data/') -> None:
    """
    Scrapes play-by-play data for a given month and year, cleans it, and saves it to a CSV file
    with the filename format 'nhl_pbp_{month}_{year}.csv'.
    
    This function integrates the scraping and saving processes by calling the `scrape_month_playbyplay`
    function internally.
    
    Parameters:
        month (int): The month number (1-12).
        year (int): The four-digit year (e.g., 2023).
        directory (str): The directory where the file will be saved. Defaults to 'data/'.
    """
    import os
    
    # Scrape the play-by-play data for the specified month and year
    df = scrape_month_playbyplay(year, month)
    
    # Clean the scraped data
    df_cleaned = clean_pbp_data(df)
    
    # Format the filename with leading zeros for month and last two digits of year
    filename = f'nhl_pbp_{month:02d}_{str(year)[-2:]}.csv'
    file_path = os.path.join(directory, filename)
    
    # Ensure the target directory exists
    os.makedirs(directory, exist_ok=True)
    
    # Save the cleaned DataFrame to a CSV file
    df_cleaned.to_csv(file_path, index=False)
    
    print(f"Play-by-play data for {month}/{year} saved to {file_path}")

In [13]:
# Assuming you have the `scrape_month_playbyplay` and `clean_pbp_data` functions defined,
# you can save the play-by-play data for October 2023 as follows:

save_pbp_month(month=10, year=2021)
save_pbp_month(month=11, year=2021)
save_pbp_month(month=12, year=2021)
save_pbp_month(month=1, year=2022)
save_pbp_month(month=2, year=2022)
save_pbp_month(month=3, year=2022)
save_pbp_month(month=4, year=2022)
save_pbp_month(month=5, year=2022)

Scraping play-by-play data from 2021-10-01 to 2021-10-31
Play-by-play data for 10/2021 saved to data/nhl_pbp_10_21.csv
Scraping play-by-play data from 2021-11-01 to 2021-11-30
Play-by-play data for 11/2021 saved to data/nhl_pbp_11_21.csv
Scraping play-by-play data from 2021-12-01 to 2021-12-31
Play-by-play data for 12/2021 saved to data/nhl_pbp_12_21.csv
Scraping play-by-play data from 2022-01-01 to 2022-01-31
Play-by-play data for 1/2022 saved to data/nhl_pbp_01_22.csv
Scraping play-by-play data from 2022-02-01 to 2022-02-28
Play-by-play data for 2/2022 saved to data/nhl_pbp_02_22.csv
Scraping play-by-play data from 2022-03-01 to 2022-03-31
Play-by-play data for 3/2022 saved to data/nhl_pbp_03_22.csv
Scraping play-by-play data from 2022-04-01 to 2022-04-30
Play-by-play data for 4/2022 saved to data/nhl_pbp_04_22.csv
Scraping play-by-play data from 2022-05-01 to 2022-05-31
Play-by-play data for 5/2022 saved to data/nhl_pbp_05_22.csv


In [17]:
def load_pbp_files(directory='data', prefix='nhl_pbp_'):
    """
    Loads all play-by-play (PBP) CSV files in the specified directory that start with the given prefix.

    Parameters:
        directory (str): The path to the directory containing PBP files. Defaults to 'data'.
        prefix (str): The prefix that PBP files start with. Defaults to 'nhl_pbp_'.

    Returns:
        pd.DataFrame: A concatenated DataFrame containing data from all matching PBP files.
    """
    # Construct the file pattern
    pattern = os.path.join(directory, f'{prefix}*.csv')
    
    # Retrieve all file paths matching the pattern
    file_paths = glob.glob(pattern)
    
    # List to hold individual DataFrames
    df_list = []
    
    # Iterate over each file and read its contents
    for file in file_paths:
        try:
            df = pd.read_csv(file)
            df_list.append(df)
            print(f"Successfully loaded: {file}")
        except Exception as e:
            print(f"Error loading {file}: {e}")
    
    # Concatenate all DataFrames if any are loaded
    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
        print(f"Total files loaded and concatenated: {len(df_list)}")
        return combined_df
    else:
        print("No files found matching the pattern.")
        return pd.DataFrame()

In [18]:
df = load_pbp_files()
df.head(50)

Successfully loaded: data/nhl_pbp_03_22.csv
Successfully loaded: data/nhl_pbp_11_22.csv
Successfully loaded: data/nhl_pbp_11_23.csv
Successfully loaded: data/nhl_pbp_03_23.csv
Successfully loaded: data/nhl_pbp_11_21.csv
Successfully loaded: data/nhl_pbp_01_24.csv
Successfully loaded: data/nhl_pbp_03_24.csv
Successfully loaded: data/nhl_pbp_01_22.csv
Successfully loaded: data/nhl_pbp_01_23.csv
Successfully loaded: data/nhl_pbp_05_23.csv
Successfully loaded: data/nhl_pbp_05_22.csv
Successfully loaded: data/nhl_pbp_05_24.csv
Successfully loaded: data/nhl_pbp_10_21.csv
Successfully loaded: data/nhl_pbp_10_23.csv
Successfully loaded: data/nhl_pbp_02_23.csv
Successfully loaded: data/nhl_pbp_02_22.csv
Successfully loaded: data/nhl_pbp_10_22.csv
Successfully loaded: data/nhl_pbp_12_23.csv
Successfully loaded: data/nhl_pbp_12_22.csv
Successfully loaded: data/nhl_pbp_12_21.csv
Successfully loaded: data/nhl_pbp_02_24.csv
Successfully loaded: data/nhl_pbp_04_22.csv
Successfully loaded: data/nhl_pb

Unnamed: 0,gid,eventId,sortOrder,period,time,situationCode,homeSide,code,desc,team,loser,winner,x,y,zone,reason,hitter,hittee,player,shotType,shooter,goalie,scorer,scoringPlayerTotal,assist1,assist1PlayerTotal,assist2,assist2PlayerTotal,awayScore,homeScore,blocker,awaySOG,homeSOG,secondaryReason,code.1,descKey,duration,committedBy,drawnBy,servedBy
0,2021020852,51,8,1,20:00,1551.0,left,520,period-start,,0,0,,,,,0,0,0,,0,0,0,,0,,0,,,,0,,,,,,,0,0,0
1,2021020852,52,9,1,20:00,1551.0,left,502,faceoff,9.0,8478010,8478400,0.0,0.0,N,,0,0,0,,0,0,0,,0,,0,,,,0,,,,,,,0,0,0
2,2021020852,8,10,1,19:45,1551.0,left,516,stoppage,,0,0,,,,offside,0,0,0,,0,0,0,,0,,0,,,,0,,,,,,,0,0,0
3,2021020852,53,11,1,19:45,1551.0,left,502,faceoff,9.0,8478010,8478400,20.0,22.0,N,,0,0,0,,0,0,0,,0,,0,,,,0,,,,,,,0,0,0
4,2021020852,9,12,1,19:31,1551.0,left,503,hit,9.0,0,0,-89.0,33.0,O,,8477482,8476292,0,,0,0,0,,0,,0,,,,0,,,,,,,0,0,0
5,2021020852,10,13,1,19:24,1551.0,left,503,hit,9.0,0,0,-85.0,-36.0,O,,8480801,8478416,0,,0,0,0,,0,,0,,,,0,,,,,,,0,0,0
6,2021020852,11,15,1,19:17,1551.0,left,503,hit,9.0,0,0,-88.0,34.0,O,,8477482,8476292,0,,0,0,0,,0,,0,,,,0,,,,,,,0,0,0
7,2021020852,12,18,1,19:04,1551.0,left,525,takeaway,9.0,0,0,-71.0,35.0,O,,0,0,8477015,,0,0,0,,0,,0,,,,0,,,,,,,0,0,0
8,2021020852,54,19,1,18:56,1551.0,left,507,missed-shot,9.0,0,0,-43.0,22.0,O,wide-of-net,0,0,0,wrist,8479458,8476883,0,,0,,0,,,,0,,,,,,,0,0,0
9,2021020852,55,20,1,18:49,1551.0,left,505,goal,9.0,0,0,-72.0,-7.0,O,,0,0,0,tip-in,0,8476883,8474589,6.0,8478469,23.0,8477015,22.0,1.0,0.0,0,,,,,,,0,0,0


In [31]:
# Filter for shot-on-goal and goal events, along with their preceding events
all_events = df.copy()

# Add a column to indicate if the **current** event is a shot or goal
all_events['is_shot_or_goal'] = all_events['desc'].isin(['shot-on-goal', 'goal'])

# Add a column to indicate if the **next** event (since data is descending) is a shot or goal
all_events['next_is_shot_or_goal'] = all_events['is_shot_or_goal'].shift(1)

# Filter for shot-on-goal and goal events, and events immediately preceding them
shot_goal_events = all_events[
    all_events['is_shot_or_goal'] | all_events['next_is_shot_or_goal']
].copy()

# Add a column to easily identify which rows are shots/goals and which are preceding events
shot_goal_events['event_type'] = np.where(
    shot_goal_events['is_shot_or_goal'],
    'shot_or_goal',
    'preceding_event'
)

# Sort the filtered data to maintain the original descending order
shot_goal_events.sort_index(inplace=True)

In [32]:
shot_goal_events.head(15)

Unnamed: 0,gid,eventId,sortOrder,period,time,situationCode,homeSide,code,desc,team,loser,winner,x,y,zone,reason,hitter,hittee,player,shotType,shooter,goalie,scorer,scoringPlayerTotal,assist1,assist1PlayerTotal,assist2,assist2PlayerTotal,awayScore,homeScore,blocker,awaySOG,homeSOG,secondaryReason,code.1,descKey,duration,committedBy,drawnBy,servedBy,is_shot_or_goal,next_is_shot_or_goal,event_type
9,2021020852,55,20,1,18:49,1551.0,left,505,goal,9.0,0,0,-72.0,-7.0,O,,0,0,0,tip-in,0,8476883,8474589,6.0,8478469,23.0,8477015,22.0,1.0,0.0,0,,,,,,,0,0,0,True,False,shot_or_goal
10,2021020852,56,23,1,18:49,1551.0,left,502,faceoff,14.0,8482116,8474564,0.0,0.0,N,,0,0,0,,0,0,0,,0,,0,,,,0,,,,,,,0,0,0,False,True,preceding_event
22,2021020852,62,45,1,16:17,1551.0,left,505,goal,9.0,0,0,-82.0,23.0,O,,0,0,0,wrist,0,8476883,8477482,9.0,8478400,1.0,8480801,21.0,2.0,0.0,0,,,,,,,0,0,0,True,False,shot_or_goal
23,2021020852,63,48,1,16:17,1551.0,left,502,faceoff,9.0,8474564,8477426,0.0,0.0,N,,0,0,0,,0,0,0,,0,,0,,,,0,,,,,,,0,0,0,False,True,preceding_event
25,2021020852,64,51,1,15:40,1551.0,left,506,shot-on-goal,9.0,0,0,-69.0,29.0,O,,0,0,0,snap,8477015,8476883,0,,0,,0,,,,0,1.0,0.0,,,,,0,0,0,True,False,shot_or_goal
26,2021020852,20,52,1,15:40,1551.0,left,516,stoppage,,0,0,,,,goalie-stopped-after-sog,0,0,0,,0,0,0,,0,,0,,,,0,,,,,,,0,0,0,False,True,preceding_event
28,2021020852,66,56,1,15:13,1551.0,left,506,shot-on-goal,14.0,0,0,55.0,19.0,O,,0,0,0,slap,8475167,8476899,0,,0,,0,,,,0,1.0,1.0,,,,,0,0,0,True,False,shot_or_goal
29,2021020852,67,62,1,14:33,1551.0,left,506,shot-on-goal,9.0,0,0,-78.0,21.0,O,,0,0,0,wrist,8475766,8476883,0,,0,,0,,,,0,2.0,1.0,,,,,0,0,0,True,True,shot_or_goal
30,2021020852,21,63,1,14:31,1551.0,left,503,hit,14.0,0,0,-57.0,-39.0,D,,8470621,8479580,0,,0,0,0,,0,,0,,,,0,,,,,,,0,0,0,False,True,preceding_event
33,2021020852,68,68,1,13:53,1551.0,left,506,shot-on-goal,9.0,0,0,-57.0,-19.0,O,,0,0,0,wrist,8480801,8476883,0,,0,,0,,,,0,3.0,1.0,,,,,0,0,0,True,False,shot_or_goal


In [34]:
def get_player_game_log_url(player_id, season, game_type):
    """
    Generate the URL for a player's game log from the NHL API.

    Args:
        player_id (int): The unique identifier for the player.
        season (int): The season year in the format YYYYYYYY (e.g., 20232024 for the 2023-2024 season).
        game_type (int: The type of games to retrieve (e.g., '2' for regular season, '3' for playoffs).

    Returns:
        str: The complete URL for the player's game log.

    Example:
        >>> get_player_game_log_url(8478236, 20232024, 'R')
        'https://api-web.nhle.com/v1/player/8478236/game-log/20242023/R'
    """
    return f"https://api-web.nhle.com/v1/player/{player_id}/game-log/{season}/{game_type}"

In [37]:
get_player_game_log_url(player_id=8478519, season=20232024, game_type='2')

'https://api-web.nhle.com/v1/player/8478519/game-log/20232024/2'

In [141]:
import math

# Define goal location coordinates to calculate distance for shot
GOAL_LOCATION = [89, 0]

# Modify shot location coordinates
# If x coord is negative, reflect y coord so side which shot came from is consistent
df['y'] = np.where(df['x'] < 0, df['y'] * -1, df['y'])
df['x'] = np.where(df['x'] < 0, np.absolute(df['x']), df['x'])

def calculate_shot_distance(x, y):
    dist = math.dist([x, y], GOAL_LOCATION)
    return round(dist, 2)

# Calculate shot distance for shots on goal
df['shotDist'] = df.apply(lambda row: calculate_shot_distance(row['x'], row['y']) 
                          if row['desc'] == 'shot-on-goal' else None, axis=1)

# Display the first few rows to verify the new column
df[['desc', 'x', 'y', 'shotDist']].head(10)


Unnamed: 0,desc,x,y,shotDist
0,period-start,,,
1,faceoff,0.0,0.0,
2,hit,93.0,28.0,
3,hit,67.0,-40.0,
4,shot-on-goal,61.0,-27.0,38.9
5,missed-shot,61.0,7.0,
6,shot-on-goal,79.0,-30.0,31.62
7,missed-shot,83.0,9.0,
8,hit,91.0,-26.0,
9,blocked-shot,73.0,-9.0,
