In [1]:
import requests
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import math
from datetime import datetime, timedelta
from time import strftime
import os
from requests.adapters import HTTPAdapter, Retry
import json
import os
import pandas as pd
import glob

from pbp_utils import get_matchup_games, retrieve_schedule, get_livedata_from_game, scrape_month_playbyplay

In [20]:
today_datetime= datetime.now()
yesterday_datetime = today_datetime - timedelta(days=1, hours=6) # UTC offset
yesterday = yesterday_datetime.strftime('%Y-%m-%d')
yesterday

'2024-10-17'

In [17]:
# Creating a dictionary to store the NHL team information
def get_team_info():
    nhl_teams = {}
    # https://api.nhle.com/stats/rest/en/team
    response = requests.get("https://api.nhle.com/stats/rest/en/team", params={"Content-Type": "application/json"})
    data = response.json()

    for team in data["data"]:
        team_id = team['id']
        team_info = {
            "fullName": team["fullName"],
            "triCode": team["triCode"]
        }
        nhl_teams[team_id] = team_info
    return nhl_teams

In [18]:
team_info = get_team_info()

In [19]:
def save_team_info_to_file(team_info, file_path):
    """
    Saves the team_info dictionary to a JSON file.

    Parameters:
        team_info (dict): The dictionary containing team information.
        file_path (str): The path to the file where the data will be saved.
    """
    # Ensure the directory exists
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    # Write the team_info dictionary to a JSON file with indentation for readability
    with open(file_path, 'w') as f:
        json.dump(team_info, f, indent=4)

# Specify the desired file path
file_path = 'data/team_info.json'

# Call the function to save team_info
save_team_info_to_file(team_info, file_path)

print(f"team_info has been saved to {file_path}")

team_info has been saved to data/team_info.json


In [21]:
# doesn't include international games
start_2024 = '2024-10-08'
so_far = (retrieve_schedule(start_2024, yesterday))

In [22]:
so_far

{'game_ids': [2024020003,
  2024020004,
  2024020005,
  2024020006,
  2024020007,
  2024020008,
  2024020009,
  2024020010,
  2024020011,
  2024020012,
  2024020013,
  2024020015,
  2024020016,
  2024020014,
  2024020017,
  2024020018,
  2024020019,
  2024020020,
  2024020021,
  2024020022,
  2024020023,
  2024020024,
  2024020025,
  2024020026,
  2024020027,
  2024020028,
  2024020029,
  2024020030,
  2024020031,
  2024020032,
  2024020033,
  2024020034,
  2024020035,
  2024020036,
  2024020037,
  2024020038,
  2024020039,
  2024020040,
  2024020041,
  2024020042,
  2024020043,
  2024020044,
  2024020045,
  2024020046,
  2024020047,
  2024020048,
  2024020049,
  2024020050,
  2024020051,
  2024020052,
  2024020053,
  2024020054,
  2024020055,
  2024020056,
  2024020057,
  2024020058,
  2024020059,
  2024020060,
  2024020061,
  2024020062,
  2024020064,
  2024020065,
  2024020066,
  2024020067,
  2024020063,
  2024020068,
  2024020069,
  2024020070,
  2024020071],
 'game_dates': ['2024

In [23]:
# psg_df = get_livedata_from_game(
#     {'game_ids':[2024020019],
#      'game_dates':['2024-10-10']}
# )
psg = get_livedata_from_game(so_far)

psg_df = pd.DataFrame(psg)

In [15]:
def clean_pbp_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Renames columns by removing the 'details_' prefix and performs other cleaning operations.

    Parameters:
        df (pd.DataFrame): The original DataFrame with columns to clean.

    Returns:
        pd.DataFrame: A DataFrame with cleaned columns.
    """
    # Rename columns by removing the 'details_' prefix
    df = df.rename(columns=lambda x: x.replace('details_', '') if x.startswith('details_') else x)

    # List of columns to drop if they exist
    columns_to_drop = ['maxRegulationPeriods', 'timeInPeriod', 'period_type', 'highlightClip',
                       'highlightClipFr', 'discreteClip', 'discreteClipFr', 'highlightClipSharingUrl','highlightClipSharingUrlFr']
    
    # Drop columns that exist in the DataFrame
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    # Rename columns
    column_rename_map = {
        'period_number': 'period', 'timeRemaining': 'time',
        'homeTeamDefendingSide': 'homeSide', 'typeCode': 'code',
        'typeDescKey': 'desc', 'eventOwnerTeamId': 'team', 'losingPlayerId': 'loser',
        'winningPlayerId': 'winner', 'xCoord': 'x', 'yCoord': 'y',
        'zoneCode': 'zone', 'shootingPlayerId': 'shooter', 'goalieInNetId': 'goalie',
        'hittingPlayerId': 'hitter', 'hitteePlayerId': 'hittee',
        'drawnByPlayerId': 'drawnBy', 'servedByPlayerId': 'servedBy',
        'committedByPlayerId': 'committedBy', 'blockingPlayerId': 'blocker',
        'playerId': 'player', 'scoringPlayerId': 'scorer', 'assist1PlayerId': 'assist1',
        'assist2PlayerId': 'assist2'
    }
    df = df.rename(columns={k: v for k, v in column_rename_map.items() if k in df.columns})

    # Convert player_id related columns to integers
    player_id_columns = ['player','loser', 'winner', 'hitter', 'hittee', 'shooter', 'goalie', 
                         'blocker', 'playerId', 'scorer', 'assist1',
                         'assist2', 'committedBy', 'drawnBy', 'servedBy']

    for col in player_id_columns:
        if col in df.columns:
            df[col] = df[col].fillna(0).astype(int)

    return df

In [16]:
def save_pbp_month(month: int, year: int, directory: str = 'data/') -> None:
    """
    Scrapes play-by-play data for a given month and year, cleans it, and saves it to a CSV file
    with the filename format 'nhl_pbp_{month}_{year}.csv'.
    
    This function integrates the scraping and saving processes by calling the `scrape_month_playbyplay`
    function internally.
    
    Parameters:
        month (int): The month number (1-12).
        year (int): The four-digit year (e.g., 2023).
        directory (str): The directory where the file will be saved. Defaults to 'data/'.
    """
    import os
    
    # Scrape the play-by-play data for the specified month and year
    df = scrape_month_playbyplay(year, month)
    
    # Clean the scraped data
    df_cleaned = clean_pbp_data(df)
    
    # Format the filename with leading zeros for month and last two digits of year
    filename = f'nhl_pbp_{month:02d}_{str(year)[-2:]}.csv'
    file_path = os.path.join(directory, filename)
    
    # Ensure the target directory exists
    os.makedirs(directory, exist_ok=True)
    
    # Save the cleaned DataFrame to a CSV file
    df_cleaned.to_csv(file_path, index=False)
    
    print(f"Play-by-play data for {month}/{year} saved to {file_path}")

In [20]:
# Assuming you have the `scrape_month_playbyplay` and `clean_pbp_data` functions defined,
# you can save the play-by-play data for October 2023 as follows:

# save_pbp_month(month=10, year=2023)
# save_pbp_month(month=11, year=2023)
# save_pbp_month(month=12, year=2023)
# save_pbp_month(month=1, year=2024)
# save_pbp_month(month=2, year=2024)
# save_pbp_month(month=3, year=2024)
# save_pbp_month(month=4, year=2024)
# save_pbp_month(month=5, year=2024)

Scraping play-by-play data from 2024-02-01 to 2024-02-29
Play-by-play data for 2/2024 saved to data/nhl_pbp_02_24.csv
Scraping play-by-play data from 2024-03-01 to 2024-03-31
Play-by-play data for 3/2024 saved to data/nhl_pbp_03_24.csv
Scraping play-by-play data from 2024-04-01 to 2024-04-30
Play-by-play data for 4/2024 saved to data/nhl_pbp_04_24.csv
Scraping play-by-play data from 2024-05-01 to 2024-05-31
Play-by-play data for 5/2024 saved to data/nhl_pbp_05_24.csv


In [None]:


def load_pbp_files(directory='data', prefix='nhl_pbp_'):
    """
    Loads all play-by-play (PBP) CSV files in the specified directory that start with the given prefix.

    Parameters:
        directory (str): The path to the directory containing PBP files. Defaults to 'data'.
        prefix (str): The prefix that PBP files start with. Defaults to 'nhl_pbp_'.

    Returns:
        pd.DataFrame: A concatenated DataFrame containing data from all matching PBP files.
    """
    # Construct the file pattern
    pattern = os.path.join(directory, f'{prefix}*.csv')
    
    # Retrieve all file paths matching the pattern
    file_paths = glob.glob(pattern)
    
    # List to hold individual DataFrames
    df_list = []
    
    # Iterate over each file and read its contents
    for file in file_paths:
        try:
            df = pd.read_csv(file)
            df_list.append(df)
            print(f"Successfully loaded: {file}")
        except Exception as e:
            print(f"Error loading {file}: {e}")
    
    # Concatenate all DataFrames if any are loaded
    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
        print(f"Total files loaded and concatenated: {len(df_list)}")
        return combined_df
    else:
        print("No files found matching the pattern.")
        return pd.DataFrame()

In [141]:
import math

# Define goal location coordinates to calculate distance for shot
GOAL_LOCATION = [89, 0]

# Modify shot location coordinates
# If x coord is negative, reflect y coord so side which shot came from is consistent
df['y'] = np.where(df['x'] < 0, df['y'] * -1, df['y'])
df['x'] = np.where(df['x'] < 0, np.absolute(df['x']), df['x'])

def calculate_shot_distance(x, y):
    dist = math.dist([x, y], GOAL_LOCATION)
    return round(dist, 2)

# Calculate shot distance for shots on goal
df['shotDist'] = df.apply(lambda row: calculate_shot_distance(row['x'], row['y']) 
                          if row['desc'] == 'shot-on-goal' else None, axis=1)

# Display the first few rows to verify the new column
df[['desc', 'x', 'y', 'shotDist']].head(10)


Unnamed: 0,desc,x,y,shotDist
0,period-start,,,
1,faceoff,0.0,0.0,
2,hit,93.0,28.0,
3,hit,67.0,-40.0,
4,shot-on-goal,61.0,-27.0,38.9
5,missed-shot,61.0,7.0,
6,shot-on-goal,79.0,-30.0,31.62
7,missed-shot,83.0,9.0,
8,hit,91.0,-26.0,
9,blocked-shot,73.0,-9.0,
