In [43]:
import requests
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import math
from datetime import datetime, timedelta
from time import strftime
import os

In [44]:
API_URL = 'https://api-web.nhle.com/v1'
all_plays = []

In [45]:
def get_matchup_games(start_date, end_date):
    r = requests.get(url=API_URL + '/schedule/' + str(start_date))
    data = r.json()

    end_date_dt = datetime.strptime(end_date, '%Y-%m-%d')
    matchup_games = {'next_start_date': '', 'game_ids': {'id': [], 'date': []}}

    matchup_games['next_start_date'] = data['nextStartDate']

    for day in data['gameWeek']:
        for game in day['games']:
            # game_date_timestamp = game['startTimeUTC']  # Read the game's start time
            # game_date = datetime.strptime(game_date_timestamp, '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d')
            game_date = day['date']
            # Strip the time and retain only the date this causes problems for the sweden games

            if datetime.strptime(game_date, '%Y-%m-%d').date() <= end_date_dt.date():
                matchup_games['game_ids']['id'].append(game['id'])
                matchup_games['game_ids']['date'].append(game_date)

    return matchup_games

In [46]:
def retrieve_schedule(start_date_str, end_date_str):
    all_game_ids = {'game_ids': [], 'game_dates': []}
    temp_week = get_matchup_games(start_date_str, end_date_str)

    all_game_ids['game_ids'].extend(temp_week['game_ids']['id'])
    all_game_ids['game_dates'].extend(temp_week['game_ids']['date'])

    end_date_dt = datetime.strptime(end_date_str, '%Y-%m-%d')

    while True:
        temp_next_start = datetime.strptime(temp_week['next_start_date'], '%Y-%m-%d')

        if temp_next_start <= end_date_dt:
            temp_start = temp_week['next_start_date']
            temp_week = get_matchup_games(temp_start, end_date_str)

            game_ids = temp_week['game_ids']['id']
            game_dates = temp_week['game_ids']['date']

            for game_id, game_date in zip(game_ids, game_dates):
                game_date_dt = datetime.strptime(game_date, '%Y-%m-%d').date()
                if game_date_dt <= end_date_dt.date():
                    all_game_ids['game_ids'].append(game_id)
                    all_game_ids['game_dates'].append(game_date)
                else:
                    # Instead of breaking here, move to the next week
                    break
        else:
            break

    return all_game_ids

In [47]:
today_datetime= datetime.now()
yesterday_datetime = today_datetime - timedelta(days=1, hours=6) # UTC offset
yesterday = yesterday_datetime.strftime('%Y-%m-%d')
yesterday

'2024-10-11'

In [48]:
# Creating a dictionary to store the NHL team information
def get_team_info():
    nhl_teams = {}
    # https://api.nhle.com/stats/rest/en/team
    response = requests.get("https://api.nhle.com/stats/rest/en/team", params={"Content-Type": "application/json"})
    data = response.json()

    for team in data["data"]:
        team_id = team['id']
        team_info = {
            "fullName": team["fullName"],
            "triCode": team["triCode"]
        }
        nhl_teams[team_id] = team_info
    return nhl_teams

In [49]:
team_info = get_team_info()

In [50]:
team_info

{11: {'fullName': 'Atlanta Thrashers', 'triCode': 'ATL'},
 34: {'fullName': 'Hartford Whalers', 'triCode': 'HFD'},
 32: {'fullName': 'Quebec Nordiques', 'triCode': 'QUE'},
 33: {'fullName': 'Winnipeg Jets (1979)', 'triCode': 'WIN'},
 35: {'fullName': 'Colorado Rockies', 'triCode': 'CLR'},
 36: {'fullName': 'Ottawa Senators (1917)', 'triCode': 'SEN'},
 37: {'fullName': 'Hamilton Tigers', 'triCode': 'HAM'},
 38: {'fullName': 'Pittsburgh Pirates', 'triCode': 'PIR'},
 39: {'fullName': 'Philadelphia Quakers', 'triCode': 'QUA'},
 40: {'fullName': 'Detroit Cougars', 'triCode': 'DCG'},
 41: {'fullName': 'Montreal Wanderers', 'triCode': 'MWN'},
 42: {'fullName': 'Quebec Bulldogs', 'triCode': 'QBD'},
 43: {'fullName': 'Montreal Maroons', 'triCode': 'MMR'},
 44: {'fullName': 'New York Americans', 'triCode': 'NYA'},
 45: {'fullName': 'St. Louis Eagles', 'triCode': 'SLE'},
 46: {'fullName': 'Oakland Seals', 'triCode': 'OAK'},
 47: {'fullName': 'Atlanta Flames', 'triCode': 'AFM'},
 48: {'fullName': 

In [51]:
# doesn't include international games
start_2024 = '2024-10-08'
so_far = (retrieve_schedule(start_2024, yesterday))

In [52]:
so_far

{'game_ids': [2024020003,
  2024020004,
  2024020005,
  2024020006,
  2024020007,
  2024020008,
  2024020009,
  2024020010,
  2024020011,
  2024020012,
  2024020013,
  2024020015,
  2024020016,
  2024020014,
  2024020017,
  2024020018,
  2024020019,
  2024020020,
  2024020021,
  2024020022,
  2024020023],
 'game_dates': ['2024-10-08',
  '2024-10-08',
  '2024-10-08',
  '2024-10-09',
  '2024-10-09',
  '2024-10-09',
  '2024-10-09',
  '2024-10-09',
  '2024-10-10',
  '2024-10-10',
  '2024-10-10',
  '2024-10-10',
  '2024-10-10',
  '2024-10-10',
  '2024-10-10',
  '2024-10-10',
  '2024-10-10',
  '2024-10-11',
  '2024-10-11',
  '2024-10-11',
  '2024-10-11']}

In [53]:
def load_latest_psg_df(directory_path):
    # List all files in the specified directory
    files = os.listdir(directory_path)

    # Filter for files that match the 'psg' pattern with a date suffix
    psg_files = [f for f in files if f.startswith('psg_') and f.endswith('.csv')]

    # Extract the date from the filenames and convert to datetime objects
    dates = []
    for file in psg_files:
        # Extracting date part from the filename (assuming format 'psg_mmddyy.csv')
        date_str = file.split('_')[1].split('.')[0]
        # Parsing the date string to a datetime object
        date = datetime.strptime(date_str, '%m%d%y')
        dates.append((file, date))

    # Find the file with the latest date
    latest_file = max(dates, key=lambda x: x[1])[0]

    # Read the CSV file into a DataFrame
    psg_df = pd.read_csv(os.path.join(directory_path, latest_file))
    return psg_df

In [66]:
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def get_livedata_from_game(game_list):
    """
    Fetches live play-by-play data for a list of games with retry mechanism.

    Parameters:
        game_list (dict): A dictionary containing game IDs and dates.

    Returns:
        list: A list of play-by-play records.
    """
    all_plays = []
    
    # Set up a session with retry strategy
    session = requests.Session()
    retry = Retry(
        total=5,  # Total number of retries
        backoff_factor=1,  # Exponential backoff factor (e.g., 1, 2, 4, 8, 16 seconds)
        status_forcelist=[500, 502, 503, 504, 522, 524],  # HTTP status codes to retry
        method_whitelist=["GET"]  # Methods to retry
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    
    for game in game_list['game_ids']:
        try:
            response = session.get(
                f"{API_URL}/gamecenter/{game}/play-by-play",
                headers={"Content-Type": "application/json"},
                timeout=10  # Timeout after 10 seconds
            )
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
            data = response.json()
            temp_game_plays = data.get('plays', [])

            for play in temp_game_plays:
                play_record = {
                    'gid': str(game),
                    'eventId': play.get('eventId'),
                    'sortOrder': play.get('sortOrder'),
                    'period_number': play.get('periodDescriptor', {}).get('number'),
                    'period_type': play.get('periodDescriptor', {}).get('periodType'),
                    'maxRegulationPeriods': play.get('periodDescriptor', {}).get('maxRegulationPeriods'),
                    'timeInPeriod': play.get('timeInPeriod'),
                    'timeRemaining': play.get('timeRemaining'),
                    'situationCode': play.get('situationCode'),
                    'homeTeamDefendingSide': play.get('homeTeamDefendingSide'),
                    'typeCode': play.get('typeCode'),
                    'typeDescKey': play.get('typeDescKey')
                }

                details = play.get('details', {})
                for key, value in details.items():
                    play_record[f'details_{key}'] = value

                all_plays.append(play_record)
        
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch data for game {game}: {e}")
            # Optionally, log the error or store it for later analysis

    session.close()
    return all_plays

In [77]:
psg_df = get_livedata_from_game(
    {'game_ids':[2024020019],
     'game_dates':['2024-10-10']}
)
psg_df = get_livedata_from_game(so_far)

psg_df

  retry = Retry(


[{'gid': '2024020003',
  'eventId': 9,
  'sortOrder': 8,
  'period_number': 1,
  'period_type': 'REG',
  'maxRegulationPeriods': 3,
  'timeInPeriod': '00:00',
  'timeRemaining': '20:00',
  'situationCode': '1551',
  'homeTeamDefendingSide': 'left',
  'typeCode': 520,
  'typeDescKey': 'period-start'},
 {'gid': '2024020003',
  'eventId': 8,
  'sortOrder': 11,
  'period_number': 1,
  'period_type': 'REG',
  'maxRegulationPeriods': 3,
  'timeInPeriod': '00:00',
  'timeRemaining': '20:00',
  'situationCode': '1551',
  'homeTeamDefendingSide': 'left',
  'typeCode': 502,
  'typeDescKey': 'faceoff',
  'details_eventOwnerTeamId': 19,
  'details_losingPlayerId': 8482665,
  'details_winningPlayerId': 8480023,
  'details_xCoord': 0,
  'details_yCoord': 0,
  'details_zoneCode': 'N'},
 {'gid': '2024020003',
  'eventId': 75,
  'sortOrder': 12,
  'period_number': 1,
  'period_type': 'REG',
  'maxRegulationPeriods': 3,
  'timeInPeriod': '00:08',
  'timeRemaining': '19:52',
  'situationCode': '1551',
  

In [69]:
def scrape_month_playbyplay(year: int, month: int) -> pd.DataFrame:
    """
    Scrapes all play-by-play data for the specified month.

    Parameters:
        year (int): The year of the month to scrape.
        month (int): The month to scrape (1-12).

    Returns:
        pd.DataFrame: A DataFrame containing all play-by-play data for the month.
    """
    # Define the start and end dates for the month
    start_date = datetime(year, month, 1)
    # Handle month wrap-around for December
    if month == 12:
        end_date = datetime(year + 1, 1, 1) - timedelta(days=1)
    else:
        end_date = datetime(year, month + 1, 1) - timedelta(days=1)
    
    start_date_str = start_date.strftime('%Y-%m-%d')
    end_date_str = end_date.strftime('%Y-%m-%d')
    
    print(f"Scraping play-by-play data from {start_date_str} to {end_date_str}")
    
    # Retrieve the schedule for the specified date range
    schedule = retrieve_schedule(start_date_str, end_date_str)
    
    # Fetch play-by-play data for all games in the schedule
    playbyplay_data = get_livedata_from_game(schedule)
    
    # Convert the play-by-play data to a DataFrame
    df_pbp = pd.DataFrame(playbyplay_data)
    
    return df_pbp

In [73]:
def clean_pbp_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Renames columns by removing the 'details_' prefix and performs other cleaning operations.

    Parameters:
        df (pd.DataFrame): The original DataFrame with columns to clean.

    Returns:
        pd.DataFrame: A DataFrame with cleaned columns.
    """
    # Rename columns by removing the 'details_' prefix
    df = df.rename(columns=lambda x: x.replace('details_', '') if x.startswith('details_') else x)

    # List of columns to drop if they exist
    columns_to_drop = ['maxRegulationPeriods', 'timeInPeriod', 'period_type', 'highlightClip',
                       'highlightClipFr', 'discreteClip', 'discreteClipFr']
    
    # Drop columns that exist in the DataFrame
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    # Rename columns
    column_rename_map = {
        'period_number': 'period', 'timeRemaining': 'time',
        'homeTeamDefendingSide': 'homeSide', 'typeCode': 'code',
        'typeDescKey': 'desc', 'eventOwnerTeamId': 'team', 'losingPlayerId': 'loser',
        'winningPlayerId': 'winner', 'xCoord': 'x', 'yCoord': 'y',
        'zoneCode': 'zone', 'shootingPlayerId': 'shooter', 'goalieInNetId': 'goalie'
    }
    df = df.rename(columns={k: v for k, v in column_rename_map.items() if k in df.columns})

    # Convert player_id related columns to integers
    player_id_columns = ['loser', 'winner', 'hittingPlayerId', 'hitteePlayerId', 'shooter', 'goalie',
                         'blockingPlayerId', 'playerId', 'scoringPlayerId', 'assist1PlayerId',
                         'assist2PlayerId', 'committedByPlayerId', 'drawnByPlayerId', 'servedByPlayerId']

    for col in player_id_columns:
        if col in df.columns:
            df[col] = df[col].fillna(0).astype(int)

    return df

In [70]:
df_october_2023 = scrape_month_playbyplay(2023, 10)


Scraping play-by-play data from 2023-10-01 to 2023-10-31


  retry = Retry(


In [71]:
df_october_2023

Unnamed: 0,gid,eventId,sortOrder,period_number,period_type,maxRegulationPeriods,timeInPeriod,timeRemaining,situationCode,homeTeamDefendingSide,typeCode,typeDescKey,details_scoringPlayerId,details_scoringPlayerTotal,details_assist1PlayerId,details_assist1PlayerTotal,details_assist2PlayerId,details_assist2PlayerTotal,details_eventOwnerTeamId,details_goalieInNetId,details_awayScore,details_homeScore,details_typeCode,details_descKey,details_duration,details_committedByPlayerId,details_losingPlayerId,details_winningPlayerId,details_xCoord,details_yCoord,details_zoneCode,details_playerId,details_shotType,details_shootingPlayerId,details_awaySOG,details_homeSOG,details_blockingPlayerId,details_reason,details_hittingPlayerId,details_hitteePlayerId,details_secondaryReason,details_drawnByPlayerId,details_servedByPlayerId,details_highlightClip,details_highlightClipFr
0,2023010065,9060,9060,1,REG,3,08:18,11:42,1551,,505,goal,8482113.0,1.0,8482713.0,1.0,8479367.0,1.0,13.0,8476341.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,
1,2023010065,9063,9063,1,REG,3,10:35,09:25,1551,,509,penalty,,,,,,,13.0,,,,MIN,slashing,2.0,8482113.0,,,,,,,,,,,,,,,,,,,
2,2023010065,9064,9064,1,REG,3,14:22,05:38,1551,,505,goal,8484314.0,1.0,8480208.0,1.0,8482162.0,1.0,9.0,8481519.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,
3,2023010065,9067,9067,2,REG,3,00:11,19:49,1551,,505,goal,8482162.0,1.0,8480208.0,2.0,8484314.0,1.0,9.0,8481519.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,
4,2023010065,9070,9070,2,REG,3,02:56,17:04,1551,,509,penalty,,,,,,,13.0,,,,MIN,interference,2.0,8480228.0,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55901,2023020140,398,797,3,REG,3,17:45,02:15,1551,right,503,hit,,,,,,,23.0,,,,,,,,,,-96.0,-15.0,O,,,,,,,,8476858.0,8478468.0,,,,,
55902,2023020140,1108,799,3,REG,3,18:00,02:00,1551,right,506,shot-on-goal,,,,,,,18.0,8477967.0,,,,,,,,,49.0,29.0,O,,wrist,8480748.0,27.0,22.0,,,,,,,,,
55903,2023020140,399,801,3,REG,3,18:27,01:33,1551,right,503,hit,,,,,,,18.0,,,,,,,,,,-51.0,-39.0,D,,,,,,,,8480748.0,8478057.0,,,,,
55904,2023020140,400,820,3,REG,3,20:00,00:00,1551,right,521,period-end,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [75]:
df_october_2023 = clean_pbp_data(df_october_2023)
df_october_2023

Unnamed: 0,gid,eventId,sortOrder,period,time,situationCode,homeSide,code,desc,scoringPlayerId,scoringPlayerTotal,assist1PlayerId,assist1PlayerTotal,assist2PlayerId,assist2PlayerTotal,team,goalie,awayScore,homeScore,code.1,descKey,duration,committedByPlayerId,loser,winner,x,y,zone,playerId,shotType,shooter,awaySOG,homeSOG,blockingPlayerId,reason,hittingPlayerId,hitteePlayerId,secondaryReason,drawnByPlayerId,servedByPlayerId
0,2023010065,9060,9060,1,11:42,1551,,505,goal,8482113,1.0,8482713,1.0,8479367,1.0,13.0,8476341,1.0,0.0,,,,0,0,0,,,,0,,0,,,0,,0,0,,0,0
1,2023010065,9063,9063,1,09:25,1551,,509,penalty,0,,0,,0,,13.0,0,,,MIN,slashing,2.0,8482113,0,0,,,,0,,0,,,0,,0,0,,0,0
2,2023010065,9064,9064,1,05:38,1551,,505,goal,8484314,1.0,8480208,1.0,8482162,1.0,9.0,8481519,1.0,1.0,,,,0,0,0,,,,0,,0,,,0,,0,0,,0,0
3,2023010065,9067,9067,2,19:49,1551,,505,goal,8482162,1.0,8480208,2.0,8484314,1.0,9.0,8481519,1.0,2.0,,,,0,0,0,,,,0,,0,,,0,,0,0,,0,0
4,2023010065,9070,9070,2,17:04,1551,,509,penalty,0,,0,,0,,13.0,0,,,MIN,interference,2.0,8480228,0,0,,,,0,,0,,,0,,0,0,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55901,2023020140,398,797,3,02:15,1551,right,503,hit,0,,0,,0,,23.0,0,,,,,,0,0,0,-96.0,-15.0,O,0,,0,,,0,,8476858,8478468,,0,0
55902,2023020140,1108,799,3,02:00,1551,right,506,shot-on-goal,0,,0,,0,,18.0,8477967,,,,,,0,0,0,49.0,29.0,O,0,wrist,8480748,27.0,22.0,0,,0,0,,0,0
55903,2023020140,399,801,3,01:33,1551,right,503,hit,0,,0,,0,,18.0,0,,,,,,0,0,0,-51.0,-39.0,D,0,,0,,,0,,8480748,8478057,,0,0
55904,2023020140,400,820,3,00:00,1551,right,521,period-end,0,,0,,0,,,0,,,,,,0,0,0,,,,0,,0,,,0,,0,0,,0,0


In [141]:
import math

# Define goal location coordinates to calculate distance for shot
GOAL_LOCATION = [89, 0]

# Modify shot location coordinates
# If x coord is negative, reflect y coord so side which shot came from is consistent
df['y'] = np.where(df['x'] < 0, df['y'] * -1, df['y'])
df['x'] = np.where(df['x'] < 0, np.absolute(df['x']), df['x'])

def calculate_shot_distance(x, y):
    dist = math.dist([x, y], GOAL_LOCATION)
    return round(dist, 2)

# Calculate shot distance for shots on goal
df['shotDist'] = df.apply(lambda row: calculate_shot_distance(row['x'], row['y']) 
                          if row['desc'] == 'shot-on-goal' else None, axis=1)

# Display the first few rows to verify the new column
df[['desc', 'x', 'y', 'shotDist']].head(10)


Unnamed: 0,desc,x,y,shotDist
0,period-start,,,
1,faceoff,0.0,0.0,
2,hit,93.0,28.0,
3,hit,67.0,-40.0,
4,shot-on-goal,61.0,-27.0,38.9
5,missed-shot,61.0,7.0,
6,shot-on-goal,79.0,-30.0,31.62
7,missed-shot,83.0,9.0,
8,hit,91.0,-26.0,
9,blocked-shot,73.0,-9.0,
