In [124]:
import requests
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import math
from datetime import datetime, timedelta
from time import strftime
import os

In [125]:
API_URL = 'https://api-web.nhle.com/v1'
all_plays = []

In [126]:
def get_matchup_games(start_date, end_date):
    r = requests.get(url=API_URL + '/schedule/' + str(start_date))
    data = r.json()

    end_date_dt = datetime.strptime(end_date, '%Y-%m-%d')
    matchup_games = {'next_start_date': '', 'game_ids': {'id': [], 'date': []}}

    matchup_games['next_start_date'] = data['nextStartDate']

    for day in data['gameWeek']:
        for game in day['games']:
            # game_date_timestamp = game['startTimeUTC']  # Read the game's start time
            # game_date = datetime.strptime(game_date_timestamp, '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d')
            game_date = day['date']
            # Strip the time and retain only the date this causes problems for the sweden games

            if datetime.strptime(game_date, '%Y-%m-%d').date() <= end_date_dt.date():
                matchup_games['game_ids']['id'].append(game['id'])
                matchup_games['game_ids']['date'].append(game_date)

    return matchup_games

In [127]:
def retrieve_schedule(start_date_str, end_date_str):
    all_game_ids = {'game_ids': [], 'game_dates': []}
    temp_week = get_matchup_games(start_date_str, end_date_str)

    all_game_ids['game_ids'].extend(temp_week['game_ids']['id'])
    all_game_ids['game_dates'].extend(temp_week['game_ids']['date'])

    end_date_dt = datetime.strptime(end_date_str, '%Y-%m-%d')

    while True:
        temp_next_start = datetime.strptime(temp_week['next_start_date'], '%Y-%m-%d')

        if temp_next_start <= end_date_dt:
            temp_start = temp_week['next_start_date']
            temp_week = get_matchup_games(temp_start, end_date_str)

            game_ids = temp_week['game_ids']['id']
            game_dates = temp_week['game_ids']['date']

            for game_id, game_date in zip(game_ids, game_dates):
                game_date_dt = datetime.strptime(game_date, '%Y-%m-%d').date()
                if game_date_dt <= end_date_dt.date():
                    all_game_ids['game_ids'].append(game_id)
                    all_game_ids['game_dates'].append(game_date)
                else:
                    # Instead of breaking here, move to the next week
                    break
        else:
            break

    return all_game_ids

In [128]:
today_datetime= datetime.now()
yesterday_datetime = today_datetime - timedelta(days=1, hours=6) # UTC offset
yesterday = yesterday_datetime.strftime('%Y-%m-%d')
yesterday

'2024-10-10'

In [129]:
# Creating a dictionary to store the NHL team information
def get_team_info():
    nhl_teams = {}
    # https://api.nhle.com/stats/rest/en/team
    response = requests.get("https://api.nhle.com/stats/rest/en/team", params={"Content-Type": "application/json"})
    data = response.json()

    for team in data["data"]:
        team_id = team['id']
        team_info = {
            "fullName": team["fullName"],
            "triCode": team["triCode"]
        }
        nhl_teams[team_id] = team_info
    return nhl_teams

In [130]:
team_info = get_team_info()

In [131]:
team_info

{11: {'fullName': 'Atlanta Thrashers', 'triCode': 'ATL'},
 34: {'fullName': 'Hartford Whalers', 'triCode': 'HFD'},
 32: {'fullName': 'Quebec Nordiques', 'triCode': 'QUE'},
 33: {'fullName': 'Winnipeg Jets (1979)', 'triCode': 'WIN'},
 35: {'fullName': 'Colorado Rockies', 'triCode': 'CLR'},
 36: {'fullName': 'Ottawa Senators (1917)', 'triCode': 'SEN'},
 37: {'fullName': 'Hamilton Tigers', 'triCode': 'HAM'},
 38: {'fullName': 'Pittsburgh Pirates', 'triCode': 'PIR'},
 39: {'fullName': 'Philadelphia Quakers', 'triCode': 'QUA'},
 40: {'fullName': 'Detroit Cougars', 'triCode': 'DCG'},
 41: {'fullName': 'Montreal Wanderers', 'triCode': 'MWN'},
 42: {'fullName': 'Quebec Bulldogs', 'triCode': 'QBD'},
 43: {'fullName': 'Montreal Maroons', 'triCode': 'MMR'},
 44: {'fullName': 'New York Americans', 'triCode': 'NYA'},
 45: {'fullName': 'St. Louis Eagles', 'triCode': 'SLE'},
 46: {'fullName': 'Oakland Seals', 'triCode': 'OAK'},
 47: {'fullName': 'Atlanta Flames', 'triCode': 'AFM'},
 48: {'fullName': 

In [132]:
# doesn't include international games
start_2024 = '2024-10-08'
so_far = (retrieve_schedule(start_2024, yesterday))

In [133]:
so_far

{'game_ids': [2024020003,
  2024020004,
  2024020005,
  2024020006,
  2024020007,
  2024020008,
  2024020009,
  2024020010,
  2024020011,
  2024020012,
  2024020013,
  2024020015,
  2024020016,
  2024020014,
  2024020017,
  2024020018,
  2024020019],
 'game_dates': ['2024-10-08',
  '2024-10-08',
  '2024-10-08',
  '2024-10-09',
  '2024-10-09',
  '2024-10-09',
  '2024-10-09',
  '2024-10-09',
  '2024-10-10',
  '2024-10-10',
  '2024-10-10',
  '2024-10-10',
  '2024-10-10',
  '2024-10-10',
  '2024-10-10',
  '2024-10-10',
  '2024-10-10']}

In [134]:
def load_latest_psg_df(directory_path):
    # List all files in the specified directory
    files = os.listdir(directory_path)

    # Filter for files that match the 'psg' pattern with a date suffix
    psg_files = [f for f in files if f.startswith('psg_') and f.endswith('.csv')]

    # Extract the date from the filenames and convert to datetime objects
    dates = []
    for file in psg_files:
        # Extracting date part from the filename (assuming format 'psg_mmddyy.csv')
        date_str = file.split('_')[1].split('.')[0]
        # Parsing the date string to a datetime object
        date = datetime.strptime(date_str, '%m%d%y')
        dates.append((file, date))

    # Find the file with the latest date
    latest_file = max(dates, key=lambda x: x[1])[0]

    # Read the CSV file into a DataFrame
    psg_df = pd.read_csv(os.path.join(directory_path, latest_file))
    return psg_df

In [135]:
# https://api-web.nhle.com/v1/gamecenter/2022030415/play-by-play
def get_livedata_from_game(game_list):
    all_plays = []
    for game in game_list['game_ids']:
        response = requests.get(
            f"{API_URL}/gamecenter/{game}/play-by-play",
            headers={"Content-Type": "application/json"}
        )
        data = response.json()
        temp_game_plays = data.get('plays', [])

        for play in temp_game_plays:
            play_record = {
                'gid': str(game),
                'eventId': play.get('eventId'),
                'sortOrder': play.get('sortOrder'),
                'period_number': play.get('periodDescriptor', {}).get('number'),
                'period_type': play.get('periodDescriptor', {}).get('periodType'),
                'maxRegulationPeriods': play.get('periodDescriptor', {}).get('maxRegulationPeriods'),
                'timeInPeriod': play.get('timeInPeriod'),
                'timeRemaining': play.get('timeRemaining'),
                'situationCode': play.get('situationCode'),
                'homeTeamDefendingSide': play.get('homeTeamDefendingSide'),
                'typeCode': play.get('typeCode'),
                'typeDescKey': play.get('typeDescKey')
            }

            details = play.get('details', {})
            for key, value in details.items():
                play_record[f'details_{key}'] = value

            all_plays.append(play_record)

    return all_plays

In [136]:
# psg_df = get_livedata_from_game(
#     {'game_ids':[2024020019],
#      'game_dates':['2024-10-10']}
# )
psg_df = get_livedata_from_game(so_far)


In [137]:
df = pd.DataFrame(psg_df)

In [138]:
def remove_details_prefix(df: pd.DataFrame) -> pd.DataFrame:
    """
    Renames columns by removing the 'details_' prefix.

    Parameters:
        df (pd.DataFrame): The original DataFrame with columns to rename.

    Returns:
        pd.DataFrame: A DataFrame with renamed columns.
    """
    df = df.rename(columns=lambda x: x.replace('details_', '') if x.startswith('details_') else x)
    return df

In [139]:
df = remove_details_prefix(df)
df.drop(columns=['maxRegulationPeriods', 'timeInPeriod', 'period_type', 'highlightClip',
                 'highlightClipFr','discreteClip','discreteClipFr'], inplace=True)
df.rename(columns={'period_number':'period', 'timeRemaining':'time', 
                   'homeTeamDefendingSide':'homeSide', 'typeCode':'code', 
                   'typeDescKey':'desc', 'eventOwnerTeamId':'team', 'losingPlayerId':'loser',
                   'winningPlayerId':'winner', 'xCoord':'x', 'yCoord':'y',
                   'zoneCode':'zone','shootingPlayerId':'shooter','goalieInNetId':'goalie',
                   }, inplace=True)

# Convert player_id related columns to integers
player_id_columns = ['loser', 'winner', 'hittingPlayerId', 'hitteePlayerId', 'shooter', 'goalie', 
                     'blockingPlayerId', 'playerId', 'scoringPlayerId', 'assist1PlayerId', 
                     'assist2PlayerId', 'committedByPlayerId', 'drawnByPlayerId', 'servedByPlayerId']

for col in player_id_columns:
    if col in df.columns:
        df[col] = df[col].fillna(0).astype(int)

df


Unnamed: 0,gid,eventId,sortOrder,period,time,situationCode,homeSide,code,desc,team,loser,winner,x,y,zone,hittingPlayerId,hitteePlayerId,shotType,shooter,goalie,awaySOG,homeSOG,reason,blockingPlayerId,playerId,secondaryReason,scoringPlayerId,scoringPlayerTotal,assist1PlayerId,assist1PlayerTotal,awayScore,homeScore,assist2PlayerId,assist2PlayerTotal,code.1,descKey,duration,committedByPlayerId,drawnByPlayerId,servedByPlayerId
0,2024020003,9,8,1,20:00,1551,left,520,period-start,,0,0,,,,0,0,,0,0,,,,0,0,,0,,0,,,,0,,,,,0,0,0
1,2024020003,8,11,1,20:00,1551,left,502,faceoff,19.0,8482665,8480023,0.0,0.0,N,0,0,,0,0,,,,0,0,,0,,0,,,,0,,,,,0,0,0
2,2024020003,75,12,1,19:52,1551,left,503,hit,55.0,0,0,93.0,28.0,D,8477955,8476892,,0,0,,,,0,0,,0,,0,,,,0,,,,,0,0,0
3,2024020003,78,13,1,19:50,1551,left,503,hit,19.0,0,0,67.0,-40.0,N,8482077,8474586,,0,0,,,,0,0,,0,,0,,,,0,,,,,0,0,0
4,2024020003,61,22,1,19:22,1551,left,506,shot-on-goal,19.0,0,0,-61.0,27.0,O,0,0,wrist,8477402,8475831,1.0,0.0,,0,0,,0,,0,,,,0,,,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5511,2024020019,524,831,4,05:00,1331,right,520,period-start,,0,0,,,,0,0,,0,0,,,,0,0,,0,,0,,,,0,,,,,0,0,0
5512,2024020019,523,833,4,05:00,1331,right,502,faceoff,19.0,8484801,8480023,0.0,0.0,N,0,0,,0,0,,,,0,0,,0,,0,,,,0,,,,,0,0,0
5513,2024020019,1083,836,4,04:15,1331,right,505,goal,19.0,0,0,64.0,20.0,O,0,0,wrist,0,8478406,,,,0,0,,8475170,1.0,8476892,1.0,5.0,4.0,8480023,3.0,,,,0,0,0
5514,2024020019,569,837,4,04:15,1331,right,521,period-end,,0,0,,,,0,0,,0,0,,,,0,0,,0,,0,,,,0,,,,,0,0,0


In [123]:
df['desc'].unique()

array(['period-start', 'faceoff', 'hit', 'shot-on-goal', 'missed-shot',
       'blocked-shot', 'stoppage', 'giveaway', 'takeaway', 'period-end',
       'goal', 'penalty', 'delayed-penalty', 'game-end'], dtype=object)

In [141]:
import math

# Define goal location coordinates to calculate distance for shot
GOAL_LOCATION = [89, 0]

# Modify shot location coordinates
# If x coord is negative, reflect y coord so side which shot came from is consistent
df['y'] = np.where(df['x'] < 0, df['y'] * -1, df['y'])
df['x'] = np.where(df['x'] < 0, np.absolute(df['x']), df['x'])

def calculate_shot_distance(x, y):
    dist = math.dist([x, y], GOAL_LOCATION)
    return round(dist, 2)

# Calculate shot distance for shots on goal
df['shotDist'] = df.apply(lambda row: calculate_shot_distance(row['x'], row['y']) 
                          if row['desc'] == 'shot-on-goal' else None, axis=1)

# Display the first few rows to verify the new column
df[['desc', 'x', 'y', 'shotDist']].head(10)


Unnamed: 0,desc,x,y,shotDist
0,period-start,,,
1,faceoff,0.0,0.0,
2,hit,93.0,28.0,
3,hit,67.0,-40.0,
4,shot-on-goal,61.0,-27.0,38.9
5,missed-shot,61.0,7.0,
6,shot-on-goal,79.0,-30.0,31.62
7,missed-shot,83.0,9.0,
8,hit,91.0,-26.0,
9,blocked-shot,73.0,-9.0,
