In [1]:
# Last updated May 18 2023 by Carolyn Dempsey

# The purpose of this code is to ingest NHL game data and transform it into a table that can be used for analysis.

# The hypothesis the data is intended for is that NHL referees are not biased against any team.

# Not included in this code is the way to analyze the data to understand the bias. 

In [2]:
import pandas as pd
import json
import requests
import numpy as np

In [3]:
#fetch game data json based on game id input
def fetch_game_data(gameId):
    prefix = "https://statsapi.web.nhl.com/api/v1/game/"
    suffix = "/feed/live"
    url = "{}{}{}".format(prefix, gameId, suffix)
    r= requests.get(url)
    data = r.json()
    plays_info = data["liveData"]["plays"]
    player_info = data["gameData"]["players"]
    officials_info = data["liveData"]["boxscore"]["officials"]
    
    return plays_info, player_info, officials_info
    


In [4]:
# create a player table that's used to match up info in the penalty table
def player_info_dataframe(player_info):
    #get player info into a dataframe
    players_info_df = pd.DataFrame(player_info)

    #it loads backwards, so flip the index
    players_info_df = players_info_df.swapaxes("index", "columns")

    #it loads the player id as an index, so this is resetting to help with joining later
    players_info_df.reset_index(inplace = True)

    #renaming column names for clarity
    players_info_df.rename(columns={"id": "playerId"}, inplace=True)
    players_info_df.rename(columns={"index": "playerIndex"}, inplace=True)

    #read the json
    current_team_df = pd.json_normalize(players_info_df.currentTeam)
    primary_position_df = pd.json_normalize(players_info_df.primaryPosition)

    #create a new dataframe combining player info, current team, and position
    player_table_df = pd.concat([players_info_df, current_team_df, primary_position_df], axis=1)

    #remove unncessary columns
    player_table_df = player_table_df.drop(columns=["playerIndex", "link", "firstName", "lastName", "primaryNumber", "birthDate", "currentAge", "birthCity", "birthStateProvince", "birthCountry", "nationality", "height", "weight", "active", "alternateCaptain", "captain", "rookie", "shootsCatches", "rosterStatus", "currentTeam", "primaryPosition", "id", "name", "code"])
    
    return player_table_df

In [5]:
# get official info that's added to the table
# TO DO: Does the order of the officials matter in the data?
# TO DO: I've assumed there are two refs and two linesman but I think I saw data where there was a third linesman

def officials_info_df(officials_info):
    # get officials into a dataframe 
    officials_info_df = pd.DataFrame(officials_info)

    # translate official's details from json and get it into a dataframe
    officials_details_df = pd.json_normalize(officials_info_df.official)

    #renaming column names for clarity
    officials_details_df.rename(columns={"id": "officialId"}, inplace=True)

    #combine the officials dataframes
    officials_table_df = pd.concat([officials_info_df, officials_details_df], axis=1)
    
    return officials_table_df

In [6]:
# get all plays formatted in a way that can be understood and cut down to penalties only. Filtering to penalties is done later.

def all_plays_info_df(plays_info_raw):
    # Puts allplays data into a data frame 
    all_plays_df = pd.DataFrame(plays_info_raw["allPlays"])

    # translates columns that are in json to dataframes
    all_plays_frames = [pd.json_normalize(all_plays_df.result), pd.json_normalize(all_plays_df.about), all_plays_df.players]

    # combines json columns so they are readable now
    all_plays_table = pd.concat(all_plays_frames, axis=1)
    
    all_plays_table = all_plays_table.drop(columns=["eventCode", "event", "gameWinningGoal", "emptyNet", "strength.code", "strength.name", "eventIdx", "eventId"])
    
    return all_plays_table

In [7]:
# take all the data we have and make a table

def game_penalty_data(gameId):
    plays_info_raw, player_info_raw, officials_info_raw = fetch_game_data(gameId)

    player_table_df = player_info_dataframe(player_info_raw)

    officials_table_df = officials_info_df(officials_info_raw)

    all_plays_table = all_plays_info_df(plays_info_raw)

    penalty_plays = all_plays_table.loc[all_plays_table['eventTypeId'] == "PENALTY"]
    penalty_plays.insert(0, "linesman2_id", officials_table_df.officialId[3])
    penalty_plays.insert(0, "linesman2", officials_table_df.fullName[3])
    penalty_plays.insert(0, "linesman1_id", officials_table_df.officialId[2])
    penalty_plays.insert(0, "linesman1", officials_table_df.fullName[2])
    penalty_plays.insert(0, "referee2_id", officials_table_df.officialId[1])
    penalty_plays.insert(0, "referee2", officials_table_df.fullName[1])
    penalty_plays.insert(0, "referee1_id", officials_table_df.officialId[0])
    penalty_plays.insert(0, "referee1", officials_table_df.fullName[0])
    penalty_plays.insert(0, "gameId", gameId)

    penalty_plays.reset_index(inplace = True)

    penalty_players_df = pd.DataFrame(pd.json_normalize(penalty_plays.players))

    penalty_players_penalty_on = pd.DataFrame(pd.json_normalize(penalty_players_df[0]))
    penalty_players_drew_by = pd.DataFrame(pd.json_normalize(penalty_players_df[1]))

    penalty_players_drew_by.rename(columns={"player.fullName": "fullName"}, inplace=True)
    penalty_players_drew_by.rename(columns={"player.id": "playerId"}, inplace=True)

    penalty_players_penalty_on.rename(columns={"player.fullName": "fullName"}, inplace=True)
    penalty_players_penalty_on.rename(columns={"player.id": "playerId"}, inplace=True)

    penalties_full_table = pd.DataFrame(penalty_plays)

    penalties_full_table["PenaltyOn_Player"] = np.nan
    penalties_full_table["PenaltyOn_PlayerId"] = np.nan
    penalties_full_table["DrewBy_Player"] = np.nan
    penalties_full_table["DrewBy_PlayerId"] = np.nan
    penalties_full_table["PenaltyOn_Team"] = np.nan
    penalties_full_table["DrewBy_Team"] = np.nan

    i = 0
    while i < len(penalty_players_penalty_on):
        penalties_full_table.loc[i, "PenaltyOn_Player"] = penalty_players_penalty_on.loc[i, "fullName"]
        penalties_full_table.loc[i, "PenaltyOn_Player"] = penalty_players_penalty_on.loc[i, "playerId"]
        penalties_full_table.loc[i, "DrewBy_Player"] = penalty_players_drew_by.loc[i, "fullName"]
        penalties_full_table.loc[i, "DrewBy_PlayerId"] = penalty_players_drew_by.loc[i, "playerId"]


        player_drew_by_team = player_table_df.loc[player_table_df["fullName"] == penalties_full_table.DrewBy_Player[i]]
        drew_by_team = player_drew_by_team.triCode
        drew_by_team = drew_by_team.to_string(index=False)
        penalties_full_table.loc[i, "DrewBy_Team"] = drew_by_team

        player_penalty_on_team = player_table_df.loc[player_table_df["fullName"] == penalties_full_table.PenaltyOn_Player[i]]
        penalty_on_team = player_penalty_on_team.triCode
        penalty_on_team = penalty_on_team.to_string(index=False)
        penalties_full_table.loc[i, "PenaltyOn_Team"] = penalty_on_team

        i += 1

    penalties_full_table = penalties_full_table.drop(columns=["players"])
    
    return penalties_full_table



In [13]:
# generate season penalty data
first_game_of_season = 2022020001
last_game_of_season = 2022020002
gameIds = []
season_penalty_data = pd.DataFrame()


for i in range(first_game_of_season, last_game_of_season+1):
    current_game_penalties = game_penalty_data(i)
    season_penalty_data = pd.concat([season_penalty_data, current_game_penalties], axis=0)



In [14]:
season_penalty_data = season_penalty_data.drop(columns=["goals.home", "goals.away", "dateTime", "index", "eventTypeId", "periodType"])

season_penalty_data

Unnamed: 0,gameId,referee1,referee1_id,referee2,referee2_id,linesman1,linesman1_id,linesman2,linesman2_id,description,...,period,ordinalNum,periodTime,periodTimeRemaining,PenaltyOn_Player,PenaltyOn_PlayerId,DrewBy_Player,DrewBy_PlayerId,PenaltyOn_Team,DrewBy_Team
0,2022020001,Francis Charron,4600,Justin St. Pierre,2459,Derek Nansen,2451,Ryan Galloway,2452,Ryan Johansen Slashing against Noah Gregor,...,1,1st,06:30,13:30,8475793,,Noah Gregor,8479393.0,"Series([], )",SJS
1,2022020001,Francis Charron,4600,Justin St. Pierre,2459,Derek Nansen,2451,Ryan Galloway,2452,Roman Josi High-sticking against Steven Lorentz,...,1,1st,09:11,10:49,8474600,,Steven Lorentz,8478904.0,"Series([], )",SJS
2,2022020001,Francis Charron,4600,Justin St. Pierre,2459,Derek Nansen,2451,Ryan Galloway,2452,Jonah Gadjovich Fighting against Mark Borowiecki,...,2,2nd,02:07,17:53,8479981,,Mark Borowiecki,8474697.0,"Series([], )",NSH
3,2022020001,Francis Charron,4600,Justin St. Pierre,2459,Derek Nansen,2451,Ryan Galloway,2452,Mark Borowiecki Fighting against Jonah Gadjovich,...,2,2nd,02:07,17:53,8474697,,Jonah Gadjovich,8479981.0,"Series([], )",SJS
4,2022020001,Francis Charron,4600,Justin St. Pierre,2459,Derek Nansen,2451,Ryan Galloway,2452,Kiefer Sherwood Delaying Game,...,2,2nd,02:33,17:27,8480748,,,,"Series([], )","Series([], )"
5,2022020001,Francis Charron,4600,Justin St. Pierre,2459,Derek Nansen,2451,Ryan Galloway,2452,Matt Benning Delaying Game - Puck over glass,...,2,2nd,11:00,09:00,8476988,,,,"Series([], )","Series([], )"
6,2022020001,Francis Charron,4600,Justin St. Pierre,2459,Derek Nansen,2451,Ryan Galloway,2452,Luke Kunin Interference against Tanner Jeannot,...,3,3rd,02:53,17:07,8479316,,Tanner Jeannot,8479661.0,"Series([], )",NSH
7,2022020001,Francis Charron,4600,Justin St. Pierre,2459,Derek Nansen,2451,Ryan Galloway,2452,Mikael Granlund Cross-checking against Mario F...,...,3,3rd,02:57,17:03,8475798,,Mario Ferraro,8479983.0,"Series([], )",SJS
8,2022020001,Francis Charron,4600,Justin St. Pierre,2459,Derek Nansen,2451,Ryan Galloway,2452,Jonah Gadjovich Interference - Goalkeeper agai...,...,3,3rd,05:18,14:42,8479981,,Juuse Saros,8477424.0,"Series([], )",NSH
9,2022020001,Francis Charron,4600,Justin St. Pierre,2459,Derek Nansen,2451,Ryan Galloway,2452,Matt Nieto High-sticking against Mattias Ekholm,...,3,3rd,09:27,10:33,8476442,,Mattias Ekholm,8475218.0,"Series([], )",NSH


In [10]:
# season_penalty_data.to_csv('season_data.csv')