In [1]:
import json
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
import os

# For playoff games, the 2nd digit of the specific number gives the round of the playoffs,
# the 3rd digit specifies the matchup, and the 4th digit specifies the game (out of 7).
game_type_map = {"regular_season": "02",
                 "playoffs": "03"}
year_list = [2016, 2017, 2018, 2019, 2020]


class Directory:
    DATA_DIR = "../data/" # Modify the path accordingly
    ADV_VIZ_PKL_FILE = DATA_DIR + 'major_dict_1px.p'
    ALL_SEASON_DATA_PKL_FILE = DATA_DIR + 'all_season.pkl'
    TIDY_DATA_PKL_FILENAME = 'tidy_data.pkl'


class APIList():
    GET_ALL_MATCHES_FOR_A_GIVEN_SEASON = "https://statsapi.web.nhl.com/api/v1/schedule?season="
    GET_ALL_DATA_FOR_A_GIVEN_MATCH = "https://statsapi.web.nhl.com/api/v1/game/{}/feed/live/"


class CustomRegex():
    REGULAR_GAME_ID = r"\d{0,4}02\d{0,4}"  # 02 for regular season
    PLAYOFFS_ID = r"\d{0,4}03\d{0,4}"  # 03 for playoffs


TYPES_OF_SHOTS = ["Goal", "Shot"]




In [3]:
def get_json_path(game_id=0, season=0):
    """
    This function takes an input game id and return the location of the json file
    @param game_id: game id for which we need to get the data
    @return: local system path
    """
    if game_id != 0:
        year = str(game_id)[:4]
        season = str(game_id)[4:6]
        if season == "02":
            game_type = "regular_season"
        elif season == "03":
            game_type = "playoffs"
        else:
            game_type = ""
        return Directory.DATA_DIR + year + os.path.sep + str(year) + "_" + game_type + ".json", ""
    elif season != 0:
        regular_season = Directory.DATA_DIR + str(season) + os.path.sep + str(season) + "_regular_season.json"
        playoffs_season = Directory.DATA_DIR + str(season) + os.path.sep + str(season) + "_playoffs.json"
        return regular_season, playoffs_season


def flatten_player_data(player_list):
    """
    This function transform list of players into a flatten encoded string in the form of (Full Name)_(Player Type)|
    (Full Name)_(Player Type)|.....
    @param player_list: list of players data
    @return: flatten string
    """
    flatten_string = ""
    for player in player_list:
        # flatten_string += "(" + player["player"]["id"] + ")_" # Can be uncommented in future if required
        flatten_string += "(" + player["player"]["fullName"] + ")_"
        flatten_string += "(" + player["playerType"] + ")|"
    return flatten_string[:-1]


def get_shooter_goalie(player_list):
    """
    This function gets the name of the goalie and the shooter
    @param player_list: return the shooter and goalie player names
    @return:
    """
    shooter = ""
    goalie = ""
    for player in player_list:
        if player["playerType"] == "Shooter":
            shooter = player["player"]["fullName"]
        elif player["playerType"] == "Goalie":
            goalie = player["player"]["fullName"]
        else:
            pass
    return shooter, goalie


def get_home_away_team(game_meta):
    """
    This functions get the team data
    @param game_meta: game metadata
    @return: dictionary of the team information
    """
    teams_data = game_meta["gameData"]["teams"]
    return {"home": teams_data["home"]["name"], "home_abv": teams_data["home"]["abbreviation"],
            "away": teams_data["away"]["name"], "away_abv": teams_data["away"]["abbreviation"]}


def get_side(game_meta):
    """
    This fucntion gets the team on which  rink side they were there in each period.
    @param game_meta: game metadata
    @return: a dictionary for each period home and away team rink side
    """
    periods_data = game_meta["liveData"]["linescore"]["periods"]
    period_dict = {}
    if len(periods_data) > 0:
        for i, period in enumerate(periods_data):
            if "rinkSide" in period["home"]:
                period_dict[i + 1] = {"home": period["home"]["rinkSide"], "away": period["away"]["rinkSide"]}
            else:
                period_dict[i + 1] = {"home": "Side Not Available", "away": "Side Not Available"}
    return period_dict


def get_coordinates(coordinates_data):
    """
    This functions return the coordinates as a tuple, if either of the data isn't available, it returns None
    Args:
        coordinates_data: coordinate dict
    Returns: x, y as a tuple
    """
    if "x" not in coordinates_data or "y" not in coordinates_data:
        return None
    return coordinates_data["x"], coordinates_data["y"]


def last_data_parsing(data):
    result_data = data["result"]
    about_data = data["about"]
    data_dict = {"event_code" : result_data["eventCode"], "event_type_id" : result_data["eventTypeId"], 
                 "coordinates" : get_coordinates(data["coordinates"]),
                 "about_period" : about_data["period"], "game_time": about_data["periodTime"]
                }
    
    return data_dict
    


def data_parsing(data, id, event_type, period_dict, team_detail_dict,last_event_data):
    """
    This functions transforms the json data into the relevant information for the usecase
    @param data: entire metadata and details of the given game id
    @param id: game id
    @param event_type: type of game Shot / Goal
    @return: json object
    """
    players_data = data["players"]
    result_data = data["result"]
    about_data = data["about"]
    coordinates_data = data["coordinates"]
    team_data = data["team"]
    shooter, goalie = get_shooter_goalie(players_data)
    data_dict = {"game_id": id, "event_code": result_data["eventCode"],
                 "player_info": flatten_player_data(players_data),
                 "shooter": shooter, "goalie": goalie, "event": result_data["event"],
                 "event_type_id": result_data["eventTypeId"], "event_description": result_data["description"],

                 "home_team": team_detail_dict["home"], "home_team_abv": team_detail_dict["home_abv"],
                 "away_team": team_detail_dict["away"], "away_team_abv": team_detail_dict["away_abv"],

                 "about_event_id": about_data["eventId"], "about_period": about_data["period"],
                 "about_period_type": about_data["periodType"], "game_time": about_data["periodTime"],
                 "about_time_remaining": about_data["periodTimeRemaining"], "about_date_time": about_data["dateTime"],
                 "about_goal_away": about_data["goals"]["away"], "about_goal_home": about_data["goals"]["home"],
                 "action_team_name": team_data["name"]}
    
    if last_event_data is not None:
        data_dict["last_event_code"] = last_event_data['event_code']
        data_dict["last_event_type_id"]= last_event_data['event_type_id']
        data_dict["last_event_coordinates"] = last_event_data["coordinates"]
        data_dict["last_event_time"] = last_event_data["game_time"]
        data_dict["last_event_period"] = last_event_data["about_period"]
    else:
        data_dict["last_event_id"] = np.nan
        data_dict["last_event_type"]= np.nan
        data_dict["last_event_coordinates"] = np.nan
        data_dict["last_event_time"] = np.nan
        data_dict["last_event_period"] = np.nan
    
    if "secondaryType" not in result_data:
        data_dict["event_secondary_type"] = "NA"
    else:
        data_dict["event_secondary_type"] = result_data["secondaryType"]

    data_dict["coordinates"] = get_coordinates(coordinates_data)

    if about_data["period"] not in period_dict:
        data_dict["home_team_side"] = "NA-Shootout"
        data_dict["away_team_side"] = "NA-Shootout"
    else:
        data_dict["home_team_side"] = period_dict[about_data["period"]]["home"]
        data_dict["away_team_side"] = period_dict[about_data["period"]]["away"]

    if event_type == "Goal":
        data_dict["event_strength_name"] = result_data["strength"]["name"]
        data_dict["event_strength_code"] = result_data["strength"]["code"]
        data_dict["event_game_winning_goal"] = result_data["gameWinningGoal"]
        if "emptyNet" not in result_data:
            data_dict["event_empty_net"] = "Missing Data"
        else:
            data_dict["event_empty_net"] = result_data["emptyNet"]
    else:
        data_dict["event_strength_name"] = "NA"
        data_dict["event_strength_code"] = "NA"
        data_dict["event_game_winning_goal"] = "NA"
        data_dict["event_empty_net"] = "NA"
    return data_dict


def get_goal_shots_data_by_game_id(game_id: int):
    """
    This functions transforms the json data into a df by filtering the relevant live data of the matchs which is
    restricted to "Shots" and "Goals"
    @param game_id: game id for which the transformed data needs to be done
    @return: data frame which consists of shots and goals data
    """
    json_path, _ = get_json_path(game_id=game_id)
    with open(json_path, "r") as f:
        playoffs_game_data_dict = json.load(f)
    game_data = playoffs_game_data_dict[str(game_id)]

    period_dict = get_side(game_meta=game_data)
    teams_type = get_home_away_team(game_meta=game_data)
    live_data = game_data["liveData"]["plays"]["allPlays"]
    final_list = []
    last_event = None
    for i in live_data:
        if i["result"]["event"] in TYPES_OF_SHOTS:
            try:
                last_event_data = last_data_parsing(last_event)
                parsed_data = data_parsing(data=i, id=game_id, event_type=i["result"]["event"],
                                           period_dict=period_dict, team_detail_dict=teams_type,last_event_data=last_event_data)
                final_list.append(parsed_data)
            except Exception as e:
                print(e)
                import traceback
                print(traceback.print_exc())
                break
        last_event = i
    shots_goals_df = pd.DataFrame(final_list)
    return shots_goals_df


def get_goal_shots_by_season(season_year: int):
    """
    This functions get the goals and shorts data by the given input season
    @param season_year: The year for which we need to get the goal shots data
    @return: dataframe for the entire season.
    """
    if os.path.exists(Directory.DATA_DIR + str(season_year) + os.path.sep + Directory.TIDY_DATA_PKL_FILENAME):
        return pd.read_pickle(Directory.DATA_DIR + str(season_year) + os.path.sep + Directory.TIDY_DATA_PKL_FILENAME)
    else:

        regular_data_path, playoffs_data_paths = get_json_path(season=season_year)
        with open(regular_data_path, "r") as f:
            regular_game_data_dict = json.load(f)

        with open(playoffs_data_paths, "r") as p:
            playoffs_game_data_dict = json.load(p)

        total_game_list = []
        for key, val in tqdm(regular_game_data_dict.items()):
            game_data = regular_game_data_dict[str(key)]
            period_dict = get_side(game_meta=game_data)
            teams_type = get_home_away_team(game_meta=game_data)
            live_data = game_data["liveData"]["plays"]["allPlays"]
            last_event = None
            for i in live_data:
                if i["result"]["event"] in TYPES_OF_SHOTS:
                    try:
                        last_event_data = last_data_parsing(last_event)
                        parsed_data = data_parsing(data=i, id=key, event_type=i["result"]["event"],
                                                   period_dict=period_dict, team_detail_dict=teams_type,
                                                  last_event_data=last_event_data)
                        total_game_list.append(parsed_data)
                    except Exception as e:
                        print(key)
                        print(e)
                        import traceback
                        print(traceback.print_exc())
                        break
                last_event = i

        for key, val in tqdm(playoffs_game_data_dict.items()):
            game_data = playoffs_game_data_dict[str(key)]
            period_dict = get_side(game_meta=game_data)
            teams_type = get_home_away_team(game_meta=game_data)
            live_data = game_data["liveData"]["plays"]["allPlays"]
            last_event = None
            for i in live_data:
                if i["result"]["event"] in TYPES_OF_SHOTS:
                    try:
                        last_event_data = last_data_parsing(last_event)
                        parsed_data = data_parsing(data=i, id=key, event_type=i["result"]["event"],
                                                   period_dict=period_dict, team_detail_dict=teams_type, 
                                                   last_event_data=last_event_data)
                        total_game_list.append(parsed_data)
                    except Exception as e:
                        print(i)
                        print(key)
                        print(e)
                        import traceback
                        print(traceback.print_exc())
                        break
                last_event = i

        shots_goals_df = pd.DataFrame(total_game_list)
        shots_goals_df.to_pickle(Directory.DATA_DIR + str(season_year) + os.path.sep + Directory.TIDY_DATA_PKL_FILENAME)
        return shots_goals_df

In [5]:
df_temp = get_goal_shots_by_season(season_year=2016).head()
df_temp.head()

100%|█████████████████████████████████████| 1230/1230 [00:00<00:00, 1573.38it/s]
100%|█████████████████████████████████████████| 87/87 [00:00<00:00, 1321.10it/s]


Unnamed: 0,game_id,event_code,player_info,shooter,goalie,event,event_type_id,event_description,home_team,home_team_abv,away_team,away_team_abv,about_event_id,about_period,about_period_type,game_time,about_time_remaining,about_date_time,about_goal_away,about_goal_home,action_team_name,last_event_code,last_event_type_id,last_event_coordinates,last_event_time,last_event_period,event_secondary_type,coordinates,home_team_side,away_team_side,event_strength_name,event_strength_code,event_game_winning_goal,event_empty_net
0,2016020001,OTT8,(Mitchell Marner)_(Shooter)|(Craig Anderson)_(Goalie),Mitchell Marner,Craig Anderson,Shot,SHOT,Mitchell Marner Wrist Shot saved by Craig Anderson,Ottawa Senators,OTT,Toronto Maple Leafs,TOR,8,1,REGULAR,01:11,18:49,2016-10-12T23:19:59Z,0,0,Toronto Maple Leafs,OTT203,BLOCKED_SHOT,"(-61.0, 11.0)",01:10,1,Wrist Shot,"(-77.0, 5.0)",left,right,,,,
1,2016020001,OTT11,(Chris Kelly)_(Shooter)|(Frederik Andersen)_(Goalie),Chris Kelly,Frederik Andersen,Shot,SHOT,Chris Kelly Wrist Shot saved by Frederik Andersen,Ottawa Senators,OTT,Toronto Maple Leafs,TOR,11,1,REGULAR,02:53,17:07,2016-10-12T23:21:41Z,0,0,Ottawa Senators,OTT207,GIVEAWAY,"(54.0, -5.0)",02:48,1,Wrist Shot,"(86.0, 13.0)",left,right,,,,
2,2016020001,OTT15,(Cody Ceci)_(Shooter)|(Frederik Andersen)_(Goalie),Cody Ceci,Frederik Andersen,Shot,SHOT,Cody Ceci Wrist Shot saved by Frederik Andersen,Ottawa Senators,OTT,Toronto Maple Leafs,TOR,15,1,REGULAR,04:01,15:59,2016-10-12T23:23:17Z,0,0,Ottawa Senators,OTT209,MISSED_SHOT,"(-72.0, 0.0)",03:43,1,Wrist Shot,"(23.0, -38.0)",left,right,,,,
3,2016020001,OTT16,(Erik Karlsson)_(Shooter)|(Frederik Andersen)_(Goalie),Erik Karlsson,Frederik Andersen,Shot,SHOT,Erik Karlsson Slap Shot saved by Frederik Andersen,Ottawa Senators,OTT,Toronto Maple Leafs,TOR,16,1,REGULAR,04:46,15:14,2016-10-12T23:24:02Z,0,0,Ottawa Senators,OTT210,MISSED_SHOT,"(77.0, -2.0)",04:27,1,Slap Shot,"(33.0, -15.0)",left,right,,,,
4,2016020001,OTT24,(Martin Marincin)_(Shooter)|(Craig Anderson)_(Goalie),Martin Marincin,Craig Anderson,Shot,SHOT,Martin Marincin Wrist Shot saved by Craig Anderson,Ottawa Senators,OTT,Toronto Maple Leafs,TOR,24,1,REGULAR,06:46,13:14,2016-10-12T23:27:30Z,0,0,Toronto Maple Leafs,OTT23,HIT,"(47.0, 34.0)",06:30,1,Wrist Shot,"(-34.0, 28.0)",left,right,,,,


In [4]:
# df = get_goal_shots_data_by_game_id(game_id=2016020607)
# df.head()

Unnamed: 0,game_id,event_code,player_info,shooter,goalie,event,event_type_id,event_description,home_team,home_team_abv,away_team,away_team_abv,about_event_id,about_period,about_period_type,game_time,about_time_remaining,about_date_time,about_goal_away,about_goal_home,action_team_name,last_event_code,last_event_type_id,last_event_coordinates,last_event_time,last_event_period,event_secondary_type,coordinates,home_team_side,away_team_side,event_strength_name,event_strength_code,event_game_winning_goal,event_empty_net
0,2016020607,CHI8,(Duncan Keith)_(Shooter)|(Pekka Rinne)_(Goalie),Duncan Keith,Pekka Rinne,Shot,SHOT,Duncan Keith Slap Shot saved by Pekka Rinne,Chicago Blackhawks,CHI,Nashville Predators,NSH,8,1,REGULAR,01:22,18:38,2017-01-09T00:10:00Z,0,0,Chicago Blackhawks,CHI53,HIT,"(-51.0, -37.0)",01:19,1,Slap Shot,"(-33.0, -28.0)",right,left,,,,
1,2016020607,CHI10,(Ryan Hartman)_(Shooter)|(Pekka Rinne)_(Goalie),Ryan Hartman,Pekka Rinne,Shot,SHOT,Ryan Hartman Wrist Shot saved by Pekka Rinne,Chicago Blackhawks,CHI,Nashville Predators,NSH,10,1,REGULAR,01:30,18:30,2017-01-09T00:10:36Z,0,0,Chicago Blackhawks,CHI9,FACEOFF,"(-69.0, -22.0)",01:22,1,Wrist Shot,"(-52.0, -23.0)",right,left,,,,
2,2016020607,CHI11,(Ryan Hartman)_(Shooter)|(Pekka Rinne)_(Goalie),Ryan Hartman,Pekka Rinne,Shot,SHOT,Ryan Hartman Wrist Shot saved by Pekka Rinne,Chicago Blackhawks,CHI,Nashville Predators,NSH,11,1,REGULAR,01:45,18:15,2017-01-09T00:10:51Z,0,0,Chicago Blackhawks,CHI10,SHOT,"(-52.0, -23.0)",01:30,1,Wrist Shot,"(-59.0, 21.0)",right,left,,,,
3,2016020607,CHI12,(Mike Ribeiro)_(Shooter)|(Corey Crawford)_(Goalie),Mike Ribeiro,Corey Crawford,Shot,SHOT,Mike Ribeiro Slap Shot saved by Corey Crawford,Chicago Blackhawks,CHI,Nashville Predators,NSH,12,1,REGULAR,01:52,18:08,2017-01-09T00:10:57Z,0,0,Nashville Predators,CHI11,SHOT,"(-59.0, 21.0)",01:45,1,Slap Shot,"(60.0, -27.0)",right,left,,,,
4,2016020607,CHI21,(Kevin Fiala)_(Shooter)|(Corey Crawford)_(Goalie),Kevin Fiala,Corey Crawford,Shot,SHOT,Kevin Fiala Wrist Shot saved by Corey Crawford,Chicago Blackhawks,CHI,Nashville Predators,NSH,21,1,REGULAR,05:25,14:35,2017-01-09T00:15:11Z,0,0,Nashville Predators,CHI20,BLOCKED_SHOT,"(75.0, -3.0)",04:37,1,Wrist Shot,"(68.0, 22.0)",right,left,,,,


In [6]:
year = [2016, 2017, 2018, 2019, 2020]
list_df = []
for y in year:
    list_df.append(get_goal_shots_by_season(season_year=y))
final_df = pd.concat(list_df)

100%|█████████████████████████████████████| 1230/1230 [00:00<00:00, 2256.33it/s]
100%|█████████████████████████████████████████| 87/87 [00:00<00:00, 2157.42it/s]
100%|█████████████████████████████████████| 1271/1271 [00:00<00:00, 2033.78it/s]
100%|█████████████████████████████████████████| 84/84 [00:00<00:00, 2217.87it/s]
100%|█████████████████████████████████████| 1271/1271 [00:00<00:00, 2057.53it/s]
100%|█████████████████████████████████████████| 87/87 [00:00<00:00, 2149.63it/s]
100%|█████████████████████████████████████| 1082/1082 [00:00<00:00, 2042.55it/s]
100%|███████████████████████████████████████| 130/130 [00:00<00:00, 2064.47it/s]
100%|███████████████████████████████████████| 952/952 [00:00<00:00, 2037.83it/s]
0it [00:00, ?it/s]


In [7]:
final_df.shape

(387829, 34)

In [8]:
final_df.head()

Unnamed: 0,game_id,event_code,player_info,shooter,goalie,event,event_type_id,event_description,home_team,home_team_abv,away_team,away_team_abv,about_event_id,about_period,about_period_type,game_time,about_time_remaining,about_date_time,about_goal_away,about_goal_home,action_team_name,last_event_code,last_event_type_id,last_event_coordinates,last_event_time,last_event_period,event_secondary_type,coordinates,home_team_side,away_team_side,event_strength_name,event_strength_code,event_game_winning_goal,event_empty_net
0,2016020001,OTT8,(Mitchell Marner)_(Shooter)|(Craig Anderson)_(Goalie),Mitchell Marner,Craig Anderson,Shot,SHOT,Mitchell Marner Wrist Shot saved by Craig Anderson,Ottawa Senators,OTT,Toronto Maple Leafs,TOR,8,1,REGULAR,01:11,18:49,2016-10-12T23:19:59Z,0,0,Toronto Maple Leafs,OTT203,BLOCKED_SHOT,"(-61.0, 11.0)",01:10,1,Wrist Shot,"(-77.0, 5.0)",left,right,,,,
1,2016020001,OTT11,(Chris Kelly)_(Shooter)|(Frederik Andersen)_(Goalie),Chris Kelly,Frederik Andersen,Shot,SHOT,Chris Kelly Wrist Shot saved by Frederik Andersen,Ottawa Senators,OTT,Toronto Maple Leafs,TOR,11,1,REGULAR,02:53,17:07,2016-10-12T23:21:41Z,0,0,Ottawa Senators,OTT207,GIVEAWAY,"(54.0, -5.0)",02:48,1,Wrist Shot,"(86.0, 13.0)",left,right,,,,
2,2016020001,OTT15,(Cody Ceci)_(Shooter)|(Frederik Andersen)_(Goalie),Cody Ceci,Frederik Andersen,Shot,SHOT,Cody Ceci Wrist Shot saved by Frederik Andersen,Ottawa Senators,OTT,Toronto Maple Leafs,TOR,15,1,REGULAR,04:01,15:59,2016-10-12T23:23:17Z,0,0,Ottawa Senators,OTT209,MISSED_SHOT,"(-72.0, 0.0)",03:43,1,Wrist Shot,"(23.0, -38.0)",left,right,,,,
3,2016020001,OTT16,(Erik Karlsson)_(Shooter)|(Frederik Andersen)_(Goalie),Erik Karlsson,Frederik Andersen,Shot,SHOT,Erik Karlsson Slap Shot saved by Frederik Andersen,Ottawa Senators,OTT,Toronto Maple Leafs,TOR,16,1,REGULAR,04:46,15:14,2016-10-12T23:24:02Z,0,0,Ottawa Senators,OTT210,MISSED_SHOT,"(77.0, -2.0)",04:27,1,Slap Shot,"(33.0, -15.0)",left,right,,,,
4,2016020001,OTT24,(Martin Marincin)_(Shooter)|(Craig Anderson)_(Goalie),Martin Marincin,Craig Anderson,Shot,SHOT,Martin Marincin Wrist Shot saved by Craig Anderson,Ottawa Senators,OTT,Toronto Maple Leafs,TOR,24,1,REGULAR,06:46,13:14,2016-10-12T23:27:30Z,0,0,Toronto Maple Leafs,OTT23,HIT,"(47.0, 34.0)",06:30,1,Wrist Shot,"(-34.0, 28.0)",left,right,,,,
