In [1]:
import copy
import re
import pandas as pd
from tqdm import tqdm

slice_length = 30  # slice in seconds

games = pd.read_parquet("../data/game_elo.parquet")
pbp = pd.read_parquet("../data/pbp_reduced.parquet")

games["Game_Id"] = games["Game_Id"].astype("int64")

def convert_strength_to_int(strength: str) -> int:
    players = strength.split('x')
    # home - away
    return int(players[0]) - int(players[1])

In [2]:
slices = []

for idx, game in tqdm(games.iterrows(), total=len(games)):
    cur_cutoff = slice_length
    
    game_totals = {
        "game": game["Game_Id"],
        "season": game["Season"],
        "time_remaining": 3600,  # three periods of 20 minutes, in seconds
        "away_elo": game["Away_Starting_Elo"],
        "home_elo": game["Home_Starting_Elo"],
        "away_score": 0,
        "home_score": 0,
        "away_pim": 0,
        "home_pim": 0,
        "away_hits": 0,
        "home_hits": 0,
        "away_shots": 0,
        "home_shots": 0,
        "strength": 0,
        "winner": "home" if game["Home_Score"] > game["Away_Score"] else "away"
    }

    slices.append(game_totals)  # initial based purely on Elo

    reduced = pbp[(pbp["Game_Id"] == game["Game_Id"]) & (pbp["Date"] == game["Date"])]
    for idx, play in reduced.iterrows():
        elapsed = play["Seconds_Elapsed"] + (play["Period"] - 1) * 1200
        if elapsed > (1200 * 3):
            break  # ignoring overtime
        if elapsed >= cur_cutoff:
            # convert to time remaining
            game_totals["time_remaining"] = 3600 - cur_cutoff
            slices.append(copy.deepcopy(game_totals))
            cur_cutoff += 30

        if play["Ev_Team"] == game["Home_Team"]:
            team = "home"
        elif play["Ev_Team"] == game["Away_Team"]:
            team = "away"
        else:
            team = None

        if team:
            match play["Event"]:
                case "SHOT":
                    game_totals[f"{team}_shots"] += 1
                case "HIT":
                    game_totals[f"{team}_hits"] += 1
                case "PENL":
                    try:
                        text = play["Type"].split("(")[1]  # some penalty descriptions have player number
                        if "maj" in text:
                            game_totals[f"{team}_pim"] += 5
                        else:
                            mins = re.search(r'\d+', text)
                            game_totals[f"{team}_pim"] += int(mins.group())
                    except (IndexError, AttributeError):
                        pass  # some penalties are missing descriptions
                case "GOAL":
                    game_totals[f"{team}_score"] += 1

        game_totals["strength"] = convert_strength_to_int(play["Strength"])  # always update strength

100%|██████████| 21677/21677 [10:45<00:00, 33.58it/s]


In [3]:
df = pd.DataFrame(slices)
df.to_parquet("../data/time_slices.parquet")