## Imports

In [None]:
try:
    import pandas as pd
except ImportError:
    !pip install pandas
    import pandas as pd

try:
    import numpy as np
except ImportError:
    !pip install numpy
    import numpy as np

try:
    import requests
except ImportError:
    !pip install requests
    import requests
  
try: 
  from tqdm import tqdm
except ImportError:
  !pip install tqdm
  from tqdm import tqdm

# already included in python
from pprint import pprint

In [2]:
key = '41b145a848f4bd67' # This is a public key
# Team IDs
team_id = {'Spokane Chiefs': '215', 'Seattle Thunderbirds': '214', 'Portland Winterhawks': '208', 'Everett Silvertips': '226', 'Tri-City Americans': '217', 'Kamloops Blazers': '203', 'Kelowna Rockets': '204', 'Prince George Cougars': '210', 'Brandon Wheat Kings': '201', 'Swift Current Broncos': '216', 'Vancouver Giants': '223', 'Victoria Royals': '227', 'Medicine Hat Tigers': '206', 'Edmonton Oil Kings': '228', 'Moose Jaw Warriors': '207', 'Regina Pats': '212', 'Saskatoon Blades': '213', 'Prince Albert Raiders': '209', 'Calgary Hitmen': '202', 'Lethbridge Hurricanes': '205', 'Red Deer Rebels': '211', 'Wenatchee Wild': '222'}
# Make sure there is the correct number of teams (22 in the league)
print(len(team_id.keys()))

22


# URLs

In [3]:
# Gets game IDs of the num_of_past_games in an array
def game_id_url_func(num_of_past_games, current_team_id):
  return f'https://lscluster.hockeytech.com/feed/?feed=modulekit&key={key}&view=scorebar&client_code=whl&numberofdaysahead=0&numberofdaysback={num_of_past_games}&season_id=&team_id={current_team_id}&lang_code=en&fmt=json'

# Gets stats from a given gameID
def game_stats_url_func(game_id):
  return f'https://lscluster.hockeytech.com/feed/?feed=gc&key={key}&game_id={game_id}&client_code=whl&tab=clock&lang_code=en&fmt=json'

# Get Past Game IDs for each team

In [4]:
'''
Get the game IDs of the past x amount of games
'''
def get_game_ids(url):
  headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, br"  # Request gzip compression
  }

  try:
      response = requests.get(url, headers=headers)

      # Check if the request was successful
      if response.status_code == 200:

          return response.json()['SiteKit']['Scorebar']
          #pprint(response)
      else:
          print(f"Failed to retrieve the page. Status code: {response.status_code}")
  except Exception as e:
      print(f"An error occurred: {e}")

## Get the Team ID of each opponent

### Get all teams past game IDs

In [5]:
# Keeps track of the game IDs that each team has played
team_games = {}
num_of_past_games = 500
for team in team_id:    
  curr_game_id = game_id_url_func(num_of_past_games, team_id[team])
  game = get_game_ids(curr_game_id)
  team_games[team] = [x['ID'] for x in game]


In [6]:
team_name_mapping = {
    'Kootenay ICE': 'Wenatchee Wild',
    'Winnipeg ICE': 'Wenatchee Wild',
    'Wenatchee Wild': 'Wenatchee Wild',  # Just in case it's already the correct name
}

# Function to standardize team names
def get_canonical_team_name(team_name):
    return team_name_mapping.get(team_name, team_name)

# Check it did get the right number of previous games
test = set()
for team in team_games:
  for game in team_games[team]:
    test.add(game)

print(len(test))



5598


# Get the stats from specific games

## Initilize the DataFrame

In [7]:
import pandas as pd
df_columns = ["Game_ID", "Home_Name", "Away_Name", "Home_Goals", "Away_Goals", "Home_PP%", "Away_PP%", "Home_SOG", "Away_SOG", "Home_FOW%", "Away_FOW%"]
dataset = pd.DataFrame(columns=df_columns)

## Function to fetch team stats, given a game ID

In [8]:
'''
Expects a game_id, which is used to access the Hockey Tech API to get the stats of a game, returned in a JSON format
'''
def get_game_stats(game_id):
  game_stats_url = game_stats_url_func(game_id)
  headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, br"  # Request gzip compression
  }
  
  try:
      response = requests.get(game_stats_url, headers=headers)
  
      # Check if the request was successful
      if response.status_code == 200:
  
          return response.json()
          pprint(response)
      else:
          print(f"Failed to retrieve the page. Status code: {response.status_code}")
  except Exception as e:
      print(f"An error occurred: {e}")

## Parse through each game stats from each team

In [9]:
# Calculate total number of games for the overall progress bar
total_games = sum(len(team_games[team]) for team in team_games)

# Keep track of games we've already seen
existing_game_ids = set()

with tqdm(total=total_games, desc="Processing all games", unit="game") as overall_pbar:
    # Go through each team
  for team in team_games:
    for game in team_games[team]:
      # Only get game stats if we haven't fetched it already
      if game in existing_game_ids:
          overall_pbar.update(1)  # Update the overall progress bar even if skipped
          continue

      existing_game_ids.add(game)
        
      # Get the stats of the game ID
      stats = get_game_stats(game)
  
      # Home and visitor team names
      visitor = get_canonical_team_name(stats['GC']['Clock']['visiting_team']['name'])
      home = get_canonical_team_name(stats['GC']['Clock']['home_team']['name'])
  
      # Number of goals
      home_goals = int(stats['GC']['Clock']['home_goal_count'])
      visitor_goals = int(stats['GC']['Clock']['visiting_goal_count'])
  
      # Calculate Power Play %
      home_ppp_total = float(stats['GC']['Clock']['power_play']['total']['home'])
      visitor_ppp_total = float(stats['GC']['Clock']['power_play']['total']['visiting'])
      # Avoid division by zero for home_ppp
      if home_ppp_total != 0:
          home_ppp = float(stats['GC']['Clock']['power_play']['goals']['home']) / home_ppp_total
      else:
          home_ppp = 0  # Default to 0 if no power plays
      # Avoid division by zero for visitor_ppp
      if visitor_ppp_total != 0:
          visitor_ppp = float(stats['GC']['Clock']['power_play']['goals']['visiting']) / visitor_ppp_total
      else:
          visitor_ppp = 0  # Default to 0 if no power plays
  
      # Calculate Faceoff Win %
      home_fowp = float(stats['GC']['Clock']['fow']['home'])
      visitor_fowp = float(stats['GC']['Clock']['fow']['visiting'])
      fow_total = home_fowp + visitor_fowp
      if fow_total != 0:
          home_fowp /= fow_total
          visitor_fowp /= fow_total
      else:
          home_fowp, visitor_fowp = .5, .5
  
      # Shots on goal
      home_sog = sum(stats['GC']['Clock']['shots_on_goal']['home'].values())
      visitor_sog = sum(stats['GC']['Clock']['shots_on_goal']['visiting'].values())
      
      # Add data to the dataset
      dataset.loc[len(dataset)] = {
          "Game_ID": game,
          "Home_Name": home,
          "Away_Name": visitor,
          "Home_Goals": home_goals,
          "Away_Goals": visitor_goals,
          "Home_PP%": home_ppp,
          "Away_PP%": visitor_ppp,
          "Home_SOG": home_sog,
          "Away_SOG": visitor_sog,
          "Home_FOW%": home_fowp,
          "Away_FOW%": visitor_fowp
      }
  
      # Update progress bar
      overall_pbar.update(1)


Processing all games: 100%|████████████████████████████████████████████████████| 11004/11004 [35:44<00:00,  5.13game/s]


## Display and Write CSV File

In [10]:
# Show the dataset
display(dataset)


Unnamed: 0,Game_ID,Home_Name,Away_Name,Home_Goals,Away_Goals,Home_PP%,Away_PP%,Home_SOG,Away_SOG,Home_FOW%,Away_FOW%
0,1015005,Spokane Chiefs,Seattle Thunderbirds,5,4,0.166667,0.666667,42,29,0.550725,0.449275
1,1015015,Seattle Thunderbirds,Spokane Chiefs,5,4,0.000000,0.000000,26,33,0.584906,0.415094
2,1015027,Spokane Chiefs,Tri-City Americans,4,3,0.250000,0.000000,45,24,0.508475,0.491525
3,1015029,Tri-City Americans,Spokane Chiefs,4,3,0.500000,0.250000,26,46,0.507463,0.492537
4,1015052,Spokane Chiefs,Portland Winterhawks,9,3,0.750000,0.000000,50,19,0.596154,0.403846
...,...,...,...,...,...,...,...,...,...,...,...
5593,1020884,Red Deer Rebels,Wenatchee Wild,2,3,0.000000,0.333333,31,25,0.564516,0.435484
5594,1021382,Wenatchee Wild,Red Deer Rebels,5,2,0.166667,0.000000,39,35,0.514706,0.485294
5595,1015056,Saskatoon Blades,Wenatchee Wild,4,2,0.000000,0.000000,32,29,0.375000,0.625000
5596,1015102,Wenatchee Wild,Saskatoon Blades,7,2,0.250000,0.000000,33,30,0.545455,0.454545


In [11]:
# Write to comma separated values file
dataset.to_csv('All_teams_WHL_stats.csv')
print('Successfully wrote to CSV file')

Successfully wrote to CSV file
