## Import/Install Needed Libraries


In [75]:
try:
    import pandas as pd
except ImportError:
    !pip install pandas
    import pandas as pd

try:
    import numpy as np
except ImportError:
    !pip install numpy
    import numpy as np

try:
    import requests
except ImportError:
    !pip install requests
    import requests

from collections import deque

try:
    import sklearn
except ImportError:
    !pip install scikit-learn
    import sklearn

## Declare K


In [76]:
# 5 is recommended
k = 10

## Links and GET functions


In [77]:
key = '41b145a848f4bd67' # This is a public key
def game_id_url_func(num_of_ahead_games, num_of_past_games, current_team_id=''):
  return f'https://lscluster.hockeytech.com/feed/?feed=modulekit&key={key}&view=scorebar&client_code=whl&numberofdaysahead={num_of_ahead_games}&numberofdaysback={num_of_past_games}&season_id=&team_id={current_team_id}&lang_code=en&fmt=json'

def game_stats_url_func(game_id):
  return f'https://lscluster.hockeytech.com/feed/?feed=gc&key={key}&game_id={game_id}&client_code=whl&tab=clock&lang_code=en&fmt=json'

def get_game_ids(url):
  headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, br"  # Request gzip compression
  }

  try:
      response = requests.get(url, headers=headers)

      # Check if the request was successful
      if response.status_code == 200:

          return response.json()['SiteKit']['Scorebar']
          #pprint(response)
      else:
          print(f"Failed to retrieve the page. Status code: {response.status_code}")
  except Exception as e:
      print(f"An error occurred: {e}")


'''
Expects a game_id, which is used to access the Hockey Tech API to get the stats of a game, returned in a JSON format
'''
def get_game_stats(game_id):
  game_stats_url = game_stats_url_func(game_id)
  headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, br"  # Request gzip compression
  }
  
  try:
      response = requests.get(game_stats_url, headers=headers)
  
      # Check if the request was successful
      if response.status_code == 200:
  
          return response.json()
          pprint(response)
      else:
          print(f"Failed to retrieve the page. Status code: {response.status_code}")
  except Exception as e:
      print(f"An error occurred: {e}")

## Import the Dataset


In [78]:
# Read the CSV. ALso, dont create an index column
dataset = pd.read_csv('All_teams_WHL_stats.csv', index_col=0)
team_id = {'Spokane Chiefs': '215', 'Seattle Thunderbirds': '214', 'Portland Winterhawks': '208', 'Everett Silvertips': '226', 'Tri-City Americans': '217', 'Kamloops Blazers': '203', 'Kelowna Rockets': '204', 'Prince George Cougars': '210', 'Brandon Wheat Kings': '201', 'Swift Current Broncos': '216', 'Vancouver Giants': '223', 'Victoria Royals': '227', 'Medicine Hat Tigers': '206', 'Edmonton Oil Kings': '228', 'Moose Jaw Warriors': '207', 'Regina Pats': '212', 'Saskatoon Blades': '213', 'Prince Albert Raiders': '209', 'Calgary Hitmen': '202', 'Lethbridge Hurricanes': '205', 'Red Deer Rebels': '211', 'Wenatchee Wild': '222'}

In [79]:
dataset = dataset.sort_values(by="Game_ID")
dataset.reset_index(drop=True, inplace=True)
display(dataset)

Unnamed: 0,Game_ID,Home_Name,Away_Name,Home_Goals,Away_Goals,Home_PP%,Away_PP%,Home_SOG,Away_SOG,Home_FOW%,Away_FOW%
0,1014692,Prince George Cougars,Calgary Hitmen,7,1,0.285714,0.250000,34,17,0.433333,0.566667
1,1014699,Kamloops Blazers,Calgary Hitmen,2,4,0.000000,0.000000,32,56,0.492958,0.507042
2,1014708,Kelowna Rockets,Calgary Hitmen,3,4,0.166667,0.000000,34,26,0.461538,0.538462
3,1014720,Calgary Hitmen,Lethbridge Hurricanes,1,3,0.000000,0.285714,31,37,0.500000,0.500000
4,1014735,Red Deer Rebels,Calgary Hitmen,4,2,0.000000,0.000000,31,26,0.576271,0.423729
...,...,...,...,...,...,...,...,...,...,...,...
6078,1022061,Medicine Hat Tigers,Spokane Chiefs,2,6,0.250000,0.000000,21,25,0.561644,0.438356
6079,1022062,Spokane Chiefs,Medicine Hat Tigers,0,6,0.000000,0.333333,28,45,0.720588,0.279412
6080,1022063,Spokane Chiefs,Medicine Hat Tigers,2,5,0.250000,0.750000,30,34,0.641791,0.358209
6081,1022064,Spokane Chiefs,Medicine Hat Tigers,2,4,0.200000,0.333333,36,36,0.469697,0.530303


## Update the WHL team stats dataset with recent games


In [80]:
# Get the last 10 days worth of games. Do this for reducency, incase we missed a day of running the script
update_games = get_game_ids(game_id_url_func(0, 10))

print([game['ID'] for game in update_games])
# We get a lot of games because the API calls num_of_game_to_update back, and multiple games are played on the same day

['1022054', '1022055', '1022047', '1022048', '1022060', '1022061', '1022062', '1022063', '1022064', '1022067', '1022069', '1022068', '1022070']


In [81]:
recorded_ids = set(dataset['Game_ID'])

# If there is a game that we havent seen, set this to true
# This will write the updated stats to the WHL dataset
update_dataset_condidion = False

if update_games:
  for game in update_games:
    # Because we got more games than we needed, we ignore the games we already have
    if int(game['ID']) in recorded_ids:
      continue

    # Need to update the WHL stats dataset
    update_dataset_condidion = True  
    
    # Get the game stats for each game
    stats = get_game_stats(game["ID"])
  
    print(stats)
    # Home and visitor team names
    visitor = stats['GC']['Clock']['visiting_team']['name']
    home = stats['GC']['Clock']['home_team']['name']
  
    # Number of goals
    home_goals = int(stats['GC']['Clock']['home_goal_count'])
    visitor_goals = int(stats['GC']['Clock']['visiting_goal_count'])
  
    # Calculate Power Play %
    home_ppp_total = float(stats['GC']['Clock']['power_play']['total']['home'])
    visitor_ppp_total = float(stats['GC']['Clock']['power_play']['total']['visiting'])
    # Avoid division by zero for home_ppp
    home_pp_goals = stats['GC']['Clock']['power_play']['goals']['home']
    home_pp_goals = float(home_pp_goals) if home_pp_goals is not None else 0.0

    if home_ppp_total != 0:
        home_ppp = home_pp_goals / home_ppp_total
    else:
        home_ppp = 0  # Default to 0 if no power plays

    visitor_pp_goals = float(stats['GC']['Clock']['power_play']['goals']['visiting'])
    visitor_pp_goals = float(visitor_pp_goals) if visitor_pp_goals is not None else 0.0
    # Avoid division by zero for visitor_ppp
    if visitor_ppp_total != 0:
        visitor_ppp = visitor_pp_goals / visitor_ppp_total
    else:
        visitor_ppp = 0  # Default to 0 if no power plays
  
    
    # Calculate Faceoff Win %
    home_fowp = float(stats['GC']['Clock']['fow']['home'])
    visitor_fowp = float(stats['GC']['Clock']['fow']['visiting'])
    fow_total = home_fowp + visitor_fowp
    if fow_total != 0:
        home_fowp /= fow_total
        visitor_fowp /= fow_total
    else:
        home_fowp, visitor_fowp = .5, .5
  
    # Shots on goal
    home_sog = sum(stats['GC']['Clock']['shots_on_goal']['home'].values())
    visitor_sog = sum(stats['GC']['Clock']['shots_on_goal']['visiting'].values())
    
    # Add data to the dataset
    dataset.loc[len(dataset)] = {
        "Game_ID": int(game['ID']),
        "Home_Name": home,
        "Away_Name": visitor,
        "Home_Goals": home_goals,
        "Away_Goals": visitor_goals,
        "Home_PP%": home_ppp,
        "Away_PP%": visitor_ppp,
        "Home_SOG": home_sog,
        "Away_SOG": visitor_sog,
        "Home_FOW%": home_fowp,
        "Away_FOW%": visitor_fowp
    }

{'GC': {'Parameters': {'feed': 'gc', 'key': '41b145a848f4bd67', 'game_id': '1022069', 'client_code': 'whl', 'tab': 'clock', 'lang_code': 'en', 'fmt': 'json', 'lang_id': 1, 'static': 1}, 'Clock': {'period': '3', 'game_clock': '00:00:00', 'game_date_iso_8601': '2025-09-02T14:00:00-06:00', 'timezone': 'Canada/Saskatchewan', 'timezone_short': 'ST', 'started': '1', 'game_number': '3', 'season_id': '290', 'season_name': '2025 - 26 Pre-Season', 'home_team': {'name': 'Saskatoon Blades', 'team_id': '213', 'team_code': 'SAS', 'team_nickname': 'Blades', 'team_city': 'Saskatoon'}, 'visiting_team': {'name': 'Prince Albert Raiders', 'team_id': '209', 'team_code': 'PA', 'team_nickname': 'Raiders', 'team_city': 'Prince Albert'}, 'text_game_summary': '/stats/text-game-report.php?game_id=1022069', 'official_game_report': '/stats/official-game-report.php?game_id=1022069', 'home_audio_url': '', 'home_video_url': '', 'home_webcast_url': '', 'visiting_audio_url': '', 'visiting_video_url': '', 'visiting_webc

### Update CSV


In [82]:
# Update the CSV for future use
# Sort by game_id
dataset = dataset.sort_values(by="Game_ID")
# Remove duplicate games, based on game_id
dataset = dataset.drop_duplicates(subset="Game_ID", keep="first")
dataset.reset_index(drop=True, inplace=True)
display(dataset)

Unnamed: 0,Game_ID,Home_Name,Away_Name,Home_Goals,Away_Goals,Home_PP%,Away_PP%,Home_SOG,Away_SOG,Home_FOW%,Away_FOW%
0,1014692,Prince George Cougars,Calgary Hitmen,7,1,0.285714,0.250000,34,17,0.433333,0.566667
1,1014699,Kamloops Blazers,Calgary Hitmen,2,4,0.000000,0.000000,32,56,0.492958,0.507042
2,1014708,Kelowna Rockets,Calgary Hitmen,3,4,0.166667,0.000000,34,26,0.461538,0.538462
3,1014720,Calgary Hitmen,Lethbridge Hurricanes,1,3,0.000000,0.285714,31,37,0.500000,0.500000
4,1014735,Red Deer Rebels,Calgary Hitmen,4,2,0.000000,0.000000,31,26,0.576271,0.423729
...,...,...,...,...,...,...,...,...,...,...,...
6081,1022064,Spokane Chiefs,Medicine Hat Tigers,2,4,0.200000,0.333333,36,36,0.469697,0.530303
6082,1022067,Wenatchee Wild,Tri-City Americans,2,1,0.000000,0.333333,22,28,0.436364,0.563636
6083,1022068,Lethbridge Hurricanes,Medicine Hat Tigers,4,7,0.400000,0.500000,29,35,0.421875,0.578125
6084,1022069,Saskatoon Blades,Prince Albert Raiders,4,1,0.000000,0.000000,32,26,0.549020,0.450980


In [83]:
if update_dataset_condidion:
    dataset.to_csv('All_teams_WHL_stats.csv')

## Get Next Games


In [84]:
# We want to do a GET request getting the next games played
# Store this information within an [] of (Home Team, Visiting Team)

# Then get a DF and train the model on both teams playing

# This is the number of days ahead to predict games for
# 1 is recommended because anything past that then model is missing previous games played
num_of_future_days_predict = 1
next_games_response = get_game_ids(game_id_url_func(num_of_future_days_predict, 0))
next_games = []
for game in next_games_response:
  next_games.append((game['HomeLongName'], game["VisitorLongName"]))

print(next_games)

[('Prince Albert Raiders', 'Saskatoon Blades'), ('Lethbridge Hurricanes', 'Calgary Hitmen'), ('Swift Current Broncos', 'Regina Pats'), ('Kamloops Blazers', 'Vancouver Giants'), ('Kelowna Rockets', 'Penticton Vees')]


## Create features and dependencies


In [85]:
# Get default values for averages of goals, PP%, SOG, and FOW%
default_goals = (dataset['Home_Goals'].mean() + dataset['Away_Goals'].mean()) // 2
default_ppp = (dataset['Home_PP%'].mean() + dataset['Away_PP%'].mean()) / 2
default_sog = (dataset['Home_SOG'].mean() + dataset['Away_SOG'].mean()) // 2
default_fowp = (dataset['Home_FOW%'].mean() + dataset['Away_FOW%'].mean()) / 2

print('Default Values used:')
print(default_goals, default_ppp, default_sog, default_fowp)

Default Values used:
3.0 0.21749313827447803 31.0 0.5000000000008189


In [86]:
def get_opponent_stats(dataset, queue, opponent_name, row):
  """
  Calculates average statistics for a given opponent based on their last 'k' games prior to a specified game.
  
  Parameters:
  - dataset (pd.DataFrame): The dataset containing game records with statistics for each game.
  - queue: (Unused in this function but included as a parameter for compatibility).
  - opponent_name (str): The name of the opponent team whose stats are being calculated.
  - row (pd.Series): The current row of the dataset, representing the game for which the opponent's stats are calculated.

  Returns:
  - opponent_goals (float): Average goals scored by the opponent over the last 'k' games.
  - opponent_ppp (float): Average power-play percentage of the opponent over the last 'k' games.
  - opponent_fowp (float): Average face-off win percentage of the opponent over the last 'k' games.
  - opponent_sog (float): Average shots on goal by the opponent over the last 'k' games.

  Notes:
  - 'k' is assumed to be a predefined global variable representing the number of games to consider.
  - Default values (`default_goals`, `default_ppp`, `default_fowp`, `default_sog`) must also be predefined globally.
  - Uses `.where()` to select statistics based on whether the opponent is playing as home or away in each game.
  """
  
  # Get the last k games averages
  # This creates a new dataset of only opponent games, and which are before the current game
  opponent_games = dataset[
    ((dataset['Home_Name'] == opponent_name) | (dataset['Away_Name'] == opponent_name)) & 
    (dataset['Game_ID'] < row['Game_ID'])
  ].tail(k)

  # If the opponent hasnt played k games yet, use default
  if len(opponent_games) < k:
    opponent_goals = default_goals
    opponent_ppp = default_ppp
    opponent_fowp = default_fowp
    opponent_sog = default_sog
  else:
    # Get the mean of the entire column that the opponent played
    opponent_goals = opponent_games['Home_Goals'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_Goals']).mean()
    opponent_ppp = opponent_games['Home_PP%'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_PP%']).mean()
    opponent_fowp = opponent_games['Home_FOW%'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_FOW%']).mean()
    opponent_sog = opponent_games['Home_SOG'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_SOG']).mean()

  return opponent_goals, opponent_ppp, opponent_fowp, opponent_sog

### Get Avg Stats for each team


In [87]:
def generate_team_stats(dataset, team_a, k):
  """
  This function generates a DataFrame with stats comparing a target team (team_a) against its opponent (opponent_team)
  over the last 'k' games. It calculates average stats and differences between the target team and the opponent.

  Parameters:
  - dataset (DataFrame): The dataset containing the game statistics.
  - team_a (str): The name of the target team.
  - k (int): The number of recent games to consider for averaging stats.

  Returns:
  - DataFrame: A DataFrame containing the calculated statistics and differences between the target team and its opponent.
  """

  # Initialize variables and the deque to store the last k games
  target_queue = deque(maxlen=k)
  target_df = pd.DataFrame(columns = [
    "target_goals", 
    "opponent_goals", 
    "target_PP%", 
    "opponent_PP%", 
    "target_SOG", 
    "opponent_SOG",
    "target_FOW%", 
    "opponent_FOW%", 
    "Home/Away",
    "goals_diff",
    "ppp_diff",
    "sog_diff",
    "fowp_diff",
    "target_win"
  ])

  target_goals = 0
  target_ppp = 0
  target_fowp = 0
  target_sog = 0

  # Iterate through the dataset to track the last 'k' games
  for index, row in dataset.iterrows():
    if row['Home_Name'] == team_a or row['Away_Name'] == team_a:

      side = 'Home' if row['Home_Name'] == team_a else 'Away'
      opponent_name = row['Away_Name'] if side == 'Home' else row['Home_Name']
      
      # Update statistics for the target team
      target_goals += row[side + '_Goals']
      target_ppp += row[side + '_PP%']
      target_fowp += row[side + '_FOW%']
      target_sog += row[side + '_SOG']
      
      # If the queue is not full, add the current game to the queue
      # k-1 because we can continue of the queue is almost full
      if len(target_queue) < k - 1:
        target_queue.append(row.to_dict())
      else:
        # Calculate the opponent's stats
        # This function is the code cell above this
        opponent_goals, opponent_ppp, opponent_fowp, opponent_sog = get_opponent_stats(dataset, target_queue, opponent_name, row)
        
        # Calculate average stats for the target team. /k find the aggregate avg of the sum of stats
        avg_target_goals = target_goals / k
        avg_target_ppp = target_ppp / k
        avg_target_fowp = target_fowp / k
        avg_target_sog = target_sog / k

        # Calculate differences between the target team and the opponent
        goals_diff = avg_target_goals - opponent_goals
        ppp_diff = avg_target_ppp - opponent_ppp
        sog_diff = avg_target_sog - opponent_sog
        fowp_diff = avg_target_fowp - opponent_fowp

        target_win = 1 if row[side + '_Goals'] > row[('Away' if side == 'Home' else 'Home') + '_Goals'] else 0

        # 1 is Home, 0 is Away
        target_df.loc[len(target_df)] = {
            "target_goals": avg_target_goals,
            "opponent_goals": opponent_goals,
            "target_PP%": avg_target_ppp,
            "opponent_PP%": opponent_ppp,
            "target_SOG": avg_target_sog,
            "opponent_SOG": opponent_sog,
            "target_FOW%": avg_target_fowp,
            "opponent_FOW%": opponent_fowp,
            "Home/Away": 1.0 if side == 'Home' else 0.0,
            "goals_diff": goals_diff,
            "ppp_diff": ppp_diff,
            "sog_diff": sog_diff,
            "fowp_diff": fowp_diff,
            "target_win": target_win
        }

        # Remove the oldest game from the queue and update stats
        popped = target_queue.popleft()
        popped_side = 'Home' if popped['Home_Name'] == team_a else 'Away'
        target_goals -= popped[popped_side + '_Goals']
        target_ppp -= popped[popped_side + '_PP%']
        target_fowp -= popped[popped_side + '_FOW%']
        target_sog -= popped[popped_side + '_SOG']

        # Add the current game to the queue
        target_queue.append(row.to_dict())

  return target_df

### Generate DataFrames For Each Team Playing in Next Game


In [88]:
# loop through the next games, and get the dataframe for each team.
# Store it in a [] of (DF Home, DF, Away)

# This is an array that stores the stats DataFrame for each team playing
next_games_dfs = []
for game in next_games:
  home_team = game[0]
  away_team = game[1]
  home_team_stats_df = generate_team_stats(dataset, home_team, k)
  away_team_stats_df = generate_team_stats(dataset, away_team, k)
  
  next_games_dfs.append((home_team_stats_df, away_team_stats_df))


In [89]:
# Just display the first game to make sure
if len(next_games_dfs) > 0:
  display(next_games_dfs[0][0])

for i, game in enumerate(next_games_dfs):
  print(f'Game {i+1}: {next_games[i][0]} vs {next_games[i][1]}')
  print(f'Number of Home Team Rows: {len(game[0])}, Number of Away Team Rows: {len(game[1])}')

Unnamed: 0,target_goals,opponent_goals,target_PP%,opponent_PP%,target_SOG,opponent_SOG,target_FOW%,opponent_FOW%,Home/Away,goals_diff,ppp_diff,sog_diff,fowp_diff,target_win
0,3.6,3.2,0.248333,0.166667,33.7,30.4,0.462265,0.455354,1.0,0.4,0.081667,3.3,0.006910,1
1,3.4,3.0,0.198333,0.223333,35.3,27.4,0.487624,0.498731,1.0,0.4,-0.025000,7.9,-0.011107,1
2,3.0,2.7,0.215000,0.111667,35.3,36.0,0.485952,0.488623,0.0,0.3,0.103333,-0.7,-0.002671,0
3,2.6,3.6,0.165000,0.220000,32.9,29.1,0.486547,0.534352,1.0,-1.0,-0.055000,3.8,-0.047804,0
4,2.9,6.6,0.215000,0.288333,32.0,41.4,0.501541,0.535303,1.0,-3.7,-0.073333,-9.4,-0.033762,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
543,3.5,5.6,0.323333,0.316667,25.7,34.2,0.506901,0.500035,0.0,-2.1,0.006667,-8.5,0.006865,0
544,3.6,5.5,0.356667,0.376667,26.5,34.9,0.506624,0.494316,0.0,-1.9,-0.020000,-8.4,0.012308,0
545,3.3,5.4,0.306667,0.416667,25.5,35.3,0.499146,0.501350,1.0,-2.1,-0.110000,-9.8,-0.002204,0
546,3.2,5.3,0.306667,0.390000,24.1,35.6,0.501818,0.481495,1.0,-2.1,-0.083333,-11.5,0.020323,0


Game 1: Prince Albert Raiders vs Saskatoon Blades
Number of Home Team Rows: 548, Number of Away Team Rows: 553
Game 2: Lethbridge Hurricanes vs Calgary Hitmen
Number of Home Team Rows: 554, Number of Away Team Rows: 540
Game 3: Swift Current Broncos vs Regina Pats
Number of Home Team Rows: 539, Number of Away Team Rows: 529
Game 4: Kamloops Blazers vs Vancouver Giants
Number of Home Team Rows: 534, Number of Away Team Rows: 551
Game 5: Kelowna Rockets vs Penticton Vees
Number of Home Team Rows: 530, Number of Away Team Rows: 0


## Apply Classification Models


We go through 4 classification models for each team, train the model, and then store it for future use for a probablistic approach


In [90]:
def default_team_stats():
    return pd.DataFrame([
        {
            "target_goals": default_goals,
            "opponent_goals": default_goals,
            "target_PP%": default_ppp,
            "opponent_PP%": default_ppp,
            "target_SOG": default_sog,
            "opponent_SOG": default_sog,
            "target_FOW%": default_fowp,
            "opponent_FOW%": default_fowp,
            "Home/Away": 1.0,
            "goals_diff": 0,
            "ppp_diff": 0,
            "sog_diff": 0,
            "fowp_diff": 0,
            "target_win": 0
        },
        {
            "target_goals": default_goals,
            "opponent_goals": default_goals,
            "target_PP%": default_ppp,
            "opponent_PP%": default_ppp,
            "target_SOG": default_sog,
            "opponent_SOG": default_sog,
            "target_FOW%": default_fowp,
            "opponent_FOW%": default_fowp,
            "Home/Away": 1.0,
            "goals_diff": 0,
            "ppp_diff": 0,
            "sog_diff": 0,
            "fowp_diff": 0,
            "target_win": 1
        }
    ])


In [91]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score

# Going to store a dict of the best classifiers between each team
next_games_models = []

for game in next_games_dfs:
  
  # Loop through each team (home and away)
  for i, team in enumerate(game):
    if len(team) < k:
      team = default_team_stats()
      print("Using default stats for team")
    
    best_accuracy = 0
    best_classifier = 0
    best_classifier_name = ""
    
    # Features and dependencies
    X = team.iloc[:, :-1]
    y = team.iloc[:, -1]

    # Scale feature set
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Initialize models
    classifiers = {
        "Random Forest": RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0),
        "Naive Bayes": GaussianNB(),
        "Logistic Regression": LogisticRegression(random_state=0),
        "SVC": SVC(kernel='rbf', random_state=0, probability=True)
    }
    
    # WE NOT LONGER NEED TO KEEP TRACK OF ACCURACIES. WE ARE USING PROBABILITY
    # Train each classifier
    for classifier_name, classifier in classifiers.items():
      # Train the classifier for future use
      classifier.fit(X, y)
    
    
    # Append to the models list
    team_name = "Home" if i == 0 else "Away" 
    next_games_models.append({
        "Team": team_name,
        "Classifiers": classifiers, # classifiers is a dict
        "Scaler": scaler
    })

# Combine two teams into one game
combine_next_game_models = []
for i in range(0, len(next_games_models), 2):
  home = next_games_models[i]  # Home team entry
  away = next_games_models[i + 1]  # Away team entry

  combine_next_game_models.append((home, away))
  # Now, next_games_models will store all of the classifiers for each team playing a game


Using default stats for team


In [92]:
print(combine_next_game_models)

[({'Team': 'Home', 'Classifiers': {'Random Forest': RandomForestClassifier(criterion='entropy', random_state=0), 'Naive Bayes': GaussianNB(), 'Logistic Regression': LogisticRegression(random_state=0), 'SVC': SVC(probability=True, random_state=0)}, 'Scaler': StandardScaler()}, {'Team': 'Away', 'Classifiers': {'Random Forest': RandomForestClassifier(criterion='entropy', random_state=0), 'Naive Bayes': GaussianNB(), 'Logistic Regression': LogisticRegression(random_state=0), 'SVC': SVC(probability=True, random_state=0)}, 'Scaler': StandardScaler()}), ({'Team': 'Home', 'Classifiers': {'Random Forest': RandomForestClassifier(criterion='entropy', random_state=0), 'Naive Bayes': GaussianNB(), 'Logistic Regression': LogisticRegression(random_state=0), 'SVC': SVC(probability=True, random_state=0)}, 'Scaler': StandardScaler()}, {'Team': 'Away', 'Classifiers': {'Random Forest': RandomForestClassifier(criterion='entropy', random_state=0), 'Naive Bayes': GaussianNB(), 'Logistic Regression': Logistic

## Predict the Next Games Playing


In [93]:
def get_df_for_predicton(i, next_games_dfs, playing_side):
  """
  Prepares a feature set for predicting game outcomes based on team statistics.

  Parameters:
  - i (int): Index of the game in the `next_games_dfs` list.
  - next_games_dfs (list): List of DataFrames with game statistics.
  - playing_side (str): The side the target team is playing on ("Home" or "Away").

  Returns:
  - pd.DataFrame: DataFrame containing features for prediction, including goals, PP%, SOG, FOW%, and their differences.

  Function Details:
  - Extracts statistics for the target team and opponent.
  - Computes the differences in key statistics (goals, PP%, SOG, FOW%).
  - Returns a DataFrame with relevant features for prediction.
  """

  
  side = 0 if playing_side == "Home" else 1
  opponent_side = 0 if side else 1

  # Handle missing team data
  if next_games_dfs[i][side].empty:
      target_stats = {
          'target_goals': default_goals,
          'target_PP%': default_ppp,
          'target_SOG': default_sog,
          'target_FOW%': default_fowp
      }
  else:
      target_stats = next_games_dfs[i][side].iloc[-1]

  # Handle missing opponent data
  if next_games_dfs[i][opponent_side].empty:
      opponent_stats = {
          'target_goals': default_goals,
          'target_PP%': default_ppp,
          'target_SOG': default_sog,
          'target_FOW%': default_fowp
      }
  else:
      opponent_stats = next_games_dfs[i][opponent_side].iloc[-1]

  # Break down stats
  target_goals = target_stats['target_goals']
  opponent_goals = opponent_stats['target_goals']
  target_PPP = target_stats['target_PP%']
  opponent_PPP = opponent_stats['target_PP%']
  target_SOG = target_stats['target_SOG']
  opponent_SOG = opponent_stats['target_SOG']
  target_FOWP = target_stats['target_FOW%']
  opponent_FOWP = opponent_stats['target_FOW%']

  # Differences
  goals_diff = target_goals - opponent_goals
  ppp_diff = target_PPP - opponent_PPP
  sog_diff = target_SOG - opponent_SOG
  fowp_diff = target_FOWP - opponent_FOWP

  features = [[
      target_goals, opponent_goals, target_PPP, opponent_PPP,
      target_SOG, opponent_SOG, target_FOWP, opponent_FOWP,
      side, goals_diff, ppp_diff, sog_diff, fowp_diff
  ]]

  feature_names = [
      "target_goals", "opponent_goals",
      "target_PP%", "opponent_PP%",
      "target_SOG", "opponent_SOG",
      "target_FOW%", "opponent_FOW%",
      "Home/Away",
      "goals_diff", "ppp_diff", "sog_diff", "fowp_diff"
  ]

  return pd.DataFrame(features, columns=feature_names)


### Print Next Games Prediction


In [94]:
'''
This will give the confidence level of the multiple classifiers
This gives us a more comprehensive view of how likely a team is going to win based on teh confiendce of all classifiers
'''
def calculate_team_prob(index, game, classifiers, side):

  # prepares the features dataframe for prediction
  prediction_features_dataframe = get_df_for_predicton(index, next_games_dfs, game[side]["Team"])
  
  # scaled the features dataframe with the correct scaler for the team
  scaled_prediction_dataframe = game[side]['Scaler'].transform(prediction_features_dataframe)
  
  total_prob = 0
  # Goes through each classifier and sums the prediction probability of each
  for classifier in classifiers.values():
    # Get the predicted probability (probability that this team wins)
    prob = classifier.predict_proba(scaled_prediction_dataframe)[:, 1]  # assuming 1 is the 'home win' class
    # print(f'{'away' if side else 'home'} prob: {prob}')
    if not np.isnan(prob[0]) and np.isfinite(prob[0]):
      total_prob += prob[0]

  # return the averate of all probabilities
  return total_prob / len(classifiers)

In [95]:
import datetime

game_predicted = { 
    'GameID': [],  # Going to append with actual GameIDs
    'GameDate': [],  # Going to append with actual dates
    'Home Team': [],  # Going to append with actual teams
    'Away Team': [],  # Going to append with actual teams
    'Prediction': [],  # Going to append with actual predictions
}
for i, game in enumerate(combine_next_game_models):

  # this uses the function right above this code cell
  # This calculates the average probabilities that the home team will win across all classifiers
  # game[0] and 0 because home is in the first index
  home_avg_prob = calculate_team_prob(i, game, game[0]['Classifiers'], 0)
  
  # This calculates the average probabilities that the away team will win across all classifiers
  # game[1] and 1 because away is in the second index
  away_avg_prob = calculate_team_prob(i, game, game[1]['Classifiers'], 1)

  # The reason we do the home and away because it is not guaranteed that the probabilities will add to 1
  # The training for each classifier is slightly biased for the team it is training on

  print(f'{next_games[i][0]} (home) total prob: {home_avg_prob}, {next_games[i][1]} (away) total prob: {away_avg_prob}')
  # Make the prediction based on average probabilities
  # We will favor the home team if the probabilities are a tie (this is very unlikely)
  if home_avg_prob >= away_avg_prob:
    # winner is home (first index)
    winner = 0
    winner_accuracy = home_avg_prob
  else:
    # winner is away (second index)
    winner = 1
    winner_accuracy = away_avg_prob

  # print results based on the average probabilities
  print(f'{next_games[i][winner]} is predicted to win their next game against the {next_games[i][0 if winner else 1]} on {next_games_response[i]['GameDate']} with an accuracy of {winner_accuracy:.3f}')
  game_predicted['Prediction'].append(next_games[i][winner])

  print()


  # This is for the excel file
  game_predicted['GameDate'].append(next_games_response[i]['GameDate'] + ", " + str(datetime.datetime.now().year))
  game_predicted['GameID'].append(next_games_response[i]['ID'])
  game_predicted['Home Team'].append(next_games_response[i]['HomeLongName'])
  game_predicted['Away Team'].append(next_games_response[i]['VisitorLongName'])
  
  
  

Prince Albert Raiders (home) total prob: 0.36959847218997127, Saskatoon Blades (away) total prob: 0.6491849587308031
Saskatoon Blades is predicted to win their next game against the Prince Albert Raiders on Wed, Sep 3 with an accuracy of 0.649

Lethbridge Hurricanes (home) total prob: 0.3737940000828037, Calgary Hitmen (away) total prob: 0.5452327801255786
Calgary Hitmen is predicted to win their next game against the Lethbridge Hurricanes on Wed, Sep 3 with an accuracy of 0.545

Swift Current Broncos (home) total prob: 0.5766798877910468, Regina Pats (away) total prob: 0.24312833262124112
Swift Current Broncos is predicted to win their next game against the Regina Pats on Wed, Sep 3 with an accuracy of 0.577

Kamloops Blazers (home) total prob: 0.3430415022991903, Vancouver Giants (away) total prob: 0.4828163392242264
Vancouver Giants is predicted to win their next game against the Kamloops Blazers on Wed, Sep 3 with an accuracy of 0.483

Kelowna Rockets (home) total prob: 0.135028108

  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)


In [96]:
print(game_predicted)

{'GameID': ['1022074', '1022071', '1022075', '1022072', '1022073'], 'GameDate': ['Wed, Sep 3, 2025', 'Wed, Sep 3, 2025', 'Wed, Sep 3, 2025', 'Wed, Sep 3, 2025', 'Wed, Sep 3, 2025'], 'Home Team': ['Prince Albert Raiders', 'Lethbridge Hurricanes', 'Swift Current Broncos', 'Kamloops Blazers', 'Kelowna Rockets'], 'Away Team': ['Saskatoon Blades', 'Calgary Hitmen', 'Regina Pats', 'Vancouver Giants', 'Penticton Vees'], 'Prediction': ['Saskatoon Blades', 'Calgary Hitmen', 'Swift Current Broncos', 'Vancouver Giants', 'Penticton Vees']}


## Write Predictions to file


In [97]:
import pandas as pd
from openpyxl import load_workbook

# Define the file path
file_path = 'prediction_record.xlsx'

# New data to append (replace this with your actual DataFrame)
new_results = pd.DataFrame(game_predicted)

# Select only the relevant columns (GameID, GameDate, Home Team, Away Team, Prediction)
new_results = new_results[['GameID', 'GameDate', 'Home Team', 'Away Team', 'Prediction']]

# Load the existing Excel file to check for existing GameIDs
existing_data = pd.read_excel(file_path, engine='openpyxl')

# Ensure 'GameID' columns are of the same type for comparison (convert to string)
existing_game_ids = existing_data['GameID'].astype(int).values  # Ensure existing GameIDs are strings
new_results['GameID'] = new_results['GameID'].astype(int)  # Convert new GameIDs to strings

# Filter new_results to only include rows with GameID that does not already exist in the file
new_results_filtered = new_results[~new_results['GameID'].isin(existing_game_ids)]

# Check if there are rows to append
if not new_results_filtered.empty:
  # Load the existing workbook
  book = load_workbook(file_path)

  # Open the file in append mode using ExcelWriter
  with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
    # Write the new results (without overwriting existing data) to columns A:G
    new_results_filtered.to_excel(writer, index=False, header=False, startrow=book.active.max_row, sheet_name='Sheet1')

  print(f"Appended {len(new_results_filtered)} rows to the file.")
  update_cells_start = len(existing_data)
else:
  print("No new GameIDs to append.")
  update_cells_start = -1


Appended 5 rows to the file.


## Update With Actual Results


In [98]:
# Define the file path
file_path = 'prediction_record.xlsx'

# Read the existing Excel file into a Pandas DataFrame
df = pd.read_excel(file_path, engine='openpyxl', usecols="A:F")

# Assume 'dataset' contains your actual game data with 'HomeGoals' and 'AwayGoals'
# dataset = pd.DataFrame(...)  # Load your dataset here with actual goal values

# We wont update any cells if the 
if update_cells_start == -1:
  update_cells_start = len(df)


# Loop through the rows and update the 'Actual' column where it's empty
for index, row in df.iterrows():
  if pd.isna(row['Actual']):  # Check if 'Actual' is empty
    game_id = row['GameID']
    
    # Get the corresponding data for the game from 'dataset'
    game_data = dataset[dataset['Game_ID'] == game_id]
    
    if not game_data.empty:
      home_goals = game_data['Home_Goals'].iloc[0]
      away_goals = game_data['Away_Goals'].iloc[0]
      
      # Determine which team has the higher score
      if home_goals > away_goals:
        actual_team = row['Home Team']
      elif away_goals > home_goals:
        actual_team = row['Away Team']
  
      
      print(actual_team)
  
      # Update only the 'Actual' column
      df.at[index, 'Actual'] = actual_team


Saskatoon Blades
Medicine Hat Tigers
Everett Silvertips


In [99]:
from openpyxl import load_workbook

# Load the workbook using openpyxl
wb = load_workbook(file_path)

# Select the sheet where you want to write the data
sheet = wb['Sheet1']

# Start row (assuming the first row is header)
start_row = 2  # Data starts at row 2 (after the header)
# Start at where the predictions start
# df_subset = df.iloc[update_cells_start:]

# Update the cells in columns A:G
for index, row in df.iterrows():
  # We are writing columns A:G, so we update the appropriate cells
  sheet[f'A{start_row + index}'] = row['GameID']
  sheet[f'B{start_row + index}'] = row['GameDate']
  sheet[f'C{start_row + index}'] = row['Home Team']
  sheet[f'D{start_row + index}'] = row['Away Team']
  sheet[f'E{start_row + index}'] = row['Prediction']
  sheet[f'F{start_row + index}'] = row['Actual']
  if pd.notna(row['Actual']):
    sheet[f'G{start_row + index}'] = 1 if row['Prediction'] == row['Actual'] else 0

    

# Save the workbook with the updated values
wb.save(file_path)

In [None]:
csv_path = 'prediction_record.csv'
df.to_csv(csv_path, index=False)
print(f"CSV saved as {csv_path} with the same data as Excel.")


CSV saved as prediction_record.csv with the same data as Excel.
CSV saved as prediction_record.csv with the same data as Excel.
