## Import/Install Needed Libraries

In [69]:
try:
    import pandas as pd
except ImportError:
    !pip install pandas
    import pandas as pd

try:
    import numpy as np
except ImportError:
    !pip install numpy
    import numpy as np

try:
    import requests
except ImportError:
    !pip install requests
    import requests

from collections import deque

try:
    import sklearn
except ImportError:
    !pip install scikit-learn
    import sklearn

## Declare K

In [72]:
# 5 is recommended
k = 15

## Links and GET functions

In [75]:
key = '41b145a848f4bd67' # This is a public key
def game_id_url_func(num_of_ahead_games, num_of_past_games, current_team_id=''):
  return f'https://lscluster.hockeytech.com/feed/?feed=modulekit&key={key}&view=scorebar&client_code=whl&numberofdaysahead={num_of_ahead_games}&numberofdaysback={num_of_past_games}&season_id=&team_id={current_team_id}&lang_code=en&fmt=json'

def game_stats_url_func(game_id):
  return f'https://lscluster.hockeytech.com/feed/?feed=gc&key={key}&game_id={game_id}&client_code=whl&tab=clock&lang_code=en&fmt=json'

def get_game_ids(url):
  headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, br"  # Request gzip compression
  }

  try:
      response = requests.get(url, headers=headers)

      # Check if the request was successful
      if response.status_code == 200:

          return response.json()['SiteKit']['Scorebar']
          #pprint(response)
      else:
          print(f"Failed to retrieve the page. Status code: {response.status_code}")
  except Exception as e:
      print(f"An error occurred: {e}")


'''
Expects a game_id, which is used to access the Hockey Tech API to get the stats of a game, returned in a JSON format
'''
def get_game_stats(game_id):
  game_stats_url = game_stats_url_func(game_id)
  headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, br"  # Request gzip compression
  }
  
  try:
      response = requests.get(game_stats_url, headers=headers)
  
      # Check if the request was successful
      if response.status_code == 200:
  
          return response.json()
          pprint(response)
      else:
          print(f"Failed to retrieve the page. Status code: {response.status_code}")
  except Exception as e:
      print(f"An error occurred: {e}")

## Import the Dataset

In [78]:
# Read the CSV. ALso, dont create an index column
dataset = pd.read_csv('All_teams_WHL_stats.csv', index_col=0)
team_id = {'Spokane Chiefs': '215', 'Seattle Thunderbirds': '214', 'Portland Winterhawks': '208', 'Everett Silvertips': '226', 'Tri-City Americans': '217', 'Kamloops Blazers': '203', 'Kelowna Rockets': '204', 'Prince George Cougars': '210', 'Brandon Wheat Kings': '201', 'Swift Current Broncos': '216', 'Vancouver Giants': '223', 'Victoria Royals': '227', 'Medicine Hat Tigers': '206', 'Edmonton Oil Kings': '228', 'Moose Jaw Warriors': '207', 'Regina Pats': '212', 'Saskatoon Blades': '213', 'Prince Albert Raiders': '209', 'Calgary Hitmen': '202', 'Lethbridge Hurricanes': '205', 'Red Deer Rebels': '211', 'Wenatchee Wild': '222'}

In [80]:
dataset = dataset.sort_values(by="Game_ID")
dataset.reset_index(drop=True, inplace=True)
display(dataset)

Unnamed: 0,Game_ID,Home_Name,Away_Name,Home_Goals,Away_Goals,Home_PP%,Away_PP%,Home_SOG,Away_SOG,Home_FOW%,Away_FOW%
0,1014692,Prince George Cougars,Calgary Hitmen,7,1,0.285714,0.250000,34,17,0.433333,0.566667
1,1014699,Kamloops Blazers,Calgary Hitmen,2,4,0.000000,0.000000,32,56,0.492958,0.507042
2,1014708,Kelowna Rockets,Calgary Hitmen,3,4,0.166667,0.000000,34,26,0.461538,0.538462
3,1014720,Calgary Hitmen,Lethbridge Hurricanes,1,3,0.000000,0.285714,31,37,0.500000,0.500000
4,1014735,Red Deer Rebels,Calgary Hitmen,4,2,0.000000,0.000000,31,26,0.576271,0.423729
...,...,...,...,...,...,...,...,...,...,...,...
6017,1022007,Swift Current Broncos,Medicine Hat Tigers,5,4,0.000000,1.000000,26,36,0.616667,0.383333
6018,1022008,Swift Current Broncos,Medicine Hat Tigers,3,4,0.000000,1.000000,28,48,0.610169,0.389831
6019,1022009,Medicine Hat Tigers,Swift Current Broncos,3,2,0.000000,1.000000,39,27,0.564516,0.435484
6020,1022012,Saskatoon Blades,Calgary Hitmen,1,4,0.000000,1.000000,21,30,0.491525,0.508475


## Update the WHL team stats dataset with recent games

In [151]:
# Get the last 10 days worth of games. Do this for reducency, incase we missed a day of running the script
update_games = get_game_ids(game_id_url_func(0, 10))

print([game['ID'] for game in update_games])
# We get a lot of games because the API calls num_of_game_to_update back, and multiple games are played on the same day

['1021928', '1021935', '1021932', '1021931', '1021934', '1021933', '1021930', '1021936', '1021929', '1021942', '1021946', '1021944', '1021938', '1021941', '1021940', '1021939', '1021943', '1021937', '1021945', '1021948', '1021949', '1021947', '1021891', '1021963', '1022005', '1021990', '1021965', '1021976', '1021997', '1021961', '1021974', '1021966', '1022006', '1021991', '1021977', '1021975', '1021962', '1021964', '1021998', '1022002', '1021992', '1022012', '1022007', '1021980', '1021985', '1021969', '1022003', '1021993', '1022008', '1022013', '1021981', '1021986', '1021978', '1021979', '1021967', '1022009', '1021994', '1021982', '1021987', '1021970']


In [153]:
recorded_ids = set(dataset['Game_ID'])

# If there is a game that we havent seen, set this to true
# This will write the updated stats to the WHL dataset
update_dataset_condidion = False

if update_games:
  for game in update_games:
    # Because we got more games than we needed, we ignore the games we already have
    if int(game['ID']) in recorded_ids:
      continue

    # Need to update the WHL stats dataset
    update_dataset_condidion = True  
    
    # Get the game stats for each game
    stats = get_game_stats(game["ID"])
  
    print(stats)
    # Home and visitor team names
    visitor = stats['GC']['Clock']['visiting_team']['name']
    home = stats['GC']['Clock']['home_team']['name']
  
    # Number of goals
    home_goals = int(stats['GC']['Clock']['home_goal_count'])
    visitor_goals = int(stats['GC']['Clock']['visiting_goal_count'])
  
    # Calculate Power Play %
    home_ppp_total = float(stats['GC']['Clock']['power_play']['total']['home'])
    visitor_ppp_total = float(stats['GC']['Clock']['power_play']['total']['visiting'])
    # Avoid division by zero for home_ppp
    home_pp_goals = stats['GC']['Clock']['power_play']['goals']['home']
    home_pp_goals = float(home_pp_goals) if home_pp_goals is not None else 0.0
    
    if home_ppp_total != 0:
        home_ppp = home_pp_goals / home_ppp_total
    else:
        home_ppp = 0  # Default to 0 if no power plays

    visitor_pp_goals = float(stats['GC']['Clock']['power_play']['total']['visiting'])
    visitor_pp_goals = float(visitor_pp_goals) if visitor_pp_goals is not None else 0.0
    # Avoid division by zero for visitor_ppp
    if visitor_ppp_total != 0:
        visitor_ppp = visitor_pp_goals / visitor_ppp_total
    else:
        visitor_ppp = 0  # Default to 0 if no power plays
  
    # Calculate Faceoff Win %
    home_fowp = float(stats['GC']['Clock']['fow']['home'])
    visitor_fowp = float(stats['GC']['Clock']['fow']['visiting'])
    fow_total = home_fowp + visitor_fowp
    if fow_total != 0:
        home_fowp /= fow_total
        visitor_fowp /= fow_total
    else:
        home_fowp, visitor_fowp = .5, .5
  
    # Shots on goal
    home_sog = sum(stats['GC']['Clock']['shots_on_goal']['home'].values())
    visitor_sog = sum(stats['GC']['Clock']['shots_on_goal']['visiting'].values())
    
    # Add data to the dataset
    dataset.loc[len(dataset)] = {
        "Game_ID": int(game['ID']),
        "Home_Name": home,
        "Away_Name": visitor,
        "Home_Goals": home_goals,
        "Away_Goals": visitor_goals,
        "Home_PP%": home_ppp,
        "Away_PP%": visitor_ppp,
        "Home_SOG": home_sog,
        "Away_SOG": visitor_sog,
        "Home_FOW%": home_fowp,
        "Away_FOW%": visitor_fowp
    }

### Update CSV

In [145]:
# Update the CSV for future use
# Sort by game_id
dataset = dataset.sort_values(by="Game_ID")
# Remove duplicate games, based on game_id
dataset = dataset.drop_duplicates(subset="Game_ID", keep="first")
dataset.reset_index(drop=True, inplace=True)
display(dataset)

Unnamed: 0,Game_ID,Home_Name,Away_Name,Home_Goals,Away_Goals,Home_PP%,Away_PP%,Home_SOG,Away_SOG,Home_FOW%,Away_FOW%
0,1014692,Prince George Cougars,Calgary Hitmen,7,1,0.285714,0.250000,34,17,0.433333,0.566667
1,1014699,Kamloops Blazers,Calgary Hitmen,2,4,0.000000,0.000000,32,56,0.492958,0.507042
2,1014708,Kelowna Rockets,Calgary Hitmen,3,4,0.166667,0.000000,34,26,0.461538,0.538462
3,1014720,Calgary Hitmen,Lethbridge Hurricanes,1,3,0.000000,0.285714,31,37,0.500000,0.500000
4,1014735,Red Deer Rebels,Calgary Hitmen,4,2,0.000000,0.000000,31,26,0.576271,0.423729
...,...,...,...,...,...,...,...,...,...,...,...
6033,1022007,Swift Current Broncos,Medicine Hat Tigers,5,4,0.000000,1.000000,26,36,0.616667,0.383333
6034,1022008,Swift Current Broncos,Medicine Hat Tigers,3,4,0.000000,1.000000,28,48,0.610169,0.389831
6035,1022009,Medicine Hat Tigers,Swift Current Broncos,3,2,0.000000,1.000000,39,27,0.564516,0.435484
6036,1022012,Saskatoon Blades,Calgary Hitmen,1,4,0.000000,1.000000,21,30,0.491525,0.508475


In [147]:
if update_dataset_condidion:
  dataset.to_csv('All_teams_WHL_stats.csv')

## Get Next Games

In [19]:
# We want to do a GET request getting the next games played
# Store this information within an [] of (Home Team, Visiting Team)

# Then get a DF and train the model on both teams playing

# This is the number of days ahead to predict games for
# 1 is recommended because anything past that then model is missing previous games played
num_of_future_days_predict = 1
next_games_response = get_game_ids(game_id_url_func(num_of_future_days_predict, 0))
next_games = []
for game in next_games_response:
  next_games.append((game['HomeLongName'], game["VisitorLongName"]))

print(next_games)

[('Everett Silvertips', 'Seattle Thunderbirds'), ('Spokane Chiefs', 'Vancouver Giants')]


## Create features and dependencies

In [21]:
# Get default values for averages of goals, PP%, SOG, and FOW%
default_goals = (dataset['Home_Goals'].mean() + dataset['Away_Goals'].mean()) // 2
default_ppp = (dataset['Home_PP%'].mean() + dataset['Away_PP%'].mean()) / 2
default_sog = (dataset['Home_SOG'].mean() + dataset['Away_SOG'].mean()) // 2
default_fowp = (dataset['Home_FOW%'].mean() + dataset['Away_FOW%'].mean()) / 2

print('Default Values used:')
print(default_goals, default_ppp, default_sog, default_fowp)

Default Values used:
3.0 0.21860324073874388 31.0 0.5


In [22]:
def get_opponent_stats(dataset, queue, opponent_name, row):
  """
  Calculates average statistics for a given opponent based on their last 'k' games prior to a specified game.
  
  Parameters:
  - dataset (pd.DataFrame): The dataset containing game records with statistics for each game.
  - queue: (Unused in this function but included as a parameter for compatibility).
  - opponent_name (str): The name of the opponent team whose stats are being calculated.
  - row (pd.Series): The current row of the dataset, representing the game for which the opponent's stats are calculated.

  Returns:
  - opponent_goals (float): Average goals scored by the opponent over the last 'k' games.
  - opponent_ppp (float): Average power-play percentage of the opponent over the last 'k' games.
  - opponent_fowp (float): Average face-off win percentage of the opponent over the last 'k' games.
  - opponent_sog (float): Average shots on goal by the opponent over the last 'k' games.

  Notes:
  - 'k' is assumed to be a predefined global variable representing the number of games to consider.
  - Default values (`default_goals`, `default_ppp`, `default_fowp`, `default_sog`) must also be predefined globally.
  - Uses `.where()` to select statistics based on whether the opponent is playing as home or away in each game.
  """
  
  # Get the last k games averages
  # This creates a new dataset of only opponent games, and which are before the current game
  opponent_games = dataset[
    ((dataset['Home_Name'] == opponent_name) | (dataset['Away_Name'] == opponent_name)) & 
    (dataset['Game_ID'] < row['Game_ID'])
  ].tail(k)

  # If the opponent hasnt played k games yet, use default
  if len(opponent_games) < k:
    opponent_goals = default_goals
    opponent_ppp = default_ppp
    opponent_fowp = default_fowp
    opponent_sog = default_sog
  else:
    # Get the mean of the entire column that the opponent played
    opponent_goals = opponent_games['Home_Goals'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_Goals']).mean()
    opponent_ppp = opponent_games['Home_PP%'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_PP%']).mean()
    opponent_fowp = opponent_games['Home_FOW%'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_FOW%']).mean()
    opponent_sog = opponent_games['Home_SOG'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_SOG']).mean()

  return opponent_goals, opponent_ppp, opponent_fowp, opponent_sog

### Get Avg Stats for each team

In [24]:
def generate_team_stats(dataset, team_a, k):
  """
  This function generates a DataFrame with stats comparing a target team (team_a) against its opponent (opponent_team)
  over the last 'k' games. It calculates average stats and differences between the target team and the opponent.

  Parameters:
  - dataset (DataFrame): The dataset containing the game statistics.
  - team_a (str): The name of the target team.
  - k (int): The number of recent games to consider for averaging stats.

  Returns:
  - DataFrame: A DataFrame containing the calculated statistics and differences between the target team and its opponent.
  """

  # Initialize variables and the deque to store the last k games
  target_queue = deque(maxlen=k)
  target_df = pd.DataFrame(columns = [
    "target_goals", 
    "opponent_goals", 
    "target_PP%", 
    "opponent_PP%", 
    "target_SOG", 
    "opponent_SOG",
    "target_FOW%", 
    "opponent_FOW%", 
    "Home/Away",
    "goals_diff",
    "ppp_diff",
    "sog_diff",
    "fowp_diff",
    "target_win"
  ])

  target_goals = 0
  target_ppp = 0
  target_fowp = 0
  target_sog = 0

  # Iterate through the dataset to track the last 'k' games
  for index, row in dataset.iterrows():
    if row['Home_Name'] == team_a or row['Away_Name'] == team_a:

      side = 'Home' if row['Home_Name'] == team_a else 'Away'
      opponent_name = row['Away_Name'] if side == 'Home' else row['Home_Name']
      
      # Update statistics for the target team
      target_goals += row[side + '_Goals']
      target_ppp += row[side + '_PP%']
      target_fowp += row[side + '_FOW%']
      target_sog += row[side + '_SOG']
      
      # If the queue is not full, add the current game to the queue
      # k-1 because we can continue of the queue is almost full
      if len(target_queue) < k - 1:
        target_queue.append(row.to_dict())
      else:
        # Calculate the opponent's stats
        # This function is the code cell above this
        opponent_goals, opponent_ppp, opponent_fowp, opponent_sog = get_opponent_stats(dataset, target_queue, opponent_name, row)
        
        # Calculate average stats for the target team. /k find the aggregate avg of the sum of stats
        avg_target_goals = target_goals / k
        avg_target_ppp = target_ppp / k
        avg_target_fowp = target_fowp / k
        avg_target_sog = target_sog / k

        # Calculate differences between the target team and the opponent
        goals_diff = avg_target_goals - opponent_goals
        ppp_diff = avg_target_ppp - opponent_ppp
        sog_diff = avg_target_sog - opponent_sog
        fowp_diff = avg_target_fowp - opponent_fowp

        target_win = 1 if row[side + '_Goals'] > row[('Away' if side == 'Home' else 'Home') + '_Goals'] else 0

        # 1 is Home, 0 is Away
        target_df.loc[len(target_df)] = {
            "target_goals": avg_target_goals,
            "opponent_goals": opponent_goals,
            "target_PP%": avg_target_ppp,
            "opponent_PP%": opponent_ppp,
            "target_SOG": avg_target_sog,
            "opponent_SOG": opponent_sog,
            "target_FOW%": avg_target_fowp,
            "opponent_FOW%": opponent_fowp,
            "Home/Away": 1.0 if side == 'Home' else 0.0,
            "goals_diff": goals_diff,
            "ppp_diff": ppp_diff,
            "sog_diff": sog_diff,
            "fowp_diff": fowp_diff,
            "target_win": target_win
        }

        # Remove the oldest game from the queue and update stats
        popped = target_queue.popleft()
        popped_side = 'Home' if popped['Home_Name'] == team_a else 'Away'
        target_goals -= popped[popped_side + '_Goals']
        target_ppp -= popped[popped_side + '_PP%']
        target_fowp -= popped[popped_side + '_FOW%']
        target_sog -= popped[popped_side + '_SOG']

        # Add the current game to the queue
        target_queue.append(row.to_dict())

  return target_df

### Generate DataFrames For Each Team Playing in Next Game

In [26]:
# loop through the next games, and get the dataframe for each team.
# Store it in a [] of (DF Home, DF, Away)

# This is an array that stores the stats DataFrame for each team playing
next_games_dfs = []
for game in next_games:
  home_team = game[0]
  away_team = game[1]
  home_team_stats_df = generate_team_stats(dataset, home_team, k)
  away_team_stats_df = generate_team_stats(dataset, away_team, k)
  
  next_games_dfs.append((home_team_stats_df, away_team_stats_df))


In [27]:
# Just display the first game to make sure
if len(next_games_dfs) > 0:
  display(next_games_dfs[0][0])

Unnamed: 0,target_goals,opponent_goals,target_PP%,opponent_PP%,target_SOG,opponent_SOG,target_FOW%,opponent_FOW%,Home/Away,goals_diff,ppp_diff,sog_diff,fowp_diff,target_win
0,4.266667,4.000000,0.340370,0.204444,41.400000,37.866667,0.502832,0.523732,1.0,0.266667,0.135926,3.533333,-0.020900,0
1,4.533333,3.133333,0.329259,0.140000,42.133333,31.133333,0.505304,0.467841,1.0,1.400000,0.189259,11.000000,0.037463,1
2,4.466667,3.733333,0.302593,0.164444,42.066667,34.933333,0.505827,0.511171,0.0,0.733333,0.138148,7.133333,-0.005343,1
3,4.466667,4.066667,0.299259,0.206111,41.266667,34.200000,0.501489,0.524485,0.0,0.400000,0.093148,7.066667,-0.022996,1
4,4.266667,3.933333,0.272593,0.250556,41.600000,34.666667,0.505268,0.519338,1.0,0.333333,0.022037,6.933333,-0.014070,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
531,3.200000,3.400000,0.272222,0.252857,39.000000,35.533333,0.478184,0.455059,1.0,-0.200000,0.019365,3.466667,0.023125,1
532,3.000000,4.666667,0.272222,0.306667,37.733333,38.533333,0.475009,0.567959,0.0,-1.666667,-0.034444,-0.800000,-0.092950,1
533,3.066667,2.800000,0.272222,0.270000,38.600000,29.466667,0.478581,0.472748,1.0,0.266667,0.002222,9.133333,0.005832,1
534,3.133333,2.600000,0.338889,0.292222,38.466667,29.533333,0.485630,0.478271,0.0,0.533333,0.046667,8.933333,0.007358,1


## Apply Classification Models

We go through 4 classification models for each team, train the model, and then store it for future use for a probablistic approach 

In [30]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score

# Going to store a dict of the best classifiers between each team
next_games_models = []

for game in next_games_dfs:
  
  # Loop through each team (home and away)
  for i, team in enumerate(game):
    best_accuracy = 0
    best_classifier = 0
    best_classifier_name = ""
    
    # Features and dependencies
    X = team.iloc[:, :-1]
    y = team.iloc[:, -1]

    
    # Scale feature set
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Initialize models
    classifiers = {
        "Random Forest": RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0),
        "Naive Bayes": GaussianNB(),
        "Logistic Regression": LogisticRegression(random_state=0),
        "SVC": SVC(kernel='rbf', random_state=0, probability=True)
    }
    
    # WE NOT LONGER NEED TO KEEP TRACK OF ACCURACIES. WE ARE USING PROBABILITY
    # Train each classifier
    for classifier_name, classifier in classifiers.items():
      # Train the classifier for future use
      classifier.fit(X, y)
    
    
    # Append to the models list
    team_name = "Home" if i == 0 else "Away" 
    next_games_models.append({
        "Team": team_name,
        "Classifiers": classifiers, # classifiers is a dict
        "Scaler": scaler
    })

# Combine two teams into one game
combine_next_game_models = []
for i in range(0, len(next_games_models), 2):
  home = next_games_models[i]  # Home team entry
  away = next_games_models[i + 1]  # Away team entry

  combine_next_game_models.append((home, away))
  # Now, next_games_models will store all of the classifiers for each team playing a game


In [31]:
print(combine_next_game_models)

[({'Team': 'Home', 'Classifiers': {'Random Forest': RandomForestClassifier(criterion='entropy', random_state=0), 'Naive Bayes': GaussianNB(), 'Logistic Regression': LogisticRegression(random_state=0), 'SVC': SVC(probability=True, random_state=0)}, 'Scaler': StandardScaler()}, {'Team': 'Away', 'Classifiers': {'Random Forest': RandomForestClassifier(criterion='entropy', random_state=0), 'Naive Bayes': GaussianNB(), 'Logistic Regression': LogisticRegression(random_state=0), 'SVC': SVC(probability=True, random_state=0)}, 'Scaler': StandardScaler()}), ({'Team': 'Home', 'Classifiers': {'Random Forest': RandomForestClassifier(criterion='entropy', random_state=0), 'Naive Bayes': GaussianNB(), 'Logistic Regression': LogisticRegression(random_state=0), 'SVC': SVC(probability=True, random_state=0)}, 'Scaler': StandardScaler()}, {'Team': 'Away', 'Classifiers': {'Random Forest': RandomForestClassifier(criterion='entropy', random_state=0), 'Naive Bayes': GaussianNB(), 'Logistic Regression': Logistic

## Predict the Next Games Playing

In [33]:
def get_df_for_predicton(i, next_games_dfs, playing_side):
  """
  Prepares a feature set for predicting game outcomes based on team statistics.

  Parameters:
  - i (int): Index of the game in the `next_games_dfs` list.
  - next_games_dfs (list): List of DataFrames with game statistics.
  - playing_side (str): The side the target team is playing on ("Home" or "Away").

  Returns:
  - pd.DataFrame: DataFrame containing features for prediction, including goals, PP%, SOG, FOW%, and their differences.

  Function Details:
  - Extracts statistics for the target team and opponent.
  - Computes the differences in key statistics (goals, PP%, SOG, FOW%).
  - Returns a DataFrame with relevant features for prediction.
  """
  
  side = 0 if playing_side == "Home" else 1
  opponent_side = 0 if side else 1
  target_stats = next_games_dfs[i][side].iloc[-1]
  
  # Break down the stats into different variables
  opponent_stats = next_games_dfs[i][opponent_side].iloc[-1]
  target_goals = target_stats['target_goals']
  opponent_goals = opponent_stats['target_goals']
  target_PPP = target_stats['target_PP%']
  opponent_PPP = opponent_stats['target_PP%']
  target_SOG = target_stats['target_SOG']
  opponent_SOG = opponent_stats['target_SOG']
  target_FOWP = target_stats['target_FOW%']
  opponent_FOWP = opponent_stats['target_FOW%']
  target_side = opponent_side # This is reverse because I ordered it weirdly
  goals_diff = target_goals - opponent_goals
  ppp_diff = target_PPP - opponent_PPP
  sog_diff = target_SOG - opponent_SOG
  fowp_diff = target_FOWP - opponent_FOWP

  features = [
    [
      target_goals, opponent_goals, target_PPP, opponent_PPP,
      target_SOG, opponent_SOG, target_FOWP, opponent_FOWP,
      side, goals_diff, ppp_diff, sog_diff, fowp_diff
      ]
  ]

  feature_names = ["target_goals", 
      "opponent_goals", 
      "target_PP%", 
      "opponent_PP%", 
      "target_SOG", 
      "opponent_SOG",
      "target_FOW%",
      "opponent_FOW%",
      "Home/Away",
      "goals_diff",
      "ppp_diff",
      "sog_diff",
      "fowp_diff"]


  # Convert features to a DataFrame with the correct column names
  return pd.DataFrame(features, columns=feature_names)
  

### Print Next Games Prediction

In [35]:
'''
This will give the confidence level of the multiple classifiers
This gives us a more comprehensive view of how likely a team is going to win based on teh confiendce of all classifiers
'''
def calculate_team_prob(index, game, classifiers, side):

  # prepares the features dataframe for prediction
  prediction_features_dataframe = get_df_for_predicton(index, next_games_dfs, game[side]["Team"])

  # scaled the features dataframe with the correct scaler for the team
  scaled_prediction_dataframe = game[side]['Scaler'].transform(prediction_features_dataframe)

  total_prob = 0
  # Goes through each classifier and sums the prediction probability of each
  for classifier in classifiers.values():
    # Get the predicted probability (probability that this team wins)
    prob = classifier.predict_proba(scaled_prediction_dataframe)[:, 1]  # assuming 1 is the 'home win' class
    # print(f'{'away' if side else 'home'} prob: {prob}')
    total_prob += prob[0]

  # return the averate of all probabilities
  return total_prob / len(classifiers)

In [36]:
import datetime

game_predicted = { 
    'GameID': [],  # Going to append with actual GameIDs
    'GameDate': [],  # Going to append with actual dates
    'Home Team': [],  # Going to append with actual teams
    'Away Team': [],  # Going to append with actual teams
    'Prediction': [],  # Going to append with actual predictions
}
for i, game in enumerate(combine_next_game_models):

  # this uses the function right above this code cell
  # This calculates the average probabilities that the home team will win across all classifiers
  # game[0] and 0 because home is in the first index
  home_avg_prob = calculate_team_prob(i, game, game[0]['Classifiers'], 0)
  
  # This calculates the average probabilities that the away team will win across all classifiers
  # game[1] and 1 because away is in the second index
  away_avg_prob = calculate_team_prob(i, game, game[1]['Classifiers'], 1)

  # The reason we do the home and away because it is not guaranteed that the probabilities will add to 1
  # The training for each classifier is slightly biased for the team it is training on

  print(f'{next_games[i][0]} (home) total prob: {home_avg_prob}, {next_games[i][1]} (away) total prob: {away_avg_prob}')
  # Make the prediction based on average probabilities
  # We will favor the home team if the probabilities are a tie (this is very unlikely)
  if home_avg_prob >= away_avg_prob:
    # winner is home (first index)
    winner = 0
    winner_accuracy = home_avg_prob
  else:
    # winner is away (second index)
    winner = 1
    winner_accuracy = away_avg_prob

  # print results based on the average probabilities
  print(f'{next_games[i][winner]} is predicted to win their next game against the {next_games[i][0 if winner else 1]} on {next_games_response[i]['GameDate']} with an accuracy of {winner_accuracy:.3f}')
  game_predicted['Prediction'].append(next_games[i][winner])

  print()


  # This is for the excel file
  game_predicted['GameDate'].append(next_games_response[i]['GameDate'] + ", " + str(datetime.datetime.now().year))
  game_predicted['GameID'].append(next_games_response[i]['ID'])
  game_predicted['Home Team'].append(next_games_response[i]['HomeLongName'])
  game_predicted['Away Team'].append(next_games_response[i]['VisitorLongName'])
  
  
  

Everett Silvertips (home) total prob: 0.47960301650265136, Seattle Thunderbirds (away) total prob: 0.5484440154190973
Seattle Thunderbirds is predicted to win their next game against the Everett Silvertips on Sat, Apr 5 with an accuracy of 0.548

Spokane Chiefs (home) total prob: 0.8430611367477777, Vancouver Giants (away) total prob: 0.2048403603832083
Spokane Chiefs is predicted to win their next game against the Vancouver Giants on Sat, Apr 5 with an accuracy of 0.843



In [37]:
print(game_predicted)

{'GameID': ['1021971', '1021999'], 'GameDate': ['Sat, Apr 5, 2025', 'Sat, Apr 5, 2025'], 'Home Team': ['Everett Silvertips', 'Spokane Chiefs'], 'Away Team': ['Seattle Thunderbirds', 'Vancouver Giants'], 'Prediction': ['Seattle Thunderbirds', 'Spokane Chiefs']}


## Write Predictions to file

In [39]:
import pandas as pd
from openpyxl import load_workbook

# Define the file path
file_path = 'prediction_record.xlsx'

# New data to append (replace this with your actual DataFrame)
new_results = pd.DataFrame(game_predicted)

# Select only the relevant columns (GameID, GameDate, Home Team, Away Team, Prediction)
new_results = new_results[['GameID', 'GameDate', 'Home Team', 'Away Team', 'Prediction']]

# Load the existing Excel file to check for existing GameIDs
existing_data = pd.read_excel(file_path, engine='openpyxl')

# Ensure 'GameID' columns are of the same type for comparison (convert to string)
existing_game_ids = existing_data['GameID'].astype(int).values  # Ensure existing GameIDs are strings
new_results['GameID'] = new_results['GameID'].astype(int)  # Convert new GameIDs to strings

# Filter new_results to only include rows with GameID that does not already exist in the file
new_results_filtered = new_results[~new_results['GameID'].isin(existing_game_ids)]

# Check if there are rows to append
if not new_results_filtered.empty:
  # Load the existing workbook
  book = load_workbook(file_path)

  # Open the file in append mode using ExcelWriter
  with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
    # Write the new results (without overwriting existing data) to columns A:G
    new_results_filtered.to_excel(writer, index=False, header=False, startrow=book.active.max_row, sheet_name='Sheet1')

  print(f"Appended {len(new_results_filtered)} rows to the file.")
  update_cells_start = len(existing_data)
else:
  print("No new GameIDs to append.")
  update_cells_start = -1


No new GameIDs to append.


## Update With Actual Results

In [41]:
# Define the file path
file_path = 'prediction_record.xlsx'

# Read the existing Excel file into a Pandas DataFrame
df = pd.read_excel(file_path, engine='openpyxl', usecols="A:F")

# Assume 'dataset' contains your actual game data with 'HomeGoals' and 'AwayGoals'
# dataset = pd.DataFrame(...)  # Load your dataset here with actual goal values

# We wont update any cells if the 
if update_cells_start == -1:
  update_cells_start = len(df)


# Loop through the rows and update the 'Actual' column where it's empty
for index, row in df.iterrows():
  if pd.isna(row['Actual']):  # Check if 'Actual' is empty
    game_id = row['GameID']
    
    # Get the corresponding data for the game from 'dataset'
    game_data = dataset[dataset['Game_ID'] == game_id]
    
    if not game_data.empty:
      home_goals = game_data['Home_Goals'].iloc[0]
      away_goals = game_data['Away_Goals'].iloc[0]
      
      # Determine which team has the higher score
      if home_goals > away_goals:
        actual_team = row['Home Team']
      elif away_goals > home_goals:
        actual_team = row['Away Team']
  
      
      print(actual_team)
  
      # Update only the 'Actual' column
      df.at[index, 'Actual'] = actual_team


Medicine Hat Tigers
Calgary Hitmen
Medicine Hat Tigers


In [42]:
from openpyxl import load_workbook

# Load the workbook using openpyxl
wb = load_workbook(file_path)

# Select the sheet where you want to write the data
sheet = wb['Sheet1']

# Start row (assuming the first row is header)
start_row = 2  # Data starts at row 2 (after the header)
# Start at where the predictions start
# df_subset = df.iloc[update_cells_start:]

# Update the cells in columns A:G
for index, row in df.iterrows():
  # We are writing columns A:G, so we update the appropriate cells
  sheet[f'A{start_row + index}'] = row['GameID']
  sheet[f'B{start_row + index}'] = row['GameDate']
  sheet[f'C{start_row + index}'] = row['Home Team']
  sheet[f'D{start_row + index}'] = row['Away Team']
  sheet[f'E{start_row + index}'] = row['Prediction']
  sheet[f'F{start_row + index}'] = row['Actual']
  if pd.notna(row['Actual']):
    sheet[f'G{start_row + index}'] = 1 if row['Prediction'] == row['Actual'] else 0

    

# Save the workbook with the updated values
wb.save(file_path)