## Import/Install Needed Libraries

In [3]:
try:
    import pandas as pd
except ImportError:
    !pip install pandas
    import pandas as pd

try:
    import numpy as np
except ImportError:
    !pip install numpy
    import numpy as np

try:
    import requests
except ImportError:
    !pip install requests
    import requests

from collections import deque

try:
    import sklearn
except ImportError:
    !pip install scikit-learn
    import sklearn

## Declare K

In [5]:
# 5 is recommended
k = 5

## Links and GET functions

In [7]:
key = '41b145a848f4bd67' # This is a public key
def game_id_url_func(num_of_ahead_games, num_of_past_games, current_team_id=''):
  return f'https://lscluster.hockeytech.com/feed/?feed=modulekit&key={key}&view=scorebar&client_code=whl&numberofdaysahead={num_of_ahead_games}&numberofdaysback={num_of_past_games}&season_id=&team_id={current_team_id}&lang_code=en&fmt=json'

def game_stats_url_func(game_id):
  return f'https://lscluster.hockeytech.com/feed/?feed=gc&key={key}&game_id={game_id}&client_code=whl&tab=clock&lang_code=en&fmt=json'

def get_game_ids(url):
  headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, br"  # Request gzip compression
  }

  try:
      response = requests.get(url, headers=headers)

      # Check if the request was successful
      if response.status_code == 200:

          return response.json()['SiteKit']['Scorebar']
          #pprint(response)
      else:
          print(f"Failed to retrieve the page. Status code: {response.status_code}")
  except Exception as e:
      print(f"An error occurred: {e}")


'''
Expects a game_id, which is used to access the Hockey Tech API to get the stats of a game, returned in a JSON format
'''
def get_game_stats(game_id):
  game_stats_url = game_stats_url_func(game_id)
  headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, br"  # Request gzip compression
  }
  
  try:
      response = requests.get(game_stats_url, headers=headers)
  
      # Check if the request was successful
      if response.status_code == 200:
  
          return response.json()
          pprint(response)
      else:
          print(f"Failed to retrieve the page. Status code: {response.status_code}")
  except Exception as e:
      print(f"An error occurred: {e}")

## Import the Dataset

In [9]:
# Read the CSV. ALso, dont create an index column
dataset = pd.read_csv('All_teams_WHL_stats.csv', index_col=0)
team_id = {'Spokane Chiefs': '215', 'Seattle Thunderbirds': '214', 'Portland Winterhawks': '208', 'Everett Silvertips': '226', 'Tri-City Americans': '217', 'Kamloops Blazers': '203', 'Kelowna Rockets': '204', 'Prince George Cougars': '210', 'Brandon Wheat Kings': '201', 'Swift Current Broncos': '216', 'Vancouver Giants': '223', 'Victoria Royals': '227', 'Medicine Hat Tigers': '206', 'Edmonton Oil Kings': '228', 'Moose Jaw Warriors': '207', 'Regina Pats': '212', 'Saskatoon Blades': '213', 'Prince Albert Raiders': '209', 'Calgary Hitmen': '202', 'Lethbridge Hurricanes': '205', 'Red Deer Rebels': '211', 'Wenatchee Wild': '222'}

In [10]:
dataset = dataset.sort_values(by="Game_ID")
dataset.reset_index(drop=True, inplace=True)
display(dataset)

Unnamed: 0,Game_ID,Home_Name,Away_Name,Home_Goals,Away_Goals,Home_PP%,Away_PP%,Home_SOG,Away_SOG,Home_FOW%,Away_FOW%
0,1014692,Prince George Cougars,Calgary Hitmen,7,1,0.285714,0.250000,34,17,0.433333,0.566667
1,1014699,Kamloops Blazers,Calgary Hitmen,2,4,0.000000,0.000000,32,56,0.492958,0.507042
2,1014708,Kelowna Rockets,Calgary Hitmen,3,4,0.166667,0.000000,34,26,0.461538,0.538462
3,1014720,Calgary Hitmen,Lethbridge Hurricanes,1,3,0.000000,0.285714,31,37,0.500000,0.500000
4,1014735,Red Deer Rebels,Calgary Hitmen,4,2,0.000000,0.000000,31,26,0.576271,0.423729
...,...,...,...,...,...,...,...,...,...,...,...
5625,1021573,Portland Winterhawks,Everett Silvertips,3,7,0.200000,0.500000,30,36,0.650000,0.350000
5626,1021574,Seattle Thunderbirds,Prince George Cougars,4,6,0.250000,0.000000,31,38,0.522388,0.477612
5627,1021575,Tri-City Americans,Spokane Chiefs,0,4,0.000000,0.000000,33,38,0.521127,0.478873
5628,1021576,Victoria Royals,Vancouver Giants,6,7,0.000000,0.250000,20,32,0.491803,0.508197


## Update the dataset with recent games

In [12]:
# Get the most recent games, and check their game ID
last_game_id = get_game_ids(game_id_url_func(0, 1))
last_game_id = int(last_game_id[-1]["ID"])

# Get the latest game ID we have in the dataset
last_recorded_game_id = dataset.iloc[-1]['Game_ID']

# Get the difference between the last game and the last recorded game.
# this will give us how many games off our dataset is
num_of_game_to_update = last_game_id - last_recorded_game_id
print(f'We need to update {num_of_game_to_update} games in our dataset')

We need to update 5 games in our dataset


In [13]:
update_games = get_game_ids(game_id_url_func(0, num_of_game_to_update))
# We get a lot of games because the API calls num_of_game_to_update back, and multiple games are played on the same day

In [14]:

if update_games:
  for game in update_games:
    # Because we got more games than we needed, we ignore the games we already have
    if int(game['ID']) <= last_recorded_game_id:
      continue
  
    # Get the game stats for each game
    stats = get_game_stats(game["ID"])
  
    print(stats)
    # Home and visitor team names
    visitor = stats['GC']['Clock']['visiting_team']['name']
    home = stats['GC']['Clock']['home_team']['name']
  
    # Number of goals
    home_goals = int(stats['GC']['Clock']['home_goal_count'])
    visitor_goals = int(stats['GC']['Clock']['visiting_goal_count'])
  
    # Calculate Power Play %
    home_ppp_total = float(stats['GC']['Clock']['power_play']['total']['home'])
    visitor_ppp_total = float(stats['GC']['Clock']['power_play']['total']['visiting'])
    # Avoid division by zero for home_ppp
    if home_ppp_total != 0:
        home_ppp = float(stats['GC']['Clock']['power_play']['goals']['home']) / home_ppp_total
    else:
        home_ppp = 0  # Default to 0 if no power plays
    # Avoid division by zero for visitor_ppp
    if visitor_ppp_total != 0:
        visitor_ppp = float(stats['GC']['Clock']['power_play']['goals']['visiting']) / visitor_ppp_total
    else:
        visitor_ppp = 0  # Default to 0 if no power plays
  
    # Calculate Faceoff Win %
    home_fowp = float(stats['GC']['Clock']['fow']['home'])
    visitor_fowp = float(stats['GC']['Clock']['fow']['visiting'])
    fow_total = home_fowp + visitor_fowp
    if fow_total != 0:
        home_fowp /= fow_total
        visitor_fowp /= fow_total
    else:
        home_fowp, visitor_fowp = .5, .5
  
    # Shots on goal
    home_sog = sum(stats['GC']['Clock']['shots_on_goal']['home'].values())
    visitor_sog = sum(stats['GC']['Clock']['shots_on_goal']['visiting'].values())
    
    # Add data to the dataset
    dataset.loc[len(dataset)] = {
        "Game_ID": int(game['ID']),
        "Home_Name": home,
        "Away_Name": visitor,
        "Home_Goals": home_goals,
        "Away_Goals": visitor_goals,
        "Home_PP%": home_ppp,
        "Away_PP%": visitor_ppp,
        "Home_SOG": home_sog,
        "Away_SOG": visitor_sog,
        "Home_FOW%": home_fowp,
        "Away_FOW%": visitor_fowp
    }

{'GC': {'Parameters': {'feed': 'gc', 'key': '41b145a848f4bd67', 'game_id': '1021578', 'client_code': 'whl', 'tab': 'clock', 'lang_code': 'en', 'fmt': 'json', 'lang_id': 1, 'static': 1}, 'Clock': {'period': '3', 'game_clock': '00:00:00', 'game_date_iso_8601': '2025-01-01T14:00:00-06:00', 'timezone': 'Canada/Central', 'timezone_short': 'CST', 'started': '1', 'game_number': '377', 'season_id': '285', 'season_name': '2024 - 25 Regular Season', 'home_team': {'name': 'Brandon Wheat Kings', 'team_id': '201', 'team_code': 'BDN', 'team_nickname': 'Wheat Kings', 'team_city': 'Brandon'}, 'visiting_team': {'name': 'Swift Current Broncos', 'team_id': '216', 'team_code': 'SC', 'team_nickname': 'Broncos', 'team_city': 'Swift Current'}, 'text_game_summary': '/stats/text-game-report.php?game_id=1021578', 'official_game_report': '/stats/official-game-report.php?game_id=1021578', 'home_audio_url': 'https://qcountryfm.ca/player/?playerID=2907', 'home_video_url': '', 'home_webcast_url': '', 'visiting_audio

### Update CSV

In [16]:
# Update the CSV for future use
display(dataset)

Unnamed: 0,Game_ID,Home_Name,Away_Name,Home_Goals,Away_Goals,Home_PP%,Away_PP%,Home_SOG,Away_SOG,Home_FOW%,Away_FOW%
0,1014692,Prince George Cougars,Calgary Hitmen,7,1,0.285714,0.250000,34,17,0.433333,0.566667
1,1014699,Kamloops Blazers,Calgary Hitmen,2,4,0.000000,0.000000,32,56,0.492958,0.507042
2,1014708,Kelowna Rockets,Calgary Hitmen,3,4,0.166667,0.000000,34,26,0.461538,0.538462
3,1014720,Calgary Hitmen,Lethbridge Hurricanes,1,3,0.000000,0.285714,31,37,0.500000,0.500000
4,1014735,Red Deer Rebels,Calgary Hitmen,4,2,0.000000,0.000000,31,26,0.576271,0.423729
...,...,...,...,...,...,...,...,...,...,...,...
5630,1021578,Brandon Wheat Kings,Swift Current Broncos,2,5,0.125000,0.250000,34,19,0.596774,0.403226
5631,1021579,Edmonton Oil Kings,Calgary Hitmen,1,4,0.000000,0.166667,26,29,0.523077,0.476923
5632,1021582,Vancouver Giants,Victoria Royals,2,5,0.333333,0.428571,19,46,0.537037,0.462963
5633,1021581,Saskatoon Blades,Prince Albert Raiders,3,6,0.000000,0.500000,47,32,0.509091,0.490909


In [17]:
if update_games:
  dataset.to_csv('All_teams_WHL_stats.csv')

## Get Next Games

In [19]:
# We want to do a GET request getting the next games played
# Store this information within an [] of (Home Team, Visiting Team)

# Then get a DF and train the model on both teams playing

# This is the number of days ahead to predict games for
# 1 is recommended because anything past that then model is missing previous games played
num_of_future_days_predict = 1
next_games_response = get_game_ids(game_id_url_func(num_of_future_days_predict, 0))
next_games = []
for game in next_games_response:
  next_games.append((game['HomeLongName'], game["VisitorLongName"]))

print(next_games)

[('Portland Winterhawks', 'Spokane Chiefs')]


## Create features and dependencies

In [21]:
# Get default values for averages of goals, PP%, SOG, and FOW%
default_goals = (dataset['Home_Goals'].mean() + dataset['Away_Goals'].mean()) // 2
default_ppp = (dataset['Home_PP%'].mean() + dataset['Away_PP%'].mean()) / 2
default_sog = (dataset['Home_SOG'].mean() + dataset['Away_SOG'].mean()) // 2
default_fowp = (dataset['Home_FOW%'].mean() + dataset['Away_FOW%'].mean()) / 2

print('Default Values used:')
print(default_goals, default_ppp, default_sog, default_fowp)

Default Values used:
3.0 0.2154162521398546 31.0 0.5


In [22]:
def get_opponent_stats(dataset, queue, opponent_name, row):
  """
  Calculates average statistics for a given opponent based on their last 'k' games prior to a specified game.
  
  Parameters:
  - dataset (pd.DataFrame): The dataset containing game records with statistics for each game.
  - queue: (Unused in this function but included as a parameter for compatibility).
  - opponent_name (str): The name of the opponent team whose stats are being calculated.
  - row (pd.Series): The current row of the dataset, representing the game for which the opponent's stats are calculated.

  Returns:
  - opponent_goals (float): Average goals scored by the opponent over the last 'k' games.
  - opponent_ppp (float): Average power-play percentage of the opponent over the last 'k' games.
  - opponent_fowp (float): Average face-off win percentage of the opponent over the last 'k' games.
  - opponent_sog (float): Average shots on goal by the opponent over the last 'k' games.

  Notes:
  - 'k' is assumed to be a predefined global variable representing the number of games to consider.
  - Default values (`default_goals`, `default_ppp`, `default_fowp`, `default_sog`) must also be predefined globally.
  - Uses `.where()` to select statistics based on whether the opponent is playing as home or away in each game.
  """
  
  # Get the last k games averages
  # This creates a new dataset of only opponent games, and which are before the current game
  opponent_games = dataset[
    ((dataset['Home_Name'] == opponent_name) | (dataset['Away_Name'] == opponent_name)) & 
    (dataset['Game_ID'] < row['Game_ID'])
  ].tail(k)

  # If the opponent hasnt played k games yet, use default
  if len(opponent_games) < k:
    opponent_goals = default_goals
    opponent_ppp = default_ppp
    opponent_fowp = default_fowp
    opponent_sog = default_sog
  else:
    # Get the mean of the entire column that the opponent played
    opponent_goals = opponent_games['Home_Goals'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_Goals']).mean()
    opponent_ppp = opponent_games['Home_PP%'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_PP%']).mean()
    opponent_fowp = opponent_games['Home_FOW%'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_FOW%']).mean()
    opponent_sog = opponent_games['Home_SOG'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_SOG']).mean()

  return opponent_goals, opponent_ppp, opponent_fowp, opponent_sog

### Get Avg Stats for each team

In [24]:
def generate_team_stats(dataset, team_a, k):
  """
  This function generates a DataFrame with stats comparing a target team (team_a) against its opponent (opponent_team)
  over the last 'k' games. It calculates average stats and differences between the target team and the opponent.

  Parameters:
  - dataset (DataFrame): The dataset containing the game statistics.
  - team_a (str): The name of the target team.
  - k (int): The number of recent games to consider for averaging stats.

  Returns:
  - DataFrame: A DataFrame containing the calculated statistics and differences between the target team and its opponent.
  """

  # Initialize variables and the deque to store the last k games
  target_queue = deque(maxlen=k)
  target_df = pd.DataFrame(columns = [
    "target_goals", 
    "opponent_goals", 
    "target_PP%", 
    "opponent_PP%", 
    "target_SOG", 
    "opponent_SOG",
    "target_FOW%", 
    "opponent_FOW%", 
    "Home/Away",
    "goals_diff",
    "ppp_diff",
    "sog_diff",
    "fowp_diff",
    "target_win"
  ])

  target_goals = 0
  target_ppp = 0
  target_fowp = 0
  target_sog = 0

  # Iterate through the dataset to track the last 'k' games
  for index, row in dataset.iterrows():
    if row['Home_Name'] == team_a or row['Away_Name'] == team_a:

      side = 'Home' if row['Home_Name'] == team_a else 'Away'
      opponent_name = row['Away_Name'] if side == 'Home' else row['Home_Name']
      
      # Update statistics for the target team
      target_goals += row[side + '_Goals']
      target_ppp += row[side + '_PP%']
      target_fowp += row[side + '_FOW%']
      target_sog += row[side + '_SOG']
      
      # If the queue is not full, add the current game to the queue
      # k-1 because we can continue of the queue is almost full
      if len(target_queue) < k - 1:
        target_queue.append(row.to_dict())
      else:
        # Calculate the opponent's stats
        # This function is the code cell above this
        opponent_goals, opponent_ppp, opponent_fowp, opponent_sog = get_opponent_stats(dataset, target_queue, opponent_name, row)
        
        # Calculate average stats for the target team. /k find the aggregate avg of the sum of stats
        avg_target_goals = target_goals / k
        avg_target_ppp = target_ppp / k
        avg_target_fowp = target_fowp / k
        avg_target_sog = target_sog / k

        # Calculate differences between the target team and the opponent
        goals_diff = avg_target_goals - opponent_goals
        ppp_diff = avg_target_ppp - opponent_ppp
        sog_diff = avg_target_sog - opponent_sog
        fowp_diff = avg_target_fowp - opponent_fowp

        target_win = 1 if row[side + '_Goals'] > row[('Away' if side == 'Home' else 'Home') + '_Goals'] else 0

        # 1 is Home, 0 is Away
        target_df.loc[len(target_df)] = {
            "target_goals": avg_target_goals,
            "opponent_goals": opponent_goals,
            "target_PP%": avg_target_ppp,
            "opponent_PP%": opponent_ppp,
            "target_SOG": avg_target_sog,
            "opponent_SOG": opponent_sog,
            "target_FOW%": avg_target_fowp,
            "opponent_FOW%": opponent_fowp,
            "Home/Away": 1.0 if side == 'Home' else 0.0,
            "goals_diff": goals_diff,
            "ppp_diff": ppp_diff,
            "sog_diff": sog_diff,
            "fowp_diff": fowp_diff,
            "target_win": target_win
        }

        # Remove the oldest game from the queue and update stats
        popped = target_queue.popleft()
        popped_side = 'Home' if popped['Home_Name'] == team_a else 'Away'
        target_goals -= popped[popped_side + '_Goals']
        target_ppp -= popped[popped_side + '_PP%']
        target_fowp -= popped[popped_side + '_FOW%']
        target_sog -= popped[popped_side + '_SOG']

        # Add the current game to the queue
        target_queue.append(row.to_dict())

  return target_df

### Generate DataFrames For Each Team Playing in Next Game

In [26]:
# loop through the next games, and get the dataframe for each team.
# Store it in a [] of (DF Home, DF, Away)

# This is an array that stores the stats DataFrame for each team playing
next_games_dfs = []
for game in next_games:
  home_team = game[0]
  away_team = game[1]
  home_team_stats_df = generate_team_stats(dataset, home_team, k)
  away_team_stats_df = generate_team_stats(dataset, away_team, k)
  
  next_games_dfs.append((home_team_stats_df, away_team_stats_df))


In [28]:
# Just display the first game to make sure
display(next_games_dfs[0][0])

Unnamed: 0,target_goals,opponent_goals,target_PP%,opponent_PP%,target_SOG,opponent_SOG,target_FOW%,opponent_FOW%,Home/Away,goals_diff,ppp_diff,sog_diff,fowp_diff,target_win
0,4.2,5.0,0.386667,0.369048,36.0,30.0,0.494254,0.477579,0.0,-0.8,0.017619,6.0,0.016675,1
1,4.2,4.0,0.320000,0.166667,33.4,42.8,0.476777,0.496223,0.0,0.2,0.153333,-9.4,-0.019446,0
2,3.8,3.4,0.250000,0.326667,34.0,32.0,0.492550,0.464044,0.0,0.4,-0.076667,2.0,0.028506,1
3,3.2,5.0,0.283333,0.270476,32.2,34.6,0.500861,0.563969,1.0,-1.8,0.012857,-2.4,-0.063108,0
4,3.2,5.0,0.250000,0.233333,33.0,31.0,0.503946,0.561687,1.0,-1.8,0.016667,2.0,-0.057741,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511,4.4,3.4,0.293810,0.166667,34.4,26.6,0.525082,0.534508,1.0,1.0,0.127143,7.8,-0.009426,1
512,4.2,3.4,0.240476,0.340000,35.2,33.4,0.559073,0.474481,1.0,0.8,-0.099524,1.8,0.084592,1
513,4.8,2.6,0.190476,0.150000,36.0,33.2,0.571749,0.498036,1.0,2.2,0.040476,2.8,0.073714,1
514,4.8,2.8,0.123810,0.150000,37.2,34.0,0.585321,0.496129,0.0,2.0,-0.026190,3.2,0.089192,1


## Apply Classification Models

We go through 4 classification models for each team, and then select the most accurate model to use for our prediction

This is similar to a grid search, but we are using different models instead of different parameters

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Going to store a dict of the best classifiers between each team
next_games_models = []

for game in next_games_dfs:
  
  # Loop through each team (home and away)
  for i, team in enumerate(game):
    best_accuracy = 0
    best_classifier = 0
    best_classifier_name = ""
    
    # Features and dependencies
    X = team.iloc[:, :-1]
    y = team.iloc[:, -1]
    
    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    # Scale train and test sets
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Initialize models
    classifiers = {
        "Random Forest": RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0),
        "Naive Bayes": GaussianNB(),
        "Logistic Regression": LogisticRegression(random_state=0),
        "SVC": SVC(kernel='rbf', random_state=0)
    }
    
    classifiers_accuracies = {
        "Random Forest": 0,
        "Naive Bayes": 0,
        "Logistic Regression": 0,
        "SVC": 0
    }
    
    # Train each classifier and evaluate accuracy
    for classifier_name, classifier in classifiers.items():
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        classifiers_accuracies[classifier_name] = accuracy_score(y_test, y_pred)
    
    # Get the best classifier for this team
    best_classifier_name = max(classifiers_accuracies, key=classifiers_accuracies.get)
    best_accuracy = classifiers_accuracies[best_classifier_name]
    best_classifier = classifiers[best_classifier_name]
    
    # Append to the models list
    team_name = "Home" if i == 0 else "Away"  # You can label as "Home" or "Away"
    next_games_models.append({
        "team": team_name,
        "classifier": best_classifier,
        "accuracy": best_accuracy,
        "classifier_name": best_classifier_name,
        "scaler": scaler
    })

combine_next_game_models = []
for i in range(0, len(next_games_models), 2):
  home = next_games_models[i]  # Home team entry
  away = next_games_models[i + 1]  # Away team entry

  combine_next_game_models.append((home, away))
  # Now, next_games_models will store the best model and accuracy for both home and away teams in each game.


In [35]:
print(combine_next_game_models)

[({'team': 'Home', 'classifier': GaussianNB(), 'accuracy': 0.7596153846153846, 'classifier_name': 'Naive Bayes', 'scaler': StandardScaler()}, {'team': 'Away', 'classifier': LogisticRegression(random_state=0), 'accuracy': 0.6633663366336634, 'classifier_name': 'Logistic Regression', 'scaler': StandardScaler()})]


## Predict the Next Games Playing

In [38]:
def get_df_for_predicton(i, next_games_dfs, playing_side):
  """
  Prepares a feature set for predicting game outcomes based on team statistics.

  Parameters:
  - i (int): Index of the game in the `next_games_dfs` list.
  - next_games_dfs (list): List of DataFrames with game statistics.
  - playing_side (str): The side the target team is playing on ("Home" or "Away").

  Returns:
  - pd.DataFrame: DataFrame containing features for prediction, including goals, PP%, SOG, FOW%, and their differences.

  Function Details:
  - Extracts statistics for the target team and opponent.
  - Computes the differences in key statistics (goals, PP%, SOG, FOW%).
  - Returns a DataFrame with relevant features for prediction.
  """
  
  side = 0 if playing_side == "Home" else 1
  opponent_side = 0 if side else 1
  target_stats = next_games_dfs[i][side].iloc[-1]
  
  # Break down the stats into different variables
  opponent_stats = next_games_dfs[i][opponent_side].iloc[-1]
  target_goals = target_stats['target_goals']
  opponent_goals = opponent_stats['target_goals']
  target_PPP = target_stats['target_PP%']
  opponent_PPP = opponent_stats['target_PP%']
  target_SOG = target_stats['target_SOG']
  opponent_SOG = opponent_stats['target_SOG']
  target_FOWP = target_stats['target_FOW%']
  opponent_FOWP = opponent_stats['target_FOW%']
  target_side = opponent_side # This is reverse because I ordered it weirdly
  goals_diff = target_goals - opponent_goals
  ppp_diff = target_PPP - opponent_PPP
  sog_diff = target_SOG - opponent_SOG
  fowp_diff = target_FOWP - opponent_FOWP

  features = [
    [
      target_goals, opponent_goals, target_PPP, opponent_PPP,
      target_SOG, opponent_SOG, target_FOWP, opponent_FOWP,
      side, goals_diff, ppp_diff, sog_diff, fowp_diff
      ]
  ]

  feature_names = ["target_goals", 
      "opponent_goals", 
      "target_PP%", 
      "opponent_PP%", 
      "target_SOG", 
      "opponent_SOG",
      "target_FOW%",
      "opponent_FOW%",
      "Home/Away",
      "goals_diff",
      "ppp_diff",
      "sog_diff",
      "fowp_diff"]


  # Convert features to a DataFrame with the correct column names
  return pd.DataFrame(features, columns=feature_names)
  

### Print Next Games Prediction

In [52]:
import datetime
game_predicted = { 
    'GameID': [],  # Replace with actual GameIDs
    'GameDate': [],  # Replace with actual dates
    'Home Team': [],  # Replace with actual teams
    'Away Team': [],  # Replace with actual teams
    'Prediction': [],  # Replace with actual predictions
}
for i, game in enumerate(combine_next_game_models):
  # We want to compare the accuracy of the home and away training models
  if game[0]['accuracy'] < game[1]['accuracy']: 
    # Home is target
    prediction_features_dataframe = get_df_for_predicton(i, next_games_dfs, game[0]["team"])
    side = 0
  else: 
    # Away is target
    prediction_features_dataframe = get_df_for_predicton(i, next_games_dfs, game[1]["team"])
    side = 1

  # Scale the dataframe for classification model
  scaled_prediction_dataframe = game[side]['scaler'].transform(prediction_features_dataframe)
  # Predict using the most accurate model for the team
  prediction = game[side]['classifier'].predict(scaled_prediction_dataframe)
  
  # The team with the highest accuracy will be predicted as the target_team
  if prediction[0]:
    print(f'{next_games[i][side]} is predicted to with their next game against the {next_games[i][0 if side else 1]} on {next_games_response[i]['GameDate']} with an accuracy of {game[side]['accuracy']:.3f}')
    game_predicted['Prediction'].append(next_games[i][side])
  else:
    print(f'{next_games[i][0 if side else 1]} is predicted to with their next game against the {next_games[i][side]} on {next_games_response[i]['GameDate']} with an accuracy of {game[side]['accuracy']:.3f}')
    game_predicted['Prediction'].append(next_games[i][0 if side else 1])


  game_predicted['GameDate'].append(next_games_response[i]['GameDate'] + ", " + str(datetime.datetime.now().year))
  game_predicted['GameID'].append(next_games_response[i]['ID'])
  game_predicted['Home Team'].append(next_games_response[i]['HomeLongName'])
  game_predicted['Away Team'].append(next_games_response[i]['VisitorLongName'])
  
  
  

Portland Winterhawks is predicted to with their next game against the Spokane Chiefs on Thu, Jan 2 with an accuracy of 0.663


In [43]:
print(game_predicted)

{'GameID': ['1021583'], 'GameDate': ['Thu, Jan 2, 2025'], 'Home Team': ['Portland Winterhawks'], 'Away Team': ['Spokane Chiefs'], 'Prediction': ['Portland Winterhawks']}


## Write Predictions to file

In [46]:
import pandas as pd
from openpyxl import load_workbook

# Define the file path
file_path = 'prediction_record.xlsx'

# New data to append (replace this with your actual DataFrame)
new_results = pd.DataFrame(game_predicted)

# Select only the relevant columns (GameID, GameDate, Home Team, Away Team, Prediction)
new_results = new_results[['GameID', 'GameDate', 'Home Team', 'Away Team', 'Prediction']]

# Load the existing Excel file to check for existing GameIDs
existing_data = pd.read_excel(file_path, engine='openpyxl')
display(existing_data)

# Ensure 'GameID' columns are of the same type for comparison (convert to string)
existing_game_ids = existing_data['GameID'].astype(int).values  # Ensure existing GameIDs are strings
new_results['GameID'] = new_results['GameID'].astype(int)  # Convert new GameIDs to strings

# Filter new_results to only include rows with GameID that does not already exist in the file
new_results_filtered = new_results[~new_results['GameID'].isin(existing_game_ids)]

# Check if there are rows to append
if not new_results_filtered.empty:
    # Load the existing workbook
    book = load_workbook(file_path)

    # Open the file in append mode using ExcelWriter
    with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
        # Write the new results (without overwriting existing data) to columns A:G
        new_results_filtered.to_excel(writer, index=False, header=False, startrow=book.active.max_row, sheet_name='Sheet1')

    print(f"Appended {len(new_results_filtered)} rows to the file.")
    update_cells_start = len(existing_data)
else:
    print("No new GameIDs to append.")
    update_cells_start = -1


Unnamed: 0,GameID,GameDate,Home Team,Away Team,Prediction,Actual,Correct,Unnamed: 7,Accuracy Score
0,1021546,"Fri, Dec 27, 2024",Brandon Wheat Kings,Regina Pats,Brandon Wheat Kings,Regina Pats,0.0,,60.00%
1,1021550,"Fri, Dec 27, 2024",Prince Albert Raiders,Swift Current Broncos,Prince Albert Raiders,Prince Albert Raiders,1.0,,
2,1021552,"Fri, Dec 27, 2024",Saskatoon Blades,Moose Jaw Warriors,Moose Jaw Warriors,Saskatoon Blades,0.0,,
3,1021551,"Fri, Dec 27, 2024",Red Deer Rebels,Edmonton Oil Kings,Edmonton Oil Kings,Edmonton Oil Kings,1.0,,
4,1021547,"Fri, Dec 27, 2024",Kamloops Blazers,Vancouver Giants,Kamloops Blazers,Vancouver Giants,0.0,,
5,1021549,"Fri, Dec 27, 2024",Portland Winterhawks,Tri-City Americans,Portland Winterhawks,Portland Winterhawks,1.0,,
6,1021553,"Fri, Dec 27, 2024",Seattle Thunderbirds,Everett Silvertips,Everett Silvertips,Everett Silvertips,1.0,,
7,1021554,"Fri, Dec 27, 2024",Spokane Chiefs,Wenatchee Wild,Spokane Chiefs,Spokane Chiefs,1.0,,
8,1021560,"Sat, Dec 28, 2024",Regina Pats,Saskatoon Blades,Saskatoon Blades,Saskatoon Blades,1.0,,
9,1021559,"Sat, Dec 28, 2024",Moose Jaw Warriors,Brandon Wheat Kings,Brandon Wheat Kings,Brandon Wheat Kings,1.0,,


Appended 1 rows to the file.


## Update With Actual Results

In [49]:
from openpyxl import load_workbook

# Define the file path
file_path = 'prediction_record.xlsx'

# Read the existing Excel file into a Pandas DataFrame
df = pd.read_excel(file_path, engine='openpyxl', usecols="A:F")

# Assume 'dataset' contains your actual game data with 'HomeGoals' and 'AwayGoals'
# dataset = pd.DataFrame(...)  # Load your dataset here with actual goal values

# We wont update any cells if the 
if update_cells_start == -1:
  update_cells_start = len(df)

print(update_cells_start)
# Loop through the rows and update the 'Actual' column where it's empty

for index, row in df.iterrows():
    if pd.isna(row['Actual']):  # Check if 'Actual' is empty
        game_id = row['GameID']
        
        # Get the corresponding data for the game from 'dataset' (replace with your actual dataset)
        game_data = dataset[dataset['Game_ID'] == game_id]
        
        if not game_data.empty:
            home_goals = game_data['Home_Goals'].iloc[0]
            away_goals = game_data['Away_Goals'].iloc[0]
            
            # Determine which team has the higher score
            if home_goals > away_goals:
                actual_team = row['Home Team']
            elif away_goals > home_goals:
                actual_team = row['Away Team']

            print(actual_team)

            # Update only the 'Actual' column
            df.at[index, 'Actual'] = actual_team


# Load the workbook using openpyxl
wb = load_workbook(file_path)

# Select the sheet where you want to write the data
sheet = wb['Sheet1']

# Start row (assuming the first row is header)
start_row = 2  # Data starts at row 2 (after the header)
# Start at where the predictions start
# df_subset = df.iloc[update_cells_start:]

# Update the cells in columns A:G
for index, row in df.iterrows():
    # We are writing columns A:G, so we update the appropriate cells
    sheet[f'A{start_row + index}'] = row['GameID']
    sheet[f'B{start_row + index}'] = row['GameDate']
    sheet[f'C{start_row + index}'] = row['Home Team']
    sheet[f'D{start_row + index}'] = row['Away Team']
    sheet[f'E{start_row + index}'] = row['Prediction']
    sheet[f'F{start_row + index}'] = row['Actual']
    if pd.notna(row['Actual']):
      sheet[f'G{start_row + index}'] = 1 if row['Prediction'] == row['Actual'] else 0

    

# Save the workbook with the updated values
wb.save(file_path)


35
Swift Current Broncos
Calgary Hitmen
Moose Jaw Warriors
Prince Albert Raiders
Victoria Royals
