In [2]:
# !pip install pandas
# !pip install skikit-learn
# !pip install --upgrade scikit-learn
# !pip install numpy

In [3]:
import pandas as pd
import numpy as np
import requests
from collections import deque

## Declare Target Team and K

In [5]:
target_team = 'Brandon Wheat Kings'
k = 5

## Links and GET functions

In [7]:
key = '41b145a848f4bd67'
def game_id_url_func(num_of_ahead_games, num_of_past_games, current_team_id=''):
  return f'https://lscluster.hockeytech.com/feed/?feed=modulekit&key={key}&view=scorebar&client_code=whl&numberofdaysahead={num_of_ahead_games}&numberofdaysback={num_of_past_games}&season_id=&team_id={current_team_id}&lang_code=en&fmt=json'

def game_stats_url_func(game_id):
  return f'https://lscluster.hockeytech.com/feed/?feed=gc&key={key}&game_id={game_id}&client_code=whl&tab=clock&lang_code=en&fmt=json'

def get_game_ids(url):
  headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, br"  # Request gzip compression
  }

  try:
      response = requests.get(url, headers=headers)

      # Check if the request was successful
      if response.status_code == 200:

          return response.json()['SiteKit']['Scorebar']
          #pprint(response)
      else:
          print(f"Failed to retrieve the page. Status code: {response.status_code}")
  except Exception as e:
      print(f"An error occurred: {e}")


'''
Expects a game_id, which is used to access the Hockey Tech API to get the stats of a game, returned in a JSON format
'''
def get_game_stats(game_id):
  game_stats_url = game_stats_url_func(game_id)
  headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, br"  # Request gzip compression
  }
  
  try:
      response = requests.get(game_stats_url, headers=headers)
  
      # Check if the request was successful
      if response.status_code == 200:
  
          return response.json()
          pprint(response)
      else:
          print(f"Failed to retrieve the page. Status code: {response.status_code}")
  except Exception as e:
      print(f"An error occurred: {e}")

## Import the Dataset

In [9]:
dataset = pd.read_csv('All_teams_WHL_stats.csv')
team_id = {'Spokane Chiefs': '215', 'Seattle Thunderbirds': '214', 'Portland Winterhawks': '208', 'Everett Silvertips': '226', 'Tri-City Americans': '217', 'Kamloops Blazers': '203', 'Kelowna Rockets': '204', 'Prince George Cougars': '210', 'Brandon Wheat Kings': '201', 'Swift Current Broncos': '216', 'Vancouver Giants': '223', 'Victoria Royals': '227', 'Medicine Hat Tigers': '206', 'Edmonton Oil Kings': '228', 'Moose Jaw Warriors': '207', 'Regina Pats': '212', 'Saskatoon Blades': '213', 'Prince Albert Raiders': '209', 'Calgary Hitmen': '202', 'Lethbridge Hurricanes': '205', 'Red Deer Rebels': '211', 'Wenatchee Wild': '222'}

In [10]:
dataset = dataset.sort_values(by="Game_ID")
dataset.reset_index(drop=True, inplace=True)
display(dataset)

Unnamed: 0,Unnamed: 0.11,Unnamed: 0.10,Unnamed: 0.9,Unnamed: 0.8,Unnamed: 0.7,Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,...,Home_Name,Away_Name,Home_Goals,Away_Goals,Home_PP%,Away_PP%,Home_SOG,Away_SOG,Home_FOW%,Away_FOW%
0,0,0,0,0,0,0,0,0,0,0,...,Prince George Cougars,Calgary Hitmen,7,1,0.285714,0.250000,34,17,0.433333,0.566667
1,1,1,1,1,1,1,1,1,1,1,...,Kamloops Blazers,Calgary Hitmen,2,4,0.000000,0.000000,32,56,0.492958,0.507042
2,2,2,2,2,2,2,2,2,2,2,...,Kelowna Rockets,Calgary Hitmen,3,4,0.166667,0.000000,34,26,0.461538,0.538462
3,3,3,3,3,3,3,3,3,3,3,...,Calgary Hitmen,Lethbridge Hurricanes,1,3,0.000000,0.285714,31,37,0.500000,0.500000
4,4,4,4,4,4,4,4,4,4,4,...,Red Deer Rebels,Calgary Hitmen,4,2,0.000000,0.000000,31,26,0.576271,0.423729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5603,5603,5603,5603,5603,5603,5603,5603,5603,5603,5603,...,Saskatoon Blades,Moose Jaw Warriors,5,2,0.666667,0.333333,27,30,0.593750,0.406250
5604,5604,5604,5604,5604,5604,5604,5604,5604,5604,5604,...,Seattle Thunderbirds,Everett Silvertips,3,5,0.500000,0.500000,27,40,0.507246,0.492754
5605,5605,5605,5605,5605,5605,5605,5605,5605,5605,5605,...,Spokane Chiefs,Wenatchee Wild,2,1,0.200000,0.200000,24,33,0.573770,0.426230
5606,5606,5606,5606,5606,5606,5606,5606,5606,5606,5606,...,Spokane Chiefs,Wenatchee Wild,2,1,0.200000,0.200000,24,33,0.573770,0.426230


## Update the dataset with recent games

In [12]:
# Get the most recent games, and check their game ID
last_game_id = get_game_ids(game_id_url_func(0, 1))
last_game_id = int(last_game_id[-1]["ID"])

# Get the latest game ID we have in the dataset
last_recorded_game_id = dataset.iloc[-1]['Game_ID']

# Get the difference between the last game and the last recorded game.
# this will give us how many games off our dataset is
num_of_game_to_update = last_game_id - last_recorded_game_id
print(f'We need to update {num_of_game_to_update} games in our dataset')

We need to update 0 games in our dataset


In [13]:
update_games = get_game_ids(game_id_url_func(0, num_of_game_to_update))
print(f'Number of games fetched {len(update_games)}')
# We get a lot of games because the API calls num_of_game_to_update back, and multiple games are played on the same day

Number of games fetched 0


In [14]:
update_games = get_game_ids(game_id_url_func(0, num_of_game_to_update))


for game in update_games:
  # Because we got more games than we needed, we ignore the games we already have
  if int(game['ID']) <= last_recorded_game_id:
    continue

  # Get the game stats for each game
  stats = get_game_stats(game["ID"])

  print(stats)
  # Home and visitor team names
  visitor = stats['GC']['Clock']['visiting_team']['name']
  home = stats['GC']['Clock']['home_team']['name']

  # Number of goals
  home_goals = int(stats['GC']['Clock']['home_goal_count'])
  visitor_goals = int(stats['GC']['Clock']['visiting_goal_count'])

  # Calculate Power Play %
  home_ppp_total = float(stats['GC']['Clock']['power_play']['total']['home'])
  visitor_ppp_total = float(stats['GC']['Clock']['power_play']['total']['visiting'])
  # Avoid division by zero for home_ppp
  if home_ppp_total != 0:
      home_ppp = float(stats['GC']['Clock']['power_play']['goals']['home']) / home_ppp_total
  else:
      home_ppp = 0  # Default to 0 if no power plays
  # Avoid division by zero for visitor_ppp
  if visitor_ppp_total != 0:
      visitor_ppp = float(stats['GC']['Clock']['power_play']['goals']['visiting']) / visitor_ppp_total
  else:
      visitor_ppp = 0  # Default to 0 if no power plays

  # Calculate Faceoff Win %
  home_fowp = float(stats['GC']['Clock']['fow']['home'])
  visitor_fowp = float(stats['GC']['Clock']['fow']['visiting'])
  fow_total = home_fowp + visitor_fowp
  if fow_total != 0:
      home_fowp /= fow_total
      visitor_fowp /= fow_total
  else:
      home_fowp, visitor_fowp = .5, .5

  # Shots on goal
  home_sog = sum(stats['GC']['Clock']['shots_on_goal']['home'].values())
  visitor_sog = sum(stats['GC']['Clock']['shots_on_goal']['visiting'].values())
  
  # Add data to the dataset
  dataset.loc[len(dataset)] = {
      "Game_ID": int(game['ID']),
      "Home_Name": home,
      "Away_Name": visitor,
      "Home_Goals": home_goals,
      "Away_Goals": visitor_goals,
      "Home_PP%": home_ppp,
      "Away_PP%": visitor_ppp,
      "Home_SOG": home_sog,
      "Away_SOG": visitor_sog,
      "Home_FOW%": home_fowp,
      "Away_FOW%": visitor_fowp
  }

In [15]:
# dataset = dataset.drop(columns=['Unnamed: 0'])
display(dataset)
# Update the CSV for future use
dataset.to_csv('All_teams_WHL_stats.csv')

Unnamed: 0,Unnamed: 0.11,Unnamed: 0.10,Unnamed: 0.9,Unnamed: 0.8,Unnamed: 0.7,Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,...,Home_Name,Away_Name,Home_Goals,Away_Goals,Home_PP%,Away_PP%,Home_SOG,Away_SOG,Home_FOW%,Away_FOW%
0,0,0,0,0,0,0,0,0,0,0,...,Prince George Cougars,Calgary Hitmen,7,1,0.285714,0.250000,34,17,0.433333,0.566667
1,1,1,1,1,1,1,1,1,1,1,...,Kamloops Blazers,Calgary Hitmen,2,4,0.000000,0.000000,32,56,0.492958,0.507042
2,2,2,2,2,2,2,2,2,2,2,...,Kelowna Rockets,Calgary Hitmen,3,4,0.166667,0.000000,34,26,0.461538,0.538462
3,3,3,3,3,3,3,3,3,3,3,...,Calgary Hitmen,Lethbridge Hurricanes,1,3,0.000000,0.285714,31,37,0.500000,0.500000
4,4,4,4,4,4,4,4,4,4,4,...,Red Deer Rebels,Calgary Hitmen,4,2,0.000000,0.000000,31,26,0.576271,0.423729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5603,5603,5603,5603,5603,5603,5603,5603,5603,5603,5603,...,Saskatoon Blades,Moose Jaw Warriors,5,2,0.666667,0.333333,27,30,0.593750,0.406250
5604,5604,5604,5604,5604,5604,5604,5604,5604,5604,5604,...,Seattle Thunderbirds,Everett Silvertips,3,5,0.500000,0.500000,27,40,0.507246,0.492754
5605,5605,5605,5605,5605,5605,5605,5605,5605,5605,5605,...,Spokane Chiefs,Wenatchee Wild,2,1,0.200000,0.200000,24,33,0.573770,0.426230
5606,5606,5606,5606,5606,5606,5606,5606,5606,5606,5606,...,Spokane Chiefs,Wenatchee Wild,2,1,0.200000,0.200000,24,33,0.573770,0.426230


## Get Next Games

In [17]:
# We want to do a GET request getting the next games played
# Maybe store this information within an [] of (Home Team, Visiting Team)

# Then get a DF and train the model on both teams playing 

next_games_response = get_game_ids(game_id_url_func(1, 0))
next_games = []
for game in next_games_response:
  next_games.append((game['HomeLongName'], game["VisitorLongName"]))

print(next_games)

[('Regina Pats', 'Saskatoon Blades'), ('Moose Jaw Warriors', 'Brandon Wheat Kings'), ('Swift Current Broncos', 'Prince Albert Raiders'), ('Medicine Hat Tigers', 'Lethbridge Hurricanes'), ('Wenatchee Wild', 'Spokane Chiefs'), ('Everett Silvertips', 'Seattle Thunderbirds'), ('Kelowna Rockets', 'Kamloops Blazers'), ('Tri-City Americans', 'Portland Winterhawks'), ('Victoria Royals', 'Prince George Cougars')]


## Create features and dependencies

In [19]:
# Get default values for averages of goals, PP%, SOG, and FOW%
default_goals = (dataset['Home_Goals'].mean() + dataset['Away_Goals'].mean()) // 2
default_ppp = (dataset['Home_PP%'].mean() + dataset['Away_PP%'].mean()) / 2
default_sog = (dataset['Home_SOG'].mean() + dataset['Away_SOG'].mean()) // 2
default_fowp = (dataset['Home_FOW%'].mean() + dataset['Away_FOW%'].mean()) / 2

print(default_goals, default_ppp, default_sog, default_fowp)

3.0 0.21534856410034725 31.0 0.5


In [20]:
def get_opponent_stats(dataset, queue, opponent_name, row):
  opponent_goals = 0
  opponent_ppp = 0
  opponent_fowp = 0
  opponent_sog = 0
  # Get the last k games averages
  # This creates a new dataset of only opponent games, and which are before the current game
  opponent_games = dataset[
    ((dataset['Home_Name'] == opponent_name) | (dataset['Away_Name'] == opponent_name)) & 
    (dataset['Game_ID'] < row['Game_ID'])
  ].tail(k)

  # If the opponent hasnt played k games yet, use default
  if len(opponent_games) < k:
    opponent_goals = default_goals
    opponent_ppp = default_ppp
    opponent_fowp = default_fowp
    opponent_sog = default_sog
  else:
    # Get the mean of the entire column that the opponent played
    opponent_goals = opponent_games['Home_Goals'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_Goals']).mean()
    opponent_ppp = opponent_games['Home_PP%'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_PP%']).mean()
    opponent_fowp = opponent_games['Home_FOW%'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_FOW%']).mean()
    opponent_sog = opponent_games['Home_SOG'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_SOG']).mean()

  return opponent_goals, opponent_ppp, opponent_fowp, opponent_sog

In [27]:
opponent_win_loss = {}
def generate_team_stats(dataset, team_a, k):
  """
  This function generates a DataFrame with stats comparing a target team (team_a) against its opponent (opponent_team)
  over the last 'k' games. It calculates average stats and differences between the target team and the opponent.

  Parameters:
  - dataset (DataFrame): The dataset containing the game statistics.
  - team_a (str): The name of the target team.
  - k (int): The number of recent games to consider for averaging stats.

  Returns:
  - DataFrame: A DataFrame containing the calculated statistics and differences between the target team and its opponent.
  """

  # Initialize variables and the deque to store the last k games
  target_queue = deque(maxlen=k)
  target_df = pd.DataFrame(columns = [
    "target_goals", 
    "opponent_goals", 
    "target_PP%", 
    "opponent_PP%", 
    "target_SOG", 
    "opponent_SOG",
    "target_FOW%", 
    "opponent_FOW%", 
    "Home/Away",
    # "opponent_win_loss",
    "goals_diff",
    "ppp_diff",
    "sog_diff",
    "fowp_diff",
    "target_win"
  ])

  target_goals = 0
  target_ppp = 0
  target_fowp = 0
  target_sog = 0

  

  # Iterate through the dataset to track the last 'k' games
  for index, row in dataset.iterrows():
    if row['Home_Name'] == team_a or row['Away_Name'] == team_a:

      side = 'Home' if row['Home_Name'] == team_a else 'Away'
      opponent_name = row['Away_Name'] if side == 'Home' else row['Home_Name']
      
      # Update statistics for the target team
      target_goals += row[side + '_Goals']
      target_ppp += row[side + '_PP%']
      target_fowp += row[side + '_FOW%']
      target_sog += row[side + '_SOG']
      
      # If the queue is not full, add the current game to the queue
      if len(target_queue) < k - 1:
        target_queue.append(row.to_dict())
      else:
        # Calculate the opponent's stats and update the win/loss balance
        opponent_goals, opponent_ppp, opponent_fowp, opponent_sog = get_opponent_stats(dataset, target_queue, opponent_name, row)

        target_win = 1 if row[side + '_Goals'] > row[('Away' if side == 'Home' else 'Home') + '_Goals'] else 0
        if opponent_name not in opponent_win_loss:
            opponent_win_loss[opponent_name] = 0.0
        
        # Update opponent win/loss balance
        if target_win:
            opponent_win_loss[opponent_name] -= 1.0
        else:
            opponent_win_loss[opponent_name] += 1.0

        
        # Calculate average stats for the target team
        avg_target_goals = target_goals / k
        avg_target_ppp = target_ppp / k
        avg_target_fowp = target_fowp / k
        avg_target_sog = target_sog / k

        # Calculate differences between the target team and the opponent
        goals_diff = avg_target_goals - opponent_goals
        ppp_diff = avg_target_ppp - opponent_ppp
        sog_diff = avg_target_sog - opponent_sog
        fowp_diff = avg_target_fowp - opponent_fowp

        # 1 is Home, 0 is Away
        target_df.loc[len(target_df)] = {
            "target_goals": avg_target_goals,
            "opponent_goals": opponent_goals,
            "target_PP%": avg_target_ppp,
            "opponent_PP%": opponent_ppp,
            "target_SOG": avg_target_sog,
            "opponent_SOG": opponent_sog,
            "target_FOW%": avg_target_fowp,
            "opponent_FOW%": opponent_fowp,
            "Home/Away": 1.0 if side == 'Home' else 0.0,
            # "opponent_win_loss": opponent_win_loss[opponent_name],
            "goals_diff": goals_diff,
            "ppp_diff": ppp_diff,
            "sog_diff": sog_diff,
            "fowp_diff": fowp_diff,
            "target_win": target_win
        }

        # Remove the oldest game from the queue and update stats
        popped = target_queue.popleft()
        popped_side = 'Home' if popped['Home_Name'] == team_a else 'Away'
        target_goals -= popped[popped_side + '_Goals']
        target_ppp -= popped[popped_side + '_PP%']
        target_fowp -= popped[popped_side + '_FOW%']
        target_sog -= popped[popped_side + '_SOG']

        # Add the current game to the queue
        target_queue.append(row.to_dict())

  return target_df

In [30]:
# loop through the next games, and get the dataframe for each team.
# Probably store it in a [] of (DF Home, DF, Away)
next_games_dfs = []
for game in next_games:
  home_team = game[0]
  away_team = game[1]
  home_team_stats_df = generate_team_stats(dataset, home_team, k)
  away_team_stats_df = generate_team_stats(dataset, away_team, k)
  
  next_games_dfs.append((home_team_stats_df, away_team_stats_df))

a = generate_team_stats(dataset, 'Brandon Wheat Kings', k)
print("K: " + str(k))


K: 5


In [32]:
display(next_games_dfs[1][0])

Unnamed: 0,target_goals,opponent_goals,target_PP%,opponent_PP%,target_SOG,opponent_SOG,target_FOW%,opponent_FOW%,Home/Away,goals_diff,ppp_diff,sog_diff,fowp_diff,target_win
0,4.6,2.6,0.403333,0.261905,35.4,38.4,0.490832,0.524757,1.0,2.0,0.141429,-3.0,-0.033924,1
1,5.2,3.2,0.436667,0.275000,35.0,28.6,0.493115,0.522733,0.0,2.0,0.161667,6.4,-0.029617,1
2,5.8,5.0,0.436667,0.340000,33.8,31.0,0.488636,0.471331,1.0,0.8,0.096667,2.8,0.017304,1
3,6.2,4.8,0.383333,0.256667,33.0,33.4,0.498306,0.477526,1.0,1.4,0.126667,-0.4,0.020780,1
4,5.6,3.4,0.396667,0.218571,33.2,25.6,0.493761,0.523317,0.0,2.2,0.178095,7.6,-0.029557,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521,3.0,3.6,0.183333,0.213333,27.8,31.8,0.394573,0.544709,0.0,-0.6,-0.030000,-4.0,-0.150137,0
522,1.8,4.8,0.166667,0.333333,25.2,36.8,0.379481,0.509805,1.0,-3.0,-0.166667,-11.6,-0.130324,0
523,2.6,2.8,0.216667,0.223333,27.4,30.2,0.394551,0.517384,0.0,-0.2,-0.006667,-2.8,-0.122833,1
524,3.0,4.8,0.233333,0.250000,29.0,34.0,0.379551,0.526912,0.0,-1.8,-0.016667,-5.0,-0.147360,1


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Going to store a dict of the best classifiers between each team
next_games_models = []

for game in next_games_dfs:
  print(len(game))
  
  # Loop through each team (home and away)
  for i, team in enumerate(game):
    best_accuracy = 0
    best_classifier = 0
    best_classifier_name = ""
    
    # Features and dependencies
    X = team.iloc[:, :-1]
    y = team.iloc[:, -1]
    
    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    # Scale train and test sets
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Initialize models
    classifiers = {
        "Random Forest": RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0),
        "Naive Bayes": GaussianNB(),
        "Logistic Regression": LogisticRegression(random_state=0),
        "SVC": SVC(kernel='rbf', random_state=0)
    }
    
    classifiers_accuracies = {
        "Random Forest": 0,
        "Naive Bayes": 0,
        "Logistic Regression": 0,
        "SVC": 0
    }
    
    # Train each classifier and evaluate accuracy
    for classifier_name, classifier in classifiers.items():
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        classifiers_accuracies[classifier_name] = accuracy_score(y_test, y_pred)
        print(f"{classifier_name} accuracy: {accuracy_score(y_test, y_pred)}")
    
    # Get the best classifier for this team
    best_classifier_name = max(classifiers_accuracies, key=classifiers_accuracies.get)
    best_accuracy = classifiers_accuracies[best_classifier_name]
    best_classifier = classifiers[best_classifier_name]
    
    # Append to the models list
    team_name = "Home" if i == 0 else "Away"  # You can label as "Home" or "Away"
    next_games_models.append({
        "team": team_name,
        "classifier": best_classifier,
        "accuracy": best_accuracy,
        "classifier_name": best_classifier_name,
        "scaler": scaler
    })

combine_next_game_models = []
for i in range(0, len(next_games_models), 2):
  home = next_games_models[i]  # Home team entry
  away = next_games_models[i + 1]  # Away team entry

  combine_next_game_models.append((home, away))
  # Now, next_games_models will store the best model and accuracy for both home and away teams in each game.


2
Random Forest accuracy: 0.63
Naive Bayes accuracy: 0.63
Logistic Regression accuracy: 0.7
SVC accuracy: 0.65
Random Forest accuracy: 0.5288461538461539
Naive Bayes accuracy: 0.5673076923076923
Logistic Regression accuracy: 0.5865384615384616
SVC accuracy: 0.5673076923076923
2
Random Forest accuracy: 0.7075471698113207
Naive Bayes accuracy: 0.6886792452830188
Logistic Regression accuracy: 0.6981132075471698
SVC accuracy: 0.6509433962264151
Random Forest accuracy: 0.62
Naive Bayes accuracy: 0.67
Logistic Regression accuracy: 0.68
SVC accuracy: 0.62
2
Random Forest accuracy: 0.7128712871287128
Naive Bayes accuracy: 0.6732673267326733
Logistic Regression accuracy: 0.693069306930693
SVC accuracy: 0.6732673267326733
Random Forest accuracy: 0.6534653465346535
Naive Bayes accuracy: 0.6237623762376238
Logistic Regression accuracy: 0.6138613861386139
SVC accuracy: 0.6534653465346535
2
Random Forest accuracy: 0.65
Naive Bayes accuracy: 0.64
Logistic Regression accuracy: 0.66
SVC accuracy: 0.62


In [35]:
print(combine_next_game_models)

[({'team': 'Home', 'classifier': LogisticRegression(random_state=0), 'accuracy': 0.7, 'classifier_name': 'Logistic Regression', 'scaler': StandardScaler()}, {'team': 'Away', 'classifier': LogisticRegression(random_state=0), 'accuracy': 0.5865384615384616, 'classifier_name': 'Logistic Regression', 'scaler': StandardScaler()}), ({'team': 'Home', 'classifier': RandomForestClassifier(criterion='entropy', random_state=0), 'accuracy': 0.7075471698113207, 'classifier_name': 'Random Forest', 'scaler': StandardScaler()}, {'team': 'Away', 'classifier': LogisticRegression(random_state=0), 'accuracy': 0.68, 'classifier_name': 'Logistic Regression', 'scaler': StandardScaler()}), ({'team': 'Home', 'classifier': RandomForestClassifier(criterion='entropy', random_state=0), 'accuracy': 0.7128712871287128, 'classifier_name': 'Random Forest', 'scaler': StandardScaler()}, {'team': 'Away', 'classifier': RandomForestClassifier(criterion='entropy', random_state=0), 'accuracy': 0.6534653465346535, 'classifier

In [38]:
def get_df_for_predicton(i, next_games_dfs, playing_side):
  side = 0 if playing_side == "Home" else 1
  opponent_side = 0 if side else 1
  target_stats = next_games_dfs[i][side].iloc[-1]
  # print(next_games[i], target_stats)

  opponent_stats = next_games_dfs[i][opponent_side].iloc[-1]
  target_goals = target_stats['target_goals']
  opponent_goals = opponent_stats['target_goals']
  target_PPP = target_stats['target_PP%']
  opponent_PPP = opponent_stats['target_PP%']
  target_SOG = target_stats['target_SOG']
  opponent_SOG = opponent_stats['target_SOG']
  target_FOWP = target_stats['target_FOW%']
  opponent_FOWP = opponent_stats['target_FOW%']
  target_side = opponent_side # This is reverse because I ordered it weirdly
  # print(opponent_win_loss)
  # opponent_win_loss = opponent_win_loss[next_games[i][opponent_side]]
  goals_diff = target_goals - opponent_goals
  ppp_diff = target_PPP - opponent_PPP
  sog_diff = target_SOG - opponent_SOG
  fowp_diff = target_FOWP - opponent_FOWP

  features = [
    [
      target_goals, opponent_goals, target_PPP, opponent_PPP,
      target_SOG, opponent_SOG, target_FOWP, opponent_FOWP,
      side, goals_diff, ppp_diff, sog_diff, fowp_diff
      ]
  ]

  feature_names = ["target_goals", 
      "opponent_goals", 
      "target_PP%", 
      "opponent_PP%", 
      "target_SOG", 
      "opponent_SOG",
      "target_FOW%",
      "opponent_FOW%",
      "Home/Away",
      "goals_diff",
      "ppp_diff",
      "sog_diff",
      "fowp_diff"]


  # Convert features to a DataFrame with the correct column names
  return pd.DataFrame(features, columns=feature_names)
  

In [58]:

for i, game in enumerate(combine_next_game_models):
  # We want to compare the accuracy of the home and away training models
  if game[0]['accuracy'] > game[1]['accuracy']:
    prediction_features_dataframe = get_df_for_predicton(i, next_games_dfs, game[0]["team"])
    side = 0
    # Home is target
  else:
    # Away is target
    prediction_features_dataframe = get_df_for_predicton(i, next_games_dfs, game[1]["team"])
    side = 1

  scaled_prediction_dataframe = game[side]['scaler'].transform(prediction_features_dataframe)
  prediction = game[side]['classifier'].predict(scaled_prediction_dataframe)
  # print(prediction)

  if prediction[0]:
    print(f'{next_games[i][side]} is predicted to with their next game against the {next_games[i][0 if side else 1]}')
  else:
    print(f'{next_games[i][0 if side else 1]} is predicted to with their next game against the {next_games[i][side]}')
  # The team with the highest accuracy will be predicted as the target_team
  

Saskatoon Blades is predicted to with their next game against the Regina Pats
Brandon Wheat Kings is predicted to with their next game against the Moose Jaw Warriors
Prince Albert Raiders is predicted to with their next game against the Swift Current Broncos
Lethbridge Hurricanes is predicted to with their next game against the Medicine Hat Tigers
Spokane Chiefs is predicted to with their next game against the Wenatchee Wild
Everett Silvertips is predicted to with their next game against the Seattle Thunderbirds
Kelowna Rockets is predicted to with their next game against the Kamloops Blazers
Portland Winterhawks is predicted to with their next game against the Tri-City Americans
Prince George Cougars is predicted to with their next game against the Victoria Royals


## Split the dataset

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import confusion_matrix, accuracy_score
# X = next_games_dfs[0][0].iloc[:, :-1]
# y = next_games_dfs[0][0].iloc[:, -1]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Scale the features

In [None]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

## Apply Classification Model

### Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
# This is an ensemble. (A bunch of smaller classification trees)
# Again, this has a random state
# Because our data set is so simple we will just to based on 40 trees
#   Even with a larger amount of esimators to results remain the same or worse
rfc_classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)
rfc_classifier.fit(X_train, y_train)
y_pred = rfc_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

rfc_accuracy = accuracy_score(y_test, y_pred)

### Naive Bayes

In [None]:
# from sklearn.naive_bayes import GaussianNB
# # New library from sklearn
# #   Lookup sklearn Naive Bayes
# # This is non-linear and will consider probability
# nb_classifier = GaussianNB()
# nb_classifier.fit(X_train, y_train)

# y_pred = nb_classifier.predict(X_test)
# cm = confusion_matrix(y_test, y_pred)
# print(cm)

# nb_accuracy = accuracy_score(y_test, y_pred)

### Logistic Regression

In [None]:
# from sklearn.linear_model import LogisticRegression

# lg_classifier = LogisticRegression(random_state=0)
# lg_classifier.fit(X_train, y_train)

# y_pred = lg_classifier.predict(X_test)
# cm = confusion_matrix(y_test, y_pred)
# print(cm)

# lg_accuracy = accuracy_score(y_test, y_pred)

### Support Vector Machine

In [None]:
# from sklearn.svm import SVC
# # We are using the radial basis function
# # This is a non-linear function
# svc_classifier = SVC(kernel='rbf', random_state=0)
# svc_classifier.fit(X_train, y_train)

# y_pred = svc_classifier.predict(X_test)
# cm = confusion_matrix(y_test, y_pred)
# print(cm)

# svc_accuracy = accuracy_score(y_test, y_pred)

In [None]:
# count_target_win = target_df['target_win'].value_counts()[1]
# print(f"Number of times target_win is 1: {count_target_win}")

# count_target_loss = target_df['target_win'].value_counts()[0]
# print(f"Number of times target_win is 0: {count_target_loss}")

# target_total = count_target_loss + count_target_win
# print(f"Win %: {count_target_win / target_total}")
# print(f"Loss %: {count_target_loss / target_total}")

# # print(len(dataset['Home_Name'] == target_team))
# print((dataset['Away_Name'] == target_team).sum())
# print((dataset['Home_Name'] == target_team).sum())
