In [120]:
# !pip install pandas
# !pip install scikit-learn
# !pip install numpy

In [121]:
import pandas as pd
import numpy as np

In [122]:
dataset = pd.read_csv('All_teams_WHL_stats.csv')
target_team = 'Spokane Chiefs'
k = 5 # trained on the last k games

In [123]:
dataset = dataset.sort_values(by="Game_ID")
display(dataset)

Unnamed: 0.1,Unnamed: 0,Game_ID,Home_Name,Away_Name,Home_Goals,Away_Goals,Home_PP%,Away_PP%,Home_SOG,Away_SOG,Home_FOW%,Away_FOW%
5495,5495,1014692,Prince George Cougars,Calgary Hitmen,7,1,0.285714,0.250000,34,17,0.433333,0.566667
5496,5496,1014699,Kamloops Blazers,Calgary Hitmen,2,4,0.000000,0.000000,32,56,0.492958,0.507042
5497,5497,1014708,Kelowna Rockets,Calgary Hitmen,3,4,0.166667,0.000000,34,26,0.461538,0.538462
5498,5498,1014720,Calgary Hitmen,Lethbridge Hurricanes,1,3,0.000000,0.285714,31,37,0.500000,0.500000
5499,5499,1014735,Red Deer Rebels,Calgary Hitmen,4,2,0.000000,0.000000,31,26,0.576271,0.423729
...,...,...,...,...,...,...,...,...,...,...,...,...
2568,2568,1021542,Kelowna Rockets,Prince George Cougars,3,5,0.000000,1.000000,20,34,0.464286,0.535714
618,618,1021543,Portland Winterhawks,Spokane Chiefs,6,4,0.333333,0.500000,47,30,0.677419,0.322581
3669,3669,1021544,Red Deer Rebels,Swift Current Broncos,4,5,0.250000,0.250000,33,35,0.514706,0.485294
5494,5494,1021545,Calgary Hitmen,Prince Albert Raiders,8,5,0.333333,0.250000,44,23,0.424242,0.575758


In [124]:
# Get default values for averages of goals, PP%, SOG, and FOW%
default_goals = (dataset['Home_Goals'].mean() + dataset['Away_Goals'].mean()) // 2
default_ppp = (dataset['Home_PP%'].mean() + dataset['Away_PP%'].mean()) / 2
default_sog = (dataset['Home_SOG'].mean() + dataset['Away_SOG'].mean()) // 2
default_fowp = (dataset['Home_FOW%'].mean() + dataset['Away_FOW%'].mean()) / 2

print(default_goals, default_ppp, default_sog, default_fowp)

3.0 0.22006456999060547 31.0 0.5


In [125]:
def get_opponent_stats(dataset, queue, opponent_name):
  opponent_goals = 0
  opponent_ppp = 0
  opponent_fowp = 0
  opponent_sog = 0
  # Get the last k games averages
  # This creates a new dataset of only opponent games, and which are before the current game
  opponent_games = dataset[
    ((dataset['Home_Name'] == opponent_name) | (dataset['Away_Name'] == opponent_name)) & 
    (dataset['Game_ID'] < row['Game_ID'])
  ].tail(k)

  # If the opponent hasnt played k games yet, use default
  if len(opponent_games) < k:
    opponent_goals = default_goals
    opponent_ppp = default_ppp
    opponent_fowp = default_fowp
    opponent_sog = default_sog
  else:
    # Get the mean of the entire column that the opponent played
    opponent_goals = opponent_games['Home_Goals'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_Goals']).mean()
    opponent_ppp = opponent_games['Home_PP%'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_PP%']).mean()
    opponent_fowp = opponent_games['Home_FOW%'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_FOW%']).mean()
    opponent_sog = opponent_games['Home_SOG'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_SOG']).mean()

  return opponent_goals, opponent_ppp, opponent_fowp, opponent_sog

In [126]:
from collections import deque
target_queue = deque(maxlen=k)
target_df = pd.DataFrame(columns = [
    "target_goals", 
    "opponent_goals", 
    "target_PP%", 
    "opponent_PP%", 
    "target_SOG", 
    "opponent_SOG",
    "target_FOW%",
    "opponent_FOW%",
    "Home/Away",
    "target_win"
])

target_goals = 0
target_ppp = 0
target_fowp = 0
target_sog = 0

# Iterate through the DataFrame to track the last 5 games
for index, row in dataset.iterrows():
    if row['Home_Name'] == target_team or row['Away_Name'] == target_team:

      side = 'Home' if row['Home_Name'] == target_team else 'Away'
      opponent_name = row['Away_Name'] if side == 'Home' else row['Home_Name']
      # now we are within the target team
      # If the queue is less than k, just add to queue
      if len(target_queue) < k:
        target_goals += row[side + '_Goals']
        target_ppp += row[side + '_PP%']
        target_fowp += row[side + '_FOW%']
        target_sog += row[side + '_SOG']
        target_queue.append(row.to_dict())
        #Maybe need to do somethine else here
      else: #If queue is full, then we can do some popping

        opponent_goals, opponent_ppp, opponent_fowp, opponent_sog = get_opponent_stats(dataset, target_queue, opponent_name)
        
        # print(target_goals / k, target_ppp / k, target_fowp / k, target_sog / k)
        # print(opponent_goals, opponent_ppp, opponent_fowp, opponent_sog)
        # Calculate average stats for the target team
        avg_target_goals = target_goals / k
        avg_target_ppp = target_ppp / k
        avg_target_fowp = target_fowp / k
        avg_target_sog = target_sog / k

        # 1 is Home, 0 is Away
        target_df.loc[len(target_df)] = {
          "target_goals": avg_target_goals,
          "opponent_goals": opponent_goals,
          "target_PP%": avg_target_ppp,
          "opponent_PP%": opponent_ppp,
          "target_SOG": avg_target_sog,
          "opponent_SOG": opponent_sog,
          "target_FOW%": avg_target_fowp,
          "opponent_FOW%": opponent_fowp,
          "Home/Away": 1 if side == 'Home' else 0,
          "target_win": 1 if row[side + '_Goals'] > row[('Away' if side == 'Home' else 'Home') + '_Goals'] else 0
        }

        popped = target_queue.popleft()
        popped_side ='Home' if popped['Home_Name'] == target_team else 'Away'
        target_goals -= popped[popped_side + '_Goals']
        target_ppp -= popped[popped_side + '_PP%']
        target_fowp -= popped[popped_side + '_FOW%']
        target_sog -= popped[popped_side + '_SOG']
        # Now get the average stats from the target team, and put them all into the target_df
        


In [127]:
display(target_df)

Unnamed: 0,target_goals,opponent_goals,target_PP%,opponent_PP%,target_SOG,opponent_SOG,target_FOW%,opponent_FOW%,Home/Away,target_win
0,3.6,3.4,0.123810,0.266667,38.4,24.2,0.455919,0.512211,0,0
1,2.8,4.2,0.106667,0.485000,38.6,31.8,0.472700,0.490992,1,1
2,3.2,4.0,0.106667,0.256667,37.4,30.2,0.495557,0.482006,1,0
3,3.4,4.0,0.156667,0.228571,34.8,35.2,0.500579,0.495670,0,0
4,2.8,3.6,0.090000,0.331667,32.0,34.4,0.485306,0.470147,0,0
...,...,...,...,...,...,...,...,...,...,...
269,5.2,3.0,0.233333,0.166667,27.8,28.6,0.470551,0.512972,0,0
270,4.8,3.2,0.273333,0.216667,29.0,33.6,0.456801,0.515239,0,0
271,4.4,2.6,0.213333,0.250000,28.2,33.4,0.466500,0.505173,1,1
272,3.8,3.6,0.246667,0.316667,29.2,31.4,0.483933,0.450503,0,0


# Machine Learning Classification

## Split the dataset

In [128]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
X = target_df.iloc[:, :-1]
y = target_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Scale the features

In [129]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

# X_train.iloc[:, :-1] = scaler.fit_transform(X_train.iloc[:, :-1])
# X_test.iloc[:, :-1] = scaler.transform(X_test.iloc[:, :-1])

## Apply Classification Model

In [130]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state=3, criterion='entropy')
classifier.fit(X_train, y_train)


In [131]:
y_pred = classifier.predict(X_test)

In [132]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)

[[23 12]
 [11  9]]


0.5818181818181818

In [133]:
count_target_win = target_df['target_win'].value_counts()[1]
print(f"Number of times target_win is 1: {count_target_win}")

count_target_loss = target_df['target_win'].value_counts()[0]
print(f"Number of times target_win is 0: {count_target_loss}")

target_total = count_target_loss + count_target_win
print(f"Win %: {count_target_win / target_total}")
print(f"Loss %: {count_target_loss / target_total}")

# print(len(dataset['Home_Name'] == target_team))
print((dataset['Away_Name'] == target_team).sum())
print((dataset['Home_Name'] == target_team).sum())


Number of times target_win is 1: 111
Number of times target_win is 0: 163
Win %: 0.4051094890510949
Loss %: 0.5948905109489051
291
261
