In [134]:
# !pip install pandas
# !pip install scikit-learn
# !pip install numpy

In [135]:
import pandas as pd
import numpy as np

In [136]:
dataset = pd.read_csv('All_teams_WHL_stats.csv')
target_team = 'Spokane Chiefs'
k = 5 # trained on the last k games

In [137]:
dataset = dataset.sort_values(by="Game_ID")
display(dataset)

Unnamed: 0.1,Unnamed: 0,Game_ID,Home_Name,Away_Name,Home_Goals,Away_Goals,Home_PP%,Away_PP%,Home_SOG,Away_SOG,Home_FOW%,Away_FOW%
5495,5495,1014692,Prince George Cougars,Calgary Hitmen,7,1,0.285714,0.250000,34,17,0.433333,0.566667
5496,5496,1014699,Kamloops Blazers,Calgary Hitmen,2,4,0.000000,0.000000,32,56,0.492958,0.507042
5497,5497,1014708,Kelowna Rockets,Calgary Hitmen,3,4,0.166667,0.000000,34,26,0.461538,0.538462
5498,5498,1014720,Calgary Hitmen,Lethbridge Hurricanes,1,3,0.000000,0.285714,31,37,0.500000,0.500000
5499,5499,1014735,Red Deer Rebels,Calgary Hitmen,4,2,0.000000,0.000000,31,26,0.576271,0.423729
...,...,...,...,...,...,...,...,...,...,...,...,...
2568,2568,1021542,Kelowna Rockets,Prince George Cougars,3,5,0.000000,1.000000,20,34,0.464286,0.535714
618,618,1021543,Portland Winterhawks,Spokane Chiefs,6,4,0.333333,0.500000,47,30,0.677419,0.322581
3669,3669,1021544,Red Deer Rebels,Swift Current Broncos,4,5,0.250000,0.250000,33,35,0.514706,0.485294
5494,5494,1021545,Calgary Hitmen,Prince Albert Raiders,8,5,0.333333,0.250000,44,23,0.424242,0.575758


In [138]:
# Get default values for averages of goals, PP%, SOG, and FOW%
default_goals = (dataset['Home_Goals'].mean() + dataset['Away_Goals'].mean()) // 2
default_ppp = (dataset['Home_PP%'].mean() + dataset['Away_PP%'].mean()) / 2
default_sog = (dataset['Home_SOG'].mean() + dataset['Away_SOG'].mean()) // 2
default_fowp = (dataset['Home_FOW%'].mean() + dataset['Away_FOW%'].mean()) / 2

print(default_goals, default_ppp, default_sog, default_fowp)

3.0 0.22006456999060547 31.0 0.5


In [139]:
def get_opponent_stats(dataset, queue, opponent_name):
  opponent_goals = 0
  opponent_ppp = 0
  opponent_fowp = 0
  opponent_sog = 0
  # Get the last k games averages
  # This creates a new dataset of only opponent games, and which are before the current game
  opponent_games = dataset[
    ((dataset['Home_Name'] == opponent_name) | (dataset['Away_Name'] == opponent_name)) & 
    (dataset['Game_ID'] < row['Game_ID'])
  ].tail(k)

  # If the opponent hasnt played k games yet, use default
  if len(opponent_games) < k:
    opponent_goals = default_goals
    opponent_ppp = default_ppp
    opponent_fowp = default_fowp
    opponent_sog = default_sog
  else:
    # Get the mean of the entire column that the opponent played
    opponent_goals = opponent_games['Home_Goals'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_Goals']).mean()
    opponent_ppp = opponent_games['Home_PP%'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_PP%']).mean()
    opponent_fowp = opponent_games['Home_FOW%'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_FOW%']).mean()
    opponent_sog = opponent_games['Home_SOG'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_SOG']).mean()

  return opponent_goals, opponent_ppp, opponent_fowp, opponent_sog

In [140]:
from collections import deque
target_queue = deque(maxlen=k)
target_df = pd.DataFrame(columns = [
    "target_goals", 
    "opponent_goals", 
    "target_PP%", 
    "opponent_PP%", 
    "target_SOG", 
    "opponent_SOG",
    "target_FOW%",
    "opponent_FOW%",
    "Home/Away",
    "target_win"
])

target_goals = 0
target_ppp = 0
target_fowp = 0
target_sog = 0

# Iterate through the DataFrame to track the last 5 games
for index, row in dataset.iterrows():
    if row['Home_Name'] == target_team or row['Away_Name'] == target_team:

      side = 'Home' if row['Home_Name'] == target_team else 'Away'
      opponent_name = row['Away_Name'] if side == 'Home' else row['Home_Name']
      # now we are within the target team
      # If the queue is less than k, just add to queue
      if len(target_queue) < k:
        target_goals += row[side + '_Goals']
        target_ppp += row[side + '_PP%']
        target_fowp += row[side + '_FOW%']
        target_sog += row[side + '_SOG']
        target_queue.append(row.to_dict())
        #Maybe need to do somethine else here
      else: #If queue is full, then we can do some popping

        opponent_goals, opponent_ppp, opponent_fowp, opponent_sog = get_opponent_stats(dataset, target_queue, opponent_name)
        
        # print(target_goals / k, target_ppp / k, target_fowp / k, target_sog / k)
        # print(opponent_goals, opponent_ppp, opponent_fowp, opponent_sog)
        # Calculate average stats for the target team
        avg_target_goals = target_goals / k
        avg_target_ppp = target_ppp / k
        avg_target_fowp = target_fowp / k
        avg_target_sog = target_sog / k

        # 1 is Home, 0 is Away
        target_df.loc[len(target_df)] = {
          "target_goals": avg_target_goals,
          "opponent_goals": opponent_goals,
          "target_PP%": avg_target_ppp,
          "opponent_PP%": opponent_ppp,
          "target_SOG": avg_target_sog,
          "opponent_SOG": opponent_sog,
          "target_FOW%": avg_target_fowp,
          "opponent_FOW%": opponent_fowp,
          "Home/Away": 1 if side == 'Home' else 0,
          "target_win": 1 if row[side + '_Goals'] > row[('Away' if side == 'Home' else 'Home') + '_Goals'] else 0
        }

        popped = target_queue.popleft()
        popped_side ='Home' if popped['Home_Name'] == target_team else 'Away'
        target_goals -= popped[popped_side + '_Goals']
        target_ppp -= popped[popped_side + '_PP%']
        target_fowp -= popped[popped_side + '_FOW%']
        target_sog -= popped[popped_side + '_SOG']
        # Now get the average stats from the target team, and put them all into the target_df
        


In [141]:
display(target_df)

Unnamed: 0,target_goals,opponent_goals,target_PP%,opponent_PP%,target_SOG,opponent_SOG,target_FOW%,opponent_FOW%,Home/Away,target_win
0,4.2,3.0,0.288571,0.220065,33.6,31.0,0.523552,0.500000,0,1
1,4.0,3.8,0.221905,0.195238,34.4,33.4,0.537923,0.486391,1,1
2,5.0,3.8,0.255238,0.195238,39.0,33.4,0.539734,0.486391,1,1
3,4.8,3.8,0.288571,0.195238,38.2,33.4,0.554797,0.486391,1,1
4,4.4,3.8,0.161905,0.195238,40.2,33.4,0.560024,0.486391,1,1
...,...,...,...,...,...,...,...,...,...,...
742,4.8,2.8,0.550000,0.216667,37.0,27.6,0.543192,0.529250,1,1
743,5.0,4.6,0.533333,0.343333,36.6,38.6,0.532527,0.503171,1,1
744,5.0,3.2,0.533333,0.206667,37.2,26.8,0.494787,0.536488,1,1
745,4.4,3.2,0.333333,0.166667,34.2,28.8,0.482756,0.517612,0,1


# Machine Learning Classification

## Split the dataset

In [142]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
X = target_df.iloc[:, :-1]
y = target_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Scale the features

In [176]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train.iloc[:, :-1] = scaler.fit_transform(X_train.iloc[:, :-1])
X_test.iloc[:, :-1] = scaler.transform(X_test.iloc[:, :-1])

Unnamed: 0,target_goals,opponent_goals,target_PP%,opponent_PP%,target_SOG,opponent_SOG,target_FOW%,opponent_FOW%,Home/Away
97,-0.652053,0.251845,-0.937262,-0.519966,-0.547260,0.723886,-1.435289,-0.053163,0
516,-0.513669,0.587077,0.380476,-0.035887,-0.979070,-0.232057,-0.290605,0.642254,1
156,-0.513669,3.101317,-1.387221,1.242082,-0.314747,-1.665973,-0.445067,0.254268,0
395,-1.759130,0.419461,-1.105997,0.564371,-2.307718,0.172380,-0.840098,0.220997,1
704,0.039870,-0.418619,0.393868,-0.565147,0.316360,-0.599728,-1.559569,0.045659,1
...,...,...,...,...,...,...,...,...,...
707,0.039870,0.251845,-0.356065,0.622460,1.146765,-0.636495,-1.479189,-0.706941,1
192,0.731793,0.587077,2.174957,-1.468761,-1.244800,0.907721,-0.693113,0.751617,0
629,-0.375284,1.760389,0.650987,1.053291,-1.344448,0.834187,-0.595544,0.587660,1
559,-0.652053,-1.089083,-1.065822,-0.452195,-0.248315,-0.084989,1.028105,-0.000861,1


## Apply Classification Model

In [174]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state=3, criterion='entropy')
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)

[[59 17]
 [13 61]]


0.8

In [147]:
count_target_win = target_df['target_win'].value_counts()[1]
print(f"Number of times target_win is 1: {count_target_win}")

count_target_loss = target_df['target_win'].value_counts()[0]
print(f"Number of times target_win is 0: {count_target_loss}")

target_total = count_target_loss + count_target_win
print(f"Win %: {count_target_win / target_total}")
print(f"Loss %: {count_target_loss / target_total}")

# print(len(dataset['Home_Name'] == target_team))
print((dataset['Away_Name'] == target_team).sum())
print((dataset['Home_Name'] == target_team).sum())


Number of times target_win is 1: 402
Number of times target_win is 0: 345
Win %: 0.5381526104417671
Loss %: 0.46184738955823296
777
722
