In [154]:
# !pip install pandas
# !pip install scikit-learn
# !pip install numpy

In [155]:
import pandas as pd
import numpy as np

In [156]:
dataset = pd.read_csv('All_teams_WHL_stats.csv')
target_team = 'Spokane Chiefs'
k = 5 # trained on the last k games

In [157]:
dataset = dataset.sort_values(by="Game_ID")
dataset.reset_index(drop=True, inplace=True)
display(dataset)



Unnamed: 0.1,Unnamed: 0,Game_ID,Home_Name,Away_Name,Home_Goals,Away_Goals,Home_PP%,Away_PP%,Home_SOG,Away_SOG,Home_FOW%,Away_FOW%
0,5302,1014692,Prince George Cougars,Calgary Hitmen,7,1,0.285714,0.250000,34,17,0.433333,0.566667
1,5303,1014699,Kamloops Blazers,Calgary Hitmen,2,4,0.000000,0.000000,32,56,0.492958,0.507042
2,5304,1014708,Kelowna Rockets,Calgary Hitmen,3,4,0.166667,0.000000,34,26,0.461538,0.538462
3,5305,1014720,Calgary Hitmen,Lethbridge Hurricanes,1,3,0.000000,0.285714,31,37,0.500000,0.500000
4,5306,1014735,Red Deer Rebels,Calgary Hitmen,4,2,0.000000,0.000000,31,26,0.576271,0.423729
...,...,...,...,...,...,...,...,...,...,...,...,...
5593,2449,1021542,Kelowna Rockets,Prince George Cougars,3,5,0.000000,1.000000,20,34,0.464286,0.535714
5594,499,1021543,Portland Winterhawks,Spokane Chiefs,6,4,0.333333,0.500000,47,30,0.677419,0.322581
5595,3550,1021544,Red Deer Rebels,Swift Current Broncos,4,5,0.250000,0.250000,33,35,0.514706,0.485294
5596,5301,1021545,Calgary Hitmen,Prince Albert Raiders,8,5,0.333333,0.250000,44,23,0.424242,0.575758


In [158]:
# Get default values for averages of goals, PP%, SOG, and FOW%
default_goals = (dataset['Home_Goals'].mean() + dataset['Away_Goals'].mean()) // 2
default_ppp = (dataset['Home_PP%'].mean() + dataset['Away_PP%'].mean()) / 2
default_sog = (dataset['Home_SOG'].mean() + dataset['Away_SOG'].mean()) // 2
default_fowp = (dataset['Home_FOW%'].mean() + dataset['Away_FOW%'].mean()) / 2

print(default_goals, default_ppp, default_sog, default_fowp)

3.0 0.21539831144600702 31.0 0.5


In [159]:
def get_opponent_stats(dataset, queue, opponent_name):
  opponent_goals = 0
  opponent_ppp = 0
  opponent_fowp = 0
  opponent_sog = 0
  # Get the last k games averages
  # This creates a new dataset of only opponent games, and which are before the current game
  opponent_games = dataset[
    ((dataset['Home_Name'] == opponent_name) | (dataset['Away_Name'] == opponent_name)) & 
    (dataset['Game_ID'] < row['Game_ID'])
  ].tail(k)

  # If the opponent hasnt played k games yet, use default
  if len(opponent_games) < k:
    opponent_goals = default_goals
    opponent_ppp = default_ppp
    opponent_fowp = default_fowp
    opponent_sog = default_sog
  else:
    # Get the mean of the entire column that the opponent played
    opponent_goals = opponent_games['Home_Goals'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_Goals']).mean()
    opponent_ppp = opponent_games['Home_PP%'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_PP%']).mean()
    opponent_fowp = opponent_games['Home_FOW%'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_FOW%']).mean()
    opponent_sog = opponent_games['Home_SOG'].where(opponent_games['Home_Name'] == opponent_name, opponent_games['Away_SOG']).mean()

  return opponent_goals, opponent_ppp, opponent_fowp, opponent_sog

In [160]:
from collections import deque
target_queue = deque(maxlen=k)
target_df = pd.DataFrame(columns = [
    "target_goals", 
    "opponent_goals", 
    "target_PP%", 
    "opponent_PP%", 
    "target_SOG", 
    "opponent_SOG",
    "target_FOW%",
    "opponent_FOW%",
    "Home/Away",
    "target_win"
])

target_goals = 0
target_ppp = 0
target_fowp = 0
target_sog = 0

# Iterate through the DataFrame to track the last 5 games
for index, row in dataset.iterrows():
    if row['Home_Name'] == target_team or row['Away_Name'] == target_team:

      side = 'Home' if row['Home_Name'] == target_team else 'Away'
      opponent_name = row['Away_Name'] if side == 'Home' else row['Home_Name']
      # now we are within the target team
      # If the queue is less than k, just add to queue
      target_goals += row[side + '_Goals']
      target_ppp += row[side + '_PP%']
      target_fowp += row[side + '_FOW%']
      target_sog += row[side + '_SOG']
     
      
      if len(target_queue) < k:
        target_queue.append(row.to_dict())
      else: #If queue is full, then we can do some popping

        opponent_goals, opponent_ppp, opponent_fowp, opponent_sog = get_opponent_stats(dataset, target_queue, opponent_name)
        
        #print(target_goals / k, target_ppp / k, target_fowp / k, target_sog / k)
        # print(opponent_goals, opponent_ppp, opponent_fowp, opponent_sog)
        # Calculate average stats for the target team
        avg_target_goals = target_goals / k
        avg_target_ppp = target_ppp / k
        avg_target_fowp = target_fowp / k
        avg_target_sog = target_sog / k

        # 1 is Home, 0 is Away
        target_df.loc[len(target_df)] = {
          "target_goals": avg_target_goals,
          "opponent_goals": opponent_goals,
          "target_PP%": avg_target_ppp,
          "opponent_PP%": opponent_ppp,
          "target_SOG": avg_target_sog,
          "opponent_SOG": opponent_sog,
          "target_FOW%": avg_target_fowp,
          "opponent_FOW%": opponent_fowp,
          "Home/Away": 1 if side == 'Home' else 0,
          "target_win": 1 if row[side + '_Goals'] > row[('Away' if side == 'Home' else 'Home') + '_Goals'] else 0
        }

        popped = target_queue.popleft()
        popped_side ='Home' if popped['Home_Name'] == target_team else 'Away'
        target_goals -= popped[popped_side + '_Goals']
        target_ppp -= popped[popped_side + '_PP%']
        target_fowp -= popped[popped_side + '_FOW%']
        target_sog -= popped[popped_side + '_SOG']

        target_queue.append(row.to_dict())
        
        


In [161]:
display(target_df)

Unnamed: 0,target_goals,opponent_goals,target_PP%,opponent_PP%,target_SOG,opponent_SOG,target_FOW%,opponent_FOW%,Home/Away,target_win
0,5.0,3.0,0.321905,0.215398,43.2,31.0,0.626410,0.500000,0,1
1,4.8,3.8,0.255238,0.195238,44.0,33.4,0.640780,0.486391,1,1
2,5.6,4.0,0.255238,0.328571,46.8,30.0,0.615465,0.486960,0,0
3,5.2,4.8,0.305238,0.383333,46.6,31.8,0.622078,0.452677,1,1
4,4.4,4.2,0.195238,0.316667,49.4,31.0,0.615668,0.474982,0,0
...,...,...,...,...,...,...,...,...,...,...
496,6.0,2.4,0.399048,0.183333,42.0,27.6,0.615378,0.450405,0,1
497,5.6,3.2,0.353333,0.166667,44.2,28.8,0.604702,0.517612,0,1
498,4.8,3.0,0.420000,0.206667,41.4,33.2,0.598011,0.483446,1,0
499,4.2,4.4,0.440000,0.293810,39.4,34.4,0.538997,0.525082,0,0


# Machine Learning Classification

## Split the dataset

In [162]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
X = target_df.iloc[:, :-1]
y = target_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Scale the features

In [163]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train.iloc[:, :-1] = scaler.fit_transform(X_train.iloc[:, :-1])
X_test.iloc[:, :-1] = scaler.transform(X_test.iloc[:, :-1])

## Apply Classification Model

In [164]:
from sklearn.ensemble import RandomForestClassifier
# This is an ensemble. (A bunch of smaller classification trees)
# Again, this has a random state
# Because our data set is so simple we will just to based on 40 trees
#   Even with a larger amount of esimators to results remain the same or worse
classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)

[[41 16]
 [15 29]]


0.693069306930693

In [165]:
from sklearn.naive_bayes import GaussianNB
# New library from sklearn
#   Lookup sklearn Naive Bayes
# This is non-linear and will consider probability
classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)

[[30 27]
 [ 7 37]]


0.6633663366336634

In [166]:
count_target_win = target_df['target_win'].value_counts()[1]
print(f"Number of times target_win is 1: {count_target_win}")

count_target_loss = target_df['target_win'].value_counts()[0]
print(f"Number of times target_win is 0: {count_target_loss}")

target_total = count_target_loss + count_target_win
print(f"Win %: {count_target_win / target_total}")
print(f"Loss %: {count_target_loss / target_total}")

# print(len(dataset['Home_Name'] == target_team))
print((dataset['Away_Name'] == target_team).sum())
print((dataset['Home_Name'] == target_team).sum())


Number of times target_win is 1: 232
Number of times target_win is 0: 269
Win %: 0.4630738522954092
Loss %: 0.5369261477045908
261
245
