# Load Data and Clean 

In [46]:
# How it works:
# After calling "calculate_averages" you have the teamAveragesList list.
# This is a list where each element in the list is a "TeamAverages" object.
# There is one of these objects for each team, so the list has like 40 elements or whatever.
# The object has a teamName, and then a bunch of lists holding data.
# Each element in these lists represents a game, and holds the AVERAGE team performance up until and including that game (minus some exceptions)
# For example, if the team name is "Calgary Flames" and you look at the 10th element in the "GameGDs" list, you will find the avg GD of that team for the first 10 games.
# Finally, if you call the "retrieveGameAverages" function with a team name, a date (datetime object) and the list of team objects, it will return two dictionaries.
# The first dictionary holds the specific averages for that team on the given date. The second holds the specific averages for the opponent team on the given date.

import sys
import pandas as pd
import numpy as np
import datetime

gameData = pd.DataFrame()
class TeamAverages:
    teamName = ""
    gameOpponents = []
    gameDates = []
    gameLocations = []
    gameResults = []
    gamePoints = []
    gameGDs = []
    gameSDs = []
    gameSHs = []
    gameFOs = []
    gameSVs = []

    def __init__(self, name):
        self.teamName = name
        self.gameOpponents = []
        self.gameDates = []
        self.gameLocations = []
        self.gameResults = []
        self.gamePoints = []
        self.gameGDs = []
        self.gameSDs = []
        self.gameSHs = []
        self.gameFOs = []
        self.gameSVs = []
    
    def __eq__(self, other):
        return self.teamName == other

def main():
    filename = "GAME_LOGS.xlsx"
    gameSheets = pd.read_excel(filename, sheet_name=None)
    teamAveragesList = calculate_averages(gameSheets)

    #FOR TESTING    
    getGameData(teamAveragesList)
    gameData = createDataset(teamAveragesList)
    
    print(gameData)

def getGameData(teamAveragesList):
  global gameData
  gameData = createDataset(teamAveragesList)

def calculate_averages(teams):
    teamAveragesList = []
    for teamName in teams:
        teamDataGram = teams[teamName]
        teamDataGram = teamDataGram.dropna()
        newTeam = TeamAverages(teamName)
        totalGames = len(teamDataGram["Rank"])
        datesList = teamDataGram["Date"].tolist()
        opponentsList = teamDataGram["Opponent"].tolist()
        locationsList = convertLocationList(teamDataGram["Location"].tolist())
        resultsList = convertResultList(teamDataGram["Result"].tolist())
        pointsList = teamDataGram["Points"].tolist()
        gdList = teamDataGram["GD"].tolist()
        sdList = teamDataGram["SD"].tolist()
        shList = teamDataGram["SH%"].tolist()
        foList = teamDataGram["FO%"].tolist()
        svList = teamDataGram["SV%"].tolist()
        totalPoints = 0.0
        totalGD = 0.0
        totalSD = 0.0
        totalSH = 0.0 
        totalFO = 0.0
        totalSV = 0.0

        for gameNum in range(totalGames):
            totalPoints += pointsList[gameNum]
            totalGD += gdList[gameNum]
            totalSD += sdList[gameNum]
            totalSH += shList[gameNum]
            totalFO += foList[gameNum]
            totalSV += svList[gameNum]
            newTeam.gameDates.append(datesList[gameNum])
            newTeam.gameOpponents.append(opponentsList[gameNum])
            newTeam.gameLocations.append(locationsList[gameNum])
            newTeam.gameResults.append(resultsList[gameNum])
            newTeam.gamePoints.append((totalPoints / (gameNum + 1) / 2))
            newTeam.gameGDs.append(totalGD / (gameNum + 1))
            newTeam.gameSDs.append(totalSD / (gameNum + 1))
            newTeam.gameSHs.append(totalSH / (gameNum + 1))
            newTeam.gameFOs.append(totalFO / (gameNum + 1))
            newTeam.gameSVs.append(totalSV / (gameNum + 1))
        teamAveragesList.append(newTeam)
    return teamAveragesList

def convertResultList(resultsList):
    binaryList = []
    for result in resultsList:
        if (result == "Win" or result == "OT Win" or result == "SO Win"):
            binaryList.append(1)
        else:
            binaryList.append(0)
    return binaryList

def convertLocationList(locationsList):
    binaryList = []
    for location in locationsList:
        if (location == "Home"):
            binaryList.append(1)
        else:
            binaryList.append(0)
    return binaryList

def retrieveGameAverages(teamName, date, teamAveragesList):
    team = teamAveragesList[teamAveragesList.index(teamName)]
    dateIndex = team.gameDates.index(date)
    opponentTeamName = team.gameOpponents[dateIndex]
    opponentTeam = teamAveragesList[teamAveragesList.index(opponentTeamName)]
    opponentDateIndex = opponentTeam.gameDates.index(date)
    teamAverages = {"teamName": teamName}
    teamAverages["gameDate"] = date
    teamAverages["gameLocation"] = team.gameLocations[dateIndex]
    teamAverages["gameResult"] = team.gameResults[dateIndex]
    teamAverages["gamePoints"] = team.gamePoints[dateIndex]
    teamAverages["gameGD"] = team.gameGDs[dateIndex]
    teamAverages["gameSD"] = team.gameSDs[dateIndex]
    teamAverages["gameSH"] = team.gameSHs[dateIndex]
    teamAverages["gameFO"] = team.gameFOs[dateIndex]
    teamAverages["gameSV"] = team.gameSVs[dateIndex]
    opponentAverages = {"teamName": opponentTeamName}
    opponentAverages["gameDate"] = date
    opponentAverages["gameLocation"] = opponentTeam.gameLocations[opponentDateIndex]
    opponentAverages["gameResult"] = opponentTeam.gameResults[opponentDateIndex]
    opponentAverages["gamePoints"] = opponentTeam.gamePoints[opponentDateIndex]
    opponentAverages["gameGD"] = opponentTeam.gameGDs[opponentDateIndex]
    opponentAverages["gameSD"] = opponentTeam.gameSDs[opponentDateIndex]
    opponentAverages["gameSH"] = opponentTeam.gameSHs[opponentDateIndex]
    opponentAverages["gameFO"] = opponentTeam.gameFOs[opponentDateIndex]
    opponentAverages["gameSV"] = opponentTeam.gameSVs[opponentDateIndex]
    return teamAverages, opponentAverages


def createDataset(teamAverages):

    gameObject = [] #Will be list of tuples where each tuple is a row in the DataFrame
    games = [] 
    dates = []

    for teamAvg in teamAverages:
      for date in teamAvg.gameDates:
        if date not in dates:
          dates.append(date)

    
    for date in dates:
      for teamAvg in teamAverages:
        if date in teamAvg.gameDates:
          games.append(retrieveGameAverages(teamAvg.teamName, date, teamAverages))

    for game in games:  
      if (game[0]['gameLocation'] == 1) :
        homeTeam = game[0]
        awayTeam = game[1]
      else:
        awayTeam = game[0]
        homeTeam = game[1]

      goalDiff = homeTeam['gameGD']
      shotDiff = homeTeam['gameSD']

      shotPercentDiff = homeTeam['gameSH'] - awayTeam['gameSH']
      foPercentDiff = homeTeam['gameFO'] - awayTeam['gameFO']
      savePercentDiff = homeTeam['gameSV'] - awayTeam['gameSV']
      pointsPercentDiff = homeTeam['gamePoints'] - awayTeam['gamePoints']
      homeResult = homeTeam['gameResult']

      gameTuple = (homeTeam['teamName'], awayTeam['teamName'], homeTeam['gameDate'], goalDiff, shotDiff, shotPercentDiff, foPercentDiff, savePercentDiff, pointsPercentDiff, homeResult)
      gameObject.append(gameTuple)




    
    dfObj = pd.DataFrame(gameObject,columns=['HomeTeam', 'AwayTeam', 'Date', 'GoalDiff', 'ShotDiff','ShotPercentDiff','foPercentDiff','savePercentDiff','pointsPercentDiff','HomeTeamResult'])


    return dfObj
    

    

main()


                 HomeTeam           AwayTeam  ... pointsPercentDiff  HomeTeamResult
0           Anaheim Ducks    Arizona Coyotes  ...          1.000000               1
1           Anaheim Ducks    Arizona Coyotes  ...          1.000000               1
2            Dallas Stars      Boston Bruins  ...         -1.000000               0
3     Pittsburgh Penguins     Buffalo Sabres  ...         -1.000000               0
4      Colorado Avalanche     Calgary Flames  ...          1.000000               1
...                   ...                ...  ...               ...             ...
2159      Ottawa Senators   New York Rangers  ...          0.000000               1
2160      Ottawa Senators   New York Rangers  ...          0.000000               1
2161  Pittsburgh Penguins  New Jersey Devils  ...          0.180124               1
2162  Pittsburgh Penguins    Ottawa Senators  ...          0.204167               1
2163  Pittsburgh Penguins    Ottawa Senators  ...          0.204167         

calculate teams overall states 

GD = Goal Differential 

SD = Standard Deviation 

SH = Total Shots By Athlete 

FO = Percentage of Shots for that team that were Goals 

SV = Percentage of Shots against that team that were not Goals




# Logical Regression 


Calculate average score excluding the other team they are facing 

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import KFold, cross_val_score


from sklearn.model_selection import train_test_split
from sklearn import metrics
#from sklearn.metrics import accuracy_score
#from sklearn.metrics import confusion_matrix, classification_report
#import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [31]:
inputs = gameData.drop(columns=['Date','HomeTeam','AwayTeam','HomeTeamResult'])

target = gameData.HomeTeamResult

X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2, random_state=1, shuffle=False)#inputs = X, target = y

#clf = LogisticRegression(random_state=1, solver='lbfgs', max_iter=10000, penalty="l2")
clf = LogisticRegression(random_state=1)
#solver-newton-cg_c-2_max_iter-1000

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)


print('Logical Regression')
#Training Results
print(f'Train Accuracy: {clf.score(X_train,y_train)}');
print(confusion_matrix(y_train,y_pred_train))
print(classification_report(y_train,y_pred_train));


#Testing Results
print(f'Test Accuracy: {clf.score(X_test,y_test)}');
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred));

Logical Regression
Train Accuracy: 0.6239168110918544
[[462 363]
 [288 618]]
              precision    recall  f1-score   support

           0       0.62      0.56      0.59       825
           1       0.63      0.68      0.66       906

    accuracy                           0.62      1731
   macro avg       0.62      0.62      0.62      1731
weighted avg       0.62      0.62      0.62      1731

Test Accuracy: 0.628175519630485
[[108  77]
 [ 84 164]]
              precision    recall  f1-score   support

           0       0.56      0.58      0.57       185
           1       0.68      0.66      0.67       248

    accuracy                           0.63       433
   macro avg       0.62      0.62      0.62       433
weighted avg       0.63      0.63      0.63       433



# Hyperparameter

Hyperparemeter to find the best params for our model 


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

def logical_regression_grid_search(data, target, nfolds):

    param = { 
        'solver':['newton-cg', 'lbfgs', 'liblinear','sag', 'saga'],
        'C': [0.001,.01,.1,1,2,3,5,10],
        'penalty': ['l2','l1','elasticnet'],
        'max_iter':[100,1000,3000], 
        'multi_class': ['auto', 'ovr', 'multinomial'], 
        'class_weight':['dict', 'balanced']
    }
    LogReg=LogisticRegression()
    
    #gridsearch has built in kfold, could choose to implment if wanted 
    #kfold = KFold(5, True, 1) 
    #cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

    gridsearch = GridSearchCV(LogReg, param, cv=nfolds, scoring='accuracy', error_score=0,verbose=1)
    gridsearch.fit(data, target)

    return gridsearch

hype_results = logical_regression_grid_search(X_train, y_train, 10)

Fitting 10 folds for each of 2160 candidates, totalling 21600 fits


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ValueError: class_weight must be dict, 'balanced', or None, got: 'dict'

ValueError: class_weight must be dict, 'balanced', or None, got: 'dict'

ValueError: class_weight must be dict, 'balanced', or None, got: 'dict'

ValueError: class_weight must be dict, 'balanced', or None, got: 'dict'

ValueError: class_weight must be dict, 'balanced', or None, got: 'dict'

ValueError: class_weight must be dict, 'balanced', or None, got: 'dict'

ValueError: class_weight must be dict, 'balanced', or None, got: 'dict'

ValueError: class_weight must be dict, 'balanced', or None, got: 'dict'

ValueError: class_weight must be dict, 'balanced', or None, got: 'dict'

ValueError: class_weight must be dict, 'balanced', or None, got: 'dict'

ValueError: class_weight must be dict, 'balanced', or None, got: 'dict'

ValueError: class_weight must be dict, 'balanced', or None, got: 'dict'

ValueError: class_weight must be dict, 'balanced', or None,

In [None]:
c

Best: 0.633652 using {'C': 5, 'class_weight': 'balanced', 'max_iter': 1000, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'sag'}
Accuracy 0.559185 (0.034427) with: {'C': 0.001, 'class_weight': 'dict', 'max_iter': 100, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'newton-cg'}
Accuracy 0.559185 (0.034427) with: {'C': 0.001, 'class_weight': 'dict', 'max_iter': 100, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy 0.559185 (0.034427) with: {'C': 0.001, 'class_weight': 'dict', 'max_iter': 100, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'sag'}
Accuracy 0.559763 (0.034204) with: {'C': 0.001, 'class_weight': 'dict', 'max_iter': 100, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'saga'}
Accuracy 0.523397 (0.002728) with: {'C': 0.001, 'class_weight': 'dict', 'max_iter': 100, 'multi_class': 'auto', 'penalty': 'l1', 'solver': 'saga'}
Accuracy 0.559185 (0.034427) with: {'C': 0.001, 'class_weight': 'dict', 'max_iter': 100, 'multi_class': 'ovr', 'penalty'

Testing out recommendations 

In [None]:
hype_results.best_estimator_

LogisticRegression(C=5, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
hype_results.best_score_

0.6336522490199987

In [None]:
hype_results.n_splits_

10

In [18]:
# Normalizing continuous variables

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range = (0,1))

scaler.fit(X_train)
X_train_fit = scaler.transform(X_train)
X_test_fit = scaler.transform(X_test)



In [57]:
#test_results = LogisticRegression(random_state=1, solver='saga', max_iter=100, penalty="none", C=0.001)

#test_results_2 = LogisticRegression(random_state=1, solver='liblinear', max_iter=100, penalty="l1", C=1) 

test_results_3 = LogisticRegression(C=10, class_weight='balanced', max_iter=3000, multi_class='multinomial', penalty='l1', solver='saga') #one to bet 

''' 
test_results_3.fit(X_train_fit, y_train)
y_pred = test_results_3.predict(X_test_fit)

#Train Accuracy: 0.6314269208549971
#Test Accuracy: 0.6304849884526559
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred));
print(f'Train Accuracy: {test_results_3.score(X_train_fit,y_train)}');
print(f'Test Accuracy: {test_results_3.score(X_test_fit,y_test)}');

''' 
test_results_3.fit(X_train, y_train)
y_pred = test_results_3.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred));
print(f'Train Accuracy: {test_results_3.score(X_train,y_train)}');
print(f'Test Accuracy: {test_results_3.score(X_test,y_test)}');


[[120  65]
 [ 94 154]]
              precision    recall  f1-score   support

           0       0.56      0.65      0.60       185
           1       0.70      0.62      0.66       248

    accuracy                           0.63       433
   macro avg       0.63      0.63      0.63       433
weighted avg       0.64      0.63      0.63       433

Train Accuracy: 0.634315424610052
Test Accuracy: 0.6327944572748267


In [None]:
'C': 5, 'class_weight': 'balanced', 'max_iter': 100, 'multi_class': 'ovr', 'penalty': 'l1', 'solver': 'liblinear' 

# Predicting the outcome of the Game

In [11]:
import sys
from termcolor import colored, cprint

text = colored('WIN!', 'green')
vs = colored('VS', 'red')

#CREATE A TABLE OF OUTCOMES WIN/LOSS THAT CORRESPONDS TO 1/0
game_outcomes = np.where(y_pred==1, "Team 1 wins","Team 2 wins")

#GET THE TEAMS FOR THE TEST DATA (LAST 20%)
team_1 = gameData.HomeTeam[1730:2163]
team_2 = gameData.AwayTeam[1730:2163]

#STACK THE COLUNS ON TOP OF EACH OTHER
games = np.vstack((team_1, team_2)).T
results = np.column_stack((games,game_outcomes))

#DISPLAY THE RESULTS
from termcolor import colored, cprint
for c in results[0:7]:
    if(c[2] == "Team 2 wins"):
      print(c[0] + " " +  vs + " " + c[1] + " = " + colored(c[1], 'green') + " " + text)
      print()
    else:
      print(c[0] + " " +  vs + " "  + c[1] + " = " + colored(c[0], 'green')+ " " +  text)
      print()

Ottawa Senators [31mVS[0m Montreal Canadiens = [32mOttawa Senators[0m [32mWIN![0m

Nashville Predators [31mVS[0m Columbus Blue Jackets = [32mColumbus Blue Jackets[0m [32mWIN![0m

New Jersey Devils [31mVS[0m Washington Capitals = [32mNew Jersey Devils[0m [32mWIN![0m

New York Rangers [31mVS[0m San Jose Sharks = [32mSan Jose Sharks[0m [32mWIN![0m

Ottawa Senators [31mVS[0m Montreal Canadiens = [32mOttawa Senators[0m [32mWIN![0m

Philadelphia Flyers [31mVS[0m Winnipeg Jets = [32mPhiladelphia Flyers[0m [32mWIN![0m

Pittsburgh Penguins [31mVS[0m Buffalo Sabres = [32mPittsburgh Penguins[0m [32mWIN![0m



**cool team implmentation guesser**



**K-fold validation (Anar Implementation)**

In [None]:
from sklearn.model_selection import cross_val_score


solvers = ['newton-cg', 'lbfgs', 'liblinear','sag', 'saga']
C_params = [0.001,.01,.1,1,2,3,5,10]
#penalties = ['l2','elasticnet']
max_iterations = [1000,1500,2000]



avg_kfcv_scores = {}
for s in solvers:
  for c in C_params:
    for m in max_iterations:
      model=LogisticRegression(C=c, solver=s,max_iter=m) #pass params
      kfcv_scores = cross_val_score(model,X_train, y_train,cv=5)
      avg_kfcv_scores["solver-"+s+"_c-"+str(c)+"_max_iter-"+str(m)] = np.average(kfcv_scores)


max_value = max(avg_kfcv_scores.values())  # maximum value
max_keys = [k for k, v in avg_kfcv_scores.items() if v == max_value] # getting all keys containing the maximum

print(max_value, max_keys)


#we use thezz value of k=5 to have 80/20 split which is the most recommended one.
# so 1 of the 5 sets created is used as a training set




0.632564841498559 ['solver-newton-cg_c-2_max_iter-1000', 'solver-newton-cg_c-2_max_iter-1500', 'solver-newton-cg_c-2_max_iter-2000', 'solver-newton-cg_c-3_max_iter-1000', 'solver-newton-cg_c-3_max_iter-1500', 'solver-newton-cg_c-3_max_iter-2000', 'solver-lbfgs_c-2_max_iter-1000', 'solver-lbfgs_c-2_max_iter-1500', 'solver-lbfgs_c-2_max_iter-2000', 'solver-lbfgs_c-3_max_iter-1000', 'solver-lbfgs_c-3_max_iter-1500', 'solver-lbfgs_c-3_max_iter-2000', 'solver-liblinear_c-3_max_iter-1000', 'solver-liblinear_c-3_max_iter-1500', 'solver-liblinear_c-3_max_iter-2000', 'solver-sag_c-3_max_iter-1000', 'solver-sag_c-3_max_iter-1500', 'solver-sag_c-3_max_iter-2000']


# Ploting
