In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
from matplotlib import animation
from IPython.display import HTML
import random
import swifter
from sklearn.preprocessing import LabelEncoder
import math
import random
from statsmodels.nonparametric.smoothers_lowess import lowess
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import auc
%matplotlib inline

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

In [None]:
hackathon_womens = pd.read_csv("https://raw.githubusercontent.com/bigdatacup/Big-Data-Cup-2021/main/hackathon_womens.csv")
hackathon_scouting = pd.read_csv("https://raw.githubusercontent.com/bigdatacup/Big-Data-Cup-2021/main/hackathon_scouting.csv")
hackathon_nwhl = pd.read_csv("https://raw.githubusercontent.com/bigdatacup/Big-Data-Cup-2021/main/hackathon_nwhl.csv")

In [None]:
start_data = hackathon_scouting.copy()
start_data.shape

In [None]:
data = start_data.copy()
data.head()

In [None]:
data['goalScored'] = 0
data.loc[data.Event == 'Goal', 'goalScored'] = 1
data['shotAttempted'] = 0
data.loc[data.Event.isin(['Goal', 'Shot']), 'shotAttempted'] = 1
data.head()

In [None]:
def calcAngleOnGoal(data):
    goal = ([189, 39.5], [189, 42.5], [189, 45.5])
    x = data['X Coordinate']
    y = data['Y Coordinate']
    if x > goal[0][0]:
        return 0, 0, 0
    def calcDistToPost(playerCoords, postCoords):
        x = playerCoords[0] - postCoords[0]
        y = playerCoords[1] - postCoords[1]
        return np.sqrt(x**2 + y **2)
    lowPost = calcDistToPost([x, y], goal[0])
    highPost = calcDistToPost([x, y], goal[2])
    distToPorts = (lowPost, highPost)
    closePost = goal[0] if distToPorts[0] <= distToPorts[1] else goal[2]
    farPost = goal[2] if distToPorts[0] <= distToPorts[1] else goal[0]
    deltaX1 = np.abs(x - closePost[0])
    deltaY1 = np.abs(y - closePost[1])
    angleToClosePost = math.degrees(np.arctan(deltaY1/deltaX1)) if deltaX1 != 0 else 90
    
    deltaX2 = np.abs(x - farPost[0])
    deltaY2 = np.abs(y - farPost[1])
    angleToFarPost = math.degrees(np.arctan(deltaY2/deltaX2)) if deltaX2 != 0 else 90
    goalAngle = angleToFarPost - angleToClosePost if (y < goal[0][1]) or (y > goal[2][1]) else angleToFarPost + angleToClosePost
    return pd.Series([angleToClosePost, angleToFarPost, goalAngle])

In [None]:
data['angleToClosePost'] = -1
data['angleToFarPost'] = -1
data['overallGoalAngle'] = -1
data['distToGoal'] = -1
shotAngle = data[data.Event.isin(['Shot', 'Goal'])].apply(lambda x: calcAngleOnGoal(x), axis=1)
shotAngle.head()

In [None]:
data.loc[data.Event.isin(['Shot', 'Goal']), 'angleToClosePost'] = shotAngle[0]
data.loc[data.Event.isin(['Shot', 'Goal']), 'normAngleToClosePost'] = shotAngle[0] / shotAngle[0].mean()

data.loc[data.Event.isin(['Shot', 'Goal']), 'angleToFarPost'] = shotAngle[1]
data.loc[data.Event.isin(['Shot', 'Goal']), 'normAngleToFarPost'] = shotAngle[1] / shotAngle[1].mean()

data.loc[data.Event.isin(['Shot', 'Goal']), 'overallGoalAngle'] = shotAngle[2]
data.loc[data.Event.isin(['Shot', 'Goal']), 'normOverallGoalAngle'] = shotAngle[2] / shotAngle[2].mean()

data.loc[data.Event.isin(['Shot', 'Goal']), 'distToGoal'] = np.sqrt((data.loc[
    data.Event.isin(['Shot', 'Goal']), 'X Coordinate'] - 189)**2 + (data.loc[
    data.Event.isin(['Shot', 'Goal']), 'Y Coordinate'] - 42.5)**2)
data.loc[data.Event.isin(['Shot', 'Goal']), 'normDistToGoal'] = data.loc[data.Event.isin(['Shot', 'Goal']), 'distToGoal'] / data.loc[data.Event.isin(['Shot', 'Goal']), 'distToGoal'].mean()
data.loc[data.Event.isin(['Shot', 'Goal'])].head()

In [None]:
def calcROC(thresholds, phatGoal, yactual, pRecall=False):
    fpRates = []
    precisions = []
    tpRates = []
    for threshold in thresholds:
        yhat = []
        for prediction in phatGoal:
            y = 0 if prediction < threshold else 1
            yhat.append(y)
        fp = 0
        tp = 0
        fn = 0
        tn = 0
        for i in range(0, len(yactual)):
            if yactual[i] == yhat[i]:
                if yactual[i] == 1:
                    tp += 1
                else:
                    tn += 1
            elif yactual[i] != yhat[i]:
                if yactual[i] == 1:
                    fn += 1
                else:
                    fp += 1
        fpRate = fp / (fp + tn)
        tpRate = tp / (tp + fn)
        prec = 0
        if (fp != 0):
            prec = tp / (tp + fp)
        fpRates.append(fpRate)
        tpRates.append(tpRate)
        precisions.append(prec)
    if pRecall:
        return precisions, tpRates
    else:
        return fpRates, tpRates

In [None]:
shotRegr = data.loc[data.Event.isin(['Shot', 'Goal']), ['Period', 'newClock', 'Event', 'X Coordinate', 'Y Coordinate', 
                 'assistedShot', 'shotType', 'shotTarget', 'shotTraffic', 'oneShot', 'posTeamScore', 
                 'defTeamScore', 'x', 'y', 'angleToClosePost', 'angleToFarPost', 
                 'overallGoalAngle', 'distToGoal', 'normAngleToClosePost', 'normAngleToFarPost', 
                 'normOverallGoalAngle', 'normDistToGoal', 'goalScored']].copy()
lb = LabelEncoder()
shotRegr['shotType'] = lb.fit_transform(shotRegr['shotType'])
shotRegr['shotTarget'] = lb.fit_transform(shotRegr['shotTarget'])
shotRegr['oneShot'] = lb.fit_transform(shotRegr['oneShot'])
shotRegr['assistedShot'] = lb.fit_transform(shotRegr['assistedShot'])
shotRegr['shotTraffic'] = lb.fit_transform(shotRegr['shotTraffic'])
#shotRegr['Event'] = lb.fit_transform(shotRegr['goalScored'])

In [None]:
def calcAUC(phat, yactual, plotROC=False):
    thresholds = np.linspace(0, 1, 100)
    fpRates, tpRates = calcROC(thresholds, phat, yactual)
    if(plotROC):
        plt.plot(thresholds, [1-x for x in fpRates], c='r', label='False Positive Rate')
        plt.plot(thresholds, tpRates, label='True Positive Rate')
        plt.xlabel('Thresholds')
        plt.xlabel('Rate')
        plt.legend()
        plt.show()
    return auc(fpRates, tpRates)
def doCVShots(XInput, yInput, model, iters):
    aucs = []
    for i in range(0, iters):
        modelToUse = model
        Xtrain, Xval, ytrain, yval = train_test_split(XInput,
                                                        yInput, test_size=0.2)
        model.fit(Xtrain, ytrain)
        phat = model.predict_proba(Xval)
        aucVal = calcAUC(phat[:, 1], np.array(yval), False)
        aucs.append(aucVal)
    return aucs

In [None]:
Xtrain3, Xtest3, ytrain3, ytest3 = train_test_split(shotRegr[['Period',
                 'shotType', 'oneShot', 'shotTraffic', 'posTeamScore', 'x', 'y',
                 'defTeamScore', 'normAngleToClosePost', 'normAngleToFarPost', 
                 'normOverallGoalAngle', 'normDistToGoal']], shotRegr['goalScored'], test_size=0.2)

In [None]:
forest3 = RandomForestClassifier(random_state=2013, n_jobs=-1)
cvAUCs = doCVShots(Xtrain3, ytrain3, forest3, 5)
print("CV AUC:", sum(cvAUCs) / len(cvAUCs))
plt.barh(Xtrain3.columns, forest3.feature_importances_)
plt.show()

In [None]:
for i in [10, 20, 50, 70, 100, 250, 500, 100]:
    newForest = RandomForestClassifier(random_state=2013, n_estimators=i, n_jobs=-1)
    cvAUCs = doCVShots(Xtrain3, ytrain3, newForest, 5)
    print("Estimators:", i, "CV AUC:", sum(cvAUCs) / len(cvAUCs))

In [None]:
for i in [None, 5, 10, 15, 20, 30]:
    newForest = RandomForestClassifier(random_state=2013, n_estimators=500 , max_depth=i, n_jobs=-1)
    cvAUCs = doCVShots(Xtrain3, ytrain3, newForest, 5)
    print("Max Depth:", i, "CV AUC:", sum(cvAUCs) / len(cvAUCs))

In [None]:
for i in ['gini', 'entropy']:
    newForest = RandomForestClassifier(random_state=2013, n_estimators=500 , max_depth=5, criterion=i, n_jobs=-1)
    cvAUCs = doCVShots(Xtrain3, ytrain3, newForest, 5)
    print("Criterion:", i, "CV AUC:", sum(cvAUCs) / len(cvAUCs))

In [None]:
for i in ['auto', 'sqrt', 'log2']:
    newForest = RandomForestClassifier(random_state=2013, n_estimators=500 , max_depth=5, criterion='entropy', 
                                       max_features=i, n_jobs=-1)
    cvAUCs = doCVShots(Xtrain3, ytrain3, newForest, 5)
    print("Max Features:", i, "CV AUC:", sum(cvAUCs) / len(cvAUCs))

In [None]:
for i in [2, 3, 4, 7, 10]:
    newForest = RandomForestClassifier(random_state=2013, n_estimators=500 , max_depth=5, criterion='entropy', 
                                       max_features='log2', min_samples_split=i, n_jobs=-1)
    cvAUCs = doCVShots(Xtrain3, ytrain3, newForest, 5)
    print("Min Samples Split:", i, "CV AUC:", sum(cvAUCs) / len(cvAUCs))

In [None]:
finalExpectedGoalModel = RandomForestClassifier(random_state=2013, n_estimators=70 , max_depth=5, criterion='gini', 
                                       max_features='log2', min_samples_split=3, n_jobs=-1)
finalExpectedGoalModel.fit(Xtrain3, ytrain3)
phat3 = finalExpectedGoalModel.predict_proba(Xtest3)
aucVal = calcAUC(phat3[:, 1], np.array(ytest3), True)
thresholds = np.linspace(0, 1, 100)
precisions, tpRates = calcROC(thresholds, phat3[:,1], np.array(ytest3), True)
gMeans = np.sqrt(np.array(tpRates) * (1-np.array(fpRates)))
bestThreshLoc = np.where(gMeans == gMeans.max())[0][0]
print("Final AUC:", aucVal, 'Best Threshold:', thresholds[bestThreshLoc])