In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 5.5
fig_height = 3.5
fig_format = 'pdf'
fig_dpi = 300

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  pio.renderers.default = "notebook_connected"
except Exception:
  pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass



# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'/Users/giovanni-lunetta/uconn_masters/hockey/hockey_repo/presentations':
  os.chdir(r'/Users/giovanni-lunetta/uconn_masters/hockey/hockey_repo/presentations')

# reset state
%reset

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v
  
  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define


  set_matplotlib_formats(fig_format)




In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

df = pd.read_csv('/Users/giovanni-lunetta/uconn_masters/hockey/hockey_repo/data/final_data.csv', low_memory=False)


pd.set_option('display.max_columns', None)
df.sample(5)

Unnamed: 0,Date,Opponent,Team_Total_UConn_Score,Team_Total_Opponent_Score,Home,Win,Team_Score_On_PP,Team_PP_Faceoffs_won_in_OZ_percent,Team_PP_Faceoffs_won_percent,Team_Shots_OT_Behind_Net,Team_Shots_Behind_Net,Team_Goals_Behind_Net,Team_Shots_OT_at_Net,Team_Shots_at_Net,Team_Goals_at_Net,Team_Shots_OT_Left_Close,Team_Shots_Left_Close,Team_Goals_Left_Close,Team_Shots_OT_Center_Close,Team_Shots_Center_Close,Team_Goals_Center_Close,Team_Shots_OT_Right_Close,Team_Shots_Right_Close,Team_Goals_Right_Close,Team_Shots_OT_Left_Far,Team_Shots_Left_Far,Team_Goals_Left_Far,Team_Shots_OT_Center_Far,Team_Shots_Center_Far,Team_Goals_Center_Far,Team_Shots_OT_Right_Far,Team_Shots_Right_Far,Team_Goals_Right_Far,Team_Shots_OT_Not_in_Offensive_Zone,Team_Shots_Not_in_Offensive_Zone,Team_Goals_Not_in_Offensive_Zone,Team_PP_OZ_possession_percent,Team_PP_NZ_possession_percent,Team_PP_DZ_possession_percent
30,2022-10-01,Vermont Catamounts,4.0,1.0,0,1,1,62.5,62.5,0,0,0,1,1,1,1,1,1,3,6,0,2,2,0,0,0,0,0,5,0,1,2,0,0,0,0,61.83,7.68,15.35
64,2021-02-09,UMass-Lowell River Hawks,2.0,3.0,0,0,0,50.0,50.0,1,1,0,1,1,0,1,3,0,2,5,0,0,0,0,1,1,0,0,2,0,1,1,0,0,0,0,58.61,9.17,14.72
21,2022-11-05,Maine Black Bears,3.0,2.0,1,1,0,71.4,75.0,0,0,0,0,0,0,0,2,0,2,3,0,3,4,0,0,0,0,1,2,0,0,0,0,0,0,0,54.17,6.11,16.39
70,2021-01-16,New Hampshire Wildcats,8.0,3.0,0,1,1,76.9,71.4,0,0,0,4,4,3,1,1,1,5,7,0,2,4,0,0,0,0,1,8,0,0,3,0,0,0,0,71.68,4.24,6.55
35,2022-02-26,Northeastern Huskies,2.0,5.0,1,0,1,50.0,57.1,0,0,0,1,2,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,50.48,6.01,9.86


In [3]:
df.describe()

Unnamed: 0,Team_Total_UConn_Score,Team_Total_Opponent_Score,Home,Win,Team_Score_On_PP,Team_PP_Faceoffs_won_in_OZ_percent,Team_PP_Faceoffs_won_percent,Team_Shots_OT_Behind_Net,Team_Shots_Behind_Net,Team_Goals_Behind_Net,Team_Shots_OT_at_Net,Team_Shots_at_Net,Team_Goals_at_Net,Team_Shots_OT_Left_Close,Team_Shots_Left_Close,Team_Goals_Left_Close,Team_Shots_OT_Center_Close,Team_Shots_Center_Close,Team_Goals_Center_Close,Team_Shots_OT_Right_Close,Team_Shots_Right_Close,Team_Goals_Right_Close,Team_Shots_OT_Left_Far,Team_Shots_Left_Far,Team_Goals_Left_Far,Team_Shots_OT_Center_Far,Team_Shots_Center_Far,Team_Goals_Center_Far,Team_Shots_OT_Right_Far,Team_Shots_Right_Far,Team_Goals_Right_Far,Team_Shots_OT_Not_in_Offensive_Zone,Team_Shots_Not_in_Offensive_Zone,Team_Goals_Not_in_Offensive_Zone,Team_PP_OZ_possession_percent,Team_PP_NZ_possession_percent,Team_PP_DZ_possession_percent
count,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0,110.0
mean,3.127273,2.827273,0.481818,0.554545,0.4,59.192727,59.770909,0.018182,0.018182,0.0,1.136364,1.454545,0.327273,0.7,1.354545,0.027273,1.154545,2.181818,0.145455,0.727273,1.290909,0.009091,0.3,0.672727,0.009091,0.8,2.172727,0.027273,0.336364,0.763636,0.009091,0.045455,0.054545,0.0,53.706,9.161909,14.732
std,1.877482,1.630327,0.501956,0.499291,0.49214,20.064313,18.401767,0.13422,0.13422,0.0,1.252354,1.481608,0.636923,0.893914,1.154062,0.163622,1.110305,1.813213,0.40266,0.907755,1.175791,0.095346,0.58341,0.879002,0.095346,0.926669,1.924535,0.163622,0.609967,0.966667,0.095346,0.209252,0.22813,0.0,12.934222,4.289807,5.996944
min,0.0,0.0,0.0,0.0,0.0,16.7,16.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.05,1.8,4.33
25%,2.0,1.0,0.0,0.0,0.0,50.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.5675,6.27,11.1375
50%,3.0,3.0,0.0,1.0,0.0,60.0,60.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.275,8.425,14.17
75%,4.0,4.0,1.0,1.0,1.0,71.4,75.0,0.0,0.0,0.0,2.0,2.0,0.75,1.0,2.0,0.0,2.0,3.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,3.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,60.345,10.97,17.15
max,8.0,7.0,1.0,1.0,1.0,100.0,100.0,1.0,1.0,0.0,5.0,6.0,3.0,3.0,4.0,1.0,5.0,8.0,2.0,4.0,4.0,1.0,2.0,4.0,1.0,4.0,8.0,1.0,2.0,4.0,1.0,1.0,1.0,0.0,97.89,25.0,44.51


In [4]:
# Plotting the distribution of the dependent variable 'Win'
plt.figure(figsize=(8, 5))
sns.countplot(x='Win', data=df)
plt.title('Distribution of Wins')
plt.xlabel('Win (0 = Loss, 1 = Win)')
plt.ylabel('Count')
plt.show()

<Figure size 2400x1500 with 1 Axes>

In [5]:
# Removing the duplicate column 'Team_Shots_Behind_Net'
df = df.drop('Team_Shots_Behind_Net', axis=1)

# Checking for columns that have all zeros
columns_all_zeros = df.columns[(df == 0).all()]
columns_all_zeros

# Dropping columns with all zeros
df = df.drop(columns=columns_all_zeros)

# Dropping non-numeric columns for simplicity
df = df.drop(columns=['Date', 'Opponent'])
df = df.drop(columns=['Team_Total_UConn_Score', 'Team_Total_Opponent_Score'])
df = df.drop(columns=['Team_Shots_OT_Behind_Net'])
df = df.drop(columns=['Team_PP_Faceoffs_won_in_OZ_percent'])

In [6]:
import statsmodels.formula.api as smf

# Joining all column names with '+' except 'Win', 'Date', and 'Opponent'
formula = 'Win ~ ' + ' + '.join(df.columns.drop('Win'))

# Fitting the logistic regression model using smf.logit
logit_model = smf.logit(formula=formula, data=df).fit()

# Displaying the summary of the model
logit_model_summary = logit_model.summary()
print(logit_model_summary)

         Current function value: 0.501595
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                    Win   No. Observations:                  110
Model:                          Logit   Df Residuals:                       80
Method:                           MLE   Df Model:                           29
Date:                Mon, 11 Dec 2023   Pseudo R-squ.:                  0.2701
Time:                        13:46:19   Log-Likelihood:                -55.175
converged:                      False   LL-Null:                       -75.590
Covariance Type:            nonrobust   LLR p-value:                   0.07126
                                          coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
Intercept                              -4.0347      1.819     -2.218      0.027      -7.600   



In [7]:
import matplotlib.pyplot as plt

residuals = logit_model.resid_generalized
plt.hist(residuals, bins=20)

(array([ 4.,  2.,  3.,  3.,  7.,  7.,  3.,  6.,  7.,  4., 11., 11.,  9.,
         8.,  5.,  9.,  3.,  3.,  3.,  2.]),
 array([-0.88656364, -0.79971038, -0.71285711, -0.62600385, -0.53915058,
        -0.45229731, -0.36544405, -0.27859078, -0.19173752, -0.10488425,
        -0.01803098,  0.06882228,  0.15567555,  0.24252881,  0.32938208,
         0.41623535,  0.50308861,  0.58994188,  0.67679514,  0.76364841,
         0.85050168]),
 <BarContainer object of 20 artists>)

<Figure size 1650x1050 with 1 Axes>

In [8]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Selecting only the independent variables
X = df.drop('Win', axis=1)

# Calculating VIF for each feature
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

vif_data.sort_values(by='VIF', ascending=False)

df = df.drop(columns=['Team_Shots_OT_at_Net'])

In [9]:
df = df.drop(columns=['Team_Goals_Right_Close', 'Team_Goals_Left_Far', 'Team_Goals_Center_Far', 'Team_Goals_Right_Far', 'Team_Shots_OT_Not_in_Offensive_Zone', 'Team_Shots_Not_in_Offensive_Zone'])

In [10]:
import statsmodels.formula.api as smf

# Joining all column names with '+' except 'Win', 'Date', and 'Opponent'
formula = 'Win ~ ' + ' + '.join(df.columns.drop('Win'))

# Fitting the logistic regression model using smf.logit
logit_model = smf.logit(formula=formula, data=df).fit()

# Displaying the summary of the model
logit_model_summary = logit_model.summary()
print(logit_model_summary)

Optimization terminated successfully.
         Current function value: 0.576667
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                    Win   No. Observations:                  110
Model:                          Logit   Df Residuals:                       87
Method:                           MLE   Df Model:                           22
Date:                Mon, 11 Dec 2023   Pseudo R-squ.:                  0.1608
Time:                        13:46:19   Log-Likelihood:                -63.433
converged:                       True   LL-Null:                       -75.590
Covariance Type:            nonrobust   LLR p-value:                    0.3310
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept                        -3.8386      1.679     -2.286      0.

In [11]:
from sklearn.metrics import roc_curve, auc

predictions = logit_model.predict(df)

fpr, tpr, thresholds = roc_curve(df['Win'], predictions)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

<Figure size 1650x1050 with 1 Axes>

In [12]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Convert probabilities to 0/1 predictions based on a chosen threshold (e.g., 0.5)
pred_labels = predictions > 0.5
cm = confusion_matrix(df['Win'], pred_labels)

sns.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

<Figure size 1650x1050 with 2 Axes>

In [13]:
import numpy as np
import statsmodels.api as sm

# Assuming logit_model is your fitted logistic regression model
# And X_test is your test features dataset

predicted_probabilities = logit_model.predict(X)
predicted_labels = np.where(predicted_probabilities > 0.5, 1, 0)  # Using 0.5 as the threshold


from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

precision = precision_score(df['Win'], predicted_labels)
recall = recall_score(df['Win'], predicted_labels)
f1 = f1_score(df['Win'], predicted_labels)
accuracy = accuracy_score(df['Win'], predicted_labels)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Accuracy: {accuracy}")

Precision: 0.7121212121212122
Recall: 0.7704918032786885
F1-Score: 0.7401574803149605
Accuracy: 0.7
