In [1]:
# Load the NHL shot data (shots_2023.csv)
import pandas as pd

shots_2023 = pd.read_csv('shots_2023.csv')

# Display the first few rows of the dataset
print(shots_2023.head())

   shotID  arenaAdjustedShotDistance  arenaAdjustedXCord  \
0       0                  39.698866                59.0   
1       1                  11.313708                81.0   
2       2                  45.343136                55.0   
3       3                  43.139309                58.0   
4       4                  42.201896               -64.0   

   arenaAdjustedXCordABS  arenaAdjustedYCord  arenaAdjustedYCordAbs  \
0                   59.0               -26.0                   26.0   
1                   81.0                 8.0                    8.0   
2                   55.0                30.0                   30.0   
3                   58.0               -30.0                   30.0   
4                   64.0                34.0                   34.0   

   averageRestDifference  awayEmptyNet  awayPenalty1Length  \
0                   -3.4             0                   0   
1                   -3.4             0                   0   
2                   -3.8  

In [2]:
# Build a regression tree model for predicting the xGoals of a shot
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Select relevant features for predicting xGoals
features = [
    'shotDistance', 'shotAngle', 'shotType', 'shotRebound', 'shotRush',
    'playerPositionThatDidEvent', 'shotOnEmptyNet', 'arenaAdjustedShotDistance',
    'offWing', 'lastEventCategory', 'timeSinceLastEvent'
]

In [3]:
# Prepare the data
X = shots_2023[features]
y = shots_2023['xGoal']

# Create dummy variables for non-ordinal categorical features
X = pd.get_dummies(X, columns=['shotType', 'playerPositionThatDidEvent', 'lastEventCategory'], drop_first=True)

# Update the features list to include new dummy columns
features = X.columns.tolist()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Create and train the regression tree model
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

In [6]:
# Evaluate the original model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Original Model:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Display feature importances for the original model
feature_importance = pd.DataFrame({'feature': features, 'importance': model.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nFeature Importances (Original Model):")
print(feature_importance)

# Create a smaller model with max_depth=5
small_model = DecisionTreeRegressor(max_depth=5, random_state=42)
small_model.fit(X_train, y_train)

# Make predictions with the smaller model
y_pred_small = small_model.predict(X_test)

# Evaluate the smaller model
mse_small = mean_squared_error(y_test, y_pred_small)
r2_small = r2_score(y_test, y_pred_small)

print("\nSmaller Model (max_depth=5):")
print(f"Mean Squared Error: {mse_small:.4f}")
print(f"R-squared Score: {r2_small:.4f}")

# Display feature importances for the smaller model
feature_importance_small = pd.DataFrame({'feature': features, 'importance': small_model.feature_importances_})
feature_importance_small = feature_importance_small.sort_values('importance', ascending=False)
print("\nFeature Importances (Smaller Model):")
print(feature_importance_small)

# Visualize the smaller decision tree
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(20,10))
plot_tree(small_model, feature_names=features, filled=True, rounded=True, fontsize=10)
plt.savefig('small_decision_tree.png', dpi=300, bbox_inches='tight')
plt.close()

Original Model:
Mean Squared Error: 0.0079
R-squared Score: 0.2803

Feature Importances (Original Model):
                         feature  importance
5      arenaAdjustedShotDistance    0.314134
4                 shotOnEmptyNet    0.183755
7             timeSinceLastEvent    0.141396
1                      shotAngle    0.134242
0                   shotDistance    0.074764
6                        offWing    0.016001
16  playerPositionThatDidEvent_L    0.014827
20         lastEventCategory_FAC    0.013937
17  playerPositionThatDidEvent_R    0.012327
13                shotType_WRIST    0.012285
23        lastEventCategory_MISS    0.010244
2                    shotRebound    0.009822
10                 shotType_SNAP    0.008900
24        lastEventCategory_SHOT    0.008826
22         lastEventCategory_HIT    0.007462
12                 shotType_WRAP    0.006670
14  playerPositionThatDidEvent_D    0.006521
11                  shotType_TIP    0.006412
21        lastEventCategory_GIVE    0.0