In [52]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import Lasso

season_22_23 = pd.read_csv('2022-23_data.csv')
season_23_24 = pd.read_csv('2023-24_data.csv')

### Data pre-processing

In [56]:
# Drop rows with na
s22_23 = season_22_23.dropna()
s23_24 = season_23_24.dropna()

# Drop non-numeric columns
s22_23 = s22_23.select_dtypes(include='number')
s23_24 = s23_24.select_dtypes(include='number')

# Drop feature 'WIN'
s22_23 = s22_23.drop(columns=['WON'])
s23_24 = s23_24.drop(columns=['WON'])

In [57]:
# Split X and y
s22_23_y = s22_23['PTS']
s22_23_X = s22_23.drop(columns=['PTS'])
s23_24_y = s23_24['PTS']
s23_24_X = s23_24.drop(columns=['PTS'])

In [96]:
# Generate importance df
def get_feature_importance(features, importances):
    importance_df = pd.DataFrame({
        'Feature': features,
        'Importance': importances
    })
    return importance_df.sort_values(by='Importance', ascending=False)

# Filtered features
def filter_features_by_importance(importance_df, top_30_features):
    # Desired features to ensure inclusion
    additional_features = ['REF_BIAS', 'CLOSE_WIN_PCT', 'REST_DAYS', 'IS_BACK_TO_BACK', 'DISTANCE', 'RECENT_WIN_PCT']
    
    combined_features = set(top_30_features) | set(additional_features)
    filtered_df = importance_df[importance_df['Feature'].isin(combined_features)]
    return filtered_df

### Random forest regressor

In [97]:
# season22_23
rf22_23 = RandomForestRegressor(n_estimators=100, random_state=42)
rf22_23.fit(s22_23_X, s22_23_y)

feature_importances = rf22_23.feature_importances_
importance_df = get_feature_importance(s22_23_X.columns, feature_importances)

In [98]:
top_30_features = set(importance_df.head(30)['Feature'])
filtered_df = filter_features_by_importance(importance_df, top_30_features)
filtered_df

Unnamed: 0,Feature,Importance
18,fieldGoalPercentage_my_player_1,0.091964
38,fieldGoalPercentage_my_player_2,0.042626
78,fieldGoalPercentage_my_player_4,0.038956
58,fieldGoalPercentage_my_player_3,0.037627
98,fieldGoalPercentage_my_player_5,0.025223
118,fieldGoalPercentage_my_player_6,0.022885
138,fieldGoalPercentage_my_player_7,0.016619
183,speed_opposing_player_3,0.013345
163,speed_opposing_player_2,0.011931
144,distance_opposing_player_1,0.011512


In [99]:
# season23_24
rf23_24 = RandomForestRegressor(n_estimators=100, random_state=42)
rf23_24.fit(s23_24_X, s23_24_y)

feature_importances = rf23_24.feature_importances_
importance_df = get_feature_importance(s23_24_X.columns, feature_importances)

In [100]:
top_30_features = set(importance_df.head(30)['Feature'])
filtered_df = filter_features_by_importance(importance_df, top_30_features)
filtered_df

Unnamed: 0,Feature,Importance
37,fieldGoalPercentage_my_player_2,0.078819
17,fieldGoalPercentage_my_player_1,0.067253
77,fieldGoalPercentage_my_player_4,0.061427
57,fieldGoalPercentage_my_player_3,0.048865
97,fieldGoalPercentage_my_player_5,0.025233
117,fieldGoalPercentage_my_player_6,0.022338
16,uncontestedFieldGoalsPercentage_my_player_1,0.017601
14,uncontestedFieldGoalsMade_my_player_1,0.013409
137,fieldGoalPercentage_my_player_7,0.012749
3,distance_my_player_1,0.011042


### XGBoost regressor

In [101]:
# season22_23
xgb22_23 = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb22_23.fit(s22_23_X, s22_23_y)

feature_importances = xgb22_23.feature_importances_
importance_df = get_feature_importance(s22_23_X.columns, feature_importances)

In [102]:
top_30_features = set(importance_df.head(30)['Feature'])
filtered_df = filter_features_by_importance(importance_df, top_30_features)
filtered_df

Unnamed: 0,Feature,Importance
18,fieldGoalPercentage_my_player_1,0.046796
38,fieldGoalPercentage_my_player_2,0.027023
138,fieldGoalPercentage_my_player_7,0.025815
58,fieldGoalPercentage_my_player_3,0.02276
78,fieldGoalPercentage_my_player_4,0.022692
98,fieldGoalPercentage_my_player_5,0.02093
118,fieldGoalPercentage_my_player_6,0.020805
169,freeThrowAssists_opposing_player_2,0.020269
95,uncontestedFieldGoalsMade_my_player_5,0.019406
117,uncontestedFieldGoalsPercentage_my_player_6,0.018307


In [103]:
# season23_24
xgb23_24 = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb23_24.fit(s23_24_X, s23_24_y)

feature_importances = xgb23_24.feature_importances_
importance_df = get_feature_importance(s23_24_X.columns, feature_importances)

In [104]:
top_30_features = set(importance_df.head(30)['Feature'])
filtered_df = filter_features_by_importance(importance_df, top_30_features)
filtered_df

Unnamed: 0,Feature,Importance
77,fieldGoalPercentage_my_player_4,0.048408
37,fieldGoalPercentage_my_player_2,0.042299
17,fieldGoalPercentage_my_player_1,0.034087
57,fieldGoalPercentage_my_player_3,0.028426
117,fieldGoalPercentage_my_player_6,0.02216
97,fieldGoalPercentage_my_player_5,0.021378
14,uncontestedFieldGoalsMade_my_player_1,0.020233
137,fieldGoalPercentage_my_player_7,0.017881
54,uncontestedFieldGoalsMade_my_player_3,0.015118
284,CLOSE,0.014443


### Lasso Regression

In [105]:
# season22_23
lasso22_23 = Lasso(alpha=0.01)
lasso22_23.fit(s22_23_X, s22_23_y)

coef = lasso22_23.coef_
importance_df = get_feature_importance(s22_23_X.columns, coef)

  model = cd_fast.enet_coordinate_descent(


In [106]:
top_30_features = set(importance_df.head(15)['Feature']) | set(importance_df.tail(15)['Feature'])
filtered_df = filter_features_by_importance(importance_df, top_30_features)
filtered_df

Unnamed: 0,Feature,Importance
18,fieldGoalPercentage_my_player_1,10.703713
24,distance_my_player_2,4.211129
158,fieldGoalPercentage_opposing_player_1,3.47368
184,distance_opposing_player_3,2.913906
164,distance_opposing_player_2,2.236538
64,distance_my_player_4,2.050179
75,uncontestedFieldGoalsMade_my_player_4,1.953096
95,uncontestedFieldGoalsMade_my_player_5,1.944218
115,uncontestedFieldGoalsMade_my_player_6,1.835284
38,fieldGoalPercentage_my_player_2,1.76011


In [108]:
# season23_24
lasso23_24 = Lasso(alpha=0.01)
lasso23_24.fit(s23_24_X, s23_24_y)

coef = lasso23_24.coef_
importance_df = get_feature_importance(s23_24_X.columns, coef)

  model = cd_fast.enet_coordinate_descent(


In [109]:
top_30_features = set(importance_df.head(15)['Feature']) | set(importance_df.tail(15)['Feature'])
filtered_df = filter_features_by_importance(importance_df, top_30_features)
filtered_df

Unnamed: 0,Feature,Importance
37,fieldGoalPercentage_my_player_2,3.128156
291,RECENT_WIN_PCT,2.700272
54,uncontestedFieldGoalsMade_my_player_3,2.267065
14,uncontestedFieldGoalsMade_my_player_1,1.984862
97,fieldGoalPercentage_my_player_5,1.84214
74,uncontestedFieldGoalsMade_my_player_4,1.82254
111,contestedFieldGoalsMade_my_player_6,1.70615
34,uncontestedFieldGoalsMade_my_player_2,1.604114
114,uncontestedFieldGoalsMade_my_player_6,1.5725
134,uncontestedFieldGoalsMade_my_player_7,1.458111
