In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

# Import Data

In [24]:
df = pd.read_excel("../spring21_data/fa_stats_v2.xlsx")

# Feature Cleaning

In [25]:
numerical = ['age',
       'games_played', 'minutes_played', 
       'player_efficiency_rating', 'true_shooting_percentage',
       'three_point_attempt_rate', 'free_throw_attempt_rate',
       'offensive_rebound_percentage', 'defensive_rebound_percentage',
       'total_rebound_percentage', 'assist_percentage', 'steal_percentage',
       'block_percentage', 'turnover_percentage', 'usage_percentage',
       'offensive_win_shares', 'defensive_win_shares', 'win_shares',
       'win_shares_per_48_minutes', 'offensive_box_plus_minus',
       'defensive_box_plus_minus', 'box_plus_minus',
       'value_over_replacement_player','O-LEBRON', 'D-LEBRON', 'Wins Added', 'ORPM', 'DRPM', 'poss',
       'raptor_offense', 'raptor_defense', 'war_total', 'war_reg_season',
       'war_playoffs', 'predator_offense', 'predator_defense', 'pace_impact',
       'LA_RAPM__Def', 'LA_RAPM__Off', 'RA_EFG__Def', 'RA_EFG__Off',
       'RA_FTR__Def', 'RA_FTR__Off', 'RA_ORBD__Def', 'RA_ORBD__Off',
       'RA_TOV__Def', 'RA_TOV__Off', 'RAPM__Def', 'RAPM__Off']

categorical = ["Type", "positions"]
other = [col for col in df.columns.values if (col not in categorical+numerical+["cap_space_usage"])]
dum_df = pd.get_dummies(df, columns=categorical) # convert the categorical features into dummy features (numerical)

In [31]:
df = dum_df.drop(columns=other) # drop the non-feature columns
df.dropna(inplace=True) # drop all rows with a null value (there should be none)

Split the dataset into training set, validation set, and test set (this is for the final run)

In [30]:
from sklearn.model_selection import train_test_split

In [32]:
X = df.drop(columns=["cap_space_usage"])
y = df["cap_space_usage"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.13, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.13, random_state=42)

In [36]:
# scale the numerical features only (this leaves out the dummy features from earlier)
from sklearn.preprocessing import StandardScaler
features = numerical
scaler = StandardScaler()

X_train[features] = scaler.fit_transform(X_train[features])
X_test[features] = scaler.transform(X_test[features])
X_val[features] = scaler.transform(X_val[features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[features] = scaler.transform(X_test[features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


# Feature Selection

In [38]:
from sklearn.feature_selection import SelectKBest, f_regression
select = SelectKBest(f_regression, k=20)

best_features = select.fit_transform(X_train, y_train)
feature_scores = pd.concat([pd.Series(X_train.columns), pd.Series(select.scores_)], axis=1)
feature_scores.columns = ["features", "score"]

In [39]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [40]:
feature_scores.sort_values(by=["score"], ascending=False)

Unnamed: 0,features,score
2,Wins Added,436.424487
48,value_over_replacement_player,397.784298
8,war_total,375.565552
43,win_shares,348.976592
9,war_reg_season,341.832047
41,offensive_win_shares,290.549865
5,poss,245.789636
45,offensive_box_plus_minus,235.206834
47,box_plus_minus,219.198711
0,O-LEBRON,195.321237


Since this is the baseline model I decided just to use these features and call it a day for now.

In [41]:
features = list(feature_scores.sort_values(by=["score"], ascending=False).head(10)['features'])

In [42]:
X_train = X_train[features]
X_test = X_test[features]
X_val = X_val[features]

### helper functions

run(model, model_name, x_train, x_val) fits the model on the training set, evaluates the fit on the training set and on the validation set. 

evaluate(y_pred, y_test) prints out the evaluation metrics of a prediction.

In [43]:
def run(model, model_name, x_train, x_val):
    """ Fits the model and prints out results from calling evaluate on the training and validation set
        parameters:
            model
                the unfitted model, from sklearn (has to have a fit and predict method)
            model_name
                String, the name of the model
            x_train
                DataFrame, the training set
            x_val
                DataFrame, the validation set
                
        returns:
            nothing
        
    """
    model = model.fit(x_train, y_train)
    y_pred = np.clip(model.predict(x_train), 0, 1000) # since negative results are meaningless
    
    print("[{0}] These are the results for the training set.".format(model_name))
    
    evaluate(y_pred, y_train)
    print("--------------------------------------")
    
    print("[{0}] These are the results for the validation set.".format(model_name))
    y_pred = np.clip(model.predict(x_val), 0, 1000) # since negative results are meaningless
    
    evaluate(y_pred, y_val)
    
    print("_____________________________________________")

In [44]:
def evaluate(y_pred, y_test):
    """ Prints out the evaluation metrics of the experiment
        parameters:
            y_pred
                DataFrame or array-like, the predictions
            y_test
                DataFrame or array-like, the actual y-values
                
            these two parameters must have the same dimensions
    """
    print("r^2: ", r2_score(y_test, y_pred))
    print("mse: ", mean_squared_error(y_test, y_pred))
    print("variance_score: ", explained_variance_score(y_test, y_pred))
    print("mse-log: ", mean_squared_log_error(y_test, y_pred))

Here I'm trying those five linear models without any tuning and testing the performance.

In [45]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LassoLarsCV
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_squared_log_error

lstsq = LinearRegression()
ridge = RidgeCV()
lasso = LassoCV(max_iter=5000, random_state = 0)
elasticNet = ElasticNetCV(n_alphas = 1000)
lassoLars = LassoLarsCV()

run(ridge, "Ridge", X_train, X_val)
run(lstsq, "Least Squares", X_train,  X_val)
run(lasso, "Lasso", X_train,  X_val)
run(elasticNet, "ElasticNetCV", X_train,  X_val)
run(lassoLars, "Lasso LARS", X_train,  X_val)

[Ridge] These are the results for the training set.
r^2:  0.6601766124386279
mse:  0.002230673453217625
variance_score:  0.6601771516641841
mse-log:  0.0017913397771962916
--------------------------------------
[Ridge] These are the results for the validation set.
r^2:  0.6922526813426049
mse:  0.002009355336751259
variance_score:  0.6927736845900008
mse-log:  0.0015412465501453499
_____________________________________________
[Least Squares] These are the results for the training set.
r^2:  0.6638162047868044
mse:  0.002206782390010138
variance_score:  0.6638183234228525
mse-log:  0.0017653642500892555
--------------------------------------
[Least Squares] These are the results for the validation set.
r^2:  0.6949054441307768
mse:  0.001992034818448116
variance_score:  0.695271559458994
mse-log:  0.0015275200832865219
_____________________________________________
[Lasso] These are the results for the training set.
r^2:  0.6561452713641313
mse:  0.0022571360389162668
variance_score:  0

Here I'm transforming the datasets to include quadratic features (eecs16b). This means that for feature: points, the PolynomialFeatures transformer will add a column of points-squared to the dataset.

In [46]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)

X_trainPF = poly.fit_transform(X_train)
X_testPF = poly.transform(X_test)
X_valPF = poly.transform(X_val)

lstsq = LinearRegression()
ridge = RidgeCV()
lasso = LassoCV(max_iter=50000, random_state = 0)
elasticNet = ElasticNetCV(max_iter=50000, n_alphas = 1000)
lassoLars = LassoLarsCV()

run(ridge, "Ridge", X_trainPF, X_valPF)
run(lstsq, "Least Squares", X_trainPF, X_valPF)
run(lasso, "Lasso", X_trainPF, X_valPF)
run(elasticNet, "ElasticNetCV", X_trainPF, X_valPF)
run(lassoLars, "Lasso LARS", X_trainPF, X_valPF)

[Ridge] These are the results for the training set.
r^2:  0.7317587349256864
mse:  0.0017607930794660798
variance_score:  0.7317587553306708
mse-log:  0.0014306599960247797
--------------------------------------
[Ridge] These are the results for the validation set.
r^2:  0.680538475912156
mse:  0.002085840166254118
variance_score:  0.6832934589832045
mse-log:  0.0015996389355668042
_____________________________________________
[Least Squares] These are the results for the training set.
r^2:  0.7660364852454948
mse:  0.0015357865894092134
variance_score:  0.7660371601979861
mse-log:  0.0012636390849380728
--------------------------------------
[Least Squares] These are the results for the validation set.
r^2:  0.6415235287596459
mse:  0.0023405780226746927
variance_score:  0.6502472263006186
mse-log:  0.001818071494088916
_____________________________________________
[Lasso] These are the results for the training set.
r^2:  0.7052103745929592
mse:  0.0019350621992157476
variance_score: 

Since these results are much better (except least squares), we're probably going to need to look beyond just linear-models.

**To put the result of baseline model into contexts, we also need to look into data entries manually to do some sanity checks**

In [47]:
train_indexes = X_train.index

In [48]:
sanity_check_train_df = dum_df.iloc[train_indexes][['key', 'cap_space_usage', 'Wins Added', 'value_over_replacement_player',
                                                      'war_total', 'win_shares']]
sanity_check_train_df['val_cap_usage'] = elasticNet.predict(X_trainPF)
sanity_check_train_df['diff'] = sanity_check_train_df['val_cap_usage'] - sanity_check_train_df['cap_space_usage']

In [49]:
sanity_check_train_df.sort_values('diff', ascending=False)

Unnamed: 0,key,cap_space_usage,Wins Added,value_over_replacement_player,war_total,win_shares,val_cap_usage,diff
293,Willie Cauley-Stein2019,0.019951,5.851237,1.8,2.426611,6.7,0.144533,0.124582
163,Derrick Rose2017,0.013691,2.521421,0.5,2.694905,3.0,0.105402,0.091711
301,Al-Farouq Aminu2019,0.084827,6.805854,1.5,5.025886,5.8,0.173193,0.088367
230,DeMarcus Cousins2018,0.052391,6.525764,2.9,4.722472,4.7,0.138727,0.086336
236,Jeff Green2018,0.0235,2.246076,0.1,2.527616,3.9,0.109136,0.085636
294,Brook Lopez2019,0.110803,8.400811,2.4,5.568199,6.7,0.192836,0.082033
12,Zaza Pachulia2016,0.030783,3.256223,0.9,2.82302,6.0,0.104859,0.074076
317,Jeff Green2019,0.0235,1.534547,0.3,2.131536,3.6,0.096193,0.072694
182,Brandon Jennings2017,0.002999,2.20805,0.0,0.691147,1.5,0.073999,0.071001
229,Dirk Nowitzki2018,0.049083,4.006983,2.1,2.855006,4.8,0.118354,0.069272


In [50]:
validation_indexes = X_val.index

In [51]:
sanity_check_val_df = dum_df.iloc[validation_indexes][['key', 'cap_space_usage', 'Wins Added', 'value_over_replacement_player',
                                                      'war_total', 'win_shares']]
sanity_check_val_df['val_cap_usage'] = elasticNet.predict(X_valPF)
sanity_check_val_df['diff'] = sanity_check_val_df['val_cap_usage'] - sanity_check_val_df['cap_space_usage']

In [52]:
sanity_check_val_df.sort_values('diff', ascending=False)

Unnamed: 0,key,cap_space_usage,Wins Added,value_over_replacement_player,war_total,win_shares,val_cap_usage,diff
70,Dion Waiters2016,0.030783,0.775636,-0.4,2.183525,2.1,0.100577,0.069794
142,Marreese Speights2017,0.014848,2.32335,0.9,2.274486,4.2,0.078146,0.063298
299,Danny Green2019,0.134086,6.081768,2.2,10.105761,5.9,0.19583,0.061744
237,Brook Lopez2018,0.0332,3.104935,1.3,1.943188,3.8,0.093386,0.060187
243,Shabazz Napier2018,0.019068,2.740723,1.0,3.733121,3.2,0.068639,0.049571
233,Tyreke Evans2018,0.117798,4.682775,2.6,5.391138,4.2,0.163802,0.046003
367,Austin Rivers2019,0.019922,0.72755,-0.6,0.23264,-0.2,0.064009,0.044087
302,Ed Davis2019,0.043678,2.497605,0.7,2.616817,5.7,0.076778,0.0331
96,Willie Reed2016,0.010789,0.701533,0.2,0.942895,1.2,0.043365,0.032577
92,Chris Andersen2016,0.016482,0.666488,0.1,0.526616,0.1,0.041801,0.025319
