In [3]:
#!pip install sklearn-pandas

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

# Import Data

In [5]:
df = pd.read_excel("../spring21_data/FA_STATS_DF.xlsx")

# Feature Cleaning

In [6]:
numerical = ['age',
       'games_played', 'games_started', 'minutes_played', 'made_field_goals',
       'attempted_field_goals', 'made_three_point_field_goals',
       'attempted_three_point_field_goals', 'made_free_throws',
       'attempted_free_throws', 'offensive_rebounds', 'defensive_rebounds',
       'assists', 'steals', 'blocks', 'turnovers', 'personal_fouls', 'points',
       'player_efficiency_rating', 'true_shooting_percentage',
       'three_point_attempt_rate', 'free_throw_attempt_rate',
       'offensive_rebound_percentage', 'defensive_rebound_percentage',
       'total_rebound_percentage', 'assist_percentage', 'steal_percentage',
       'block_percentage', 'turnover_percentage', 'usage_percentage',
       'offensive_win_shares', 'defensive_win_shares', 'win_shares',
       'win_shares_per_48_minutes', 'offensive_box_plus_minus',
       'defensive_box_plus_minus', 'box_plus_minus',
       'value_over_replacement_player','O-LEBRON', 'D-LEBRON', 'Wins Added', 'ORPM', 'DRPM', 'poss',
       'raptor_offense', 'raptor_defense', 'war_total', 'war_reg_season',
       'war_playoffs', 'predator_offense', 'predator_defense', 'pace_impact',
       'LA_RAPM__Def', 'LA_RAPM__Off', 'RA_EFG__Def', 'RA_EFG__Off',
       'RA_FTR__Def', 'RA_FTR__Off', 'RA_ORBD__Def', 'RA_ORBD__Off',
       'RA_TOV__Def', 'RA_TOV__Off', 'RAPM__Def', 'RAPM__Off']

categorical = ["Type", "positions"]
other = [col for col in df.columns.values if (col not in categorical+numerical+["cap_usage"])]
dum_df = pd.get_dummies(df, columns=categorical) # convert the categorical features into dummy features (numerical)

In [7]:
df_train = dum_df.drop(columns=other) # drop the non-feature columns
df_train.dropna(inplace=True) # drop all rows with a null value (there should be none)

Split the dataset into training set, validation set, and test set (this is for the final run)

In [8]:
df_test = df_train.sample(frac=0.15)
df_train = df_train.drop(df_test.index)
df_val = df_train.sample(frac=0.15)
df_train = df_train.drop(df_val.index)

In [9]:
# assign the y dataframes cap_usage and drop it from the x features dataset
x_train = df_train.drop(columns=["cap_usage"])
y_train = df_train["cap_usage"]

x_test = df_test.drop(columns=["cap_usage"])
y_test = df_test["cap_usage"]

x_val = df_val.drop(columns=["cap_usage"])
y_val = df_val["cap_usage"]

In [10]:
# scale the numerical features only (this leaves out the dummy features from earlier)
from sklearn.preprocessing import StandardScaler
features = numerical
scaler = StandardScaler()

x_train[features] = scaler.fit_transform(x_train[features])
x_test[features] = scaler.transform(x_test[features])
x_val[features] = scaler.transform(x_val[features])

# Feature Selection

In [11]:
from sklearn.feature_selection import SelectKBest, f_regression
select = SelectKBest(f_regression, k=20)

best_features = select.fit_transform(x_train, y_train)
feature_scores = pd.concat([pd.Series(x_train.columns), pd.Series(select.scores_)], axis=1)
feature_scores.columns = ["features", "score"]

In [12]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [13]:
feature_scores.sort_values(by=["score"], ascending=False)

Unnamed: 0,features,score
4,made_field_goals,220.166302
17,points,217.285434
40,Wins Added,200.028339
37,value_over_replacement_player,197.866496
2,games_started,189.119233
11,defensive_rebounds,187.977156
43,poss,186.621125
46,war_total,180.661047
5,attempted_field_goals,178.101984
32,win_shares,162.979624


Since this is the baseline model I decided just to use these features and call it a day for now.

In [14]:
features = ["points", "poss", "minutes_played", "attempted_free_throws", "Wins Added", "value_over_replacement_player", 
            "war_total", "defensive_rebounds", "war_reg_season", "win_shares", "assists"]

In [15]:
x_train = x_train[features]
x_test = x_test[features]
x_val = x_val[features]

### helper functions

run(model, model_name, x_train, x_val) fits the model on the training set, evaluates the fit on the training set and on the validation set. 

evaluate(y_pred, y_test) prints out the evaluation metrics of a prediction.

In [16]:
def run(model, model_name, x_train, x_val):
    """ Fits the model and prints out results from calling evaluate on the training and validation set
        parameters:
            model
                the unfitted model, from sklearn (has to have a fit and predict method)
            model_name
                String, the name of the model
            x_train
                DataFrame, the training set
            x_val
                DataFrame, the validation set
                
        returns:
            nothing
        
    """
    model = model.fit(x_train, y_train)
    y_pred = np.clip(model.predict(x_train), 0, 1000) # since negative results are meaningless
    
    print("[{0}] These are the results for the training set.".format(model_name))
    
    evaluate(y_pred, y_train)
    print("--------------------------------------")
    
    print("[{0}] These are the results for the validation set.".format(model_name))
    y_pred = np.clip(model.predict(x_val), 0, 1000) # since negative results are meaningless
    
    evaluate(y_pred, y_val)
    
    print("_____________________________________________")

In [17]:
def evaluate(y_pred, y_test):
    """ Prints out the evaluation metrics of the experiment
        parameters:
            y_pred
                DataFrame or array-like, the predictions
            y_test
                DataFrame or array-like, the actual y-values
                
            these two parameters must have the same dimensions
    """
    print("r^2: ", r2_score(y_test, y_pred))
    print("mse: ", mean_squared_error(y_test, y_pred))
    print("variance_score: ", explained_variance_score(y_test, y_pred))
    print("mse-log: ", mean_squared_log_error(y_test, y_pred))

Here I'm trying those five linear models without any tuning and testing the performance.

In [18]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LassoLarsCV
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_squared_log_error

lstsq = LinearRegression()
ridge = RidgeCV()
lasso = LassoCV(max_iter=5000, random_state = 0)
elasticNet = ElasticNetCV(n_alphas = 1000)
lassoLars = LassoLarsCV()

run(ridge, "Ridge", x_train, x_val)
run(lstsq, "Least Squares", x_train,  x_val)
run(lasso, "Lasso", x_train,  x_val)
run(elasticNet, "ElasticNetCV", x_train,  x_val)
run(lassoLars, "Lasso LARS", x_train,  x_val)

[Ridge] These are the results for the training set.
r^2:  0.38476998908921733
mse:  0.002227358958069786
variance_score:  0.38476998912616056
mse-log:  0.001836512572164665
--------------------------------------
[Ridge] These are the results for the validation set.
r^2:  0.5452749487851896
mse:  0.0017056239818189283
variance_score:  0.5453629458578279
mse-log:  0.0014027605439937154
_____________________________________________
[Least Squares] These are the results for the training set.
r^2:  0.38487902757404713
mse:  0.0022269641986437942
variance_score:  0.3848790276384342
mse-log:  0.0018365412929568553
--------------------------------------
[Least Squares] These are the results for the validation set.
r^2:  0.5434642342940376
mse:  0.0017124157740284955
variance_score:  0.5435586922646147
mse-log:  0.0014071324460311605
_____________________________________________
[Lasso] These are the results for the training set.
r^2:  0.3818563070634793
mse:  0.002237907558830316
variance_scor

Here I'm transforming the datasets to include quadratic features (eecs16b). This means that for feature: points, the PolynomialFeatures transformer will add a column of points-squared to the dataset.

In [19]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)

x_trainPF = poly.fit_transform(x_train)
x_testPF = poly.transform(x_test)
x_valPF = poly.transform(x_val)

lstsq = LinearRegression()
ridge = RidgeCV()
lasso = LassoCV(max_iter=5000, random_state = 0)
elasticNet = ElasticNetCV(max_iter=5000, n_alphas = 1000)
lassoLars = LassoLarsCV()

run(ridge, "Ridge", x_trainPF, x_valPF)
run(lstsq, "Least Squares", x_trainPF, x_valPF)
run(lasso, "Lasso", x_trainPF, x_valPF)
run(elasticNet, "ElasticNetCV", x_trainPF, x_valPF)
run(lassoLars, "Lasso LARS", x_trainPF, x_valPF)

[Ridge] These are the results for the training set.
r^2:  0.497788953039786
mse:  0.0018181887334014538
variance_score:  0.4977891803289257
mse-log:  0.0015098935061078186
--------------------------------------
[Ridge] These are the results for the validation set.
r^2:  0.5567606869443618
mse:  0.0016625422329668675
variance_score:  0.5572716924241671
mse-log:  0.0013471878146051222
_____________________________________________
[Least Squares] These are the results for the training set.
r^2:  0.4599255079478811
mse:  0.0019552683330848115
variance_score:  0.4599867045188414
mse-log:  0.0016492975316784247
--------------------------------------
[Least Squares] These are the results for the validation set.
r^2:  0.47962520806422526
mse:  0.0019518689860797012
variance_score:  0.4805009925445216
mse-log:  0.0016149888320426175
_____________________________________________


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


[Lasso] These are the results for the training set.
r^2:  0.4050581772644699
mse:  0.0021539082536604598
variance_score:  0.4050581772644699
mse-log:  0.00176993132270257
--------------------------------------
[Lasso] These are the results for the validation set.
r^2:  0.5844297496240483
mse:  0.0015587586021005954
variance_score:  0.5857673906772698
mse-log:  0.0012927249776600264
_____________________________________________
[ElasticNetCV] These are the results for the training set.
r^2:  0.40463167336426564
mse:  0.0021554523546729663
variance_score:  0.40463167336426564
mse-log:  0.0017711774567162014
--------------------------------------
[ElasticNetCV] These are the results for the validation set.
r^2:  0.5843483025888141
mse:  0.0015590641010304928
variance_score:  0.585684933078229
mse-log:  0.0012929703100226678
_____________________________________________
[Lasso LARS] These are the results for the training set.
r^2:  0.39340268120104716
mse:  0.0021961054370019865
variance_s

Since these results are much better (except least squares), we're probably going to need to look beyond just linear-models.

**To put the result of baseline model into contexts, we also need to look into data entries manually to do some sanity checks**

In [21]:
train_indexes = x_train.index

In [22]:
sanity_check_train_df = dum_df.iloc[train_indexes][['key', 'cap_usage', 'Wins Added', 'value_over_replacement_player',
                                                      'war_total', 'win_shares']]
sanity_check_train_df['val_cap_usage'] = elasticNet.predict(x_trainPF)
sanity_check_train_df['diff'] = sanity_check_train_df['val_cap_usage'] - sanity_check_train_df['cap_usage']

In [24]:
sanity_check_train_df.sort_values('diff', ascending=False)

Unnamed: 0,key,cap_usage,Wins Added,value_over_replacement_player,war_total,win_shares,val_cap_usage,diff
6,Andre Drummond2015-16,0.046744,11.192345,2.0,5.046424,7.4,0.141802,0.095058
2,Hassan Whiteside2015-16,0.014019,8.504416,3.3,6.836313,10.3,0.105857,0.091838
64,Joe Johnson2015-16,0.005921,3.481817,0.4,3.330284,2.0,0.094276,0.088355
62,Joe Johnson2015-16,0.005921,3.481817,-0.3,3.330284,0.7,0.092914,0.086993
392,Clint Capela2017-18,0.023559,6.914961,2.8,8.439726,10.2,0.110227,0.086668
400,Will Barton2017-18,0.035657,3.605126,2.1,4.940574,6.2,0.116498,0.080841
649,Malcolm Brogdon2018-19,0.015166,4.825757,1.9,6.769914,6.5,0.095858,0.080692
321,Andrew Bogut2016-17,0.002573,1.133597,-0.1,0.298903,0.7,0.081291,0.078718
81,Jordan Clarkson2015-16,0.012072,-0.949777,0.2,0.14372,2.1,0.088291,0.076219
413,Tyreke Evans2017-18,0.033201,4.682775,2.6,5.391138,4.2,0.107525,0.074324


In [20]:
validation_indexes = x_val.index

In [45]:
sanity_check_val_df = dum_df.iloc[validation_indexes][['key', 'cap_usage', 'Wins Added', 'value_over_replacement_player',
                                                      'war_total', 'win_shares']]
sanity_check_val_df['val_cap_usage'] = elasticNet.predict(x_valPF)
sanity_check_val_df['diff'] = sanity_check_val_df['val_cap_usage'] - sanity_check_val_df['cap_usage']

In [47]:
sanity_check_val_df.sort_values('diff', ascending=False)

Unnamed: 0,key,cap_usage,Wins Added,value_over_replacement_player,war_total,win_shares,val_cap_usage,diff
64,Joe Johnson2015-16,0.005921,3.481817,0.4,3.330284,2.0,0.081736,0.075814
213,Mason Plumlee2016-17,0.024734,6.096758,1.8,3.069865,4.6,0.094591,0.069857
657,Thomas Bryant2018-19,0.01353,3.403753,1.5,0.979397,5.6,0.066547,0.053018
102,Kris Humphries2015-16,0.005543,0.720786,0.0,0.615665,0.0,0.058487,0.052944
420,Marco Belinelli2017-18,0.007833,2.560616,0.5,2.242331,1.9,0.060728,0.052894
20,Jared Sullinger2015-16,0.032418,5.13359,1.3,3.805603,4.8,0.084601,0.052183
14,Evan Fournier2015-16,0.032689,0.979105,1.1,3.2989,5.7,0.080238,0.04755
214,Mason Plumlee2016-17,0.024734,6.096758,0.3,3.069865,1.4,0.071999,0.047265
505,Joe Johnson2017-18,0.004782,0.322302,-0.1,-1.699268,0.6,0.049845,0.045063
217,David Lee2016-17,0.016482,2.81791,1.4,4.473241,5.7,0.058684,0.042202
