In [3]:
#!pip install sklearn-pandas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

# Import Data

In [2]:
df = pd.read_excel("../spring21_data/fa_stats_v2.xlsx")

# Feature Cleaning

In [3]:
numerical = ['age',
       'games_played', 'games_started', 'minutes_played', 'made_field_goals',
       'attempted_field_goals', 'made_three_point_field_goals',
       'attempted_three_point_field_goals', 'made_free_throws',
       'attempted_free_throws', 'offensive_rebounds', 'defensive_rebounds',
       'assists', 'steals', 'blocks', 'turnovers', 'personal_fouls', 'points',
       'player_efficiency_rating', 'true_shooting_percentage',
       'three_point_attempt_rate', 'free_throw_attempt_rate',
       'offensive_rebound_percentage', 'defensive_rebound_percentage',
       'total_rebound_percentage', 'assist_percentage', 'steal_percentage',
       'block_percentage', 'turnover_percentage', 'usage_percentage',
       'offensive_win_shares', 'defensive_win_shares', 'win_shares',
       'win_shares_per_48_minutes', 'offensive_box_plus_minus',
       'defensive_box_plus_minus', 'box_plus_minus',
       'value_over_replacement_player','O-LEBRON', 'D-LEBRON', 'Wins Added', 'ORPM', 'DRPM', 'poss',
       'raptor_offense', 'raptor_defense', 'war_total', 'war_reg_season',
       'war_playoffs', 'predator_offense', 'predator_defense', 'pace_impact',
       'LA_RAPM__Def', 'LA_RAPM__Off', 'RA_EFG__Def', 'RA_EFG__Off',
       'RA_FTR__Def', 'RA_FTR__Off', 'RA_ORBD__Def', 'RA_ORBD__Off',
       'RA_TOV__Def', 'RA_TOV__Off', 'RAPM__Def', 'RAPM__Off']

categorical = ["Type", "positions"]
other = [col for col in df.columns.values if (col not in categorical+numerical+["cap_usage"])]
dum_df = pd.get_dummies(df, columns=categorical) # convert the categorical features into dummy features (numerical)

In [4]:
df_train = dum_df.drop(columns=other) # drop the non-feature columns
df_train.dropna(inplace=True) # drop all rows with a null value (there should be none)

Split the dataset into training set, validation set, and test set (this is for the final run)

In [5]:
df_test = df_train.sample(frac=0.15)
df_train = df_train.drop(df_test.index)
df_val = df_train.sample(frac=0.15)
df_train = df_train.drop(df_val.index)

In [6]:
# assign the y dataframes cap_usage and drop it from the x features dataset
x_train = df_train.drop(columns=["cap_usage"])
y_train = df_train["cap_usage"]

x_test = df_test.drop(columns=["cap_usage"])
y_test = df_test["cap_usage"]

x_val = df_val.drop(columns=["cap_usage"])
y_val = df_val["cap_usage"]

In [7]:
# scale the numerical features only (this leaves out the dummy features from earlier)
from sklearn.preprocessing import StandardScaler
features = numerical
scaler = StandardScaler()

x_train[features] = scaler.fit_transform(x_train[features])
x_test[features] = scaler.transform(x_test[features])
x_val[features] = scaler.transform(x_val[features])

# Feature Selection

In [8]:
from sklearn.feature_selection import SelectKBest, f_regression
select = SelectKBest(f_regression, k=20)

best_features = select.fit_transform(x_train, y_train)
feature_scores = pd.concat([pd.Series(x_train.columns), pd.Series(select.scores_)], axis=1)
feature_scores.columns = ["features", "score"]

In [9]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [10]:
feature_scores.sort_values(by=["score"], ascending=False)

Unnamed: 0,features,score
63,value_over_replacement_player,322.044636
58,win_shares,316.729761
2,Wins Added,311.627731
8,war_total,275.29301
43,points,267.106213
30,made_field_goals,260.608996
9,war_reg_season,259.46857
56,offensive_win_shares,242.818752
31,attempted_field_goals,202.228605
5,poss,187.408303


Since this is the baseline model I decided just to use these features and call it a day for now.

In [11]:
features = ["points", "poss", "minutes_played", "attempted_free_throws", "Wins Added", "value_over_replacement_player", 
            "war_total", "defensive_rebounds", "war_reg_season", "win_shares", "assists"]

In [12]:
x_train = x_train[features]
x_test = x_test[features]
x_val = x_val[features]

### helper functions

run(model, model_name, x_train, x_val) fits the model on the training set, evaluates the fit on the training set and on the validation set. 

evaluate(y_pred, y_test) prints out the evaluation metrics of a prediction.

In [13]:
def run(model, model_name, x_train, x_val):
    """ Fits the model and prints out results from calling evaluate on the training and validation set
        parameters:
            model
                the unfitted model, from sklearn (has to have a fit and predict method)
            model_name
                String, the name of the model
            x_train
                DataFrame, the training set
            x_val
                DataFrame, the validation set
                
        returns:
            nothing
        
    """
    model = model.fit(x_train, y_train)
    y_pred = np.clip(model.predict(x_train), 0, 1000) # since negative results are meaningless
    
    print("[{0}] These are the results for the training set.".format(model_name))
    
    evaluate(y_pred, y_train)
    print("--------------------------------------")
    
    print("[{0}] These are the results for the validation set.".format(model_name))
    y_pred = np.clip(model.predict(x_val), 0, 1000) # since negative results are meaningless
    
    evaluate(y_pred, y_val)
    
    print("_____________________________________________")

In [14]:
def evaluate(y_pred, y_test):
    """ Prints out the evaluation metrics of the experiment
        parameters:
            y_pred
                DataFrame or array-like, the predictions
            y_test
                DataFrame or array-like, the actual y-values
                
            these two parameters must have the same dimensions
    """
    print("r^2: ", r2_score(y_test, y_pred))
    print("mse: ", mean_squared_error(y_test, y_pred))
    print("variance_score: ", explained_variance_score(y_test, y_pred))
    print("mse-log: ", mean_squared_log_error(y_test, y_pred))

Here I'm trying those five linear models without any tuning and testing the performance.

In [15]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LassoLarsCV
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_squared_log_error

lstsq = LinearRegression()
ridge = RidgeCV()
lasso = LassoCV(max_iter=5000, random_state = 0)
elasticNet = ElasticNetCV(n_alphas = 1000)
lassoLars = LassoLarsCV()

run(ridge, "Ridge", x_train, x_val)
run(lstsq, "Least Squares", x_train,  x_val)
run(lasso, "Lasso", x_train,  x_val)
run(elasticNet, "ElasticNetCV", x_train,  x_val)
run(lassoLars, "Lasso LARS", x_train,  x_val)

[Ridge] These are the results for the training set.
r^2:  0.6893276932211938
mse:  0.00195069573062744
variance_score:  0.689377377037312
mse-log:  0.001578911662499847
--------------------------------------
[Ridge] These are the results for the validation set.
r^2:  0.6161684611364733
mse:  0.0029901682877407974
variance_score:  0.6636892671120789
mse-log:  0.002346858817097546
_____________________________________________
[Least Squares] These are the results for the training set.
r^2:  0.6920611098805709
mse:  0.0019335327454139907
variance_score:  0.6921268581027036
mse-log:  0.0015623919403044226
--------------------------------------
[Least Squares] These are the results for the validation set.
r^2:  0.6030433127882685
mse:  0.0030924173172991775
variance_score:  0.6530520216621603
mse-log:  0.002416289923297044
_____________________________________________
[Lasso] These are the results for the training set.
r^2:  0.6846923453823348
mse:  0.001979800845701869
variance_score:  0.6

Here I'm transforming the datasets to include quadratic features (eecs16b). This means that for feature: points, the PolynomialFeatures transformer will add a column of points-squared to the dataset.

In [25]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)

x_trainPF = poly.fit_transform(x_train)
x_testPF = poly.transform(x_test)
x_valPF = poly.transform(x_val)

lstsq = LinearRegression()
ridge = RidgeCV()
lasso = LassoCV(max_iter=50000, random_state = 0)
elasticNet = ElasticNetCV(max_iter=50000, n_alphas = 1000)
lassoLars = LassoLarsCV()

run(ridge, "Ridge", x_trainPF, x_valPF)
run(lstsq, "Least Squares", x_trainPF, x_valPF)
run(lasso, "Lasso", x_trainPF, x_valPF)
run(elasticNet, "ElasticNetCV", x_trainPF, x_valPF)
run(lassoLars, "Lasso LARS", x_trainPF, x_valPF)

[Ridge] These are the results for the training set.
r^2:  0.7939829194131968
mse:  0.0012935708486663928
variance_score:  0.7939841055909175
mse-log:  0.0010646690600285467
--------------------------------------
[Ridge] These are the results for the validation set.
r^2:  0.5762557649201332
mse:  0.003301100731848175
variance_score:  0.5903320890827108
mse-log:  0.002635385181714802
_____________________________________________
[Least Squares] These are the results for the training set.
r^2:  0.7052683795396719
mse:  0.001850604965965671
variance_score:  0.7065793179238371
mse-log:  0.0015498211065933763
--------------------------------------
[Least Squares] These are the results for the validation set.
r^2:  -0.6621143989835931
mse:  0.0129483934049651
variance_score:  -0.6306189219338898
mse-log:  0.007943458727131595
_____________________________________________
[Lasso] These are the results for the training set.
r^2:  0.7289288304119772
mse:  0.0017020421893864648
variance_score:  0

Since these results are much better (except least squares), we're probably going to need to look beyond just linear-models.

**To put the result of baseline model into contexts, we also need to look into data entries manually to do some sanity checks**

In [17]:
train_indexes = x_train.index

In [18]:
sanity_check_train_df = dum_df.iloc[train_indexes][['key', 'cap_usage', 'Wins Added', 'value_over_replacement_player',
                                                      'war_total', 'win_shares']]
sanity_check_train_df['val_cap_usage'] = elasticNet.predict(x_trainPF)
sanity_check_train_df['diff'] = sanity_check_train_df['val_cap_usage'] - sanity_check_train_df['cap_usage']

In [19]:
sanity_check_train_df.sort_values('diff', ascending=False)

Unnamed: 0,key,cap_usage,Wins Added,value_over_replacement_player,war_total,win_shares,val_cap_usage,diff
178,rosede0130.511.925.7,0.007566,2.521421,0.5,2.694905,3.0,0.122046,0.114479
177,rosede0130.511.925.7,0.013691,2.521421,0.5,2.694905,3.0,0.122046,0.108354
256,couside014.72.918.931.9,0.052391,6.525764,2.9,4.722472,4.7,0.148871,0.09648
245,howardw016.81.215.124.2,0.052391,6.497057,1.2,0.547699,6.8,0.141554,0.089163
172,allento013.10.513.317.9,0.014848,2.92064,0.5,4.148434,3.1,0.087253,0.072404
53,feltora012.70.213.718.4,0.016482,0.458465,0.2,2.565857,2.7,0.082119,0.065637
36,willide013.70.914.822,0.058422,3.198578,0.9,2.130199,3.7,0.116517,0.058095
262,greenje023.90.19.419.3,0.0235,2.246076,0.1,2.527616,3.9,0.081068,0.057568
116,jackja010.90.318.821.7,0.00097,1.841893,0.3,0.588331,0.9,0.057303,0.056334
202,jennibr011.5017.818.1,0.002999,2.20805,0.0,0.691147,1.5,0.059007,0.056008


In [20]:
validation_indexes = x_val.index

In [21]:
sanity_check_val_df = dum_df.iloc[validation_indexes][['key', 'cap_usage', 'Wins Added', 'value_over_replacement_player',
                                                      'war_total', 'win_shares']]
sanity_check_val_df['val_cap_usage'] = elasticNet.predict(x_valPF)
sanity_check_val_df['diff'] = sanity_check_val_df['val_cap_usage'] - sanity_check_val_df['cap_usage']

In [22]:
sanity_check_val_df.sort_values('diff', ascending=False)

Unnamed: 0,key,cap_usage,Wins Added,value_over_replacement_player,war_total,win_shares,val_cap_usage,diff
60,barnema022.60.613.817.9,0.004072,1.056068,0.6,0.163583,2.6,0.086582,0.08251
255,nowitdi014.82.1620.5,0.049083,4.006983,2.1,2.855006,4.8,0.107221,0.058138
94,jennibr010.80.212.119,0.012747,1.921673,0.2,1.428483,0.8,0.038234,0.025488
59,barnema022.60.613.817.9,0.065061,1.056068,0.6,0.163583,2.6,0.086582,0.021522
222,casspom010.4-0.111.916.3,0.014848,0.514966,-0.1,-0.338762,0.4,0.035697,0.020849
190,tollian012.30.412.613.6,0.033201,0.214893,0.4,0.813447,2.3,0.05361,0.020409
192,terryja012.10.412.39.8,0.014848,0.476538,0.4,3.183763,2.1,0.034258,0.019409
127,woodch010.30.13.119.4,0.009291,0.221274,0.1,0.050021,0.3,0.025775,0.016484
85,speigma011.8-0.411.529.7,0.014909,0.392331,-0.4,-0.141095,1.8,0.031018,0.016108
298,clarkia011.1-0.69.917.2,0.016782,0.016477,-0.6,-0.668282,1.1,0.032346,0.015564
