In [37]:
#!pip install sklearn-pandas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

# Import Data

In [2]:
df = pd.read_excel("../spring21_data/FA_STATS_DF.xlsx")

# Feature Cleaning

In [23]:
numerical = ['age',
       'games_played', 'games_started', 'minutes_played', 'made_field_goals',
       'attempted_field_goals', 'made_three_point_field_goals',
       'attempted_three_point_field_goals', 'made_free_throws',
       'attempted_free_throws', 'offensive_rebounds', 'defensive_rebounds',
       'assists', 'steals', 'blocks', 'turnovers', 'personal_fouls', 'points',
       'player_efficiency_rating', 'true_shooting_percentage',
       'three_point_attempt_rate', 'free_throw_attempt_rate',
       'offensive_rebound_percentage', 'defensive_rebound_percentage',
       'total_rebound_percentage', 'assist_percentage', 'steal_percentage',
       'block_percentage', 'turnover_percentage', 'usage_percentage',
       'offensive_win_shares', 'defensive_win_shares', 'win_shares',
       'win_shares_per_48_minutes', 'offensive_box_plus_minus',
       'defensive_box_plus_minus', 'box_plus_minus',
       'value_over_replacement_player','O-LEBRON', 'D-LEBRON', 'Wins Added', 'ORPM', 'DRPM', 'poss',
       'raptor_offense', 'raptor_defense', 'war_total', 'war_reg_season',
       'war_playoffs', 'predator_offense', 'predator_defense', 'pace_impact',
       'LA_RAPM__Def', 'LA_RAPM__Off', 'RA_EFG__Def', 'RA_EFG__Off',
       'RA_FTR__Def', 'RA_FTR__Off', 'RA_ORBD__Def', 'RA_ORBD__Off',
       'RA_TOV__Def', 'RA_TOV__Off', 'RAPM__Def', 'RAPM__Off']

categorical = ["Type", "positions"]
other = [col for col in df.columns.values if (col not in categorical+numerical+["cap_usage"])]
dum_df = pd.get_dummies(df, columns=categorical) # convert the categorical features into dummy features (numerical)

In [24]:
df_train = dum_df.drop(columns=other) # drop the non-feature columns
df_train.dropna(inplace=True) # drop all rows with a null value (there should be none)

Split the dataset into training set, validation set, and test set (this is for the final run)

In [25]:
df_test = df_train.sample(frac=0.15)
df_train = df_train.drop(df_test.index)
df_val = df_train.sample(frac=0.15)
df_train = df_train.drop(df_val.index)

In [43]:
# assign the y dataframes cap_usage and drop it from the x features dataset
x_train = df_train.drop(columns=["cap_usage"])
y_train = df_train["cap_usage"]

x_test = df_test.drop(columns=["cap_usage"])
y_test = df_test["cap_usage"]

x_val = df_val.drop(columns=["cap_usage"])
y_val = df_val["cap_usage"]

In [44]:
# scale the numerical features only (this leaves out the dummy features from earlier)
from sklearn.preprocessing import StandardScaler
features = numerical
scaler = StandardScaler()

x_train[features] = scaler.fit_transform(x_train[features])
x_test[features] = scaler.transform(x_test[features])
x_val[features] = scaler.transform(x_val[features])

# Feature Selection

In [65]:
from sklearn.feature_selection import SelectKBest, f_regression
select = SelectKBest(f_regression, k=20)

best_features = select.fit_transform(x_train, y_train)
feature_scores = pd.concat([pd.Series(x_train.columns), pd.Series(select.scores_)], axis=1)
feature_scores.columns = ["features", "score"]

                    features       score
0                        age   36.621545
1               games_played   19.719607
2              games_started  199.945033
3             minutes_played  208.078240
4           made_field_goals  261.484532
..                       ...         ...
66          positions_CENTER    3.009382
67     positions_POINT GUARD    2.269737
68   positions_POWER FORWARD    0.013183
69  positions_SHOOTING GUARD    6.815906
70   positions_SMALL FORWARD    7.694446

[71 rows x 2 columns]


In [75]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [79]:
feature_scores.sort_values(by=["score"], ascending=False)

Unnamed: 0,features,score
4,made_field_goals,261.484532
17,points,256.55058
43,poss,223.839901
5,attempted_field_goals,222.142619
3,minutes_played,208.07824
2,games_started,199.945033
9,attempted_free_throws,197.862968
8,made_free_throws,196.255188
40,Wins Added,193.90472
37,value_over_replacement_player,177.714494


Since this is the baseline model I decided just to use these features and call it a day for now.

In [80]:
features = ["points", "poss", "minutes_played", "attempted_free_throws", "Wins Added", "value_over_replacement_player", 
            "war_total", "defensive_rebounds", "war_reg_season", "win_shares", "assists"]

In [81]:
x_train = x_train[features]
x_test = x_test[features]
x_val = x_val[features]

### helper functions

run(model, model_name, x_train, x_val) fits the model on the training set, evaluates the fit on the training set and on the validation set. 

evaluate(y_pred, y_test) prints out the evaluation metrics of a prediction.

In [131]:
def run(model, model_name, x_train, x_val):
    """ Fits the model and prints out results from calling evaluate on the training and validation set
        parameters:
            model
                the unfitted model, from sklearn (has to have a fit and predict method)
            model_name
                String, the name of the model
            x_train
                DataFrame, the training set
            x_val
                DataFrame, the validation set
                
        returns:
            nothing
        
    """
    model = model.fit(x_train, y_train)
    y_pred = np.clip(model.predict(x_train), 0, 1000) # since negative results are meaningless
    
    print("[{0}] These are the results for the training set.".format(model_name))
    
    evaluate(y_pred, y_train)
    print("--------------------------------------")
    
    print("[{0}] These are the results for the validation set.".format(model_name))
    y_pred = np.clip(model.predict(x_val), 0, 1000) # since negative results are meaningless
    
    evaluate(y_pred, y_val)
    
    print("_____________________________________________")

In [130]:
def evaluate(y_pred, y_test):
    """ Prints out the evaluation metrics of the experiment
        parameters:
            y_pred
                DataFrame or array-like, the predictions
            y_test
                DataFrame or array-like, the actual y-values
                
            these two parameters must have the same dimensions
    """
    print("r^2: ", r2_score(y_test, y_pred))
    print("mse: ", mean_squared_error(y_test, y_pred))
    print("variance_score: ", explained_variance_score(y_test, y_pred))
    print("mse-log: ", mean_squared_log_error(y_test, y_pred))

Here I'm trying those five linear models without any tuning and testing the performance.

In [133]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LassoLarsCV
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_squared_log_error

lstsq = LinearRegression()
ridge = RidgeCV()
lasso = LassoCV(max_iter=5000, random_state = 0)
elasticNet = ElasticNetCV(n_alphas = 1000)
lassoLars = LassoLarsCV()

run(ridge, "Ridge", x_train, x_val)
run(lstsq, "Least Squares", x_train,  x_val)
run(lasso, "Lasso", x_train,  x_val)
run(elasticNet, "ElasticNetCV", x_train,  x_val)
run(lassoLars, "Lasso LARS", x_train,  x_val)

[Ridge] These are the results for the training set.
r^2:  0.3878503203509137
mse:  0.0023271522344735034
variance_score:  0.3878503213623087
mse-log:  0.0018998198052741376
--------------------------------------
[Ridge] These are the results for the validation set.
r^2:  0.4635061038289867
mse:  0.0016138947347056473
variance_score:  0.4659608850463195
mse-log:  0.001347073974183212
_____________________________________________
[Least Squares] These are the results for the training set.
r^2:  0.3918153479559673
mse:  0.0023120787595414588
variance_score:  0.39181535149804003
mse-log:  0.0018903452689006822
--------------------------------------
[Least Squares] These are the results for the validation set.
r^2:  0.45249262622491715
mse:  0.0016470257612520157
variance_score:  0.4549686507664955
mse-log:  0.0013721863527366691
_____________________________________________
[Lasso] These are the results for the training set.
r^2:  0.39164467010264514
mse:  0.002312727609587352
variance_sco

Here I'm transforming the datasets to include quadratic features (eecs16b). This means that for feature: points, the PolynomialFeatures transformer will add a column of points-squared to the dataset.

In [135]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)

x_trainPF = poly.fit_transform(x_train)
x_testPF = poly.transform(x_test)
x_valPF = poly.transform(x_val)

lstsq = LinearRegression()
ridge = RidgeCV()
lasso = LassoCV(max_iter=5000, random_state = 0)
elasticNet = ElasticNetCV(max_iter=5000, n_alphas = 1000)
lassoLars = LassoLarsCV()

run(ridge, "Ridge", x_trainPF, x_valPF)
run(lstsq, "Least Squares", x_trainPF, x_valPF)
run(lasso, "Lasso", x_trainPF, x_valPF)
run(elasticNet, "ElasticNetCV", x_trainPF, x_valPF)
run(lassoLars, "Lasso LARS", x_trainPF, x_valPF)

[Ridge] These are the results for the training set.
r^2:  0.5041816102608213
mse:  0.0018849064402615117
variance_score:  0.504181615770992
mse-log:  0.0015526937131791947
--------------------------------------
[Ridge] These are the results for the validation set.
r^2:  0.4762108056855261
mse:  0.0015756761238722107
variance_score:  0.48312369374684316
mse-log:  0.0013219457252041315
_____________________________________________
[Least Squares] These are the results for the training set.
r^2:  0.47468865346401323
mse:  0.0019970270581310943
variance_score:  0.47473718251036356
mse-log:  0.0016602011077021981
--------------------------------------
[Least Squares] These are the results for the validation set.
r^2:  0.1912429723486917
mse:  0.002432923688225101
variance_score:  0.21467978636858953
mse-log:  0.0019154784116936466
_____________________________________________
[Lasso] These are the results for the training set.
r^2:  0.42620697666204244
mse:  0.002181335318433386
variance_sc

Since these results are much better (except least squares), we're probably going to need to look beyond just linear-models.