In [2]:
import pandas as pd
import numpy as np
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LassoLarsCV
import pickle

import warnings
warnings.filterwarnings('ignore')
import smogn

In [3]:
sns.set(rc={'figure.figsize':(14, 7)})

In [4]:
with open('../scraping/stats_salaries_cap.pickle', 'rb') as handle:
    data = pickle.load(handle)

In [5]:
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_squared_log_error
def evaluate(y_pred, y_test):
    """ Prints out the evaluation metrics of the experiment
        parameters:
            y_pred
                DataFrame or array-like, the predictions
            y_test
                DataFrame or array-like, the actual y-values
                
            these two parameters must have the same dimensions
    """
    print("r^2: ", r2_score(y_test, y_pred))
    print("mse: ", mean_squared_error(y_test, y_pred))
    print("variance_score: ", explained_variance_score(y_test, y_pred))

In [7]:
seed = 77

# Use smogn

In [8]:
X = data.drop(columns=['Player', 'previous season', 'Next season', 'Salary',
       'salary_cap', 'cap_usage', 'salary_category', 'season_x', 'season_y'])
X = pd.concat([X.drop('Pos', axis=1), pd.get_dummies(X.Pos)], axis=1)

y = data['cap_usage']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=seed)
both = pd.concat([X_train, y_train], axis=1).reset_index().drop(columns=["index"])
smote_data_high = smogn.smoter(data = both,
                           y = 'cap_usage',
                           rel_xtrm_type = 'high',
                           rel_thres = 0.85,
                           k = 11,
                           drop_na_col = True,       ## boolean (True or False)
                           drop_na_row = True,
                           samp_method = 'balance')
smote_data_mid = smogn.smoter(data = both,
                           y = 'cap_usage',
                           rel_xtrm_type = 'both',
                           rel_thres = 0.2,
                           k = 11,
                           drop_na_col = True,       ## boolean (True or False)
                           drop_na_row = True,
                           samp_method = 'balance')
smote_data_low = smogn.smoter(data = both,
                           y = 'cap_usage',
                           rel_xtrm_type = 'both',
                           rel_thres = 0.01,
                           k = 11,
                           drop_na_col = True,       ## boolean (True or False)
                           drop_na_row = True,
                           samp_method = 'extreme')
smote_data = pd.concat([smote_data_high, smote_data_mid, smote_data_low], ignore_index=True).drop_duplicates()
sm = smote_data.sample(frac=1)
y = sm[['cap_usage']]
X = sm.drop(columns=['cap_usage'])

dist_matrix: 100%|##########| 109/109 [00:19<00:00,  5.56it/s]
synth_matrix: 100%|##########| 109/109 [00:02<00:00, 39.78it/s]
r_index: 100%|##########| 106/106 [00:00<00:00, 113.22it/s]
dist_matrix: 100%|##########| 295/295 [02:13<00:00,  2.22it/s]
r_index: 100%|##########| 248/248 [00:02<00:00, 118.16it/s]
dist_matrix: 100%|##########| 481/481 [05:53<00:00,  1.36it/s]
r_index: 100%|##########| 144/144 [00:01<00:00, 120.13it/s]


# Feature Selection

In [10]:
select = SelectKBest(f_regression, k=90)

In [17]:
best_features = select.fit_transform(X, y)
feature_scores = pd.concat([pd.Series(X.columns), pd.Series(select.scores_)], axis=1)
feature_scores.columns = ["features", "score"]
features = list(feature_scores.sort_values(by=["score"], ascending=False).head(30)['features'])

In [16]:
feature_scores.sort_values(by=["score"], ascending=False).head(35)

Unnamed: 0,features,score
3,MPG,2604.84526
107,win_shares,2318.493313
112,value_over_replacement_player,2244.2705
113,poss,2171.007019
118,war_reg_season,2035.39244
117,war_total,1988.707306
105,offensive_win_shares,1870.777191
2,GS,1782.442765
109,offensive_box_plus_minus,1392.578305
93,player_efficiency_rating,1350.647447


In [18]:
X_train = X_train[features]
X_test = X_test[features]
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=seed)

In [19]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [20]:
evaluate(lr.predict(X_val), y_val)

r^2:  0.6921567441166873
mse:  0.0012572652043942182
variance_score:  0.6938715622050913


In [21]:
evaluate(lr.predict(X_test), y_test)

r^2:  0.6476358541784755
mse:  0.0017429092926665134
variance_score:  0.649601651067561


In [22]:
ridge_cv = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv=5)
ridge_cv.fit(X_train, y_train)

RidgeCV(alphas=array([0.001, 0.01 , 0.1  , 1.   ]), cv=5)

In [23]:
evaluate(ridge_cv.predict(X_val), y_val)

r^2:  0.6838177440954116
mse:  0.0012913225838099506
variance_score:  0.685863068389853


In [24]:
evaluate(ridge_cv.predict(X_test), y_test)

r^2:  0.6565882964911789
mse:  0.0016986275600217428
variance_score:  0.6581752737623041


# Light GBM

In [25]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [26]:
import lightgbm as lgb

In [27]:
lgb_toy = lgb.LGBMRegressor(boosting_type = 'dart', 
                            n_estimators=2500)

In [28]:
lgb_toy.fit(X_train, y_train)

LGBMRegressor(boosting_type='dart', n_estimators=2500)

In [29]:
r2_score(y_val, lgb_toy.predict(X_val))

0.7085240442943114

In [30]:
r2_score(y_test, lgb_toy.predict(X_test))

0.6314890085252827

In [63]:
boosting_type = ['gbdt', 'dart', 'goss', 'rf']

objective = ['fair', 'poisson', 'quantile']

num_leaves = range(1, 24, 3)

min_child_weight = range(1, 25, 3)

learning_rate = [1e-4, 5e-3, 1e-3, 5e-2, 1e-2, 5e-2, 1e-1, 0.2, 0.3]

min_split_gain = np.arange(0.005, 0.02, 0.001)

lightgbm_grid = {'num_leaves': num_leaves, 
                 'min_child_weight': min_child_weight,
                 'min_split_gain': min_split_gain,
                 'objective': objective,
                 'learning_rate': learning_rate}

In [64]:
lightgbm = lgb.LGBMRegressor(boosting_type = 'dart', n_estimators = 3000)

lightgbm_random = RandomizedSearchCV(estimator = lightgbm,
                                     param_distributions = lightgbm_grid, n_iter = 250, cv = 5,
                                     verbose=3, n_jobs = -1, random_state = seed)
# Fit the random search model
lightgbm_random.fit(X_train, y_train)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits


RandomizedSearchCV(cv=5,
                   estimator=LGBMRegressor(boosting_type='dart',
                                           n_estimators=3000),
                   n_iter=250, n_jobs=-1,
                   param_distributions={'learning_rate': [0.0001, 0.005, 0.001,
                                                          0.05, 0.01, 0.05, 0.1,
                                                          0.2, 0.3],
                                        'min_child_weight': range(1, 25, 3),
                                        'min_split_gain': array([0.005, 0.006, 0.007, 0.008, 0.009, 0.01 , 0.011, 0.012, 0.013,
       0.014, 0.015, 0.016, 0.017, 0.018, 0.019]),
                                        'num_leaves': range(1, 24, 3),
                                        'objective': ['fair', 'poisson',
                                                      'quantile']},
                   random_state=77, verbose=3)

In [65]:
r2_score(y_val, lightgbm_random.predict(X_val))

0.6887491515612504

In [66]:
r2_score(y_test, lightgbm_random.predict(X_test))

0.6689888185463069

In [67]:
lightgbm_random.best_estimator_

LGBMRegressor(boosting_type='dart', learning_rate=0.3, min_child_weight=22,
              min_split_gain=0.015, n_estimators=3000, num_leaves=22,
              objective='poisson')

In [68]:
with open('../scraping/salary_categories_dict.pickle', 'rb') as handle:
    salary_categories_dict = pickle.load(handle)

In [69]:
def assign_category(row, index='pred_salary'):
    season_dict = salary_categories_dict[row['Next season']]
    sorted_season_dict = dict(sorted(season_dict.items(), key=lambda item: abs(item[0] - row[index])))
    return list(sorted_season_dict.values())[0]

In [70]:
tmp = data.loc[y_test.index][['Player', 'previous season', 'cap_usage', 'salary_cap',  'Salary', 'salary_category', 'Next season']]
tmp['pred_cap_usage'] = lightgbm_random.predict(X_test)
tmp['pred_salary'] = tmp['pred_cap_usage'] * tmp['salary_cap']
tmp['pred_salary'] = tmp['pred_salary'].apply(int)
tmp = tmp[['Player', 'previous season', 'salary_cap', 'cap_usage', 'pred_cap_usage', 'Salary', 'salary_category', 'pred_salary', 'Next season']]
tmp['Salary'] = tmp['Salary'].apply(int)
tmp['salary_category_pred'] = tmp.apply(assign_category, axis=1)
tmp.drop(['Next season'], inplace=True, axis=1)

In [62]:
tmp[150:200]

Unnamed: 0,Player,previous season,salary_cap,cap_usage,pred_cap_usage,Salary,salary_category,pred_salary,salary_category_pred
560,Manu Ginobili,2012-13,58679000,0.127814,0.111624,7500000,MLE,6549976,MLE
719,Jack Cooley,2014-15,70000000,0.012072,0.019519,845059,MIN,1366337,MIN
782,Aaron Brooks,2015-16,94143000,0.02868,0.022138,2700000,MLE,2084154,BAE
872,Zaza Pachulia,2015-16,94143000,0.030783,0.087223,2898000,MLE,8211445,MLE
284,Mario West,2008-09,57700000,0.007827,0.020601,451595,MIN,1188679,MIN
1170,Oshae Brissett,2019-20,115000000,0.006587,0.020131,757453,MIN,2315083,MIN
965,Doug McDermott,2017-18,101869000,0.071988,0.039206,7333333,MLE,3993921,MLE
103,Justin Reed,2005-06,53135000,0.028792,0.020773,1529873,BAE,1103791,MIN
677,Wesley Johnson,2013-14,63065000,0.014513,0.048198,915243,MIN,3039621,MLE
341,Brad Miller,2009-10,58044000,0.075805,0.055385,4400000,MLE,3214789,BAE


In [60]:
tmp['salary_category'] = tmp['salary_category'].apply(lambda x: x[-3:])
tmp['salary_category_pred'] = tmp['salary_category_pred'].apply(lambda x: x[-3:])

In [61]:
np.sum(tmp['salary_category'] == tmp['salary_category_pred']) / tmp.shape[0]

0.625