In [47]:
#create a machine learning model that predicts a players PPG from their previous-season output

def dataclean(data):
    #imports
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
    
    #remove rows that have a null ppg feature (meaning they did not play in the nba the year prior)
    data = data[data['PPG'].notna()]
    #remove player name from data (Player)
#     data = data.drop(['Player','RPG','APG'],axis=1)
    
    #transform data
    
    y = data['PPG']
    X = data.drop('PPG', axis=1)
    
    #convert categorical text to integers (Pos)
    jobs_encoder = OneHotEncoder()
#     X['Pos'] = label_encoder.fit_transform(X['Pos'])
    
    transformed = jobs_encoder.fit_transform(X['Pos'].to_numpy().reshape(-1, 1))
    #Create a Pandas DataFrame of the hot encoded column
    ohe_df = pd.DataFrame(transformed, columns=jobs_encoder.get_feature_names())
    #concat with original data
    X = pd.concat([X, ohe_df], axis=1).drop(['Pos'], axis=1)
    
    transformed = jobs_encoder.fit_transform(X['Player'].to_numpy().reshape(-1, 1))
    #Create a Pandas DataFrame of the hot encoded column
    ohe_df = pd.DataFrame(transformed, columns=jobs_encoder.get_feature_names())
    #concat with original data
    X = pd.concat([X, ohe_df], axis=1).drop(['Player'], axis=1)
    
    #train test split
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.7,random_state=42)
    
    #standard scaler on test set
    sc = StandardScaler()
    
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    return X_train, X_test, y_train, y_test

In [5]:
def featureselection(X_train,X_test,y_train,y_test):
    
    #imports
    from sklearn.feature_selection import SelectKBest, chi2
    
    #select k best features
    skb = SelectKBest(k=20)
    X_train = skb.fit_transform(X_train, y_train)
    X_test = skb.transform(X_test)
    
    return X_train, X_test

In [8]:
def model(X_train, X_test, y_train, y_test):
    
    #import
    from numpy import mean, std
    from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
    from lightgbm import LGBMRegressor
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import RepeatedKFold

    model = LGBMRegressor()
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
    print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
    # fit the model on the whole dataset
    model = LGBMRegressor()
    model.fit(X_train, y_train)
    
    #tuner
    def tuner(X_train,y_train):
        
        #imports
        from sklearn.model_selection import GridSearchCV
        from sklearn.linear_model import Ridge
        
        param_grid = { 'alpha': [0.5, 1.0, 1.5],  
                       'tol': [1, 0.1, 0.01, 0.001, 0.0001], 
                       'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']  }
        
        grid = GridSearchCV(Ridge(),param_grid)
        
        # fitting the model for grid search 
        grid.fit(X_train, y_train) 

        # print best parameter after tuning 
        print(grid.best_params_)
    
#     tuner(X_train,y_train)
    
    y_pred = model.predict(X_test)
    
    #r2
    r2 = r2_score(y_test,y_pred)
    print('r squared')
    print(r2)
    #rmse
    rmse = mean_squared_error(y_test,y_pred, squared=False)
    print('root mean square error')
    print(rmse)
    #mae
    mae = mean_absolute_error(y_test,y_pred)
    print('mean absolute error')
    print(mae)

In [11]:
def model_svr(X_train, X_test, y_train, y_test):
    
    #import
    from sklearn import svm
    from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
    
    reg = svm.SVR(kernel='linear', gamma='auto', shrinking=True, tol=1)
    
    #tuner
    def tuner(X_train,y_train):
        
        #imports
        from sklearn.model_selection import GridSearchCV
        from sklearn.svm import SVR
        
        param_grid = { 'gamma': ['scale','auto'],  
                       'tol': [1, 0.1, 0.01, 0.001, 0.0001], 
                        'shrinking':[True, False]}
        
        grid = GridSearchCV(SVR(),param_grid)
        
        # fitting the model for grid search 
        grid.fit(X_train, y_train) 

        # print best parameter after tuning 
        print(grid.best_params_)
    
#     tuner(X_train,y_train)
    reg.fit(X_train,y_train)
    
    y_pred = reg.predict(X_test)
    
    #r2
    r2 = r2_score(y_test,y_pred)
    print('r squared')
    print(r2)
    #rmse
    rmse = mean_squared_error(y_test,y_pred, squared=False)
    print('root mean square error')
    print(rmse)
    #mae
    mae = mean_absolute_error(y_test,y_pred)
    print('mean absolute error')
    print(mae)

In [12]:
def model_tree(X_train, X_test, y_train, y_test):
    
    #import
    from sklearn import ensemble
    from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
    
    reg = ensemble.RandomForestRegressor()
    
    #tuner
    def tuner(X_train,y_train):
        
        #imports
        from sklearn.model_selection import GridSearchCV
        from sklearn.svm import SVR
        
        param_grid = { 'gamma': ['scale','auto'],  
                       'tol': [1, 0.1, 0.01, 0.001, 0.0001], 
                        'shrinking':[True, False]}
        
        grid = GridSearchCV(SVR(),param_grid)
        
        # fitting the model for grid search 
        grid.fit(X_train, y_train) 

        # print best parameter after tuning 
        print(grid.best_params_)
    
#     tuner(X_train,y_train)
    reg.fit(X_train,y_train)
    
    y_pred = reg.predict(X_test)
    
    #r2
    r2 = r2_score(y_test,y_pred)
    print('r squared')
    print(r2)
    #rmse
    rmse = mean_squared_error(y_test,y_pred, squared=False)
    print('root mean square error')
    print(rmse)
    #mae
    mae = mean_absolute_error(y_test,y_pred)
    print('mean absolute error')
    print(mae)

In [13]:
def main():

    import pandas as pd
    data = pd.read_csv('Seasons_Stats_Final_allYears.csv')

    #prepares dataset for machine learning model
    X_train, X_test, y_train, y_test= dataclean(data)
    
    #feature selection
    X_train_, X_test_ = featureselection(X_train,X_test,y_train,y_test)
    
    

In [14]:
if __name__ == "__main__":
    main()

In [9]:
import pandas as pd
data = pd.read_csv('Seasons_Stats_Final_allYears.csv')

X_train, X_test, y_train, y_test= dataclean(data)

X_train_, X_test_ = featureselection(X_train,X_test,y_train,y_test)

model(X_train,X_test,y_train,y_test)

MAE: -2.454 (0.095)
r squared
0.7407206184756521
root mean square error
3.119553044214539
mean absolute error
2.389346450215549


In [49]:
#base model --> ridge regression model (best after tuning)

# r squared
# 0.7468619977556834
# root mean square error
# 3.082386243742886
# mean absolute error
# 2.3656133505320853

#not great results

#svr linear kernel (better) --> tune
# r squared
# 0.7424675749817459
# root mean square error
# 3.1090259166567464
# mean absolute error
# 2.355901330681439

In [38]:
import pandas as pd

data = pd.read_csv('Seasons_Stats_Final_allYears.csv')
# #remove rows that have a null ppg feature (meaning they did not play in the nba the year prior)
data = data[data['PPG'].notna()]
# #remove player name from data (Player)
# data = data.drop(['Player','RPG','APG','Pos'],axis=1)

In [39]:
data = data[data['Year']>= 2000]

In [40]:
data.head()

Unnamed: 0,Year,Player,Pos,Age,G,MP,PER,TS%,3PAr,FTr,...,TRB,AST,STL,BLK,TOV,PF,PTS,PPG,RPG,APG
7100,2000,Shareef Abdur-Rahim,SF,23,82,3223,20.2,0.547,0.075,0.431,...,825,271,89,87,249,244,1663,20.53,9.07,3.09
7101,2000,Cory Alexander,PG,26,29,329,8.8,0.381,0.357,0.224,...,42,58,24,2,28,39,82,2.0,0.96,1.38
7102,2000,Ray Allen,SG,24,82,3070,20.6,0.57,0.288,0.282,...,359,308,110,19,183,187,1809,22.02,5.22,4.56
7103,2000,Rafer Alston,PG,23,27,361,4.3,0.31,0.147,0.042,...,23,70,12,0,29,29,60,2.08,0.84,1.84
7104,2000,John Amaechi,C,29,80,1684,13.2,0.505,0.009,0.416,...,266,95,35,37,139,161,836,7.93,3.27,0.9


In [31]:
import smogn

In [32]:
data.head()

Unnamed: 0,Year,Player,Pos,Age,G,MP,PER,TS%,3PAr,FTr,...,TRB,AST,STL,BLK,TOV,PF,PTS,PPG,RPG,APG
0,1980,Kareem Abdul-Jabbar,C,32,82,3143,25.3,0.639,0.001,0.344,...,886,371,81,280,297,216,2034,26.19,10.26,3.4
1,1980,Tom Abernethy,PF,25,67,1222,11.0,0.511,0.003,0.258,...,191,87,35,12,39,118,362,1.62,1.23,0.49
2,1980,Alvan Adams,C,25,75,2168,19.2,0.571,0.002,0.27,...,609,322,108,55,218,237,1118,14.87,7.28,4.59
3,1980,Tiny Archibald,PG,31,80,2864,15.3,0.574,0.023,0.548,...,197,671,106,10,242,218,1131,13.83,2.2,7.73
4,1980,Dennis Awtrey,C,31,26,560,7.4,0.524,0.0,0.833,...,115,40,12,15,27,66,86,2.17,2.3,1.15


In [41]:
from pycaret.regression import *

In [42]:
exp_reg = setup(data, target = 'PPG')#, normalize=True, feature_selection=True, feature_selection_threshold=0.5)

Unnamed: 0,Description,Value
0,session_id,7929
1,Target,PPG
2,Original Data,"(6139, 51)"
3,Missing Values,False
4,Numeric Features,47
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(4297, 1213)"


In [43]:
best = compare_models() #best model - light gradient boosting machine (lightgbm) - r2 89.39 MAE 1.455 MSE 3.954

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,1.4866,4.0609,2.0113,0.8867,0.2132,0.2241,0.364
br,Bayesian Ridge,1.4955,4.0595,2.0129,0.8866,0.2323,0.2461,5.127
ridge,Ridge Regression,1.5127,4.0637,2.0139,0.8866,0.2373,0.2509,1.394
gbr,Gradient Boosting Regressor,1.5211,4.2167,2.0493,0.8825,0.2244,0.2385,3.363
omp,Orthogonal Matching Pursuit,1.5371,4.3327,2.0783,0.8792,0.2358,0.2485,0.173
rf,Random Forest Regressor,1.6039,4.8192,2.1904,0.8656,0.2303,0.2484,8.743
et,Extra Trees Regressor,1.5958,4.9127,2.2112,0.8631,0.2291,0.2459,10.507
en,Elastic Net,1.7022,5.1484,2.2665,0.8563,0.2681,0.3022,2.464
lasso,Lasso Regression,1.7947,5.7059,2.3859,0.8408,0.2833,0.3271,2.943
ada,AdaBoost Regressor,2.1884,7.1359,2.6693,0.8007,0.3727,0.5174,6.003


In [44]:
best_tuned = tune_model(best)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1.7083,5.7397,2.3958,0.8572,0.2335,0.2371
1,1.675,5.176,2.2751,0.8567,0.2522,0.2628
2,1.5655,4.3176,2.0779,0.8798,0.2302,0.2474
3,1.5886,4.6016,2.1451,0.8811,0.2191,0.2191
4,1.5602,4.5308,2.1286,0.8685,0.2138,0.2125
5,1.6286,4.7876,2.1881,0.8712,0.2398,0.258
6,1.4765,3.7728,1.9424,0.8888,0.2251,0.2461
7,1.5078,4.1631,2.0404,0.883,0.2279,0.2261
8,1.4827,3.9609,1.9902,0.8849,0.2386,0.2599
9,1.5566,4.5721,2.1382,0.8568,0.2219,0.2144


In [49]:
best_tuned

LGBMRegressor(bagging_fraction=0.8, bagging_freq=4, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.7,
              importance_type='split', learning_rate=0.3, max_depth=-1,
              min_child_samples=86, min_child_weight=0.001, min_split_gain=0.8,
              n_estimators=140, n_jobs=-1, num_leaves=10, objective=None,
              random_state=7929, reg_alpha=0.1, reg_lambda=0.5, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [23]:
# gbm = create_model('lightgbm')
# tuned_gbm = tune_model(gbm)

# tune multiple models dynamically
top2 = compare_models(n_select = 2, turbo=True)
tuned_top3 = [tune_model(i) for i in top3]

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1.5806,4.3847,2.094,0.8944,0.2554,0.2673
1,1.4682,3.9176,1.9793,0.8967,0.2262,0.2319
2,1.6404,4.8162,2.1946,0.8676,0.2309,0.2423
3,1.6392,4.6498,2.1564,0.8795,0.2515,0.2695
4,1.525,4.068,2.0169,0.8926,0.2357,0.2314
5,1.549,4.3748,2.0916,0.8958,0.2404,0.2358
6,1.489,4.0931,2.0231,0.8853,0.238,0.2526
7,1.5616,4.3185,2.0781,0.8746,0.248,0.2552
8,1.5155,3.8624,1.9653,0.889,0.2418,0.2337
9,1.4919,4.0886,2.022,0.888,0.237,0.2492


In [29]:
lgm = tuned_top3[0]

# bagged_lgm = ensemble_model(lgm)

In [30]:
evaluate_model(lgm)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [1]:
!pip install lightgbm



In [3]:
import lightgbm
print(lightgbm.__version__)

3.2.1


In [25]:
def model_lightgbm(X_train, X_test, y_train, y_test):
    import pandas as pd
    from sklearn.metrics import mean_squared_error, r2_score

    import lightgbm as lgb

    print('Loading data...')
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

    # specify your configurations as a dict
    params = {
        'boosting_type': 'gbdt',
        'objective': None, #previously 'regression'
        'metric': {'l2', 'l1'},
        'num_leaves': 50,
        'learning_rate': 0.1,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.7,
        'bagging_freq': 6,
        'verbose': 0,
        'colsample_bytree':1.0,
        'min_child_samples':66,
        'min_child_weight':0.001,
        'min_split_gain':0.9,
        'n_estimators':230,
        'n_jobs':-1,
        'random_state':7251,
        'reg_alpha':0.1,
        'reg_lambda':2,
        'silent':True,
        'subsample':1.0,
        'subsample_for_bin':200000,
        'subsample_freq':0
    }
    
#     LGBMRegressor(bagging_fraction=0.7, bagging_freq=6, boosting_type='gbdt',
#                class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
#                importance_type='split', learning_rate=0.1, max_depth=-1,
#                min_child_samples=66, min_child_weight=0.001, min_split_gain=0.9,
#                n_estimators=230, n_jobs=-1, num_leaves=50, objective=None,
#                random_state=7251, reg_alpha=0.1, reg_lambda=2, silent=True,
#                subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

    print('Starting training...')
    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=100,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=3)

    print('Saving model...')
    # save model to file
    gbm.save_model('model.txt')

    print('Starting predicting...')
    # predict
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    # eval
    print('The r2 of prediction is:', r2_score(y_test, y_pred))

In [48]:
import pandas as pd
data = pd.read_csv('Seasons_Stats_Final_allYears.csv')

X_train, X_test, y_train, y_test= dataclean(data)

# X_train_, X_test_ = featureselection(X_train,X_test,y_train,y_test)

# model_lightgbm(X_train,X_test,y_train,y_test)

ValueError: Shape of passed values is (11771, 1), indices imply (11771, 15)