In [None]:
 # Essentials
import os
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Tools/Metrics 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
import scipy

# Modeling
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
path = '/content/drive/MyDrive/CME538 Project 2'
os.chdir(path)

In [None]:
df_dpoy = pd.read_csv('dpoy_player_stats_1984_2022.csv')
df_dpoy = df_dpoy.sort_values('Year', ascending=False)
df_dpoy = df_dpoy.reset_index(drop = True)
df_dpoy.head(10)

Unnamed: 0.1,Unnamed: 0,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,DRB%,BPM,PER,OWS,3PAr,ORB%,OBPM,TS%,TRB%,TOV%
0,556,Matisse Thybulle,24,PHI,0.0,1.0,500,0.002,66,25.5,...,7.1,1.1,11.2,1.2,0.48,2.8,-2.2,0.594,5.0,10.9
1,551,Giannis Antetokounmpo,27,MIL,5.0,58.0,500,0.116,67,32.9,...,30.4,11.2,32.1,9.2,0.194,6.6,7.6,0.633,18.7,12.2
2,546,Marcus Smart,27,BOS,37.0,257.0,500,0.514,71,32.3,...,10.5,0.5,13.6,1.9,0.501,2.0,-1.2,0.54,6.4,16.6
3,547,Mikal Bridges,25,PHO,22.0,202.0,500,0.404,82,34.8,...,10.0,1.6,14.4,5.2,0.364,2.9,0.5,0.627,6.5,6.8
4,548,Rudy Gobert,29,UTA,12.0,136.0,500,0.272,66,32.1,...,36.3,4.6,24.7,7.3,0.008,12.9,2.9,0.732,25.0,14.5
5,550,Jaren Jackson Jr.,22,MEM,10.0,99.0,500,0.198,78,27.3,...,16.9,-0.1,17.0,1.7,0.387,5.7,-0.9,0.535,11.2,9.9
6,549,Bam Adebayo,24,MIA,13.0,128.0,500,0.256,56,32.6,...,26.1,3.8,21.8,3.6,0.008,8.7,1.7,0.608,17.5,14.4
7,552,Robert Williams,24,BOS,1.0,8.0,500,0.016,61,29.6,...,20.5,5.5,22.1,6.0,0.003,14.6,2.4,0.745,17.6,13.4
8,553,Jrue Holiday,31,MIL,0.0,6.0,500,0.012,67,32.9,...,10.9,3.2,19.8,4.5,0.336,3.4,2.9,0.593,7.2,15.0
9,554,Al Horford,35,BOS,0.0,3.0,500,0.006,69,29.1,...,22.2,4.3,16.7,3.7,0.466,6.0,1.4,0.574,14.3,9.6


In [None]:
df_dpoy.info()

# Model Testing, Evaluation, and Selection

Potential Regression Models to be Used:
*   Random Forest
*   XGBoost
*   LightGBM

Metrics to be used to evaluate model:
*   Mean Absolute Error (MAE) --> Want to minimize
*   Root Mean Squared Error (RMSE) --> Want to minimize
*   Coefficient of Determination (R^2) --> Want to maximize
*   Accuracy (In terms of predicting the correct DPOY winner) --> Want to maximize













In [None]:
# First, let's split the data into train, test data sets

train = df_dpoy[df_dpoy['Year'] != 2022]
test = df_dpoy[df_dpoy['Year'] == 2022]

print('Train {}%'.format(train.shape[0] / df_dpoy.shape[0] * 100))
print('Test {}%'.format(test.shape[0] / df_dpoy.shape[0] * 100))

Train 98.02513464991023%
Test 1.9748653500897666%


In [None]:
# Testing Abdullah's features:
train_abdullah = train[['G', 'MP', 'PTS','TRB', 'BLK','WS','GS','VORP', 'BPM','Share']]
test_abdullah = test[['G', 'MP', 'PTS','TRB', 'BLK','WS','GS','VORP', 'BPM','Share']]
train_abdullah.head(10)

Unnamed: 0,G,MP,PTS,TRB,BLK,WS,GS,VORP,BPM,Share
11,63,30.1,15.2,14.3,2.0,8.2,63,2.2,2.7,0.02
12,71,30.8,14.3,13.5,2.7,11.3,71,3.8,4.9,0.928
13,58,32.4,14.3,7.2,0.6,6.0,58,2.3,2.9,0.574
14,63,31.5,7.0,7.1,0.8,4.6,63,1.9,1.7,0.152
15,64,33.5,18.7,9.0,1.0,8.8,64,3.7,4.9,0.062
16,61,33.0,28.1,11.0,1.2,10.2,61,5.6,9.0,0.024
17,52,33.6,21.5,6.9,0.3,9.3,52,4.3,7.7,0.004
18,51,31.1,28.5,10.6,1.4,8.8,51,3.8,7.5,0.014
19,59,32.3,17.7,4.5,0.6,6.6,56,2.6,3.4,0.012
20,47,31.0,12.6,6.5,3.4,3.3,47,0.9,0.4,0.006


In [None]:
# Now, let's remove the categorical column (except for position) to ensure all features within the dataset are numerical (for regression model testing purposes)

# train_clean = train.drop(['Player', 'Tm', 'First', 'Pts Won', 'Pts Max', 'Unnamed: 0'], axis = 1)
# test_clean = test.drop(['Player', 'Tm', 'First', 'Pts Won', 'Pts Max', 'Unnamed: 0'], axis = 1)

train_for_accuracy = train[['Player', 'Year', 'Share']]
test_for_accuracy = test[['Player', 'Year', 'Share']]

In [None]:
train_for_accuracy.head(10)


Unnamed: 0,Player,Year,Share
11,Clint Capela,2021,0.02
12,Rudy Gobert,2021,0.928
13,Ben Simmons,2021,0.574
14,Draymond Green,2021,0.152
15,Bam Adebayo,2021,0.062
16,Giannis Antetokounmpo,2021,0.024
17,Jimmy Butler,2021,0.004
18,Joel Embiid,2021,0.014
19,Jrue Holiday,2021,0.012
20,Myles Turner,2021,0.006


In [None]:
# # Next, let's clean the data (according to Kaison's code)
# def clean_data(df):
#   # impute missing values in 3P% by median
#   df['3P%'] = df['3P%'].fillna(df['3P%'].median())
#   return df

# train = clean_data(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['3P%'] = df['3P%'].fillna(df['3P%'].median())


In [None]:
# train_clean.head(10)

Unnamed: 0,Age,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,...,DRB%,BPM,PER,OWS,3PAr,ORB%,OBPM,TS%,TRB%,TOV%
11,26,0.02,63,30.1,15.2,14.3,0.8,0.7,2.0,0.594,...,34.3,2.7,24.3,4.9,0.0,17.5,2.7,0.601,26.1,8.4
12,28,0.928,71,30.8,14.3,13.5,1.3,0.6,2.7,0.675,...,33.5,4.9,23.5,6.1,0.007,12.2,2.1,0.683,23.3,13.7
13,24,0.574,58,32.4,14.3,7.2,6.9,1.6,0.6,0.557,...,18.7,2.9,18.3,2.7,0.017,5.6,0.7,0.584,12.2,19.6
14,30,0.152,63,31.5,7.0,7.1,8.9,1.7,0.8,0.447,...,20.9,1.7,13.3,1.2,0.332,3.0,-1.6,0.53,12.1,31.0
15,23,0.062,64,33.5,18.7,9.0,5.4,1.2,1.0,0.57,...,22.6,4.9,22.7,5.6,0.01,7.7,2.9,0.626,15.3,15.0
16,26,0.024,61,33.0,28.1,11.0,5.9,1.2,1.2,0.569,...,28.9,9.0,29.2,6.9,0.201,5.3,6.2,0.633,17.5,13.2
17,31,0.004,52,33.6,21.5,6.9,7.1,2.1,0.3,0.497,...,17.1,7.7,26.5,6.6,0.139,6.3,5.4,0.607,11.8,10.6
18,26,0.014,51,31.1,28.5,10.6,2.8,1.0,1.4,0.513,...,29.1,7.5,30.3,5.6,0.171,8.0,6.3,0.636,18.7,12.2
19,30,0.012,59,32.3,17.7,4.5,6.1,1.6,0.6,0.503,...,10.4,3.4,20.0,4.3,0.344,4.2,2.8,0.592,7.4,12.6
20,24,0.006,47,31.0,12.6,6.5,1.0,0.9,3.4,0.477,...,18.0,0.4,15.3,1.1,0.477,4.7,-1.6,0.599,11.4,11.9


In [None]:
# Let's split up Train as X and y, where y is our target variable 'Share'

train_X = train_abdullah.drop(['Share'], axis = 1)
train_y = train_abdullah.loc[:,'Share']


In [None]:
# Let's create a function that will allow us perform cross-validation on different models to see their different metric scores

from sklearn.model_selection import KFold # Do we want to use KStratifiedFold here, or is regular KFold fine?
from sklearn.base import clone

def cross_validate(model, x, y):

  """ 
  Input:
    Regression model of choice, X train data set (features only), y train data
    set (target variable only)

  Output:
    List of Cross-validation RMSE scores, Mean of Cross-validation RMSE scores,
    List of MAE scores, Mean of MAE scores, List of R2 scores, Mean of R2 scores

  """

  model = clone(model)
  five_fold = KFold(n_splits = 5, shuffle = True, random_state = 0)
  RMSE_scores = []
  MAE_scores = []
  R2_scores = []

  for train_index, val_index in five_fold.split(x,y):
        
        X_train, y_train = x.iloc[train_index], y.iloc[train_index]
        X_val, y_val =  x.iloc[val_index], y.iloc[val_index]
        
        # Fit model
        model.fit(X_train, y_train)
        y_fitted = model.predict(X_train)
        y_predicted = model.predict(X_val)
    
        # Append RMSE, MAE, and R2 scores, using val error
        RMSE_scores.append(mean_squared_error(y.iloc[val_index], y_predicted, squared = False))
        MAE_scores.append(mean_absolute_error(y.iloc[val_index], y_predicted))
        R2_scores.append(r2_score(y.iloc[val_index], y_predicted))

  print('CV RMSE scores: {}\n'.format(RMSE_scores), 'CV RMSE scores mean: {}\n'.format(np.mean(RMSE_scores).round(4)),
        'CV MAE scores: {}\n'.format(MAE_scores), 'CV MAE scores mean: {}\n'.format(np.mean(MAE_scores).round(4)),
        'CV R2 scores: {}\n'.format(R2_scores), 'CV R2 scores mean: {}\n'.format(np.mean(R2_scores).round(4)))

  

In [None]:
# Let's try out the cross_validate function with the four regression models we want to try out; without any hyperparameter tuning

# Random Forest Regressor --> Random Search Parameters CV

random_forest_RS = RandomForestRegressor(random_state = 0)
cross_validate(random_forest_RS, train_X, train_y)



CV RMSE scores: [0.15881084401948695, 0.1685885891232524, 0.14845593313051308, 0.1695186554211064, 0.1241945696636731]
 CV RMSE scores mean: 0.1539
 CV MAE scores: [0.09678072727272727, 0.10784559633027523, 0.09624559633027523, 0.10399357798165133, 0.08657045871559634]
 CV MAE scores mean: 0.0983
 CV R2 scores: [0.17801472733979296, 0.16471319119141292, 0.3245839462591317, 0.39350097339810397, 0.28587978987152685]
 CV R2 scores mean: 0.2693



In [None]:
# Random Forest Regressor --> GS Search Parameters CV

random_forest_GS = RandomForestRegressor()
cross_validate(random_forest_GS, train_X, train_y)

CV RMSE scores: [0.1558554073696631, 0.17080521129492682, 0.150224202645747, 0.1757448139081103, 0.12140189221353809]
 CV RMSE scores mean: 0.1548
 CV MAE scores: [0.09259227272727269, 0.10828825688073396, 0.0994216513761468, 0.10453825688073391, 0.08502880733944954]
 CV MAE scores mean: 0.098
 CV R2 scores: [0.20832400245318305, 0.14260389853211164, 0.30839826213056865, 0.3481312729557118, 0.317634561544788]
 CV R2 scores mean: 0.265



In [None]:
# Let's create a hyperparameter tuning pipeline under each model --> we will use both Random Search to find range of the best parameters,
# then Grid Search to further narrow it down

# Random Forest Regressor - Random Search

cv = KFold(n_splits = 5, shuffle = True, random_state = 0)

rf_n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

rf_max_features = ['auto', 'sqrt']

rf_max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]

rf_max_depth.append(None)

min_samples_split = [2,5,10]

min_samples_leaf = [1,2,3,4]

bootstrap = [True, False]

rf_param_random_grid = {'n_estimators': rf_n_estimators,
               'max_features': rf_max_features,
               'max_depth': rf_max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
              
rf_random = RandomizedSearchCV(estimator = RandomForestRegressor(),
                  param_distributions = rf_param_random_grid,
                  n_iter = 100, cv = cv, n_jobs = -1)

rf_random_result = rf_random.fit(train_X, train_y)

print(rf_random_result.best_params_)

{'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False}


In [None]:
# RF Regressor - Grid Search (based on parameters we get from Random Search)

cv = KFold(n_splits = 5, shuffle = True, random_state = 0)

rf_param_gs = {
    'bootstrap': [True, False],
    'max_depth': [90, 100, 110, 120, 130, 140, 150],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1,2,3,4],
    'min_samples_split': [1,2,3,4,5,6],
    'n_estimators': [400, 500, 600, 700, 800]
    }

rf_grid_search = GridSearchCV(estimator = RandomForestRegressor(), param_grid = rf_param_gs, 
                          cv = cv, n_jobs = -1,)

rf_gs_result = rf_grid_search.fit(train_X, train_y)

print(rf_gs_result.best_params_)

In [None]:
# XGB Regressor --> Random Search Parameters CV

XGB_rs = XGBRegressor(objective = 'reg:squarederror') 
cross_validate(XGB_rs, train_X, train_y)


CV RMSE scores: [0.16568225643040144, 0.18598794106682154, 0.15368641026714755, 0.17265594113208818, 0.12143518473189883]
 CV RMSE scores mean: 0.1599
 CV MAE scores: [0.09202844374613331, 0.11241864825826174, 0.10022447824040684, 0.09636973120750637, 0.07849042027149726]
 CV MAE scores mean: 0.0959
 CV R2 scores: [0.10534471888729746, -0.01659704731523881, 0.2761523077483864, 0.3708442581979624, 0.31726025471230956]
 CV R2 scores mean: 0.2106



In [None]:
# XGB Regressor --> Grid Search Parameters CV

XGB_gs = XGBRegressor(objective = 'reg:squarederror', n_estimators= 1400, min_child_weight= 3, max_depth= 70, learning_rate= 0.15) 
cross_validate(XGB_gs, train_X, train_y)

CV RMSE scores: [0.1636804526627463, 0.1408895451533741, 0.16262987131592507, 0.18180024310693155, 0.13115348841959026]
 CV RMSE scores mean: 0.156
 CV MAE scores: [0.09589273048314181, 0.08408553837636194, 0.09487696735793297, 0.10352697982700593, 0.08243357177174421]
 CV MAE scores mean: 0.0922
 CV R2 scores: [0.1268329000709013, 0.4166397881641225, 0.18945543598393644, 0.3024360480493643, 0.2036099909626694]
 CV R2 scores mean: 0.2478



In [None]:
# XGB Regressor - Random Search

cv = KFold(n_splits = 5, shuffle = True, random_state = 0)

XGB_n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

learning_rate = [0.00001,0.0001,0.001,0.01,0.015,0.02,0.05,0.1,0.15,0.2]

XGB_max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]

XGB_max_depth.append(None)

min_child_weight = [1,2,3,4]

XGB_param_random_grid = {
               'n_estimators': XGB_n_estimators,
               'learning_rate': learning_rate,
               'max_depth': XGB_max_depth,
               'min_child_weight': min_child_weight}
              
XGB_random = RandomizedSearchCV(estimator = XGBRegressor(objective = 'reg:squarederror'),
                  param_distributions = XGB_param_random_grid,
                  n_iter = 100, cv = cv, n_jobs = -1)

XGB_random_result = XGB_random.fit(train_X, train_y)

print(XGB_random_result.best_params_)

In [None]:
# XGB Regressor - Grid Search (based on parameters we get from Random Search)

cv = KFold(n_splits = 5, shuffle = True, random_state = 0)

XGB_param_gs = {
    'n_estimators': [1200, 1300, 1400, 1500, 1600],
    'max_depth': [40, 50, 60, 70, 80, 90, 100],
    'learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    'min_child_weight': [1,2,3,4]
    }

XGB_grid_search = GridSearchCV(estimator = XGBRegressor(objective = 'reg:squarederror'), param_grid = XGB_param_gs, 
                          cv = cv, n_jobs = -1)

XGB_gs_result = XGB_grid_search.fit(train_X, train_y)

print(XGB_gs_result.best_params_)

In [None]:
# LightGBM - Random Search Parameters CV

LGBM_rs = LGBMRegressor()
cross_validate(LGBM_rs, train_X, train_y)

CV RMSE scores: [0.1682315827933756, 0.17056854073368632, 0.15597338740050562, 0.173408620371851, 0.13161788738551894]
 CV RMSE scores mean: 0.16
 CV MAE scores: [0.0967524883371388, 0.11054709466985728, 0.10264725316144727, 0.10301167323665258, 0.08649762951151747]
 CV MAE scores mean: 0.0999
 CV R2 scores: [0.07760106966634583, 0.1449782971823489, 0.2544491509155934, 0.3653467969620826, 0.19796016058871757]
 CV R2 scores mean: 0.2081



In [None]:
# LightGBM - Grid Search Parameters CV

LGBM_gs = LGBMRegressor()
cross_validate(LGBM_gs, train_X, train_y)

In [None]:
# LGBM Regressor - Random Search

cv = KFold(n_splits = 5, shuffle = True, random_state = 0)

LGBM_n_estimators =  [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

LGBM_learning_rate = [0.00001,0.0001,0.001,0.01,0.015,0.02,0.05,0.1,0.15,0.2]

LGBM_max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]

LGBM_max_depth.append(None)

lambda_l1 = np.arange(0,0.8,0.05)

lambda_l2 = np.arange(0,0.8,0.05)

extra_trees = [True,False]

subsample = np.arange(0.3,1.0,0.05)

bagging_freq = [1,10,15, 20, 25, 30, 40 , 50, 60, 70, 80, 90, 100]

colsample_bytree = np.arange(0.3,1.0,0.05)

num_leaves = np.arange(10,100,10)

boosting = ['gbdt','dart']

drop_rate = np.arange(0.1,0.8,0.1)

skip_drop = np.arange(0,0.7,0.1)

LGBM_param_random_grid = {'n_estimators':LGBM_n_estimators,'max_depth':LGBM_max_depth, 'subsample':subsample,
          'colsample_bytree':colsample_bytree,
          'learning_rate': LGBM_learning_rate,
          'num_leaves':num_leaves,
          'boosting':boosting,
          'extra_trees':extra_trees,
          'lambda_l1':lambda_l1,
          'lambda_l2':lambda_l2,
          'bagging_freq':bagging_freq,
          'drop_rate':drop_rate}
              
LGBM_random = RandomizedSearchCV(estimator = LGBMRegressor(objective = 'regression'),
                  param_distributions = LGBM_param_random_grid,
                  n_iter = 100, cv = cv, n_jobs = -1)

LGBM_random_result = LGBM_random.fit(train_X, train_y)

print(LGBM_random_result.best_params_)


<scipy.stats._distn_infrastructure.rv_frozen at 0x7fb629198810>

In [None]:
# LGBM Regressor - Grid Search (based on parameters we get from Random Search)
cv = KFold(n_splits = 5, shuffle = True, random_state = 0)

LGBM_param_gs = {
    'n_estimators': [1000,2000,3000,4000],
    'max_depth': [30,40,50],
    'learning_rate': [0.015,0.02,0.025],
    'lambda_l1': [0, 0.05, 0.10, 0.15],
    'lambda_l2': [0, 0.05, 0.10, 0.15],
    'extra_trees': [True],
    'subsample': [0.85,0.90,0.95,1],
    'bagging_freq': [35,40,45],
    'colsample_bytree': [0.6, 0.65, 0.7],
    'num_leaves': [40, 50, 60],
    'boosting' : ['dart'],
    'drop_rate': [0.2, 0.3, 0.4],
    }

LGBM_grid_search = GridSearchCV(estimator = LGBMRegressor(objective = 'regression'), param_grid = LGBM_param_gs, 
                          cv = cv, n_jobs = -1)

LGBM_gs_result = LGBM_grid_search.fit(train_X, train_y)

print(LGBM_gs_result.best_params_)

In [None]:
# Accuracy Score Function Pipeline

def model_summary(model, x, y, year):

  model = clone(model)
  model.fit(x, y)
  y_predicted = pd.DataFrame(model.predict(x), index = x.index, columns = ['Predicted Share'])
  DPOY_race = train_for_accuracy[train_for_accuracy['Year'] == year]
  DPOY_race = pd.merge(DPOY_race, y_predicted, left_index = True, right_index = True)
  DPOY_race = DPOY_race.sort_values(['Share', 'Predicted Share'], ascending = (False, False))
  actual_winner = DPOY_race[DPOY_race['Share'] == DPOY_race['Share'].max()]['Player']
  predicted_winner = DPOY_race[DPOY_race['Predicted Share'] == DPOY_race['Predicted Share'].max()]['Player']

  return(predicted_winner.iloc[0], actual_winner.iloc[0])

def model_accuracy(model, x, y):

  years = [year for year in range(1984,2022)]

  predicted_DPOY = []
  actual_DPOY = []
  prediction_label = []

  for year in years:
    predicted_winner, actual_winner = model_summary(model, x, y, year)
    if predicted_winner == actual_winner:
      label = 'Correct'

    else:
      label = 'Incorrect'

    predicted_DPOY.append(predicted_winner)
    actual_DPOY.append(actual_winner)
    prediction_label.append(label)


  model_accuracy_dict = {
      'Year': years,
      'Predicted DPOY': predicted_DPOY,
      'Actual DPOY': actual_DPOY,
      'Label': prediction_label
  }

  model_accuracy_DF = pd.DataFrame(model_accuracy_dict)
  correct_count = model_accuracy_DF['Label'].value_counts().iloc[0]
  incorrect_count = model_accuracy_DF['Label'].value_counts().iloc[1]
  accuracy = correct_count / (correct_count + incorrect_count)

  return(model_accuracy_DF, accuracy)

In [None]:
# RandomForest Accuracy Score (RandomSearch Hyperparameters)

RF_RS_summary_DF, RF_RS_accuracy = model_accuracy(random_forest_RS, train_X, train_y)
RF_RS_accuracy

0.8157894736842105

In [None]:
# RandomForest Accuracy Score (GridSearch Hyperparameters)
RF_GS_summary_DF, RF_GS_accuracy = model_accuracy(random_forest_GS, train_X, train_y)

In [None]:
# XGB Accuracy Score (RandomSearch Hyperparameters)
XGB_RS_summary_DF, XGB_RS_accuracy = model_accuracy(XGB_rs, train_X, train_y)
XGB_RS_accuracy

0.6842105263157895

In [None]:
# XGB Accuracy Score (GridSearch Hyperparameters)
XGB_GS_summary_DF, XGB_GS_accuracy = model_accuracy(XGB_gs, train_X, train_y)

In [None]:
# LGBM Accuracy Score (RandomSearch Hyperparameters)

LGBM_RS_summary_DF, LGBM_RS_accuracy = model_accuracy(LGBM_rs, train_X, train_y)
LGBM_RS_accuracy

0.7894736842105263

In [None]:
# LGBM Accuracy Score (Gridsearch Hyperparameters)

LGBM__GS_summary_DF, LGBM_GS_accuracy = model_accuracy(LGBM_gs, train_X, train_y)

In [None]:
# Now we test out our final model on the test dataset, to predict the 2022 DPOY winner

# Must do to test dataset whatever it is we did to train data set 

