In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import neighbors
import plotly.express as px
import sklearn.metrics as metrics



<h2> Retrieving the Data </h2>

In [None]:
#loading dataframes
past_df = pd.read_csv('csv_files/Past_Player_Data.csv')
current_df = pd.read_csv('csv_files/Current_players.csv')

#Players that have been traded show up more than once so we want to only have them show up once
current_df.drop_duplicates('NAME', keep='first', inplace=True)
past_df.drop_duplicates(subset=['NAME','AGE'], keep='first', inplace=True)

current_df.reset_index(drop=True, inplace=True)
past_df.reset_index(drop=True, inplace=True)
# current_df = current_df[current_df['PTS'] > 18]
# past_df =past_df[past_df['PTS'] > 18]

past_df = past_df.fillna(0.000).replace(' ',0)
current_df = current_df.fillna(0.000).replace(' ',0)
current_df

In [None]:
past_df.columns

<h2> Peparing data and methods </h2>

In [None]:
features = ['TEAM_STANDING','ALLSTAR', 'PTS', 'TRB', 'AST', 'TS%', 'VORP', 'WS']

In [None]:
train,test = train_test_split(past_df, test_size = 0.25, random_state = 10)

xtrain = train[features]
ytrain = train[['ALL_NBA']]

xtest = test[features]
ytest = test[['ALL_NBA']]

xtrain = xtrain.apply(pd.to_numeric)
ytrain = ytrain.apply(pd.to_numeric)

In [None]:
def score(model):
    
    model.fit(xtrain, ytrain.values.ravel())
    y_pred = model.predict(xtest)
    
    print("R2 value is: " +  str(r2_score(ytest, y_pred)))
    
    cvs = cross_val_score(model, xtest, ytest.values.ravel(), cv = 4, scoring = 'r2')
    print("Cross Validation Score is: " + str(cvs.mean()))

In [None]:
def print_all_nba_teams(df):
    guard_df = df[(df['POS'] == ' PG') | (df['POS'] == ' SG')]
    forward_df = df[(df['POS'] == ' SF') | (df['POS'] == ' PF')]
    centre_df = df[(df['POS'] == ' C')]

    #printing first team

    first_team = []

    first_team.append(guard_df['NAME'].iloc[0])
    first_team.append(guard_df['NAME'].iloc[1])
    first_team.append(forward_df['NAME'].iloc[0])
    first_team.append(forward_df['NAME'].iloc[1])
    first_team.append(centre_df['NAME'].iloc[0])

    print(first_team)

    second_team = []


    second_team.append(guard_df['NAME'].iloc[2])
    second_team.append(guard_df['NAME'].iloc[3])
    second_team.append(forward_df['NAME'].iloc[2])
    second_team.append(forward_df['NAME'].iloc[3])
    second_team.append(centre_df['NAME'].iloc[1])

    print(second_team)


    third_team = []


    third_team.append(guard_df['NAME'].iloc[4])
    third_team.append(guard_df['NAME'].iloc[5])
    third_team.append(forward_df['NAME'].iloc[4])
    third_team.append(forward_df['NAME'].iloc[5])
    third_team.append(centre_df['NAME'].iloc[2])

    print(third_team)


In [None]:
current_df

In [None]:
#Setting up the current data to be analyzed by each model
current_name_df = current_df.iloc[:, 0]
current_position_df = current_df.iloc[:, 2]
current_players_df = current_df[features]

current_players_df = current_players_df.fillna(0)

<h2> Support Vector Regression  </h2>


In [None]:
svr = SVR(kernel='rbf', gamma=1e-4, C=100, epsilon = .1)

score(svr)

In [None]:
predict_svr = svr.predict(current_players_df).tolist()


players_svr = []

svr_df = pd.DataFrame(columns = ['NAME', 'POS', 'ALL_NBA_SCORE'])
for (i, j, k) in zip(current_name_df,current_position_df, predict_svr):
    
    temp = dict()
    
    temp['NAME'] = i
    temp['POS'] = j
    temp['ALL_NBA_SCORE'] = k
    svr_df = svr_df.append(temp,ignore_index = True)
    
    
svr_df = svr_df.sort_values(by = 'ALL_NBA_SCORE', ascending=False)
svr_df

In [None]:
fig = px.bar(svr_df.head(15), x='NAME', y='ALL_NBA_SCORE',title="SVR All-NBA Prediction", labels={
                     "NAME": "Player",
                     'ALL_NBA_SCORE': "All-NBA Score"
                    
                 },)
fig.show()


In [None]:
print_all_nba_teams(svr_df)

<h2> Linear Regression </h2>

In [None]:
lr = LinearRegression()
score(lr)

In [None]:
predict_lr = lr.predict(current_players_df).tolist()

players_lr = []

lr_df = pd.DataFrame(columns = ['NAME', 'POS', 'ALL_NBA_SCORE'])
for (i, j, k) in zip(current_name_df,current_position_df, predict_lr):
    
    temp = dict()
    
    temp['NAME'] = i
    temp['POS'] = j
    temp['ALL_NBA_SCORE'] = k
    lr_df = lr_df.append(temp,ignore_index = True)
    
    
lr_df = lr_df.sort_values(by = 'ALL_NBA_SCORE', ascending=False)
lr_df.head(30)

In [None]:
fig = px.bar(lr_df.head(15), x='NAME', y='ALL_NBA_SCORE',title="LR All-NBA Prediction", labels={
                     "NAME": "Player",
                     'ALL_NBA_SCORE': "All-NBA Score"
                    
                 },)
fig.show()

In [None]:
print_all_nba_teams(lr_df)

<h2> Descion Tree Regression </h2>

In [None]:
parameters = {
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
             }

gs_clf = GridSearchCV(DecisionTreeRegressor(), parameters, n_jobs=-1,verbose=10)
gs_clf = gs_clf.fit(xtrain, ytrain)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
#I like the output of the model without the hyperparameter tuning better but uncomment to see the more tuned model

In [None]:
dtr = DecisionTreeRegressor(random_state = 31,max_depth= 80,max_features= 'sqrt',min_samples_leaf=4 )
#dtr = DecisionTreeRegressor(random_state = 31)

score(dtr)


In [None]:
predict_dtr = dtr.predict(current_players_df).tolist()

players_dtr = []

dtr_df = pd.DataFrame(columns = ['NAME', 'POS', 'ALL_NBA_SCORE'])
for (i, j, k) in zip(current_name_df,current_position_df, predict_dtr):
    
    temp = dict()
    
    temp['NAME'] = i
    temp['POS'] = j
    temp['ALL_NBA_SCORE'] = k
    dtr_df = dtr_df.append(temp,ignore_index = True)
    
    
dtr_df = dtr_df.sort_values(by = 'ALL_NBA_SCORE', ascending=False)
dtr_df

In [None]:
fig = px.bar(dtr_df.head(15), x='NAME', y='ALL_NBA_SCORE',title="DTR All-NBA Prediction", labels={
                     "NAME": "Player",
                     'ALL_NBA_SCORE': "All-NBA Score"
                    
                 },)
fig.show()

In [None]:
print_all_nba_teams(dtr_df)

<h2> Random Forest </h2>

In [None]:
parameters = {
    'n_estimators':[100, 200, 400, 600],
    'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10]
             }

gs_clf = GridSearchCV(RandomForestRegressor(), parameters, n_jobs=-1,verbose=10)
gs_clf = gs_clf.fit(xtrain, ytrain)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

<p> When running grid search on different parameters I found that n_estimators should be 100 and crterion should be mse </p>

In [None]:
rf = RandomForestRegressor(bootstrap = True, max_depth = 10, max_features = 'sqrt',min_samples_leaf = 2, 
                           min_samples_split = 2, random_state = 8, n_estimators = 200, criterion = 'mse')
score(rf)

In [None]:
predict_rf = rf.predict(current_players_df).tolist()

players_rf = []

rf_df = pd.DataFrame(columns = ['NAME', 'POS', 'ALL_NBA_SCORE'])
for (i, j, k) in zip(current_name_df,current_position_df, predict_rf):
    
    temp = dict()
    
    temp['NAME'] = i
    temp['POS'] = j
    temp['ALL_NBA_SCORE'] = k
    rf_df = rf_df.append(temp,ignore_index = True)
    
    
rf_df = rf_df.sort_values(by = 'ALL_NBA_SCORE', ascending=False)
rf_df

In [None]:
fig = px.bar(rf_df.head(15), x='NAME', y='ALL_NBA_SCORE',title="RF All-NBA Prediction", labels={
                     "NAME": "Player",
                     'ALL_NBA_SCORE': "All-NBA Score"
                    
                 },)
fig.show()

In [None]:
print_all_nba_teams(rf_df)

<h2> K Nearest Neighbours </h2>

In [None]:
parameters =  {'weights': ['uniform', 'distance'],
               'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
              }

             

gs_clf = GridSearchCV(neighbors.KNeighborsRegressor(), parameters, n_jobs=-1,verbose=10)
gs_clf = gs_clf.fit(xtrain, ytrain)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
knn = neighbors.KNeighborsRegressor(n_neighbors = 19, weights = 'distance')
score(knn)

In [None]:
predict_knn = knn.predict(current_players_df).tolist()

players_knn = []

knn_df = pd.DataFrame(columns = ['NAME', 'POS', 'ALL_NBA_SCORE'])
for (i, j, k) in zip(current_name_df,current_position_df, predict_knn):
    
    temp = dict()
    
    temp['NAME'] = i
    temp['POS'] = j
    temp['ALL_NBA_SCORE'] = k
    knn_df = knn_df.append(temp,ignore_index = True)
    
    
knn_df = knn_df.sort_values(by = 'ALL_NBA_SCORE', ascending=False)
knn_df

In [None]:
fig = px.bar(knn_df.head(15), x='NAME', y='ALL_NBA_SCORE',title="KNN All-NBA Prediction", labels={
                     "NAME": "Player",
                     'ALL_NBA_SCORE': "All-NBA Score"
                    
                 },)
fig.show()

In [None]:
print_all_nba_teams(knn_df)