## NBA Position Predictor Project

### Created 16 November 2021

## Creating a Model

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from get_stats import *

In [None]:
data = get_full_dataframe()

In [None]:
stats_df = data
[(i, list(set(stats_df[stats_df['Year'] == i]['POS']))) for i in range(2017, 2022)]  # test

In [None]:
stats_df = stats_df[stats_df['Year'] != 2017].copy()
stats_df

In [None]:
# replace positions to get two different player classifications:
pos_encode = {"G":0 , "F":1, "C":2, "GF":3, "FC":4}
## pos_encode = {"PG":0 , "SF":1, "C":2, "SG":3, "PF":4, "G":5 , "F":6}
pos_decode = {0:"G", 1:"F", 2:"C", 3:"GF", 4:"FC"}
## pos_decode = {0:"PG", 1:"SF", 2:"C", 3:"SG", 4:"PF"}

# - POS3 = {"G", "F", "C"}  # - POS5 = {"G", "GF", "F", "FC", "C"}
stats_df.loc[:, "POS3"] = stats_df["POS"].replace("G-F", "G").replace("F-G", "F").replace("C-F", "C").replace("F-C", "F")
## stats_df.loc[:, "POS3"] = stats_df["POS3"].replace("PG", "G").replace("SG", "G").replace("SF", "F").replace("PF", "F")
stats_df.loc[:, "POS5"] = stats_df["POS"].replace("G-F", "GF").replace("F-G", "GF").replace("C-F", "FC").replace("F-C", "FC")
## stats_df.loc[:, "POS5"] = stats_df["POS"].replace("G-F", "SG").replace("F-G", "SG").replace("C-F", "PF").replace("F-C", "PF")
## stats_df.loc[:, "POS5"] = stats_df["POS5"].replace("G", "PG").replace("F", "SF")
stats_df

In [None]:
print(stats_df.groupby('POS3')['POS3'].count())
print(stats_df.groupby('POS5')['POS5'].count())

In [None]:
stats_df.describe()

In [None]:
stats_df.info()

### Support Vector Machine

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [None]:
#divide the data into the input 'X' and the labels 'y'
#selected_features = ['MIN%', 'USG%', 'TO%', 'FTA', 'FT%', '2PA', '2P%', '3PA', '3P%', 'eFG%', 'TS%', 'PPG',
#                     'RPG', 'TRB%','APG', 'AST%', 'SPG', 'BPG', 'TOPG', 'VI', 'ORTG', 'DRTG']
selected_features = [i for i in list(stats_df)[4:-3]]  # if i not in ['MPG', 'PPG']]
X        = stats_df[selected_features] #the observations
y        = stats_df['POS3'] #the label

In [None]:
def SupportVector(X, y, target_names):
    # split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=6, stratify = y)
    # Enter appropriate values to partition data, and ensure results are reproducible and obtain stratified samples


    # feature scaling
    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std, X_test_std = sc.transform(X_train), sc.transform(X_test)    
    
    # initialize the classifier
    svm = SVC() #the default kernel is rbf
    svm.fit(X_train_std, y_train) #fit the data
    
    # predict the labels for the test set
    y_pred   = svm.predict(X_test_std)
    # print('The prediction is: {}'.format(y_pred))
    
    #evaluate the prediction accuracy
    print('The accuracy of the model is: {}'.format(svm.score(X_test_std, y_test)))

    #get the classification report
    print(classification_report(y_test, y_pred, target_names = target_names))

In [None]:
SupportVector(X, stats_df['POS3'], target_names = ['C', 'F', 'G'])
# SupportVector(X, stats_df['POS5'], target_names = ['C', 'F', 'FC', "G", "GF"])

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
stats_df.loc[:, "POS3"] = stats_df["POS3"].apply(lambda x: pos_encode[x])
stats_df.loc[:, "POS5"] = stats_df["POS5"].apply(lambda x: pos_encode[x])

In [None]:
#divide the data into the input 'X' and the labels 'y'
#selected_features = ['MIN%', 'USG%', 'TO%', 'FTA', 'FT%', '2PA', '2P%', '3PA', '3P%', 'eFG%', 'TS%', 'PPG',
#                     'RPG', 'TRB%','APG', 'AST%', 'SPG', 'BPG', 'TOPG', 'VI', 'ORTG', 'DRTG']
selected_features = [i for i in list(stats_df)[4:-3]]  # if i not in ['MPG', 'PPG']]
X        = stats_df[selected_features] #the observations
y        = stats_df['POS3'] #the label

In [None]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=6, stratify = y)
# Enter appropriate values to partition data, and ensure results are reproducible and obtain stratified samples


# feature scaling
sc = StandardScaler()
sc.fit(X_train)
X_train_std, X_test_std = sc.transform(X_train), sc.transform(X_test)    


In [None]:
rf = RandomForestRegressor(n_estimators=10, random_state=7)
rf.fit(X_train, y_train)

In [None]:
#predict the labels for the test set
y_pred   = rf.predict(X_test)
# print('The prediction is: {}'.format(y_pred))

# Evaluate the Predictions
mse = mean_squared_error(y_test, y_pred)
print('The mse of the model is: {}'.format(mse))

In [None]:
n = max(rf.feature_importances_)
param = selected_features[list(rf.feature_importances_).index(n)]
print(f"The most important parameter was '{param}'.")

In [None]:
# sorted([(n, selected_features[list(rf.feature_importances_).index(n)]) for n in rf.feature_importances_], key = lambda x: x[0])