## NBA Position Predictor Project

### Created 16 November 2021

## Creating a Model

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from get_stats import nbastuffer_dataframe

In [2]:
data = nbastuffer_dataframe(playoffs=False)

In [3]:
stats_df = data
[(i, list(set(stats_df[stats_df['Year'] == i]['POS']))) for i in range(2018, 2022)]  # test

[(2018, ['C-F', 'G', 'F', 'F-C', 'F-G', 'G-F', 'C']),
 (2019, ['C-F', 'G', 'F', 'F-C', '0', 'F-G', 'G-F', 'C']),
 (2020, ['C-F', 'G', 'F', 'F-C', 'F-G', 'G-F', 'C']),
 (2021, ['C-F', 'G', 'F', 'F-C', 'F-G', 'G-F', 'C'])]

In [4]:
stats_df = stats_df[stats_df['POS'] != "0"].copy()  # Nicolo Melli (F) - 2019
stats_df = stats_df[stats_df["MPG"] >= 10 ].copy()  # players that played at least 10 minutes
stats_df = stats_df.drop(["RANK", "NAME", "TEAM", "Year"], axis=1)
stats_df

Unnamed: 0,POS,AGE,GP,MPG,MIN%,USG%,TO%,FTA,FT%,2PA,...,RPG,TRB%,APG,AST%,SPG,BPG,TOPG,VI,ORTG,DRTG
0,G,25.69,31,19.0,39.5,12.2,7.9,13,0.923,30,...,1.5,4.2,0.6,4.3,0.55,0.19,0.45,3.5,103.1,103.9
1,F,28.51,10,12.3,25.6,9.2,15.2,10,0.700,3,...,2.5,11.3,0.8,8.2,0.10,0.40,0.40,4.9,87.1,98.5
2,G,22.93,34,12.6,26.2,13.5,19.7,9,0.778,36,...,1.8,7.5,1.9,20.2,0.38,0.15,0.82,7.0,99.5,108.1
3,C,25.73,80,33.4,69.5,16.4,12.8,292,0.500,807,...,9.5,14.7,1.6,6.6,1.49,0.96,1.73,7.1,119.9,102.7
4,C-F,21.73,82,23.3,48.6,15.8,17.1,226,0.735,471,...,7.3,16.6,2.2,14.2,0.88,0.79,1.48,9.0,120.0,97.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2550,F,33.80,26,18.3,38.1,16.0,12.4,27,0.481,101,...,4.4,13.0,1.7,13.6,1.19,0.42,0.85,7.9,112.8,102.0
2551,G,23.56,76,34.9,72.7,34.4,14.5,553,0.904,933,...,3.7,5.9,9.7,46.8,0.96,0.09,3.99,11.5,119.5,114.3
2552,C,23.81,56,12.6,26.3,19.8,13.0,61,0.623,236,...,5.3,23.6,0.9,11.0,0.30,0.36,0.73,9.3,113.3,94.7
2553,F-C,29.51,27,13.1,27.4,16.0,14.6,49,0.776,86,...,4.6,19.3,0.8,9.4,0.30,0.22,0.70,8.2,128.3,101.8


In [5]:
#plt.hist(stats_df["AGE"])
print(stats_df.groupby('POS')['POS'].count()) #print(list(set(stats_df['POS'])))

POS
C      172
C-F     80
F      577
F-C    139
F-G     81
G      786
G-F    197
Name: POS, dtype: int64


In [6]:
# s2 is just G, F, and C; s3 is everything else
s2 = stats_df[(stats_df["POS"] =="G") | (stats_df["POS"] =="F") | (stats_df["POS"] =="C")]
s3 = stats_df[~((stats_df["POS"] =="G") | (stats_df["POS"] =="F") | (stats_df["POS"] =="C"))]

In [7]:
#stats_df.describe()
#stats_df.info()

In [8]:
#divide the data into the input 'X' and the labels 'y'
features = list(s2)[1:]
X        = s2[features] #the observations
y        = s2['POS'] #the label

In [32]:
def get_split(X, y):
    """Get the scaled train-test split for the data"""
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=6, stratify = y)
    
    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std, X_test_std = sc.transform(X_train), sc.transform(X_test)    
    
    return X_train_std, X_test_std, y_train, y_test

X_train_std, X_test_std, y_train, y_test = get_split(X, y)

### Support Vector Machine

In [9]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [31]:
def SupportVector(X, y, target_names):
    X_train_std, X_test_std, y_train, y_test = get_split(X, y)

    # initialize the classifier
    svm = SVC() #the default kernel is rbf
    svm.fit(X_train_std, y_train) #fit the data
    
    # cross validation
    scores = cross_val_score(svm, X_train_std, y_train, cv=5)
    print(f"Mean Validation accuracy: {scores.mean()}")
    #print(f"Validation accuracy std : {scores.std()}")
    
    # evaluate using test data
    y_pred = svm.predict(X_test_std)
    print(f"Test data model accuracy: {svm.score(X_test_std, y_test)}")
    print("\n", classification_report(y_test, y_pred))

SupportVector(s2[features], s2["POS"], target_names = ['C', 'F', 'G'])

Mean Validation accuracy: 0.823086285590089
Test data model accuracy: 0.8503253796095445

               precision    recall  f1-score   support

           C       0.91      0.60      0.72        52
           F       0.81      0.80      0.81       173
           G       0.87      0.94      0.90       236

    accuracy                           0.85       461
   macro avg       0.86      0.78      0.81       461
weighted avg       0.85      0.85      0.85       461



### Random Forest

In [34]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [38]:
#X_train_std, X_test_std, y_train, y_test = get_split(X, y)
#rf = RandomForestRegressor(n_estimators=10, random_state=7)
#rf.fit(X_train_std, y_train)

#predict the labels for the test set
#y_pred   = rf.predict(X_test_std)
# print('The prediction is: {}'.format(y_pred))

# Evaluate the Predictions
#mse = mean_squared_error(y_test, y_pred)
#print('The mse of the model is: {}'.format(mse))

#n = max(rf.feature_importances_)
#param = selected_features[list(rf.feature_importances_).index(n)]
#print(f"The most important parameter was '{param}'.")

#sorted([(n, selected_features[list(rf.feature_importances_).index(n)]) for n in rf.feature_importances_], key = lambda x: x[0])

## Decision Tree

In [39]:
from sklearn import tree

In [40]:
X_train_std, X_test_std, y_train, y_test = get_split(X, y)
dt = tree.DecisionTreeClassifier()
dt = dt.fit(X_train_std, y_train)

In [41]:
r = tree.export_text(dt)
print(r)

|--- feature_16 <= -0.07
|   |--- feature_18 <= -0.22
|   |   |--- feature_16 <= -0.63
|   |   |   |--- feature_9 <= 1.19
|   |   |   |   |--- feature_19 <= 1.86
|   |   |   |   |   |--- feature_21 <= -1.46
|   |   |   |   |   |   |--- feature_18 <= -0.86
|   |   |   |   |   |   |   |--- class: G
|   |   |   |   |   |   |--- feature_18 >  -0.86
|   |   |   |   |   |   |   |--- class: F
|   |   |   |   |   |--- feature_21 >  -1.46
|   |   |   |   |   |   |--- feature_10 <= 3.12
|   |   |   |   |   |   |   |--- feature_11 <= 1.59
|   |   |   |   |   |   |   |   |--- feature_4 <= -0.06
|   |   |   |   |   |   |   |   |   |--- feature_14 <= 0.21
|   |   |   |   |   |   |   |   |   |   |--- feature_20 <= -0.27
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 4
|   |   |   |   |   |   |   |   |   |   |--- feature_20 >  -0.27
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |--- feature_14 >  0.21
|   | 

In [45]:
y_pred = dt.predict(X_test_std)
print(classification_report(y_test, y_pred))
#dt.get_params()

              precision    recall  f1-score   support

           C       0.68      0.62      0.65        52
           F       0.70      0.70      0.70       173
           G       0.84      0.86      0.85       236

    accuracy                           0.77       461
   macro avg       0.74      0.72      0.73       461
weighted avg       0.77      0.77      0.77       461



## Bagging

In [46]:
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier

In [47]:
X_train_std, X_test_std, y_train, y_test = get_split(X, y)
bc = BaggingClassifier(base_estimator=KNeighborsClassifier(), n_estimators=10, random_state=0)
bc = bc.fit(X_train_std, y_train)

In [48]:
y_pred = bc.predict(X_test_std)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           C       0.88      0.56      0.68        52
           F       0.76      0.74      0.75       173
           G       0.83      0.91      0.87       236

    accuracy                           0.81       461
   macro avg       0.82      0.74      0.77       461
weighted avg       0.81      0.81      0.80       461



## Adaboost

In [49]:
from sklearn.ensemble import AdaBoostClassifier

In [50]:
X_train_std, X_test_std, y_train, y_test = get_split(X, y)
abc = AdaBoostClassifier(n_estimators=10)
abc.fit(X_train_std, y_train)

AdaBoostClassifier(n_estimators=10)

In [51]:
y_pred = abc.predict(X_test_std)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           C       0.64      0.69      0.67        52
           F       0.75      0.56      0.64       173
           G       0.79      0.93      0.86       236

    accuracy                           0.76       461
   macro avg       0.73      0.73      0.72       461
weighted avg       0.76      0.76      0.75       461



## Gradient Tree Boosting

In [52]:
from sklearn.ensemble import GradientBoostingClassifier

In [53]:
X_train_std, X_test_std, y_train, y_test = get_split(X, y)
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
gbc.fit(X_train_std, y_train)

GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=0)

In [54]:
y_pred = gbc.predict(X_test_std)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           C       0.80      0.63      0.71        52
           F       0.76      0.80      0.78       173
           G       0.88      0.89      0.88       236

    accuracy                           0.83       461
   macro avg       0.82      0.77      0.79       461
weighted avg       0.83      0.83      0.83       461

