## NBA Position Predictor Project

### Created 16 November 2021

## Creating a Model

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from get_stats import nbastuffer_dataframe

In [2]:
data = nbastuffer_dataframe(playoffs=False)

In [3]:
stats_df = data
[(i, list(set(stats_df[stats_df['Year'] == i]['POS']))) for i in range(2018, 2022)]  # test

[(2018, ['G', 'C', 'F', 'C-F', 'F-G', 'G-F', 'F-C']),
 (2019, ['G', 'C', 'F', 'C-F', 'F-G', 'G-F', '0', 'F-C']),
 (2020, ['G', 'C', 'F', 'F-G', 'C-F', 'G-F', 'F-C']),
 (2021, ['G', 'C', 'F', 'C-F', 'F-G', 'G-F', 'F-C'])]

In [4]:
stats_df = stats_df[stats_df['POS'] != "0"].copy()  # Nicolo Melli (F) - 2019
stats_df = stats_df[stats_df["MPG"] >= 10 ].copy()  # players that played at least 10 minutes
stats_df = stats_df.drop(["RANK", "NAME", "TEAM", "Year"], axis=1)
stats_df

Unnamed: 0,POS,AGE,GP,MPG,MIN%,USG%,TO%,FTA,FT%,2PA,...,RPG,TRB%,APG,AST%,SPG,BPG,TOPG,VI,ORTG,DRTG
0,G,25.69,31,19.0,39.5,12.2,7.9,13,0.923,30,...,1.5,4.2,0.6,4.3,0.55,0.19,0.45,3.5,103.1,103.9
1,F,28.51,10,12.3,25.6,9.2,15.2,10,0.700,3,...,2.5,11.3,0.8,8.2,0.10,0.40,0.40,4.9,87.1,98.5
2,G,22.93,34,12.6,26.2,13.5,19.7,9,0.778,36,...,1.8,7.5,1.9,20.2,0.38,0.15,0.82,7.0,99.5,108.1
3,C,25.73,80,33.4,69.5,16.4,12.8,292,0.500,807,...,9.5,14.7,1.6,6.6,1.49,0.96,1.73,7.1,119.9,102.7
4,C-F,21.73,82,23.3,48.6,15.8,17.1,226,0.735,471,...,7.3,16.6,2.2,14.2,0.88,0.79,1.48,9.0,120.0,97.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2550,F,33.80,26,18.3,38.1,16.0,12.4,27,0.481,101,...,4.4,13.0,1.7,13.6,1.19,0.42,0.85,7.9,112.8,102.0
2551,G,23.56,76,34.9,72.7,34.4,14.5,553,0.904,933,...,3.7,5.9,9.7,46.8,0.96,0.09,3.99,11.5,119.5,114.3
2552,C,23.81,56,12.6,26.3,19.8,13.0,61,0.623,236,...,5.3,23.6,0.9,11.0,0.30,0.36,0.73,9.3,113.3,94.7
2553,F-C,29.51,27,13.1,27.4,16.0,14.6,49,0.776,86,...,4.6,19.3,0.8,9.4,0.30,0.22,0.70,8.2,128.3,101.8


In [5]:
#plt.hist(stats_df["AGE"])
print(stats_df.groupby('POS')['POS'].count()) #print(list(set(stats_df['POS'])))

POS
C      172
C-F     80
F      577
F-C    139
F-G     81
G      786
G-F    197
Name: POS, dtype: int64


In [6]:
# s2 is just G, F, and C; s3 is everything else
s2 = stats_df[(stats_df["POS"] =="G") | (stats_df["POS"] =="F") | (stats_df["POS"] =="C")]
s3 = stats_df[~((stats_df["POS"] =="G") | (stats_df["POS"] =="F") | (stats_df["POS"] =="C"))]

In [7]:
#stats_df.describe()
#stats_df.info()

### k-Nearest Neighbor Classification

In [8]:
#divide the data into the input 'X' and the labels 'y'
features = list(stats_df)[1:]
X        = stats_df[features] #the observations
y        = stats_df['POS'] #the label

In [9]:
def knn_model(X, y):
    """Create a knn model"""
    # split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=6, stratify = y)
    # Enter appropriate values to partition data, and ensure results are reproducible and obtain stratified samples

    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std, X_test_std = sc.transform(X_train), sc.transform(X_test)  
    knn = KNeighborsClassifier()
    
    # use grid search to find optimal value for k
    params = {"n_neighbors": list(range(1,50))}  # for grid search
    grid = GridSearchCV(knn, params)
    grid.fit(X_train_std, y_train)
    best_k = grid.best_params_["n_neighbors"]
    print(f"Optimal k value: {best_k}")
    
    # feature selection (select important featues)
    #knn = KNeighborsClassifier(n_neighbors=best_k)
    #sfs = SequentialFeatureSelector(knn, n_features_to_select=num_features)
    #sfs.fit(X_train_std, y_train)
    #selected_features = list(sfs.get_feature_names_out())
    #print(selected_features)
    #X_train_std, X_test_std = sfs.transform(X_train_std), sfs.transform(X_test_std)

    # build model with optimal value for k
    knn = KNeighborsClassifier(n_neighbors=best_k)
    knn.fit(X_train_std, y_train)
    
    # cross validation
    scores = cross_val_score(knn, X_train_std, y_train, cv=5)
    print(f"Mean Validation accuracy: {scores.mean()}")
    #print(f"Validation accuracy std : {scores.std()}")
    
    # evaluate using test data
    y_pred = knn.predict(X_test_std)
    print(f"Test data accuracy      : {knn.score(X_test_std, y_test)}")
    print("\n", classification_report(y_test, y_pred))
    return

knn_model(s2[features], s2["POS"])

Optimal k value: 16
Mean Validation accuracy: 0.7979526189958704
Test data accuracy      : 0.7939262472885033

               precision    recall  f1-score   support

           C       0.88      0.54      0.67        52
           F       0.73      0.72      0.72       173
           G       0.83      0.90      0.86       236

    accuracy                           0.79       461
   macro avg       0.81      0.72      0.75       461
weighted avg       0.80      0.79      0.79       461

