## NBA Position Predictor Project

In [1]:
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from get_stats import clean_data

In [2]:
stats_df = clean_data() # preprocessed, clean data
stats_df

Unnamed: 0,POS,AGE,GP,MPG,USG%,FTA,FT%,2PA,2P%,3PA,...,TS%,PPG,RPG,APG,SPG,BPG,TPG,VI,ORTG,DRTG
0,G,25.69,31,19.0,12.2,13,0.923,30,0.500,127,...,0.507,5.3,1.5,0.6,0.55,0.19,0.45,3.5,103.1,103.9
1,F,28.51,10,12.3,9.2,10,0.700,3,0.667,15,...,0.379,1.7,2.5,0.8,0.10,0.40,0.40,4.9,87.1,98.5
2,G,22.93,34,12.6,13.5,9,0.778,36,0.361,74,...,0.474,3.2,1.8,1.9,0.38,0.15,0.82,7.0,99.5,108.1
3,C,25.73,80,33.4,16.4,292,0.500,807,0.596,2,...,0.591,13.9,9.5,1.6,1.49,0.96,1.73,7.1,119.9,102.7
5,F,22.19,19,10.2,9.9,4,1.000,13,0.385,23,...,0.424,1.7,1.0,0.3,0.05,0.21,0.32,3.1,85.3,115.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2957,F,20.30,19,11.6,11.0,4,0.750,18,0.500,35,...,0.411,2.4,0.9,0.6,0.40,0.50,0.10,3.8,97.3,113.2
2958,G,25.90,25,10.3,10.9,9,0.667,41,0.488,16,...,0.500,2.4,0.9,1.2,0.40,0.10,0.20,5.3,117.6,114.5
2963,G,24.80,23,12.0,11.6,3,0.667,58,0.431,2,...,0.424,2.3,1.6,1.7,0.30,0.10,0.70,6.1,93.2,114.2
2970,G,25.30,5,13.4,10.0,0,0.000,8,0.500,6,...,0.393,2.2,1.8,1.0,0.80,0.60,0.20,4.7,94.4,98.0


In [3]:
features = list(stats_df)[1:]
X, y = stats_df[features], stats_df['POS']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0, stratify = y)

In [4]:
from sklearn.svm import SVC

sc = StandardScaler()
fs = SelectKBest()
svm = SVC()

pipe = Pipeline([('scaler', sc), 
                 ("feature_selection", fs),
                  ("svc", svm)])

param_grid = {'svc__C': [1],
          'feature_selection__k': [4, 6, 8, 10, 12, 20]}

search = GridSearchCV(pipe, param_grid, cv=5)
search.fit(X_train, y_train)
print(search.best_params_)

model = search.best_estimator_
print("Validation accuracy     :", search.best_score_)

y_pred = model.predict(X_test)
print(f"Test data model accuracy: {model.score(X_test, y_test)}")
print("\n", classification_report(y_test, y_pred))

{'feature_selection__k': 20, 'svc__C': 1}
Validation accuracy     : 0.8152523897367097
Test data model accuracy: 0.8222996515679443

               precision    recall  f1-score   support

           C       0.89      0.66      0.76        62
           F       0.75      0.80      0.77       217
           G       0.87      0.87      0.87       295

    accuracy                           0.82       574
   macro avg       0.84      0.78      0.80       574
weighted avg       0.83      0.82      0.82       574



In [5]:
from sklearn import tree

sc = StandardScaler()
fs = SelectKBest()
dt = tree.DecisionTreeClassifier()

pipe = Pipeline([('scaler', sc), 
                 ("feature_selection", fs),
                  ("dt", dt)])

param_grid = {'dt__max_depth': [1, 3, 5, 7, 11, 20],
          'feature_selection__k': [4, 6, 8, 10, 12, 20]}

search = GridSearchCV(pipe, param_grid, cv=5)
search.fit(X_train, y_train)
print(search.best_params_)

model = search.best_estimator_
print("Validation accuracy     :", search.best_score_)

y_pred = model.predict(X_test)
print(f"Test data model accuracy: {model.score(X_test, y_test)}")
print("\n", classification_report(y_test, y_pred))

{'dt__max_depth': 7, 'feature_selection__k': 6}
Validation accuracy     : 0.7785678349823913
Test data model accuracy: 0.7578397212543554

               precision    recall  f1-score   support

           C       0.70      0.50      0.58        62
           F       0.66      0.77      0.71       217
           G       0.86      0.80      0.83       295

    accuracy                           0.76       574
   macro avg       0.74      0.69      0.71       574
weighted avg       0.77      0.76      0.76       574



In [6]:
from sklearn.ensemble import RandomForestClassifier

sc = StandardScaler()
fs = SelectKBest()
rf = RandomForestClassifier()

pipe = Pipeline([('scaler', sc), 
                 ("feature_selection", fs),
                  ("rf", rf)])

param_grid = {'feature_selection__k': [4, 6, 8, 10, 12, 20]}

search = GridSearchCV(pipe, param_grid, cv=5)
search.fit(X_train, y_train)
print(search.best_params_)

model = search.best_estimator_
print("Validation accuracy     :", search.best_score_)

y_pred = model.predict(X_test)
print(f"Test data model accuracy: {model.score(X_test, y_test)}")
print("\n", classification_report(y_test, y_pred))

{'feature_selection__k': 12}
Validation accuracy     : 0.8107468276594556
Test data model accuracy: 0.8066202090592335

               precision    recall  f1-score   support

           C       0.86      0.60      0.70        62
           F       0.73      0.77      0.75       217
           G       0.86      0.87      0.87       295

    accuracy                           0.81       574
   macro avg       0.82      0.75      0.77       574
weighted avg       0.81      0.81      0.81       574



## Creating a Model

In [3]:
[(i, list(set(stats_df[stats_df['Year'] == i]['POS']))) for i in set(stats_df["Year"])]

[(2019, ['F-C', 'C-F', 'F-G', 'G-F', 'C', 'F', 'G']),
 (2020, [0, 'F-C', 'C-F', 'F-G', 'G-F', 'C', 'F', 'G']),
 (2021, ['F-C', 'C-F', 'F-G', 'G-F', 'C', 'F', 'G']),
 (2022, ['F-C', 'C-F', 'F-G', 'G-F', 'C', 'F', 'G']),
 (2023, ['F-C', 'C-F', 'F-G', 'G-F', 'C', 'F', 'G'])]

In [4]:
stats_df = stats_df[stats_df['POS'] != 0].copy()  # Nicolo Melli (F) - 2019
stats_df = stats_df[stats_df["MPG"] >= 10 ].copy()  # players that played at least 10 minutes
stats_df = stats_df.drop(["RANK", "NAME", "TEAM", "Year"], axis=1)
stats_df

Unnamed: 0,POS,AGE,GP,MPG,USG%,FTA,FT%,2PA,2P%,3PA,...,TS%,PPG,RPG,APG,SPG,BPG,TPG,VI,ORTG,DRTG
0,G,25.69,31,19.0,12.2,13,0.923,30,0.500,127,...,0.507,5.3,1.5,0.6,0.55,0.19,0.45,3.5,103.1,103.9
1,F,28.51,10,12.3,9.2,10,0.700,3,0.667,15,...,0.379,1.7,2.5,0.8,0.10,0.40,0.40,4.9,87.1,98.5
2,G,22.93,34,12.6,13.5,9,0.778,36,0.361,74,...,0.474,3.2,1.8,1.9,0.38,0.15,0.82,7.0,99.5,108.1
3,C,25.73,80,33.4,16.4,292,0.500,807,0.596,2,...,0.591,13.9,9.5,1.6,1.49,0.96,1.73,7.1,119.9,102.7
4,C-F,21.73,82,23.3,15.8,226,0.735,471,0.588,15,...,0.623,8.9,7.3,2.2,0.88,0.79,1.48,9.0,120.0,97.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2964,C-F,29.00,14,10.8,11.9,10,0.700,28,0.393,2,...,0.465,2.3,2.6,0.5,0.90,0.60,0.60,5.3,92.6,99.1
2970,G,25.30,5,13.4,10.0,0,0.000,8,0.500,6,...,0.393,2.2,1.8,1.0,0.80,0.60,0.20,4.7,94.4,98.0
2972,G-F,39.20,8,14.1,9.4,3,0.667,6,1.000,9,...,0.521,2.1,2.1,2.4,0.50,0.40,1.10,6.2,102.3,108.3
2987,F,21.50,6,10.2,14.8,2,1.000,8,0.250,10,...,0.238,1.5,2.0,0.7,0.20,0.00,0.30,5.0,64.8,112.2


In [5]:
#plt.hist(stats_df["AGE"])
print(stats_df.groupby('POS')['POS'].count()) #print(list(set(stats_df['POS'])))

POS
C      208
C-F    101
F      722
F-C    178
F-G    101
G      981
G-F    238
Name: POS, dtype: int64


In [6]:
stats_df

Unnamed: 0,POS,AGE,GP,MPG,USG%,FTA,FT%,2PA,2P%,3PA,...,TS%,PPG,RPG,APG,SPG,BPG,TPG,VI,ORTG,DRTG
0,G,25.69,31,19.0,12.2,13,0.923,30,0.500,127,...,0.507,5.3,1.5,0.6,0.55,0.19,0.45,3.5,103.1,103.9
1,F,28.51,10,12.3,9.2,10,0.700,3,0.667,15,...,0.379,1.7,2.5,0.8,0.10,0.40,0.40,4.9,87.1,98.5
2,G,22.93,34,12.6,13.5,9,0.778,36,0.361,74,...,0.474,3.2,1.8,1.9,0.38,0.15,0.82,7.0,99.5,108.1
3,C,25.73,80,33.4,16.4,292,0.500,807,0.596,2,...,0.591,13.9,9.5,1.6,1.49,0.96,1.73,7.1,119.9,102.7
4,C-F,21.73,82,23.3,15.8,226,0.735,471,0.588,15,...,0.623,8.9,7.3,2.2,0.88,0.79,1.48,9.0,120.0,97.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2964,C-F,29.00,14,10.8,11.9,10,0.700,28,0.393,2,...,0.465,2.3,2.6,0.5,0.90,0.60,0.60,5.3,92.6,99.1
2970,G,25.30,5,13.4,10.0,0,0.000,8,0.500,6,...,0.393,2.2,1.8,1.0,0.80,0.60,0.20,4.7,94.4,98.0
2972,G-F,39.20,8,14.1,9.4,3,0.667,6,1.000,9,...,0.521,2.1,2.1,2.4,0.50,0.40,1.10,6.2,102.3,108.3
2987,F,21.50,6,10.2,14.8,2,1.000,8,0.250,10,...,0.238,1.5,2.0,0.7,0.20,0.00,0.30,5.0,64.8,112.2


## Bagging

In [27]:
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier

In [28]:
X_train_std, X_test_std, y_train, y_test = get_split(X, y)
bc = BaggingClassifier(base_estimator=KNeighborsClassifier(), n_estimators=10, random_state=0)
bc = bc.fit(X_train_std, y_train)

In [29]:
y_pred = bc.predict(X_test_std)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.56      0.64        62
           1       0.73      0.72      0.73       217
           2       0.84      0.88      0.86       295

    accuracy                           0.79       574
   macro avg       0.77      0.72      0.74       574
weighted avg       0.79      0.79      0.79       574



## Adaboost

In [30]:
from sklearn.ensemble import AdaBoostClassifier

In [31]:
X_train_std, X_test_std, y_train, y_test = get_split(X, y)
abc = AdaBoostClassifier(n_estimators=10)
abc.fit(X_train_std, y_train)

AdaBoostClassifier(n_estimators=10)

In [32]:
y_pred = abc.predict(X_test_std)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.71      0.61        62
           1       0.63      0.54      0.58       217
           2       0.78      0.81      0.79       295

    accuracy                           0.70       574
   macro avg       0.65      0.69      0.66       574
weighted avg       0.70      0.70      0.69       574



## Gradient Tree Boosting

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

In [34]:
X_train_std, X_test_std, y_train, y_test = get_split(X, y)
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
gbc.fit(X_train_std, y_train)

GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=0)

In [35]:
y_pred = gbc.predict(X_test_std)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.58      0.62        62
           1       0.72      0.75      0.73       217
           2       0.87      0.87      0.87       295

    accuracy                           0.79       574
   macro avg       0.75      0.73      0.74       574
weighted avg       0.79      0.79      0.79       574

