In [1]:
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from get_data import get_data, filter_data

In [2]:
full_df = get_data()
stats_df = filter_data(full_df.copy(), five_pos=False, games_played=5, minutes_played=12)
stats_df

Unnamed: 0,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,G,28,14,0,14.6,1.9,4.1,0.466,0.0,0.1,...,0.500,1.0,1.9,2.9,1.5,0.4,0.2,0.5,1.9,4.1
1,F,26,81,81,38.1,7.0,14.6,0.478,0.3,0.7,...,0.841,2.2,6.2,8.4,3.0,1.1,0.5,2.6,3.0,19.9
2,G,25,66,7,20.6,2.9,7.7,0.382,0.3,0.9,...,0.808,0.6,1.2,1.8,1.2,0.5,0.1,1.0,1.9,7.9
3,F,24,80,73,29.0,4.2,9.9,0.424,0.0,0.1,...,0.802,1.7,3.6,5.3,0.7,0.5,1.0,1.6,2.9,9.6
4,G,27,76,75,37.9,7.9,17.9,0.439,2.6,7.0,...,0.916,1.2,3.8,5.0,4.4,1.4,0.2,2.6,2.9,22.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8455,G,24,27,1,12.4,1.7,3.6,0.469,0.3,1.0,...,0.684,0.3,1.4,1.7,2.1,0.3,0.2,0.7,0.9,4.2
8456,F,34,54,9,14.7,2.0,3.7,0.545,0.1,0.6,...,0.692,1.3,1.8,3.1,1.4,1.0,0.1,0.8,1.6,4.4
8457,G,24,73,73,34.8,8.2,19.0,0.429,2.1,6.3,...,0.886,0.8,2.2,3.0,10.2,1.1,0.1,4.1,1.4,26.2
8458,C,30,15,2,14.5,2.5,3.9,0.627,0.0,0.1,...,0.686,1.7,2.6,4.3,0.7,0.2,0.3,0.9,2.2,6.5


In [3]:
features = list(stats_df)[1:]
X, y = stats_df[features], stats_df['Pos']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0, stratify = y)

print(y_train.value_counts())
#ros = SMOTE(sampling_strategy={"C": 600}, random_state=0)
#rus = RandomUnderSampler(sampling_strategy={"PG": 1200}, random_state=0)
#rus = TomekLinks()
#X_train, y_train = ros.fit_resample(X_train, y_train)
#X_train, y_train = rus.fit_resample(X_train, y_train)
#print(y_train.value_counts())

G    3116
F    2686
C     966
Name: Pos, dtype: int64


In [4]:
sc = MinMaxScaler()
fs = SelectKBest()
knn = KNeighborsClassifier()

pipe = Pipeline([('scaler', sc), 
                 ("feature_selection", fs),
                  ("knn", knn)])

param_grid = {'knn__n_neighbors': [5, 9, 13, 17, 21],
          'feature_selection__k': [5, 9, 13, 17, 21]}

search = GridSearchCV(pipe, param_grid, cv=5).fit(X_train, y_train)
print(search.best_params_)

model = search.best_estimator_
print("Validation accuracy     :", search.best_score_)

y_pred = model.predict(X_test)
print(f"Test data model accuracy: {model.score(X_test, y_test)}")
#print("\n", classification_report(y_test, y_pred))
#confusion_matrix(y_test, y_pred).T

{'feature_selection__k': 17, 'knn__n_neighbors': 9}
Validation accuracy     : 0.7928491966536424
Test data model accuracy: 0.8091016548463357


In [5]:
"""
{'feature_selection__k': 17, 'knn__n_neighbors': 9}
Validation accuracy     : 0.7928491966536424
Test data model accuracy: 0.8091016548463357

               precision    recall  f1-score   support

           C       0.74      0.67      0.70       241
           F       0.75      0.78      0.77       672
           G       0.88      0.87      0.88       779

    accuracy                           0.81      1692
   macro avg       0.79      0.78      0.78      1692
weighted avg       0.81      0.81      0.81      1692

array([[162,  54,   4],
       [ 79, 526,  94],
       [  0,  92, 681]])
"""

"\n{'feature_selection__k': 17, 'knn__n_neighbors': 9}\nValidation accuracy     : 0.7928491966536424\nTest data model accuracy: 0.8091016548463357\n\n               precision    recall  f1-score   support\n\n           C       0.74      0.67      0.70       241\n           F       0.75      0.78      0.77       672\n           G       0.88      0.87      0.88       779\n\n    accuracy                           0.81      1692\n   macro avg       0.79      0.78      0.78      1692\nweighted avg       0.81      0.81      0.81      1692\n\narray([[162,  54,   4],\n       [ 79, 526,  94],\n       [  0,  92, 681]])\n"

In [6]:
"""
{'feature_selection__k': 17, 'knn__n_neighbors': 9}
Validation accuracy     : 0.6300271512182021
Test data model accuracy: 0.6459810874704491

               precision    recall  f1-score   support

           C       0.73      0.72      0.72       241
          PF       0.57      0.57      0.57       329
          PG       0.80      0.85      0.82       382
          SF       0.52      0.52      0.52       343
          SG       0.62      0.58      0.60       397

    accuracy                           0.65      1692
   macro avg       0.65      0.65      0.65      1692
weighted avg       0.64      0.65      0.64      1692

array([[173,  53,   2,   8,   1],
       [ 58, 189,   1,  66,  16],
       [  0,   1, 323,  11,  71],
       [ 10,  71,   7, 179,  80],
       [  0,  15,  49,  79, 229]])
"""



"\n{'feature_selection__k': 17, 'knn__n_neighbors': 9}\nValidation accuracy     : 0.6300271512182021\nTest data model accuracy: 0.6459810874704491\n\n               precision    recall  f1-score   support\n\n           C       0.73      0.72      0.72       241\n          PF       0.57      0.57      0.57       329\n          PG       0.80      0.85      0.82       382\n          SF       0.52      0.52      0.52       343\n          SG       0.62      0.58      0.60       397\n\n    accuracy                           0.65      1692\n   macro avg       0.65      0.65      0.65      1692\nweighted avg       0.64      0.65      0.64      1692\n\narray([[173,  53,   2,   8,   1],\n       [ 58, 189,   1,  66,  16],\n       [  0,   1, 323,  11,  71],\n       [ 10,  71,   7, 179,  80],\n       [  0,  15,  49,  79, 229]])\n"