In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, cross_validation
from sklearn.cross_validation import *
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression



In [2]:
os.chdir("C:/Users/Cynthia/Desktop/Bootcamp_Github/Basketball-Analysis-Repo/ml_logistic")

In [3]:
orig_data = pd.read_csv("../data/clean/draft_nba.csv")

In [4]:
# All features
all_feat = ["draft_All_NBA",
            "draft_All.Star",
            "draft_Pk",
            "draft_Games",
            "draft_Minutes.Played",
            "draft_PTS",
            "draft_Win.Share",
            "draft_VORP",
            "draft_WS_per_game",
            "draft_attend_college",
            "Roster"]

## Feature selection

In [5]:
data = orig_data[all_feat]

In [6]:
skf = StratifiedKFold(data["Roster"], n_folds = 10)
fold_count = 1

corrs = pd.DataFrame()

# Correlations with "Roster"

for train, test in skf:
    
    # Log
    print(f"\n>>> Running fold {fold_count} <<<\n")
    
    # Training & testing dfs
    train_fold = data.iloc[train]
    test_fold = data.iloc[test]
    
    # Best features
    corr = train_fold.corr()["Roster"][train_fold.corr()["Roster"] < 1]
    corrs = corrs.append(corr)
    corrs = corrs.rename(index = {"Roster": f"Fold {fold_count}"})
    print(corr)
    
    # Increment fold
    fold_count += 1


>>> Running fold 1 <<<

draft_All_NBA           0.081221
draft_All.Star          0.092333
draft_Pk               -0.476737
draft_Games             0.425231
draft_Minutes.Played    0.359540
draft_PTS               0.322347
draft_Win.Share         0.258547
draft_VORP              0.154236
draft_WS_per_game       0.214060
draft_attend_college    0.394295
Name: Roster, dtype: float64

>>> Running fold 2 <<<

draft_All_NBA           0.075172
draft_All.Star          0.087764
draft_Pk               -0.472424
draft_Games             0.423274
draft_Minutes.Played    0.356813
draft_PTS               0.322062
draft_Win.Share         0.262194
draft_VORP              0.154821
draft_WS_per_game       0.222859
draft_attend_college    0.392000
Name: Roster, dtype: float64

>>> Running fold 3 <<<

draft_All_NBA           0.079252
draft_All.Star          0.090852
draft_Pk               -0.473138
draft_Games             0.412525
draft_Minutes.Played    0.343076
draft_PTS               0.313548
draft_Win

In [7]:
# Corrs for each fold
corrs

Unnamed: 0,draft_All.Star,draft_All_NBA,draft_Games,draft_Minutes.Played,draft_PTS,draft_Pk,draft_VORP,draft_WS_per_game,draft_Win.Share,draft_attend_college
Fold 1,0.092333,0.081221,0.425231,0.35954,0.322347,-0.476737,0.154236,0.21406,0.258547,0.394295
Fold 2,0.087764,0.075172,0.423274,0.356813,0.322062,-0.472424,0.154821,0.222859,0.262194,0.392
Fold 3,0.090852,0.079252,0.412525,0.343076,0.313548,-0.473138,0.154413,0.206054,0.255142,0.394334
Fold 4,0.09302,0.080739,0.411227,0.346426,0.311299,-0.475965,0.155018,0.208127,0.257448,0.376509
Fold 5,0.090485,0.083803,0.410928,0.349561,0.314959,-0.480477,0.158346,0.208149,0.260038,0.355815
Fold 6,0.090935,0.078666,0.412884,0.348459,0.313178,-0.481536,0.154057,0.21284,0.256029,0.364547
Fold 7,0.104137,0.091895,0.406784,0.345066,0.312609,-0.484338,0.155775,0.216417,0.256753,0.383185
Fold 8,0.093933,0.081421,0.422848,0.356401,0.320499,-0.491569,0.155129,0.211037,0.261862,0.345254
Fold 9,0.094896,0.082639,0.424135,0.35976,0.323905,-0.475576,0.160469,0.230046,0.264547,0.364104
Fold 10,0.098792,0.086542,0.435905,0.3668,0.330893,-0.482906,0.164172,0.229397,0.272318,0.371106


In [8]:
# Average correlations
avg_corrs = pd.DataFrame(corrs.mean())
avg_corrs.columns = ["Avg corr"]
avg_corrs["Avg corr (Abs)"] = avg_corrs["Avg corr"].abs()
avg_corrs.sort_values(["Avg corr (Abs)"], ascending = False)

Unnamed: 0,Avg corr,Avg corr (Abs)
draft_Pk,-0.479467,0.479467
draft_Games,0.418574,0.418574
draft_attend_college,0.374115,0.374115
draft_Minutes.Played,0.35319,0.35319
draft_PTS,0.31853,0.31853
draft_Win.Share,0.260488,0.260488
draft_WS_per_game,0.215899,0.215899
draft_VORP,0.156643,0.156643
draft_All.Star,0.093715,0.093715
draft_All_NBA,0.082135,0.082135


In [9]:
# Top features - greater than .3 corr w Roster
sel_feat = avg_corrs[avg_corrs["Avg corr (Abs)"] > .3].index.values.tolist()
sel_feat

['draft_Games',
 'draft_Minutes.Played',
 'draft_PTS',
 'draft_Pk',
 'draft_attend_college']

## Cross-Validation

In [10]:
X = data[sel_feat]
y = data["Roster"]

In [11]:
predicted = cross_val_predict(LogisticRegression(), X, y, cv = 10)
acc_score = metrics.accuracy_score(y, predicted)
roster_vars = ["Did not play", "Played"]

print(f"Accuracy score: {round(acc_score, 2)}\n")
print("Classification report")
print(metrics.classification_report(y, predicted, target_names = roster_vars))

Accuracy score: 0.86

Classification report
              precision    recall  f1-score   support

Did not play       0.76      0.69      0.72       234
      Played       0.89      0.92      0.91       655

 avg / total       0.86      0.86      0.86       889



In [12]:
# Get coefficients
clf = LogisticRegression()
clf.fit(X, y)
clf.coef_

array([[ 1.81319323e-02, -7.66725676e-04,  6.39905653e-04,
        -4.72543116e-02,  1.50353549e+00]])

In [13]:
# Predicted probabilities of all players

pp = pd.DataFrame(clf.predict_proba(X))

pl = orig_data[["draft_Player", "draft_Draft_Yr"]]

pl_sel_feat = orig_data[sel_feat]

pred = pd.DataFrame(predicted)
pred.columns = ["Roster (Pred)"]

pl_rost = orig_data["Roster"]

pl_pp = pd.concat([pl, pl_sel_feat, pl_rost, pred, pp], axis = 1)

pl_pp.rename(columns = 
             {0: "Did not play (PP)",
              1: "Played (PP)",
              "Roster": "Roster (Actual)"}, inplace = True)

pl_pp.to_csv("predictions/draft00_15_preds.csv")

pl_pp.head()

Unnamed: 0,draft_Player,draft_Draft_Yr,draft_Games,draft_Minutes.Played,draft_PTS,draft_Pk,draft_attend_college,Roster (Actual),Roster (Pred),Did not play (PP),Played (PP)
0,Speedy Claxton,2000,334,8548,3096,20,1,0,1,0.075174,0.924826
1,Mark Karcher,2000,0,0,0,48,1,0,0,0.573603,0.426397
2,Stromile Swift,2000,547,10804,4582,2,1,1,1,0.001588,0.998412
3,Jamaal Magloire,2000,680,14621,4917,19,1,1,1,0.004775,0.995225
4,Erick Barkley,2000,27,266,77,28,1,1,1,0.272217,0.727783


In [14]:
# Save classifier
import pickle
pickle.dump(clf, open("final_classifier.pkl", "wb"))

## Testing Classifier on New Data

In [15]:
new_data = pd.read_csv("../data/clean/draft16_nba17.csv")
var_list = sel_feat
var_list.extend(("draft_Player", "Roster"))
new_data = new_data[var_list].dropna(axis = 0, how = "any")

In [16]:
X_new = new_data.drop(["Roster", "draft_Player"], axis = 1)
y_new = new_data["Roster"]

In [17]:
classifier = pickle.load(open("final_classifier.pkl", "rb"))

In [18]:
# Prediction
new_pred = classifier.predict(X_new)
pred_act = pd.DataFrame({"Roster (Pred)": new_pred,
                         "Roster (Actual)": y_new})

In [19]:
# New predicted probabilities
new_pp = pd.DataFrame(classifier.predict_proba(X_new))
new_pl_sel_feat = new_data[var_list].drop(["Roster"], axis = 1)
new_pl_pp = pd.concat([new_pl_sel_feat, pred_act, new_pp], axis = 1)
new_pl_pp.rename(columns = 
                  {0: "Did not play (PP)",
                   1: "Played (PP)"}, inplace = True)

col_order = ['draft_Player',
 'draft_Games',
 'draft_Minutes.Played',
 'draft_PTS',
 'draft_Pk',
 'draft_attend_college',
 'Roster (Actual)',
 'Roster (Pred)',
 'Did not play (PP)',
 'Played (PP)']
new_pl_pp = new_pl_pp[col_order]

new_pl_pp.to_csv("predictions/draft16_preds.csv")

new_pl_pp.head()

Unnamed: 0,draft_Player,draft_Games,draft_Minutes.Played,draft_PTS,draft_Pk,draft_attend_college,Roster (Actual),Roster (Pred),Did not play (PP),Played (PP)
0,Ben Simmons,81.0,2732.0,1279.0,1.0,1.0,1.0,1.0,0.107473,0.892527
1,Brandon Ingram,138.0,4254.0,1689.0,2.0,1.0,1.0,1.0,0.099886,0.900114
2,Jaylen Brown,148.0,3493.0,1532.0,3.0,1.0,1.0,1.0,0.056489,0.943511
3,Dragan Bender,125.0,2643.0,677.0,4.0,0.0,1.0,1.0,0.278415,0.721585
4,Kris Dunn,130.0,2858.0,992.0,5.0,1.0,1.0,1.0,0.073372,0.926628


In [20]:
new_acc_score = metrics.accuracy_score(y_new, new_pred)
roster_vars = ["Did not play", "Played"]

print(f"Accuracy score: {round(new_acc_score, 2)}\n")
print("Classification report")
print(metrics.classification_report(y_new, new_pred, target_names = roster_vars))

Accuracy score: 0.69

Classification report
              precision    recall  f1-score   support

Did not play       0.17      0.22      0.19         9
      Played       0.83      0.78      0.80        45

 avg / total       0.72      0.69      0.70        54

