In [39]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, cross_validation
from sklearn.linear_model import LogisticRegression

In [2]:
os.chdir("C:/Users/Cynthia/Desktop/Bootcamp_Github/Basketball-Analysis-Repo/ml_logistic")

In [3]:
orig_data = pd.read_csv("../data/clean/draft_nba.csv")

In [4]:
# All features
all_feat = ["draft_All_NBA",
            "draft_All.Star",
            "draft_Pk",
            "draft_Games",
            "draft_Minutes.Played",
            "draft_Minutes.per.Game",
            "draft_Points.per.Game",
            "draft_Win.Share",
            "draft_VORP",
            "draft_WS_per_game",
            "draft_attend_college",
            "Roster"]

## Feature selection

In [5]:
data = orig_data[all_feat]

In [18]:
skf = StratifiedKFold(data["Roster"], n_folds = 10)
fold_count = 1

corrs = pd.DataFrame()

# Correlations with "Roster"

for train, test in skf:
    
    # Log
    print(f"\n>>> Running fold {fold_count} <<<\n")
    
    # Training & testing dfs
    train_fold = data.iloc[train]
    test_fold = data.iloc[test]
    
    # Best features
    corr = train_fold.corr()["Roster"][train_fold.corr()["Roster"] < 1]
    corrs = corrs.append(corr)
    corrs = corrs.rename(index = {"Roster": f"Fold {fold_count}"})
    print(corr)
    
    # Increment fold
    fold_count += 1


>>> Processing fold 1 <<<

draft_All_NBA             0.081221
draft_All.Star            0.092333
draft_Pk                 -0.476737
draft_Games               0.425231
draft_Minutes.Played      0.359540
draft_Minutes.per.Game    0.535775
draft_Points.per.Game     0.451963
draft_Win.Share           0.258547
draft_VORP                0.154236
draft_WS_per_game         0.214060
draft_attend_college      0.394295
Name: Roster, dtype: float64

>>> Processing fold 2 <<<

draft_All_NBA             0.075172
draft_All.Star            0.087764
draft_Pk                 -0.472424
draft_Games               0.423274
draft_Minutes.Played      0.356813
draft_Minutes.per.Game    0.536525
draft_Points.per.Game     0.453936
draft_Win.Share           0.262194
draft_VORP                0.154821
draft_WS_per_game         0.222859
draft_attend_college      0.392000
Name: Roster, dtype: float64

>>> Processing fold 3 <<<

draft_All_NBA             0.079252
draft_All.Star            0.090852
draft_Pk          

In [7]:
# Corrs for each fold
corrs

Unnamed: 0,draft_All.Star,draft_All_NBA,draft_Games,draft_Minutes.Played,draft_Minutes.per.Game,draft_Pk,draft_Points.per.Game,draft_VORP,draft_WS_per_game,draft_Win.Share,draft_attend_college
Fold 1,0.092333,0.081221,0.425231,0.35954,0.535775,-0.476737,0.451963,0.154236,0.21406,0.258547,0.394295
Fold 2,0.087764,0.075172,0.423274,0.356813,0.536525,-0.472424,0.453936,0.154821,0.222859,0.262194,0.392
Fold 3,0.090852,0.079252,0.412525,0.343076,0.521336,-0.473138,0.437825,0.154413,0.206054,0.255142,0.394334
Fold 4,0.09302,0.080739,0.411227,0.346426,0.523727,-0.475965,0.439891,0.155018,0.208127,0.257448,0.376509
Fold 5,0.090485,0.083803,0.410928,0.349561,0.531973,-0.480477,0.447342,0.158346,0.208149,0.260038,0.355815
Fold 6,0.090935,0.078666,0.412884,0.348459,0.540216,-0.481536,0.452624,0.154057,0.21284,0.256029,0.364547
Fold 7,0.104137,0.091895,0.406784,0.345066,0.530192,-0.484338,0.446856,0.155775,0.216417,0.256753,0.383185
Fold 8,0.093933,0.081421,0.422848,0.356401,0.537646,-0.491569,0.452797,0.155129,0.211037,0.261862,0.345254
Fold 9,0.094896,0.082639,0.424135,0.35976,0.538593,-0.475576,0.449768,0.160469,0.230046,0.264547,0.364104
Fold 10,0.098792,0.086542,0.435905,0.3668,0.526627,-0.482906,0.444397,0.164172,0.229397,0.272318,0.371106


In [8]:
# Average correlations
avg_corrs = pd.DataFrame(corrs.mean())
avg_corrs.columns = ["Avg corr"]
avg_corrs["Avg corr (Abs)"] = avg_corrs["Avg corr"].abs()
avg_corrs.sort_values(["Avg corr (Abs)"], ascending = False)

Unnamed: 0,Avg corr,Avg corr (Abs)
draft_Minutes.per.Game,0.532261,0.532261
draft_Pk,-0.479467,0.479467
draft_Points.per.Game,0.44774,0.44774
draft_Games,0.418574,0.418574
draft_attend_college,0.374115,0.374115
draft_Minutes.Played,0.35319,0.35319
draft_Win.Share,0.260488,0.260488
draft_WS_per_game,0.215899,0.215899
draft_VORP,0.156643,0.156643
draft_All.Star,0.093715,0.093715


In [9]:
# Top features - greater than .3 corr w Roster
sel_feat = avg_corrs[avg_corrs["Avg corr (Abs)"] > .3].index.values.tolist()
sel_feat

['draft_Games',
 'draft_Minutes.Played',
 'draft_Minutes.per.Game',
 'draft_Pk',
 'draft_Points.per.Game',
 'draft_attend_college']

## Cross-Validation

In [None]:
X = data[sel_feat]
y = data["Roster"]

In [58]:
predicted = cross_val_predict(LogisticRegression(), X, y, cv = 10)
acc_score = metrics.accuracy_score(y, predicted)
roster_vars = ["Did not play", "Played"]

print(f"Accuracy score: {round(acc_score, 2)}\n")
print("Classification report")
print(metrics.classification_report(y, predicted, target_names = roster_vars))

Accuracy score: 0.87

Classification report
              precision    recall  f1-score   support

Did not play       0.79      0.71      0.75       234
      Played       0.90      0.93      0.92       655

 avg / total       0.87      0.87      0.87       889



## Testing Classifier on New Data

In [None]:
clf = log reg
clf.fit()
pickle.dump(clf, filename)

clf = pickle.load(file)
new pred = clf.predict_proba(new_draft_class)