In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn import svm
import itertools
from sklearn.metrics import confusion_matrix

from sklearn.svm import SVR
pd.set_option('display.max_rows', 500)


## Join Data

In [None]:
pre_draft = pd.read_csv("pre_draft_data_2002_to_2015.csv")
nba_stats = pd.read_csv("draft_data_2002_to_2015.csv")
rsci = pd.read_csv("rsci_2002_to_2015.csv")[['rsci','id']]
print nba_stats.columns
# nba_stats = nba_stats[['Yrs', 'G', 'MP','BPM','id','WS','PTS','TRB']]
nba_stats = nba_stats[['Yrs', 'G', 'MP','BPM','id','PTS']]

# nba_stats.columns = ['nba_yrs', 'nba_gp',  'nba_mp','nba_bpm','id','ws','nba_pts','nba_trb']
nba_stats.columns = ['nba_yrs', 'nba_gp',  'nba_mp','nba_bpm','id','nba_pts']
df = pd.merge(pre_draft, nba_stats, on='id')
df = df.merge(rsci,how='left', left_on='id',right_on = 'id')
df['rsci'] = df['rsci'].fillna(150)
df = df.drop_duplicates(['name'])
df.columns

# Classify Players into Bust (0) or not bust (1)

In [None]:
# classify players into: "Bust",  "bench", "Starter", "Star
def classify_role(yrs, gp, minutes, nba_bpm, draft_yr,pts):
    
    
    if (draft_yr < 2012 and (yrs < 5 and minutes / (gp * 1.0 + 1) < 20)) or (yrs == 0) or ((draft_yr >=  2012 and gp < yrs * 60) and (minutes / gp < 15)):
        level = 0
    else:
        level = 1
  

    return level


df['class']  = df.apply(lambda row: classify_role(row['nba_yrs'],row['nba_gp'], 
                                                        row['nba_mp'], row['nba_bpm'],row['Draft_Yr'],row['nba_pts'] ), axis=1)

# Percentage of first round picks that bust!
# a = df['class'] == 0
a = df['class']
b = df['Pk'] > 30
c = df['Pk'] <31
dat = pd.DataFrame(zip(a,c,b))
dat.columns = ['class', 'first', 'second']

print "Percentage of all draftees that stick in the NBA:", len(dat[dat['class'] != 0]) / (1.0 * len(a))
print "first rounder success %", len(dat[(dat['first'] == 1) & (dat['class'] != 0)]) / (1.0 * len(dat[dat['first'] == 1]))
print "second rounder success %", len(dat[(dat['second'] == 1) & (dat['class'] != 0)]) / (1.0 * len(dat[dat['second'] == 1]))


# Data - pull out relevant columns for prediction

In [None]:
x_cols = ['height','weight','wing_span','standing_reach','body_fat','no_step_vert','max_vert','gp', 'min', 'pts', 'fg',
       'fga', 'fg_perc', '2pt', '2pta', '2p_perc', '3pt', '3pta',
       '3p_perc', 'FTM', 'FTA', 'FT_perc', 'off_reb', 'def_reb',
       'tot_reb', 'ast', 'stl', 'blks', 'to', 'pf']
X = df[x_cols]

names = df[['name']]
c = df['class']


# SVM  Cross-validate to choose best kernel, c, and gamma


In [None]:
from sklearn import svm, datasets
from sklearn import model_selection
parameters = {'kernel':('linear', 'rbf'), 'C':[.1, .5, 1, 10],'gamma':[.0001,.0005,.0001,.01,.1,]}
svr = svm.SVC()
clf = model_selection.GridSearchCV(svr, parameters, scoring = "average_precision")
clf.fit(X, c)


In [None]:
M = svm.SVC(gamma = clf.best_params_['gamma'], C = clf.best_params_['C'], kernel = clf.best_params_['kernel'],probability=True)
M.fit(X, c)
M.predict_proba(X)

In [None]:
df['odds'] = [p[1] for p in M.predict_proba(X)]
# X[['name', 'odds', 'nba_bpm']]

In [None]:
df[['name', 'odds', 'Pk']][df['Pk'] > 30].sort(['odds'], ascending=[0])


# 10-fold Cross-validate to evaluate the model using parameters found above

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from scipy.io import loadmat as load
from numpy import argsort, reshape, transpose, array, zeros,delete
from matplotlib.pyplot import imshow, xlabel, ylabel, title, figure, savefig
from numpy.random import permutation, seed
from pydotplus import graph_from_dot_data
from sklearn.externals.six import StringIO 


x = X[:]
w = h = 2
k = 10
correct = 0
conf = [[0 for x_ in range(w)] for y_ in range(h)]
round_1_correct = 0
round_1_conf = [[0 for x_ in range(w)] for y_ in range(h)]
round_2_correct = 0
round_2_conf = [[0 for x_ in range(w)] for y_ in range(h)]

seed(3)
x_ = permutation(x[:])
seed(3)
Y = permutation(c[:])
pick = list(df['Pk'])[:]
seed(3)
pick = permutation(pick[:])


split_size = len(Y) * 1.0 / k
folds = [i for i in range(k) for _ in range(int(split_size))]
num_bigger_sets = len(Y) - len(folds)
if num_bigger_sets != 0:
    first = [i for i in range(k - num_bigger_sets) for _ in range(int(split_size))]
    second = [i + (k - num_bigger_sets) for i in range(num_bigger_sets) for _ in range(int(split_size + 1))] 
    folds = first + second


for j in range(k):
    X_train = [row for row, f, in zip(x_,folds) if f != j]
    Y_train = [val for val, f in zip(Y,folds) if f != j]

    X_test = [row for row, f in zip(x_,folds) if f == j]
    Y_test = [val for val, f in zip(Y,folds) if f == j]
    
    rnd = [p for p, f in zip(pick,folds) if f == j]
    rnd = [1 if p < 31 else 2 for p in rnd]

    
    M = svm.SVC(gamma = clf.best_params_['gamma'], C = clf.best_params_['C'], kernel = clf.best_params_['kernel'],probability=True)
    M = M.fit(X_train, Y_train)

    predicted = M.predict(X_test) 
    for i in range(len(predicted)):
        if predicted[i] == Y_test[i]:
            correct += 1
        conf[Y_test[i]][predicted[i]] += 1
        
        if rnd[i] == 1:
            if predicted[i] == Y_test[i]:
                round_1_correct += 1
            round_1_conf[Y_test[i]][predicted[i]] += 1
        else:
            if predicted[i] == Y_test[i]:
                round_2_correct += 1
            round_2_conf[Y_test[i]][predicted[i]] += 1


In [None]:
from sklearn.metrics import classification_report
print "gamma", clf.best_params_['gamma'], "C", clf.best_params_['C'], "kernel", clf.best_params_['kernel']
print "Bust Precision", conf[0][0] / (1.0 * (conf[0][0] + conf[1][0]))
print "Player Precision", conf[1][1] / (1.0 * (conf[0][1] + conf[1][1]))

num_classes = 2
classes = range(num_classes)
class_names = ["Bust", "Player"]
title = "SVM Confusion Matrix"

thresh = max([max(conf[0]), max(conf[1])]) / 2
imshow(conf,interpolation='nearest', cmap=plt.cm.Reds)
for i, j in itertools.product(classes, classes):
        plt.text(j, i, conf[i][j],
                 horizontalalignment="center",
                 color="white" if conf[i][j] > thresh else "black")

tick_marks = np.arange(num_classes)
plt.xticks(tick_marks, class_names, rotation=0)
plt.yticks(tick_marks, class_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title(title)
plt.show()



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

classifiers = []

parameters = {'kernel':('linear', 'rbf'), 'C':[.1, .5, 1, 10],'gamma':[.000005,.00001,.0001,.0005,.001,.01,.1,]}
estimator = svm.SVC
name = "Support Vector Machine"
classifiers.append((parameters,estimator,name))

parameters = {'n_estimators':[5,10,15,20],'criterion':('gini', 'entropy')}
estimator = RandomForestClassifier
name = "Random Forest"
classifiers.append((parameters,estimator,name))

parameters = {}
estimator = GaussianNB
name = "Naive Bayes"
classifiers.append((parameters,estimator,name))

parameters = {'n_neighbors':[3,5,10,15,20],'weights':('uniform', 'distance'), 'algorithm':('ball_tree', 'kd_tree'), 'metric':('chebyshev', 'euclidean')}
estimator = KNeighborsClassifier
name = "K Nearest Neighbors"
classifiers.append((parameters,estimator,name))

for classifier in classifiers:
    parameters = classifier[0]
    estimator = classifier[1]
    name = classifier[2]
    
    svr = estimator()
    clf = model_selection.GridSearchCV(svr, parameters, scoring = "average_precision")
    clf.fit(X, c)

    x = X[:]
    w = h = 2
    k = 10
    correct = 0
    conf = [[0 for x_ in range(w)] for y_ in range(h)]
    round_1_correct = 0
    round_1_conf = [[0 for x_ in range(w)] for y_ in range(h)]
    round_2_correct = 0
    round_2_conf = [[0 for x_ in range(w)] for y_ in range(h)]

    seed(3)
    x_ = permutation(x[:])
    seed(3)
    Y = permutation(c[:])
    pick = list(df['Pk'])[:]
    seed(3)
    pick = permutation(pick[:])


    split_size = len(Y) * 1.0 / k
    folds = [i for i in range(k) for _ in range(int(split_size))]
    num_bigger_sets = len(Y) - len(folds)
    if num_bigger_sets != 0:
        first = [i for i in range(k - num_bigger_sets) for _ in range(int(split_size))]
        second = [i + (k - num_bigger_sets) for i in range(num_bigger_sets) for _ in range(int(split_size + 1))] 
        folds = first + second


    for j in range(k):
        X_train = [row for row, f, in zip(x_,folds) if f != j]
        Y_train = [val for val, f in zip(Y,folds) if f != j]

        X_test = [row for row, f in zip(x_,folds) if f == j]
        Y_test = [val for val, f in zip(Y,folds) if f == j]

        rnd = [p for p, f in zip(pick,folds) if f == j]
        rnd = [1 if p < 31 else 2 for p in rnd]


        M = estimator(**clf.best_params_)
        M = M.fit(X_train, Y_train)

        predicted = M.predict(X_test) 
        for i in range(len(predicted)):
            if predicted[i] == Y_test[i]:
                correct += 1
            conf[Y_test[i]][predicted[i]] += 1

            if rnd[i] == 1:
                if predicted[i] == Y_test[i]:
                    round_1_correct += 1
                round_1_conf[Y_test[i]][predicted[i]] += 1
            else:
                if predicted[i] == Y_test[i]:
                    round_2_correct += 1
                round_2_conf[Y_test[i]][predicted[i]] += 1

    print "best params", clf.best_params_.keys(),clf.best_params_.values()
    print "Bust Precision", conf[0][0] / (1.0 * (conf[0][0] + conf[1][0]))
    print "Player Precision", conf[1][1] / (1.0 * (conf[0][1] + conf[1][1]))

    num_classes = 2
    classes = range(num_classes)
    class_names = ["Bust", "Player"]
    title =  name + " Confusion Matrix"

    thresh = max([max(conf[0]), max(conf[1])]) / 2
    imshow(conf,interpolation='nearest', cmap=plt.cm.Reds)
    for i, j in itertools.product(classes, classes):
            plt.text(j, i, conf[i][j],
                     horizontalalignment="center",
                     color="white" if conf[i][j] > 110 else "black")

    tick_marks = np.arange(num_classes)
    plt.xticks(tick_marks, class_names, rotation=0)
    plt.yticks(tick_marks, class_names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.title(title)
    plt.show()



In [None]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(criterion='entropy', splitter='best')
classifier.fit(x, c)
 
results = []
for title, feat in zip(x_cols, classifier.feature_importances_):
    results.append((title, feat))
 
results.sort(key=lambda x: x[1], reverse=True)
 
for x in results:
    print(x)