In [1]:
import pandas as pd
import sklearn.model_selection as model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

In [2]:
#Importing NCAA and draft merge data
ncaa_draft_data = pd.read_csv('../static/data/ncaa_model_in.csv')
ncaa_draft_data.head()

Unnamed: 0,Player,Class,Season,Pos,School,Conf,G,MP,FG,FGA,...,Rupp Trophy,Sporting News Player of the Year,UPI Player of the Year,USBWA Freshman of the Year,Year Drafted,Rd,Pk,Tm,Projected Pk,Team Score
0,Joe Adkins,JR,1999,G,Oklahoma State,Big 12,33,27.697,3.242,8.606,...,0,0,0,0,0,0,0,,0,0
1,Brian Montonati,JR,1999,F,Oklahoma State,Big 12,34,18.0,2.5,4.912,...,0,0,0,0,0,0,0,,0,0
2,Jerome Moiso,FR,1999,F,UCLA,Pac-10,29,23.759,4.517,9.276,...,0,0,0,0,0,0,0,,0,0
3,Jason Miskiri,SR,1999,G,George Mason,CAA,29,34.345,5.414,13.552,...,0,0,0,0,0,0,0,,0,0
4,Andre Miller,SR,1999,G,Utah,WAC,33,33.091,5.758,11.727,...,0,0,0,0,0,0,0,,0,0


In [3]:
# Cleaning data -- Replacing undrafted players with pick 61 and team to blank
ncaa_draft_data['Pk'].replace(0, 61,inplace=True)
ncaa_draft_data['Projected Pk'].replace(0, 61,inplace=True)
ncaa_draft_data.Tm = ncaa_draft_data.Tm.fillna('')
ncaa_draft_data.head(8)

Unnamed: 0,Player,Class,Season,Pos,School,Conf,G,MP,FG,FGA,...,Rupp Trophy,Sporting News Player of the Year,UPI Player of the Year,USBWA Freshman of the Year,Year Drafted,Rd,Pk,Tm,Projected Pk,Team Score
0,Joe Adkins,JR,1999,G,Oklahoma State,Big 12,33,27.697,3.242,8.606,...,0,0,0,0,0,0,61,,61,0
1,Brian Montonati,JR,1999,F,Oklahoma State,Big 12,34,18.0,2.5,4.912,...,0,0,0,0,0,0,61,,61,0
2,Jerome Moiso,FR,1999,F,UCLA,Pac-10,29,23.759,4.517,9.276,...,0,0,0,0,0,0,61,,61,0
3,Jason Miskiri,SR,1999,G,George Mason,CAA,29,34.345,5.414,13.552,...,0,0,0,0,0,0,61,,61,0
4,Andre Miller,SR,1999,G,Utah,WAC,33,33.091,5.758,11.727,...,0,0,0,0,0,0,61,,61,0
5,Chris Mihm,SO,1999,C,Texas,Big 12,32,32.094,4.5,10.031,...,0,0,0,0,2000,1,7,CHI,7,0
6,Greg McQuay,JR,1999,C,Purdue,Big Ten,33,25.545,4.182,7.364,...,0,0,0,0,0,0,61,,61,0
7,Dan McClintock,JR,1999,C,Northern Arizona,Big Sky,27,16.741,4.63,6.926,...,0,0,0,0,2000,2,53,DEN,53,0


In [4]:
# number of rows
ncaa_draft_data.shape[0]

31896

In [5]:
# deleting rows on PER less than 0 and undrafted 
delete_perlessthan0 = ncaa_draft_data[(ncaa_draft_data.PER < 0) & (ncaa_draft_data.Pk == 61)].index
ncaa_draft_data.drop(delete_perlessthan0 , inplace=True)
ncaa_draft_data.shape[0]

30625

In [6]:
# deleting rows on PER greater than 50 and undrafted 
delete_pergreaterthan50=ncaa_draft_data[(ncaa_draft_data.PER > 40) & (ncaa_draft_data.Pk == 61)].index
ncaa_draft_data.drop(delete_pergreaterthan50 , inplace=True)
ncaa_draft_data.shape[0]

30362

In [7]:
ncaa_draft_data.head(8)

Unnamed: 0,Player,Class,Season,Pos,School,Conf,G,MP,FG,FGA,...,Rupp Trophy,Sporting News Player of the Year,UPI Player of the Year,USBWA Freshman of the Year,Year Drafted,Rd,Pk,Tm,Projected Pk,Team Score
0,Joe Adkins,JR,1999,G,Oklahoma State,Big 12,33,27.697,3.242,8.606,...,0,0,0,0,0,0,61,,61,0
1,Brian Montonati,JR,1999,F,Oklahoma State,Big 12,34,18.0,2.5,4.912,...,0,0,0,0,0,0,61,,61,0
2,Jerome Moiso,FR,1999,F,UCLA,Pac-10,29,23.759,4.517,9.276,...,0,0,0,0,0,0,61,,61,0
3,Jason Miskiri,SR,1999,G,George Mason,CAA,29,34.345,5.414,13.552,...,0,0,0,0,0,0,61,,61,0
4,Andre Miller,SR,1999,G,Utah,WAC,33,33.091,5.758,11.727,...,0,0,0,0,0,0,61,,61,0
5,Chris Mihm,SO,1999,C,Texas,Big 12,32,32.094,4.5,10.031,...,0,0,0,0,2000,1,7,CHI,7,0
6,Greg McQuay,JR,1999,C,Purdue,Big Ten,33,25.545,4.182,7.364,...,0,0,0,0,0,0,61,,61,0
7,Dan McClintock,JR,1999,C,Northern Arizona,Big Sky,27,16.741,4.63,6.926,...,0,0,0,0,2000,2,53,DEN,53,0


Modeling


In [8]:
#columns:
#Player,Class,Season,Pos,School,Conf,G,MP,FG,FGA,2P,2PA,3P,3PA,FT,FTA,ORB,DRB,TRB,AST,STL,BLK,
#TOV,PF,PTS,PER,eFG%,WS,AP Player of the Year,NABC Defensive Player of the Year,NABC Player of the Year,
#Naismith Award,NCAA Tournament Most Outstanding Player,NIT Most Valuable Player,Rupp Trophy,
#Sporting News Player of the Year,UPI Player of the Year,USBWA Freshman of the Year,Year Drafted,Rd,Pk,Tm,,Projected Pk,Team Score

In [9]:
#features  33
#G,MP,FG,FGA,2P,2PA,3P,3PA,FT,FTA,ORB,DRB,TRB,AST,STL,BLK,
#TOV,PF,PTS,PER,eFG%,WS,AP Player of the Year,NABC Defensive Player of the Year,NABC Player of the Year,
#Naismith Award,NCAA Tournament Most Outstanding Player,NIT Most Valuable Player,Rupp Trophy,
#Sporting News Player of the Year,UPI Player of the Year,USBWA Freshman of the Year,Pk

In [11]:
#features and target
features = ncaa_draft_data.drop(['Player','Class','Season','Pos','School','Conf','Year Drafted','Rd','Tm','Projected Pk','Team Score'],1)
target = ncaa_draft_data['Pk']

In [12]:
#split training and testing data
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.40)

In [13]:
#standardize the features
scaler = preprocessing.StandardScaler().fit(features_train)
scaler.transform(features_train)
scaler.transform(features_test)

array([[ 1.30932308,  1.4897731 ,  1.3782121 , ..., -0.00740924,
        -0.02223259,  0.1548577 ],
       [-1.70651084, -1.30969511, -1.07819824, ..., -0.00740924,
        -0.02223259,  0.1548577 ],
       [ 0.4337584 ,  1.2415435 ,  1.41705073, ..., -0.00740924,
        -0.02223259,  0.1548577 ],
       ...,
       [ 0.72561329,  0.93722968,  0.41203467, ..., -0.00740924,
        -0.02223259,  0.1548577 ],
       [ 0.82289826,  0.27953978,  0.3460622 , ..., -0.00740924,
        -0.02223259,  0.1548577 ],
       [-1.41465594, -0.87907784, -0.2210882 , ..., -0.00740924,
        -0.02223259,  0.1548577 ]])

In [14]:
#Fit model by a multi-layer perceptron neural network using lbfgs optimization
model_MLP = MLPRegressor(hidden_layer_sizes = (33,), activation='identity', solver='lbfgs', alpha = 0.005, max_iter = 500, shuffle=True)
model_MLP.fit(features_train, target_train)

MLPRegressor(activation='identity', alpha=0.005, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(33,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)

In [15]:
#Fit a model by Ridge Regression
model_ridge = Ridge(alpha=1.0)
model_ridge.fit(features_train, target_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [16]:
#Fit a model by a linear SVR
model_SVR = SVR(kernel = 'linear')
model_SVR.fit(features_train, target_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [17]:
#Fit a model by Lasso Regression
model_lasso = linear_model.Lasso()
model_lasso.fit(features_train, target_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [18]:
#Fit model by linear regression
model_LR = LinearRegression()
model_LR.fit(features_train, target_train)
print(model_LR.intercept_)
print(model_LR.coef_)

0.0
[ 5.48456232e-16 -7.21644966e-16 -8.46241480e-14 -1.44533691e-13
  9.14589585e-14  1.42986317e-13  9.88350027e-14  1.42217835e-13
  2.69272452e-15  6.90419943e-16  4.49943902e-16  5.55111512e-17
 -2.13370988e-16 -3.33066907e-16  4.04494147e-15  4.12582295e-15
  6.98226199e-17 -2.01184555e-15 -3.55965257e-15 -3.79904441e-16
 -9.64614673e-16  1.77288739e-15 -3.31709122e-14  9.36803024e-15
  6.00231992e-14 -6.54231731e-15  1.32195577e-14  4.85008524e-16
  6.63597629e-15 -2.59426964e-15 -1.10932876e-14  2.21942855e-14
  1.00000000e+00]


In [19]:
#Predict target on testing data
target_pred_MLP = model_MLP.predict(features_test)
target_pred_LR = model_LR.predict(features_test)
target_pred_ridge = model_ridge.predict(features_test)
target_pred_SVR = model_SVR.predict(features_test)
target_pred_lasso = model_lasso.predict(features_test)

In [20]:
#Importing NCAA and draft merge data
nba_draft_data = ncaa_draft_data
pro_features = nba_draft_data.drop(['Player','Class','Season','Pos','School','Conf','Year Drafted','Rd','Tm','Projected Pk','Team Score'], 1)
scaler.transform(pro_features)

array([[ 0.92018322,  0.92728956,  0.55781254, ..., -0.00740924,
        -0.02223259,  0.1548577 ],
       [ 1.01746819,  0.04298298,  0.16304181, ..., -0.00740924,
        -0.02223259,  0.1548577 ],
       [ 0.53104336,  0.56816825,  1.23615848, ..., -0.00740924,
        -0.02223259,  0.1548577 ],
       ...,
       [-0.05266643, -0.50418   , -0.17214089, ..., -0.00740924,
        -0.02223259,  0.1548577 ],
       [-2.19293566, -1.59850597, -1.16704826, ..., -0.00740924,
        -0.02223259,  0.1548577 ],
       [ 0.62832833,  0.38651014,  0.37585622, ..., -0.00740924,
        -0.02223259,  0.1548577 ]])

In [21]:
#Using the multi-layer perceptron neural network model to predict current prospects
MLPPrediction = model_MLP.predict(pro_features)
nba_draft_data['Projected Pk'] = MLPPrediction
nba_draft_data['Projected Pk'] = nba_draft_data['Projected Pk'].round()
MLPResults = nba_draft_data[['Player','Class','Season','Pos','School','Conf','Year Drafted','Rd','Pk','Tm','Projected Pk']]
#MLPResults.head(8)
mlpdiff = MLPResults[(MLPResults['Pk'] != MLPResults['Projected Pk'])]
mlpdiff.head()

Unnamed: 0,Player,Class,Season,Pos,School,Conf,Year Drafted,Rd,Pk,Tm,Projected Pk


In [22]:
#Using the Linear regression perceptron neural network model to predict current prospects
LRPrediction = model_LR.predict(pro_features)
nba_draft_data['Projected Pk'] = LRPrediction
nba_draft_data['Projected Pk'] = nba_draft_data['Projected Pk'].round()
LRResults = nba_draft_data[['Player','Class','Season','Pos','School','Conf','Year Drafted','Rd','Pk','Tm','Projected Pk']]
lrdiff = LRResults[(LRResults['Pk'] != LRResults['Projected Pk'])]
lrdiff.shape[0]


0

In [23]:
#Using the Ridge perceptron neural network model to predict current prospects
RidgePrediction = model_ridge.predict(pro_features)
nba_draft_data['Projected Pk'] = RidgePrediction
nba_draft_data['Projected Pk'] = nba_draft_data['Projected Pk'].round()
RidgeResults = nba_draft_data[['Player','Class','Season','Pos','School','Conf','Year Drafted','Rd','Pk','Tm','Projected Pk']]
ridgediff = RidgeResults[(LRResults['Pk'] != RidgeResults['Projected Pk'])]
ridgediff.shape[0]

0

In [24]:
#Using the lasso perceptron neural network model to predict current prospects
lassoPrediction = model_lasso.predict(pro_features)
nba_draft_data['Projected Pk'] = lassoPrediction
nba_draft_data['Projected Pk'] = nba_draft_data['Projected Pk'].round()
lassoResults = nba_draft_data[['Player','Class','Season','Pos','School','Conf','Year Drafted','Rd','Pk','Tm','Projected Pk']]
lassodiff = lassoResults[(lassoResults['Pk'] != lassoResults['Projected Pk'])]
lassodiff.shape[0]
lassodiff.head(647)

Unnamed: 0,Player,Class,Season,Pos,School,Conf,Year Drafted,Rd,Pk,Tm,Projected Pk
5,Chris Mihm,SO,1999,C,Texas,Big 12,2000,1,7,CHI,8.0
19,Mamadou N'Diaye,JR,1999,C,Auburn,SEC,2000,1,26,DEN,27.0
21,Desmond Mason,JR,1999,F,Oklahoma State,Big 12,2000,1,17,SEA,18.0
27,Dan Langhi,JR,1999,F,Vanderbilt,SEC,2000,2,31,DAL,32.0
28,Kenyon Martin,JR,1999,C,Cincinnati,CUSA,2000,1,1,NJN,3.0
30,Jamaal Magloire,JR,1999,C,Kentucky,SEC,2000,1,19,CHH,20.0
32,Mark Madsen,JR,1999,F,Stanford,Pac-10,2000,1,29,LAL,30.0
38,Morris Peterson,JR,1999,F,Michigan State,Big Ten,2000,1,21,TOR,22.0
43,Etan Thomas,JR,1999,C,Syracuse,Big East,2000,1,12,DAL,13.0
46,Stromile Swift,FR,1999,F,Louisiana State,SEC,2000,1,2,VAN,4.0


In [25]:
#Using the SVR perceptron neural network model to predict current prospects
SVRPrediction = model_SVR.predict(pro_features)
nba_draft_data['Projected Pk'] = SVRPrediction
nba_draft_data['Projected Pk'] = nba_draft_data['Projected Pk'].round()
SVRResults = nba_draft_data[['Player','Class','Season','Pos','School','Conf','Year Drafted','Rd','Pk','Tm','Projected Pk']]
svrdiff = SVRResults[(SVRResults['Pk'] != SVRResults['Projected Pk'])]
svrdiff.shape[0]

0