In [50]:
import numpy as np 
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from scipy.stats import chi2_contingency
from scipy.stats import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix
import xgboost as xgb
from sklearn.utils import resample
from sklearn.ensemble import VotingClassifier
import os
import itertools
os.environ['KMP_DUPLICATE_LIB_OK']='True'
plt.style.use('seaborn')
pd.options.display.max_rows = 4000
%matplotlib inline
pd.set_option('display.max_columns', 100)

Cleaning Up Data

In [51]:
df_NBA = pd.read_csv('1985_2000_draft_class_NBA.csv',index_col=0)
df_NCAA= pd.read_csv('NCAA_Stats.csv',index_col=0)

In [52]:
df_NCAA.fillna(0, inplace=True)
df_NBA.fillna(0, inplace=True)

In [53]:
df_NCAA['2P']= np.where(df_NCAA['2P']==0, df_NCAA['FG'], df_NCAA['2P'])
df_NCAA['2PA']= np.where(df_NCAA['2PA']==0, df_NCAA['FGA'], df_NCAA['2PA'])
df_NCAA['2P%']= np.where(df_NCAA['2P%']==0, df_NCAA['FG%'], df_NCAA['2P%'])

Combining Data for EDA, Feature Creation and Modeling

In [54]:
columns2drop= ['Pick', 'Team', 'College', 'G', 'MP', 'PTS', 'TRB',
       'AST', 'FG%', '3P%', 'FT%', 'MP.1', 'PTS.1', 'TRB.1', 'AST.1', 'WS',
       'WS/48', 'BPM', 'VORP', 'Position3']
df_NBA.drop(columns= columns2drop, inplace= True)

In [55]:
df = df_NCAA.merge(df_NBA, on='Name', how='inner')

Feature Creation

In [56]:
df['Birthday']= pd.to_datetime(df['Birthday'])
df['Birthday']= df['Birthday'].dt.year

In [57]:
df['Age'] = df['Year']-df['Birthday']

In [58]:
df.drop(columns= ['Year','Birthday'], inplace= True)

In [59]:
df['Height'] = (df['Height'].str.strip('cm').astype(int))
df['Weight'] = (df['Weight'].str.strip('kg').astype(int))

In [60]:
# engineered a metric called assist to turnover ratio which is popular today
df['AST_TOV'] = df['AST']/df['TOV']
# engineered true fg% which takes into account pts, fga, fta
df['True_FG%'] = df['PTS']/(2*df['FGA'] + (.44*df['FTA']))*100
# engineered points per possession metric which is also very widely used today
df['PPP'] = df['PTS']/(df['FGA']+(0.44*df['FTA'])+df['TOV'])
# engineered free throw rate
df['FTR'] = (df['FTA']/df['FGA'])*100
# engineered a players turn over rate%
df['TOV%'] = df['TOV']/(df['FGA']+0.44 * df['FTA']+df['TOV'])*100
# engineered the hollinger assist ratio
df['Hasst%'] = df['AST']/(df['FGA']+.475 * df['FTA']+df['AST']+df['TOV'])*100

In [61]:
#Fix infinite numbers created
df['AST_TOV']= np.where(df['AST_TOV']== np.inf, (df['AST']/1), df['AST_TOV'])
df= df.fillna(0)

In [62]:
df.isna().sum()

Name             0
Yrs_College      0
School           0
Conf             0
G                0
MP               0
FG               0
FGA              0
FG%              0
2P               0
2PA              0
2P%              0
3P               0
3PA              0
3P%              0
FT               0
FTA              0
FT%              0
TRB              0
AST              0
STL              0
BLK              0
TOV              0
PF               0
PTS              0
SOS              0
Yrs              0
Height           0
Weight           0
Position1        0
Position2        0
All_Star_apps    0
All_NBA_apps     0
All_Def_apps     0
HOF              0
Age              0
AST_TOV          0
True_FG%         0
PPP              0
FTR              0
TOV%             0
Hasst%           0
dtype: int64

In [63]:
#create value to predict
df['Status']= np.where(df['Yrs']>4,1,0)
df['Status']= np.where(df['All_Star_apps']>0, 2, df['Status'])
df['Status']= np.where(df['All_Def_apps']>0, 2, df['Status'])
df['Status']= np.where(df['All_NBA_apps']>0, 3, df['Status'])
df['Status']= np.where(df['HOF']=='Hall of Fame', 4, df['Status'])

In [64]:
df.drop(columns=['All_Star_apps','All_Def_apps', 'All_NBA_apps', 'HOF', 'Name','Yrs'], inplace= True)

Modeling

In [65]:
df2.isna().sum()

Yrs_College                        0
G                                  0
MP                                 0
FG                                 0
FGA                                0
FG%                                0
2P                                 0
2PA                                0
2P%                                0
3P                                 0
3PA                                0
3P%                                0
FT                                 0
FTA                                0
FT%                                0
TRB                                0
AST                                0
STL                                0
BLK                                0
TOV                                0
PF                                 0
PTS                                0
SOS                                0
Height                             0
Weight                             0
Age                                0
AST_TOV                            1
T

In [87]:
df2= df.drop(columns=['School','Conf','Position1','Position2', 'Height'])

In [88]:
target = df2['Status']
features = df2.drop(['Status'], axis=1)

In [89]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=25,test_size=0.30)

log = LogisticRegression(class_weight= 'balanced')
log.fit(X_train, y_train)
log_preds = log.predict(X_test)

log_acc = metrics.accuracy_score(y_test, log_preds)

print('Test Accuracy score: ', log_acc)

Test Accuracy score:  0.33783783783783783


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [91]:
# A for loop to find out what columns would help us most if removed
for x in X_train.columns:
    X_train1= X_train.drop(columns=[x])
    X_test1=X_test.drop(columns=[x])
    y_train1=y_train.drop(columns=[x])
    y_test1=y_test.drop(columns=[x])
    lr = LogisticRegression(class_weight='balanced')
    lr.fit(X_train1, y_train1)
    lr_preds = lr.predict(X_test1)
    lr_acc = metrics.accuracy_score(y_test1, lr_preds)
    if lr_acc > log_acc:
        print(x,'LR Test Accuracy score: ', lr_acc, (lr_acc-log_acc))
    else:
        continue

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

FT% LR Test Accuracy score:  0.34234234234234234 0.004504504504504514
STL LR Test Accuracy score:  0.35135135135135137 0.013513513513513542


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

SOS LR Test Accuracy score:  0.34234234234234234 0.004504504504504514
Weight LR Test Accuracy score:  0.34234234234234234 0.004504504504504514
AST_TOV LR Test Accuracy score:  0.34234234234234234 0.004504504504504514
True_FG% LR Test Accuracy score:  0.34684684684684686 0.009009009009009028


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Hasst% LR Test Accuracy score:  0.35135135135135137 0.013513513513513542


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Seemingly poor performance by Logistic regression model

F-Test Feature Selection:

In [93]:
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression

scaler = StandardScaler()
# fit the scaler to the training data
scaler.fit(X_train)
#transform the training data
X_train1 = pd.DataFrame(data = scaler.transform(X_train), columns = features.columns)
#transform the testing dat
X_test1 = pd.DataFrame(data = scaler.transform(X_test), columns = features.columns)

selector = SelectKBest(f_regression, k=22)

selector.fit(X_train1, y_train)

selected_columns = X_train.columns[selector.get_support()]
removed_columns = X_train.columns[~selector.get_support()]

log_kbest = LogisticRegression(class_weight='balanced')
log_kbest = log_kbest.fit(X_train1[selected_columns], y_train)

log_kpreds = log_kbest.predict(X_test1[selected_columns])

log_kacc = metrics.accuracy_score(y_test, log_kpreds)

print('Test Accuracy score: ', log_kacc)

Test Accuracy score:  0.34684684684684686


Slightly Better, Lets see if gridsearch can give us better parameters to help out

In [96]:
logreg = LogisticRegression(class_weight = 'balanced')
parameters = {'C': [0.25,0.35,0.5,0.75,1.0],
             'max_iter': [100,200,300,400,500],
             'penalty': ['l1', 'l2']}
grid_tree1=GridSearchCV(logreg, parameters, cv=5, scoring='accuracy',verbose =1,n_jobs=-1)

grid_tree1.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    6.0s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=5, estimator=LogisticRegression(class_weight='balanced'),
             n_jobs=-1,
             param_grid={'C': [0.25, 0.35, 0.5, 0.75, 1.0],
                         'max_iter': [100, 200, 300, 400, 500],
                         'penalty': ['l1', 'l2']},
             scoring='accuracy', verbose=1)

In [97]:
# Single best score achieved across all params (min_samples_split)
print(grid_tree1.best_score_)

# Dictionary containing the parameters (min_samples_split) used to generate that score
print(grid_tree1.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid_tree1.best_estimator_)

0.38237490664675133
{'C': 0.5, 'max_iter': 400, 'penalty': 'l2'}
LogisticRegression(C=0.5, class_weight='balanced', max_iter=400)


In [98]:
#Predict the response for test dataset
y_pred = grid_tree1.best_estimator_.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Test Accuracy Score:",metrics.accuracy_score(y_test, y_pred))

Test Accuracy Score: 0.32882882882882886


No Bueno, How about if we gridsearch our Log Reg with F-Test?

In [101]:
logreg_kbest = LogisticRegression(class_weight = 'balanced')
parameters = {'C': [0.25,0.35,0.5,0.75,1.0],
             'max_iter': [100,200,300,400,500]}
grid_tree2=GridSearchCV(logreg_kbest, parameters, cv=5, scoring='accuracy',verbose =1,n_jobs=-1)

grid_tree2.fit(X_train1[selected_columns], y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 110 out of 125 | elapsed:    2.6s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    2.7s finished


GridSearchCV(cv=5, estimator=LogisticRegression(class_weight='balanced'),
             n_jobs=-1,
             param_grid={'C': [0.25, 0.35, 0.5, 0.75, 1.0],
                         'max_iter': [100, 200, 300, 400, 500]},
             scoring='accuracy', verbose=1)

In [102]:
#Predict the response for test dataset
y_pred = grid_tree2.best_estimator_.predict(X_test1[selected_columns])

# Model Accuracy, how often is the classifier correct?
print("Test Accuracy Score:",metrics.accuracy_score(y_test, y_pred))

Test Accuracy Score: 0.34684684684684686


No difference from our original F-Test Model

Maybe KNN will work better?