In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_regression
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn import metrics
#pd.set_option('display.mpl_style','default')
plt.rcParams['figure.figsize'] = (15,5)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
#Viewing Summary()
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq,df)

In [3]:
raw_data = pd.read_csv('nba.csv')

In [15]:
raw_data.isnull().sum()

Name            0
GP              0
MIN             0
PTS             0
FGM             0
FGA             0
FG%             0
3P Made         0
3PA             0
3P%            11
FTM             0
FTA             0
FT%             0
OREB            0
DREB            0
REB             0
AST             0
STL             0
BLK             0
TOV             0
TARGET_5Yrs     0
dtype: int64

In [17]:
data = raw_data.dropna(axis=1)
data.isnull().sum()

Name           0
GP             0
MIN            0
PTS            0
FGM            0
FGA            0
FG%            0
3P Made        0
3PA            0
FTM            0
FTA            0
FT%            0
OREB           0
DREB           0
REB            0
AST            0
STL            0
BLK            0
TOV            0
TARGET_5Yrs    0
dtype: int64

In [18]:
# the independent variables set 
X = data[['GP','3P Made', '3PA', 'AST', 'BLK','TOV']]
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

   feature        VIF
0       GP   5.621731
1  3P Made  41.542536
2      3PA  46.371652
3      AST   6.391056
4      BLK   2.541630
5      TOV  12.631371


In [19]:
# the independent variables set 
X3 = data[['GP','3P Made', 'AST', 'BLK']]
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X3.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X3.values, i) 
                          for i in range(len(X3.columns))] 
  
print(vif_data)

   feature       VIF
0       GP  4.088346
1  3P Made  1.664277
2      AST  2.863462
3      BLK  2.004278


### Using Statsmodel

In [20]:
inputs = X3.copy()
target = raw_data['TARGET_5Yrs']

In [21]:
x = sm.add_constant(inputs)
results = sm.Logit(target,inputs).fit()

Optimization terminated successfully.
         Current function value: 0.622146
         Iterations 6


In [22]:
results.summary()

0,1,2,3
Dep. Variable:,TARGET_5Yrs,No. Observations:,1340.0
Model:,Logit,Df Residuals:,1336.0
Method:,MLE,Df Model:,3.0
Date:,"Wed, 06 Jan 2021",Pseudo R-squ.:,0.06302
Time:,09:16:05,Log-Likelihood:,-833.68
converged:,True,LL-Null:,-889.75
Covariance Type:,nonrobust,LLR p-value:,3.791e-24

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
GP,0.0030,0.002,1.489,0.137,-0.001,0.007
3P Made,-0.2353,0.166,-1.414,0.157,-0.562,0.091
AST,0.1715,0.052,3.316,0.001,0.070,0.273
BLK,0.8710,0.188,4.625,0.000,0.502,1.240


In [35]:
pred = results.pred_table()
pred

array([[  0., 509.],
       [  0., 831.]])

In [24]:
tab = pd.DataFrame(pred)
tab.columns = ['Predicted 0', 'Predicted 1']
tab = tab.rename(index={0:'Actual 0',1:'Actual 1'})
tab

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0.0,509.0
Actual 1,0.0,831.0


In [25]:
cm = np.array(pred)
accuracy = (cm[0,0]+cm[1,1])/cm.sum()
accuracy

0.6201492537313433

#### P-value Rejections

In [58]:
inputs1 = data[['AST', 'BLK']]
target = raw_data['TARGET_5Yrs']

In [60]:
x = sm.add_constant(inputs1)
results1 = sm.Logit(target,inputs1).fit()
results1.summary()

Optimization terminated successfully.
         Current function value: 0.623471
         Iterations 5


0,1,2,3
Dep. Variable:,TARGET_5Yrs,No. Observations:,1340.0
Model:,Logit,Df Residuals:,1338.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 06 Jan 2021",Pseudo R-squ.:,0.06102
Time:,10:02:15,Log-Likelihood:,-835.45
converged:,True,LL-Null:,-889.75
Covariance Type:,nonrobust,LLR p-value:,1.993e-25

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
AST,0.1916,0.034,5.662,0.000,0.125,0.258
BLK,1.0509,0.147,7.134,0.000,0.762,1.340


In [61]:
pred_tab = results1.pred_table()
pred_tab

array([[  0., 509.],
       [  0., 831.]])

#### Using Scikit

In [26]:
inputs = X3.copy()
target = raw_data['TARGET_5Yrs']
model = LogisticRegression(solver='liblinear',random_state=0).fit(inputs,target)

In [27]:
model.classes_

array([0., 1.])

In [29]:
model

LogisticRegression(random_state=0, solver='liblinear')

In [30]:
model.coef_

array([[ 0.04062254, -0.06124681,  0.12978818,  0.92064447]])

In [31]:
results.params

GP         0.002966
3P Made   -0.235340
AST        0.171550
BLK        0.871041
dtype: float64

In [33]:
sum_table = pd.DataFrame(inputs.columns.values, columns=['Features'])
#sum_table['Weights'] = model.coef_
sum_table['F-stats'] = f_regression(inputs,target)[0].round(3)
sum_table['P-values'] = f_regression(inputs,target)[1].round(3)
#sum_table['Weights'] = model.coef_
sum_table

Unnamed: 0,Features,F-stats,P-values
0,GP,250.086,0.0
1,3P Made,1.797,0.18
2,AST,42.447,0.0
3,BLK,61.798,0.0


In [34]:
cm = pd.DataFrame(confusion_matrix(target, model.predict(inputs)))
cm.columns = ['Predicted 0', 'Predicted 1']
cm = cm.rename(index={0:'Actual 0', 1:'Actual 1'})
cm

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,259,250
Actual 1,153,678


In [36]:
cm1 = np.array(cm)
accuracy_train = (cm1[0,0]+cm1[1,1])/cm1.sum()
accuracy_train

0.6992537313432836

In [37]:
## Classification Report
print(classification_report(target, model.predict(inputs)))

              precision    recall  f1-score   support

         0.0       0.63      0.51      0.56       509
         1.0       0.73      0.82      0.77       831

    accuracy                           0.70      1340
   macro avg       0.68      0.66      0.67      1340
weighted avg       0.69      0.70      0.69      1340



### Improving the Model

In [81]:
inputs = X3.copy()
target = raw_data['TARGET_5Yrs']
model1 = LogisticRegression(solver='liblinear', C= 10.0, random_state=0).fit(inputs,target)

In [41]:
model1

LogisticRegression(C=10.0, random_state=0, solver='liblinear')

In [42]:
pred1 = confusion_matrix(target, model1.predict(inputs))
pred1

array([[268, 241],
       [155, 676]], dtype=int64)

In [46]:
class1 = classification_report(target, model1.predict(inputs))
print(class1)

              precision    recall  f1-score   support

         0.0       0.63      0.53      0.58       509
         1.0       0.74      0.81      0.77       831

    accuracy                           0.70      1340
   macro avg       0.69      0.67      0.67      1340
weighted avg       0.70      0.70      0.70      1340



In [47]:
print(classification_report(target, model.predict(inputs)))

              precision    recall  f1-score   support

         0.0       0.63      0.53      0.58       509
         1.0       0.74      0.81      0.77       831

    accuracy                           0.70      1340
   macro avg       0.69      0.67      0.67      1340
weighted avg       0.70      0.70      0.70      1340



In [48]:
table = pd.DataFrame(pred1)
table.columns = ['Preidcted 0','Predicted 1']
table = table.rename(index={0:'Actual 0', 1:'Actual 1'})
table

Unnamed: 0,Preidcted 0,Predicted 1
Actual 0,268,241
Actual 1,155,676


In [50]:
cmm = np.array(pred1)
accuracy = (cmm[0,0]+cmm[1,1])/cmm.sum()
accuracy

0.7044776119402985

In [51]:
sum_table = pd.DataFrame(inputs.columns.values, columns=['Features'])
#sum_table['Weights'] = model.coef_
sum_table['F-stats'] = f_regression(inputs,target)[0].round(3)
sum_table['P-values'] = f_regression(inputs,target)[1].round(3)
#sum_table['Weights'] = model.coef_
sum_table

Unnamed: 0,Features,F-stats,P-values
0,GP,250.086,0.0
1,3P Made,1.797,0.18
2,AST,42.447,0.0
3,BLK,61.798,0.0


### P-Value Rejections

In [62]:
inputs1 = data[['GP', 'AST', 'BLK']]
target = raw_data['TARGET_5Yrs']

In [63]:
#model1 = LogisticRegression(solver='liblinear', C= 10.0, random_state=0).fit(inputs,target)
model2 = LogisticRegression(solver='liblinear', random_state=0).fit(inputs1,target)
model2

LogisticRegression(random_state=0, solver='liblinear')

In [64]:
model2.coef_

array([[0.04061629, 0.12291352, 0.92098441]])

In [71]:
sum_table1 = pd.DataFrame(inputs1.columns.values, columns=['Features'])
#sum_table['Weights'] = model.coef_
sum_table1['F-stats'] = f_regression(inputs1,target)[0].round(3)
sum_table1['P-values'] = f_regression(inputs1,target)[1].round(3)
#sum_table['Weights'] = model.coef_
sum_table1

Unnamed: 0,Features,F-stats,P-values
0,GP,250.086,0.0
1,AST,42.447,0.0
2,BLK,61.798,0.0


In [66]:
con_m = confusion_matrix(target, model2.predict(inputs1))
con_m

array([[256, 253],
       [152, 679]], dtype=int64)

In [67]:
cm_df = pd.DataFrame(con_m)
cm_df.columns = ['Predicted 0','Preidcted 1']
cm_df = cm_df.rename(index={0:'Actual 0', 1:'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Preidcted 1
Actual 0,256,253
Actual 1,152,679


In [68]:
xx = np.array(cm_df)
acc = (xx[0,0]+xx[1,1])/xx.sum()
acc

0.6977611940298507

In [70]:
print(classification_report(target, model2.predict(inputs1)))

              precision    recall  f1-score   support

         0.0       0.63      0.50      0.56       509
         1.0       0.73      0.82      0.77       831

    accuracy                           0.70      1340
   macro avg       0.68      0.66      0.66      1340
weighted avg       0.69      0.70      0.69      1340



### Improving the model

In [72]:
model3 = LogisticRegression(solver='liblinear',C=10.0, random_state=0).fit(inputs1,target)
model3

LogisticRegression(C=10.0, random_state=0, solver='liblinear')

In [73]:
con_m1 = confusion_matrix(target, model3.predict(inputs1))
con_m1

array([[267, 242],
       [156, 675]], dtype=int64)

In [74]:
cm_df1 = pd.DataFrame(con_m1)
cm_df1.columns = ['Predicted 0','Preidcted 1']
cm_df1 = cm_df1.rename(index={0:'Actual 0', 1:'Actual 1'})
cm_df1

Unnamed: 0,Predicted 0,Preidcted 1
Actual 0,267,242
Actual 1,156,675


In [75]:
cm_df

Unnamed: 0,Predicted 0,Preidcted 1
Actual 0,256,253
Actual 1,152,679


In [77]:
v = np.array(cm_df1)
accy = (v[0,0]+v[1,1])/v.sum()
accy

0.7029850746268657

In [86]:
print(classification_report(target, model3.predict(inputs1))), print(model3.coef_),print(model3.intercept_)

              precision    recall  f1-score   support

         0.0       0.63      0.52      0.57       509
         1.0       0.74      0.81      0.77       831

    accuracy                           0.70      1340
   macro avg       0.68      0.67      0.67      1340
weighted avg       0.70      0.70      0.70      1340

[[0.04210668 0.12421621 0.96052345]]
[-2.49097052]


(None, None, None)

In [87]:
print(classification_report(target, model2.predict(inputs1))),print(model2.coef_),print(model2.intercept_)

              precision    recall  f1-score   support

         0.0       0.63      0.50      0.56       509
         1.0       0.73      0.82      0.77       831

    accuracy                           0.70      1340
   macro avg       0.68      0.66      0.66      1340
weighted avg       0.69      0.70      0.69      1340

[[0.04061629 0.12291352 0.92098441]]
[-2.38316987]


(None, None, None)

In [88]:
print(classification_report(target, model1.predict(inputs))), print(model1.coef_),print(model1.intercept_)

              precision    recall  f1-score   support

         0.0       0.63      0.53      0.58       509
         1.0       0.74      0.81      0.77       831

    accuracy                           0.70      1340
   macro avg       0.69      0.67      0.67      1340
weighted avg       0.70      0.70      0.70      1340

[[ 0.0421443  -0.05357211  0.13032459  0.96067439]]
[-2.48904151]


(None, None, None)