In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_regression
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn import metrics
#pd.set_option('display.mpl_style','default')
plt.rcParams['figure.figsize'] = (15,5)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
#Viewing Summary()
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq,df)

In [2]:
raw_data = pd.read_csv('nba.csv')

In [3]:
raw_data.isnull().sum()

Name            0
GP              0
MIN             0
PTS             0
FGM             0
FGA             0
FG%             0
3P Made         0
3PA             0
3P%            11
FTM             0
FTA             0
FT%             0
OREB            0
DREB            0
REB             0
AST             0
STL             0
BLK             0
TOV             0
TARGET_5Yrs     0
dtype: int64

In [4]:
data = raw_data.dropna(axis=0)

In [5]:
data.isnull().sum()

Name           0
GP             0
MIN            0
PTS            0
FGM            0
FGA            0
FG%            0
3P Made        0
3PA            0
3P%            0
FTM            0
FTA            0
FT%            0
OREB           0
DREB           0
REB            0
AST            0
STL            0
BLK            0
TOV            0
TARGET_5Yrs    0
dtype: int64

In [10]:
# the independent variables set 
X = data[['GP','3P Made', '3PA', 'AST', 'BLK','TOV']]
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

   feature        VIF
0       GP   5.612784
1  3P Made  41.563028
2      3PA  46.409596
3      AST   6.409053
4      BLK   2.529858
5      TOV  12.634854


### Logistics Regressions

In [11]:
inputs = data[['GP','3P Made', '3PA', 'AST', 'BLK','TOV']]
target = data['TARGET_5Yrs']

#### Using Statsmodel

In [12]:
y = target.copy()
x = sm.add_constant(inputs)
results = sm.Logit(y,x).fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.564353
         Iterations 6


0,1,2,3
Dep. Variable:,TARGET_5Yrs,No. Observations:,1329.0
Model:,Logit,Df Residuals:,1322.0
Method:,MLE,Df Model:,6.0
Date:,"Tue, 05 Jan 2021",Pseudo R-squ.:,0.1492
Time:,14:30:07,Log-Likelihood:,-750.02
converged:,True,LL-Null:,-881.55
Covariance Type:,nonrobust,LLR p-value:,6.679999999999999e-54

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.3683,0.233,-10.178,0.000,-2.824,-1.912
GP,0.0393,0.004,9.019,0.000,0.031,0.048
3P Made,3.4200,0.949,3.602,0.000,1.559,5.281
3PA,-1.3042,0.345,-3.778,0.000,-1.981,-0.628
AST,0.0767,0.084,0.914,0.361,-0.088,0.241
BLK,0.8130,0.227,3.583,0.000,0.368,1.258
TOV,0.3034,0.179,1.698,0.089,-0.047,0.653


In [13]:
pred = results.pred_table()
pred

array([[252., 251.],
       [145., 681.]])

In [14]:
cm_df = pd.DataFrame(pred)
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0:'Actual 0', 1:'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,252.0,251.0
Actual 1,145.0,681.0


In [15]:
#Accuracy
cm = np.array(cm_df)
accuracy_train = (cm[0,0]+cm[1,1])/cm.sum()
accuracy_train

0.7020316027088036

#### Using scikit 

In [16]:
inputs = data[['GP','3P Made', '3PA', 'AST', 'BLK','TOV']]
target = data['TARGET_5Yrs']

In [43]:
model = LogisticRegression(solver='liblinear', random_state=0).fit(inputs,target)

In [44]:
model.classes_

array([0., 1.])

In [45]:
model.coef_

array([[ 0.03827173,  1.7320358 , -0.70188932,  0.06511893,  0.78805927,
         0.27765873]])

In [46]:
sum_table = pd.DataFrame(inputs.columns.values, columns=['Features'])
#sum_table['Weights'] = model.coef_
sum_table

Unnamed: 0,Features
0,GP
1,3P Made
2,3PA
3,AST
4,BLK
5,TOV


In [47]:
model.intercept_

array([-2.29969361])

In [48]:
sum_table['F-stats'] = f_regression(inputs,target)[0].round(3)

In [50]:
sum_table['P-values'] = f_regression(inputs,target)[1].round(3)
#sum_table['Weights'] = model.coef_
sum_table

Unnamed: 0,Features,F-stats,P-values
0,GP,249.442,0.0
1,3P Made,1.63,0.202
2,3PA,0.346,0.556
3,AST,41.327,0.0
4,BLK,62.294,0.0
5,TOV,104.986,0.0


In [51]:
confusion_matrix(target, model.predict(inputs))

array([[257, 246],
       [146, 680]], dtype=int64)

In [52]:
cm = pd.DataFrame(confusion_matrix(target, model.predict(inputs)))
cm.columns = ['Predicted 0', 'Predicted 1']
cm = cm.rename(index={0:'Actual 0', 1:'Actual 1'})
cm

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,257,246
Actual 1,146,680


In [53]:
#Accuracy
cm1 = np.array(cm)
accuracy_train = (cm1[0,0]+cm1[1,1])/cm1.sum()
accuracy_train

0.7050413844996237

In [54]:
inputs = data[['GP','3P Made', '3PA', 'AST', 'BLK','TOV']]
target = data['TARGET_5Yrs']

#### Improving Our Model

In [81]:
## Classification Report
print(classification_report(target, model.predict(inputs)))

              precision    recall  f1-score   support

         0.0       0.64      0.51      0.57       503
         1.0       0.73      0.82      0.78       826

    accuracy                           0.71      1329
   macro avg       0.69      0.67      0.67      1329
weighted avg       0.70      0.71      0.70      1329



In [56]:
# Improving the mode
model2 = LogisticRegression(solver='liblinear', C=10.0, random_state =0)
model2.fit(inputs,target)

LogisticRegression(C=10.0, random_state=0, solver='liblinear')

In [57]:
model.coef_, model2.coef_

(array([[ 0.03827173,  1.7320358 , -0.70188932,  0.06511893,  0.78805927,
          0.27765873]]),
 array([[ 0.0394139 ,  3.08267745, -1.18466325,  0.06860854,  0.79176866,
          0.31114604]]))

In [58]:
model.intercept_,model2.intercept_

(array([-2.29969361]), array([-2.37269254]))

In [59]:
sum_tab = pd.DataFrame(inputs.columns.values, columns=['Features'])
#sum_table['Weights'] = model.coef_
sum_tab

Unnamed: 0,Features
0,GP
1,3P Made
2,3PA
3,AST
4,BLK
5,TOV


#### Predictions with two models

In [63]:
md1_pred = model.predict_proba(inputs)

In [64]:
md2_pred = model2.predict_proba(inputs)

In [75]:
tab = inputs.copy()
#tab['Predictions(First Model)'] = md1_pred
#tab['Predictions(Second Model)'] = md2_pred
tab['Targets'] = target

In [76]:
confusion_matrix(target, model2.predict(inputs))

array([[256, 247],
       [147, 679]], dtype=int64)

In [78]:
cm1 = pd.DataFrame(confusion_matrix(target, model2.predict(inputs)))
cm1.columns = ['Predicted 0', 'Predicted 1']
cm1 = cm1.rename(index={0:'Actual 0', 1:'Actual 1'})
cm1

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,256,247
Actual 1,147,679


In [79]:
#Accuracy
cm1 = np.array(cm)
accuracy_train = (cm1[0,0]+cm1[1,1])/cm1.sum()
accuracy_train

0.7050413844996237

In [80]:
## Classification Report
print(classification_report(target, model2.predict(inputs)))

              precision    recall  f1-score   support

         0.0       0.64      0.51      0.57       503
         1.0       0.73      0.82      0.78       826

    accuracy                           0.70      1329
   macro avg       0.68      0.67      0.67      1329
weighted avg       0.70      0.70      0.70      1329



## Using scikit (standardization)

In [82]:
Scaler = StandardScaler()

In [83]:
inputs = data[['GP','3P Made', '3PA', 'AST', 'BLK','TOV']]
target = data['TARGET_5Yrs']

In [84]:
Scaler.fit(inputs)

StandardScaler()

In [85]:
input_scaled = Scaler.transform(inputs)
input_scaled

array([[-1.39646564,  0.65113757,  1.23594448,  0.23141107,  0.07904333,
         0.14359073],
       [-1.45369184,  1.17134305,  1.89417619,  1.45290614,  0.31248463,
         0.55784271],
       [ 0.77812989,  0.39103482,  0.85981208, -0.37933646, -0.15439797,
        -0.27066125],
       ...,
       [-0.99588225, -0.64937615, -0.73875065, -0.85436233,  0.07904333,
        -0.40874525],
       [-0.48084647, -0.64937615, -0.55068445,  0.43499358, -0.62128057,
        -0.54682924],
       [-0.76697746,  0.39103482,  0.48367967, -0.10789311, -0.62128057,
        -0.40874525]])

In [86]:
model3 = LogisticRegression(solver='liblinear', random_state =0)
model3.fit(input_scaled,target)

LogisticRegression(random_state=0, solver='liblinear')

In [87]:
model3.coef_

array([[ 0.68834621,  1.03650023, -1.10629414,  0.10278053,  0.34881685,
         0.21256348]])

In [97]:
m3_tab = pd.DataFrame(inputs.columns.values, columns=['Features'])

In [None]:
#m3_tab['Weights'] = model3.coef_
m3_tab

In [101]:
md3_pred = model3.predict_proba(input_scaled)
md3_pred

array([[0.72302331, 0.27697669],
       [0.70948318, 0.29051682],
       [0.38897401, 0.61102599],
       ...,
       [0.51903079, 0.48096921],
       [0.51769493, 0.48230507],
       [0.58907379, 0.41092621]])

In [102]:
md1_pred,md2_pred,md3_pred

(array([[0.67479537, 0.32520463],
        [0.65335611, 0.34664389],
        [0.35178265, 0.64821735],
        ...,
        [0.51733226, 0.48266774],
        [0.50143669, 0.49856331],
        [0.5746133 , 0.4253867 ]]),
 array([[0.74052379, 0.25947621],
        [0.73200507, 0.26799493],
        [0.40603896, 0.59396104],
        ...,
        [0.51513873, 0.48486127],
        [0.5202564 , 0.4797436 ],
        [0.59193498, 0.40806502]]),
 array([[0.72302331, 0.27697669],
        [0.70948318, 0.29051682],
        [0.38897401, 0.61102599],
        ...,
        [0.51903079, 0.48096921],
        [0.51769493, 0.48230507],
        [0.58907379, 0.41092621]]))

In [103]:
confusion_matrix(target, model3.predict(input_scaled))

array([[261, 242],
       [148, 678]], dtype=int64)

In [104]:
cm2 = pd.DataFrame(confusion_matrix(target, model3.predict(inputs)))
cm2.columns = ['Predicted 0', 'Predicted 1']
cm2 = cm2.rename(index={0:'Actual 0', 1:'Actual 1'})
cm2

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,503
Actual 1,0,826


In [105]:
#Accuracy
cm4 = np.array(cm2)
accuracy_train = (cm4[0,0]+cm4[1,1])/cm4.sum()
accuracy_train

0.6215199398043642

In [106]:
## Classification Report
print(classification_report(target, model3.predict(input_scaled)))

              precision    recall  f1-score   support

         0.0       0.64      0.52      0.57       503
         1.0       0.74      0.82      0.78       826

    accuracy                           0.71      1329
   macro avg       0.69      0.67      0.67      1329
weighted avg       0.70      0.71      0.70      1329



In [108]:
model.intercept_,model2.intercept_,model3.intercept_

(array([-2.29969361]), array([-2.37269254]), array([0.6122793]))