In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_regression
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn import metrics
#pd.set_option('display.mpl_style','default')
plt.rcParams['figure.figsize'] = (15,5)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
#Viewing Summary()
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq,df)

In [2]:
raw_data = pd.read_csv('Data/nba.csv')

In [3]:
# the independent variables set 
X1 = raw_data[['GP', 'AST', 'BLK']]
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X1.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X1.values, i) 
                          for i in range(len(X1.columns))] 
  
print(vif_data)

  feature       VIF
0      GP  3.922528
1     AST  2.552487
2     BLK  1.967612


In [4]:
inputs = X1.copy()
target = raw_data['TARGET_5Yrs']

### Using Statsmodel

In [6]:
x = sm.add_constant(inputs)
results = sm.Logit(target,x).fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.571926
         Iterations 6


0,1,2,3
Dep. Variable:,TARGET_5Yrs,No. Observations:,1340.0
Model:,Logit,Df Residuals:,1336.0
Method:,MLE,Df Model:,3.0
Date:,"Tue, 05 Jan 2021",Pseudo R-squ.:,0.1387
Time:,18:25:42,Log-Likelihood:,-766.38
converged:,True,LL-Null:,-889.75
Covariance Type:,nonrobust,LLR p-value:,3.33e-53

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.5090,0.227,-11.060,0.000,-2.954,-2.064
GP,0.0423,0.004,10.153,0.000,0.034,0.050
AST,0.1248,0.052,2.404,0.016,0.023,0.227
BLK,0.9716,0.205,4.736,0.000,0.569,1.374


In [9]:
pred = results.pred_table()

In [10]:
cm = pd.DataFrame(pred)

In [11]:
cm.columns = ['Predicted 0', 'Predicted 1']
cm = cm.rename(index={0:'Actual 0', 1:'Actual 1'})
cm

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,268.0,241.0
Actual 1,156.0,675.0


In [14]:
cm1 = np.array(cm)
accuracy = (cm1[0,0]+cm1[1,1])/cm1.sum()
accuracy

0.7037313432835821

### Using Scikit

In [15]:
inputs = X1.copy()
target = raw_data['TARGET_5Yrs']
model = LogisticRegression(solver='liblinear',random_state=0).fit(inputs,target)

In [17]:
model.coef_, model.intercept_

(array([[0.04061629, 0.12291352, 0.92098441]]), array([-2.38316987]))

In [25]:
results.params

const   -2.509047
GP       0.042325
AST      0.124837
BLK      0.971555
dtype: float64

In [27]:
sum_table = pd.DataFrame(inputs.columns.values, columns=['Features'])
#sum_table['Weights'] = model.coef_
sum_table['F-stats'] = f_regression(inputs,target)[0].round(3)
sum_table['P-values'] = f_regression(inputs,target)[1].round(4)
#sum_table['Weights'] = model.coef_
sum_table
sum_table

Unnamed: 0,Features,F-stats,P-values
0,GP,250.086,0.0
1,AST,42.447,0.0
2,BLK,61.798,0.0


In [29]:
cm = pd.DataFrame(confusion_matrix(target, model.predict(inputs)))
cm.columns = ['Predicted 0', 'Predicted 1']
cm = cm.rename(index={0:'Actual 0', 1:'Actual 1'})
cm

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,256,253
Actual 1,152,679


In [30]:
#Accuracy
cm1 = np.array(cm)
accuracy_train = (cm1[0,0]+cm1[1,1])/cm1.sum()
accuracy_train

0.6977611940298507

In [32]:
## Classification Report
print(classification_report(target, model.predict(inputs)))

              precision    recall  f1-score   support

         0.0       0.63      0.50      0.56       509
         1.0       0.73      0.82      0.77       831

    accuracy                           0.70      1340
   macro avg       0.68      0.66      0.66      1340
weighted avg       0.69      0.70      0.69      1340

