# Generalized Linear Models 

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy import stats
from sklearn.metrics import confusion_matrix

In [3]:
# Import Data
sales_train = pd.read_csv(
    "/Users/datascience/Desktop/ADS 502 Data Sets/Website Data Sets/clothing_sales_training.csv")
sales_test = pd.read_csv(
    "/Users/datascience/Desktop/ADS 502 Data Sets/Website Data Sets/clothing_sales_test.csv")

## 13. Create a logistic regression model to predict whether or not a customer has a store credit card, based on whether they have a web account and the days between purchases. Obtain the summary of the model.

In [4]:
# Separate variables and add constant
X = pd.DataFrame(sales_train[['Days', 'Web']])
X = sm.add_constant(X)
y = pd.DataFrame(sales_train[['CC']])

In [5]:
# Perform Logistic Regression 
logreg01 = sm.Logit(y, X).fit()

Optimization terminated successfully.
         Current function value: 0.655955
         Iterations 5


In [6]:
# Obtain Summary of Logistic Regression Model
logreg01.summary2()

0,1,2,3
Model:,Logit,Pseudo R-squared:,0.053
Dependent Variable:,CC,AIC:,1909.5825
Date:,2021-11-22 19:52,BIC:,1925.4226
No. Observations:,1451,Log-Likelihood:,-951.79
Df Model:,2,LL-Null:,-1004.9
Df Residuals:,1448,LLR p-value:,8.3668e-24
Converged:,1.0000,Scale:,1.0
No. Iterations:,5.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,0.4962,0.0887,5.5968,0.0000,0.3224,0.6699
Days,-0.0037,0.0004,-8.4491,0.0000,-0.0046,-0.0028
Web,1.2537,0.3307,3.7914,0.0001,0.6056,1.9018


## 14. Are there any variables that should be removed from the model? If so, remove them and rerun the model.

According to the p-values shown in the model, both predictor variables belong in the model.

## 15.  Write the descriptive form of the logistic regression model using the coefficients obtained from Question 1.

$ \hat{p}(credit card) = \frac{exp(0.4962 - 0.0037(Days Between Purchases) + 1.2537(Web Account)}{1 + exp(0.4962 - 0.0037(Days Between Purchases) + 1.2537(Web Account)}$

## 16. Validate the model using the test data set.

In [7]:
# Separate Variables and Add constant to test set
X_test = pd.DataFrame(sales_test[['Days', 'Web']])
X_test = sm.add_constant(X_test)
y_test = pd.DataFrame(sales_test[['CC']])

In [8]:
# Perform Logistic Regression 
logreg01_test = sm.Logit(y_test, X_test).fit()

Optimization terminated successfully.
         Current function value: 0.656885
         Iterations 5


In [9]:
# Obtain Summary of Logistic Regression Model
logreg01_test.summary2()

0,1,2,3
Model:,Logit,Pseudo R-squared:,0.052
Dependent Variable:,CC,AIC:,1838.7104
Date:,2021-11-22 19:52,BIC:,1854.4324
No. Observations:,1395,Log-Likelihood:,-916.36
Df Model:,2,LL-Null:,-966.4
Df Residuals:,1392,LLR p-value:,1.8533999999999998e-22
Converged:,1.0000,Scale:,1.0
No. Iterations:,5.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,0.4634,0.0873,5.3105,0.0000,0.2924,0.6345
Days,-0.0035,0.0004,-8.2261,0.0000,-0.0043,-0.0026
Web,1.0973,0.2830,3.8780,0.0001,0.5427,1.6519


The test model confirms that both predictor variables belong in the model 

## 17. Obtain the predicted values of the response variable for each record in the data set.

In [10]:
# Obtain Predictions using logistic regression
predictions_prob = logreg01.predict(X_test)

# Return the prediction probalities of each response variable
predictions_prob

0       0.463090
1       0.542853
2       0.578054
3       0.556706
4       0.382003
          ...   
1390    0.538220
1391    0.575651
1392    0.462169
1393    0.544112
1394    0.551676
Length: 1395, dtype: float64

In [11]:
# Return positive values of the class
predictions = (logreg01.predict(X_test) > 0.5).astype(int)

# Display predictions 
predictions

0       0
1       1
2       1
3       1
4       0
       ..
1390    1
1391    1
1392    0
1393    1
1394    1
Length: 1395, dtype: int64

In [12]:
# Validate Model using a confusion Matrix 
cm = confusion_matrix(y_test, predictions)
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TP = cm[1][1]
print('TN: ', TN, '\nFP: ', FP, '\nFN: ', FN, '\nTP: ', TP)
cm

TN:  405 
FP:  312 
FN:  215 
TP:  463


array([[405, 312],
       [215, 463]])

In [13]:
### Evaluation Measures ###
TAN = TN + FP
TAP = FN + TP
TPN = TN + FN
TPP =  FP + TP
GT = TN + FP + FN + TP

# Accuracy
Acc = (TN + TP) / (GT)
# Error Rate
Error = 1 - Acc
# Sensitivity / Recall
Sens = TP / TAP
#Specificity
Spec = TN/ TAN
# Precision
Prec = TP/TPP
print('Accuracy: ', Acc, '\nError Rate: ', Error, 
      '\nSensitivity/Recall: ', Sens, '\nSpecificity: ', Spec, '\nPrecision: ', Prec)


Accuracy:  0.6222222222222222 
Error Rate:  0.37777777777777777 
Sensitivity/Recall:  0.6828908554572272 
Specificity:  0.5648535564853556 
Precision:  0.5974193548387097
