Import relevant libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

Load data file

In [None]:
data = pd.read_excel('Simmons.xls')
data.head()

In [None]:
data.describe() #it is used to get some statistical detail

In [None]:
data['Coupon'].unique() # It gives unique value in perticular column

In [None]:
data['Coupon'].value_counts()

In [None]:
60/100 #Using Baseline method

Build the model

 import libraries

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

splitting the dataset

In [None]:
x = data[['Card','Spending']]
y = data['Coupon'].values.reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state = 42)

In [None]:
len(x_train), len(y_train), len(x_test), len(y_test)

Fit the model

In [None]:
Lreg = LogisticRegression(solver='lbfgs')
Lreg.fit(x_train, y_train.ravel()) #ravel() will return 1D array with all the input-array elements

Predict the values of dependent variable using test and training set of independent variable

In [None]:
y_predict = Lreg.predict(x_test)
y_predict

In [None]:
y_predict_train = Lreg.predict(x_train)
y_predict_train

In [None]:
y_prob_train = Lreg.predict_proba(x_train)[:,1]
y_prob_train.reshape(1,-1)

In [None]:
y_prob = Lreg.predict_proba(x_test)[:,1]
y_prob.reshape(1,-1)
y_prob

In [None]:
x = data[['Spending', 'Card']]
y = data['Coupon']

import statsmodels.api as sm
x1 = sm.add_constant(x)
logit_model=sm.Logit(y,x1)
result=logit_model.fit()
print(result.summary())

Checking Accuracy of Model

In [None]:
from sklearn.metrics import accuracy_score
score =accuracy_score(y_test,y_predict)
score

Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_predict)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
print("True Negatives: ",tn)
print("False Positives: ",fp)
print("False Negatives: ",fn)
print("True Positives: ",tp)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))

#Recall gives us an idea about when it’s actually yes, how often does it predict yes.
#Precsion tells us about when it predicts yes, how often is it correct
#Precision = tp / (tp + fp), for 1
#Accuracy = (tp + tn) / (tp + tn + fp + fn)
#Recall=tp / (tp + fn), for 1
#Fmeasure=(2*Recall*Precision)/(Recall+Presision)

Harmonic mean of Precision & Recall is used to indicate a balance between Precision & Recall providing each equal weightage, it ranges from 0 to 1. F1 Score reaches its best value at 1 (perfect precision & recall) and worst at 0, read more here.

In [None]:
Accuracy = (tp + tn) / (tp + tn + fp + fn)
print("Accuracy {:0.2f}".format(Accuracy))

#True Negative Rate

In [None]:
Specificity = tn/(tn+fp)
print("Specificity {:0.2f}".format(Specificity))

#True Positive Rate

In [None]:
Sensitivity = tp/(tp+fn)
print("Sensitivity {:0.2f}".format(Sensitivity))

ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
log_ROC_AUC1 = roc_auc_score(y_train, y_predict_train)
fpr1, tpr1, thresholds1= roc_curve(y_train, y_prob_train)
roc_auc1 = auc(fpr1, tpr1)

In [None]:
plt.figure()
plt.plot(fpr1, tpr1, color='blue', label='ROC curve (area = %0.2f)' % roc_auc1)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
log_ROC_AUC = roc_auc_score(y_test, y_predict)
fpr, tpr, thresholds= roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

In [None]:
plt.figure()
plt.plot(fpr, tpr, color='blue', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

Set the threshold at 0.35

In [None]:
from sklearn.preprocessing import binarize
y_predict_class1 = binarize(y_prob.reshape(1,-1), 0.35)[0]
y_predict_class1

In [None]:
#converting the array from float data type to integer data type
y_predict_class1 = y_predict_class1.astype(int)
y_predict_class1

In [None]:
confusion_matrix_2 = confusion_matrix(y_test, y_predict_class1)
print(confusion_matrix_2)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_predict_class1).ravel()
print("True Negatives: ",tn)
print("False Positives: ",fp)
print("False Negatives: ",fn)
print("True Positives: ",tp)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict_class1))

Set the threshold at 0.50

In [None]:
from sklearn.preprocessing import binarize
y_predict_class2 = binarize(y_prob.reshape(1,-1), 0.50)[0]
y_predict_class2

In [None]:
confusion_matrix_3 = confusion_matrix(y_test, y_predict_class2)
print(confusion_matrix_3)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict_class2))

Set the threshold at 0.70

In [None]:
from sklearn.preprocessing import binarize
y_predict_class3 = binarize(y_prob.reshape(1,-1), 0.70)[0]
y_predict_class3

In [None]:
confusion_matrix_4 = confusion_matrix(y_test, y_predict_class3)
print(confusion_matrix_4)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict_class3))

Find Optimal Cutoff point(Threshold value)

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
fpr, tpr, thresholds= roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

In [None]:
print("Area under the ROC curve : %f" % roc_auc)

In [None]:
import numpy as np
i = np.arange(len(tpr)) # index for df
roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series(tpr, index = i), 
                    '1-fpr' : pd.Series(1-fpr, index = i), 'tf' : pd.Series(tpr - (1-fpr), index = i), 
                    'thresholds' : pd.Series(thresholds, index = i)})
roc.iloc[(roc.tf-0).abs().argsort()[:1]]

In [None]:
fig, ax = plt.subplots()
plt.plot(roc['tpr'])
plt.plot(roc['1-fpr'], color = 'red')
plt.xlabel('1-False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
ax.set_xticklabels([])

In [None]:
from sklearn.preprocessing import binarize
y_predict_class4 = binarize(y_prob.reshape(1,-1), 0.45)[0]
y_predict_class4

In [None]:
confusion_matrix_5 = confusion_matrix(y_test, y_predict_class4)
print(confusion_matrix_5)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict_class4))