# Calculating the Accuracy of the Model

Using the same dataset, expand the model by including all other features into the regression. 

Moreover, calculate the accuracy of the model and create a confusion matrix

## Import the relevant libraries

In [1]:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

## Load the data

Load the ‘Bank_data.csv’ dataset.

In [2]:
data = pd.read_csv('Bank-data.csv')

In [5]:
data = data.drop(['Unnamed: 0'],axis =1)

In [6]:
data.head()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,4.12,0.0,0.0,0.0,0.0,686.0,yes
4,4.856,0.0,1.0,0.0,0.0,157.0,no


In [7]:
data['y'] = data['y'].map({'yes':1,'no':0})

In [8]:
data.head()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.334,0.0,1.0,0.0,0.0,117.0,0
1,0.767,0.0,0.0,2.0,1.0,274.0,1
2,4.858,0.0,1.0,0.0,0.0,167.0,0
3,4.12,0.0,0.0,0.0,0.0,686.0,1
4,4.856,0.0,1.0,0.0,0.0,157.0,0


### Declare the dependent and independent variables

In [9]:
y = data['y']
x = data['duration']

Use 'duration' as the independet variable.

In [11]:
x1 = sm.add_constant(x)

  return ptp(axis=axis, out=out, **kwargs)


### Simple Logistic Regression

Run the regression and graph the scatter plot.

In [13]:
l_reg = sm.Logit(y,x1).fit()

Optimization terminated successfully.
         Current function value: 0.546118
         Iterations 7


In [14]:
l_reg.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,516.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 10 Aug 2020",Pseudo R-squ.:,0.2121
Time:,22:07:04,Log-Likelihood:,-282.89
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,5.387e-35

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.7001,0.192,-8.863,0.000,-2.076,-1.324
duration,0.0051,0.001,9.159,0.000,0.004,0.006


In [15]:
confusion_matrix(x1,y,l_reg)

(array([[204.,  55.],
        [104., 155.]]), 0.693050193050193)

## Expand the model

We can be omitting many causal factors in our simple logistic model, so we instead switch to a multivariate logistic regression model. Add the ‘interest_rate’, ‘march’, ‘credit’ and ‘previous’ estimators to our model and run the regression again. 

### Declare the independent variable(s)

In [46]:
x = data.drop(['y','may'],axis=1)

In [47]:
x.head()

Unnamed: 0,interest_rate,credit,march,previous,duration
0,1.334,0.0,1.0,0.0,117.0
1,0.767,0.0,0.0,1.0,274.0
2,4.858,0.0,1.0,0.0,167.0
3,4.12,0.0,0.0,0.0,686.0
4,4.856,0.0,1.0,0.0,157.0


In [19]:
x1 = sm.add_constant(x)

  return ptp(axis=axis, out=out, **kwargs)


In [57]:
 l_reg = sm.Logit(y,x).fit()

Optimization terminated successfully.
         Current function value: 0.336668
         Iterations 7


In [58]:
l_reg.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,513.0
Method:,MLE,Df Model:,4.0
Date:,"Mon, 10 Aug 2020",Pseudo R-squ.:,0.5143
Time:,22:37:45,Log-Likelihood:,-174.39
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,1.185e-78

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
interest_rate,-0.8030,0.079,-10.201,0.000,-0.957,-0.649
credit,2.3459,1.071,2.190,0.029,0.246,4.445
march,-1.8387,0.315,-5.831,0.000,-2.457,-1.221
previous,1.5262,0.478,3.190,0.001,0.588,2.464
duration,0.0069,0.001,10.365,0.000,0.006,0.008


In [60]:
confusion_matrix(x,y,l_reg)

(array([[218.,  41.],
        [ 28., 231.]]), 0.8667953667953668)

### Confusion Matrix

Create the confusion matrix of the model and estimate its accuracy. 

<i> For convenience we have already provided you with a function that finds the confusion matrix and the model accuracy.</i>

In [10]:
def confusion_matrix(data,actual_values,model):
        
        # Confusion matrix 
        
        # Parameters
        # ----------
        # data: data frame or array
            # data is a data frame formatted in the same way as your input data (without the actual values)
            # e.g. const, var1, var2, etc. Order is very important!
        # actual_values: data frame or array
            # These are the actual values from the test_data
            # In the case of a logistic regression, it should be a single column with 0s and 1s
            
        # model: a LogitResults object
            # this is the variable where you have the fitted model 
            # e.g. results_log in this course
        # ----------
        
        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and 
        return cm, accuracy

In [48]:
from sklearn.preprocessing import StandardScaler

In [49]:
scaler = StandardScaler()

In [50]:
scaler.fit(x)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [51]:
x_scaled = pd.DataFrame(scaler.transform(x),columns=x.columns.values)

In [52]:
x1 = sm.add_constant(x_scaled)

  return ptp(axis=axis, out=out, **kwargs)


In [53]:
x1.head()

Unnamed: 0,const,interest_rate,credit,march,previous,duration
0,1.0,-0.800908,-0.189737,1.659404,-0.382123,-0.770947
1,1.0,-1.103294,-0.189737,-0.602626,2.616961,-0.314503
2,1.0,1.078467,-0.189737,1.659404,-0.382123,-0.625583
3,1.0,0.684886,-0.189737,-0.602626,-0.382123,0.883298
4,1.0,1.077401,-0.189737,1.659404,-0.382123,-0.654656


In [54]:
l_reg = sm.Logit(y,x1).fit()

Optimization terminated successfully.
         Current function value: 0.336664
         Iterations 7


In [55]:
l_reg.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,512.0
Method:,MLE,Df Model:,5.0
Date:,"Mon, 10 Aug 2020",Pseudo R-squ.:,0.5143
Time:,22:35:19,Log-Likelihood:,-174.39
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,1.211e-77

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.1585,0.145,1.092,0.275,-0.126,0.443
interest_rate,-1.5003,0.168,-8.943,0.000,-1.829,-1.172
credit,0.4319,0.199,2.169,0.030,0.042,0.822
march,-0.8100,0.146,-5.556,0.000,-1.096,-0.524
previous,0.5123,0.167,3.067,0.002,0.185,0.840
duration,2.3931,0.255,9.381,0.000,1.893,2.893


In [56]:
confusion_matrix(x1,y,l_reg)

(array([[218.,  41.],
        [ 30., 229.]]), 0.862934362934363)

Unnamed: 0,const,interest_rate,credit,march,may,previous,duration
const,,,,,,,
interest_rate,,1.0,-0.156611,-0.059636,-0.44383,-0.385421,-0.020202
credit,,-0.156611,1.0,-0.11434,0.129737,0.117176,-0.028476
march,,-0.059636,-0.11434,1.0,-0.067339,-0.060022,0.022097
may,,-0.44383,0.129737,-0.067339,1.0,0.678697,-0.048609
previous,,-0.385421,0.117176,-0.060022,0.678697,1.0,-0.016794
duration,,-0.020202,-0.028476,0.022097,-0.048609,-0.016794,1.0
