# Logistic Regression Using StatsModel

### Last Notebook

In [1]:
#######################################
##1. Importing Libreries         
#######################################
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#######################################
##2. Setting Options         
#######################################
pd.set_option('display.max_columns', 500)

#######################################
##3. Reading Dataset         
#######################################
data = pd.read_csv('../datasets/bank/bank.csv',sep=';')

#######################################
##3. Cleaning Features         
#######################################
# Grouping
data['education'] = np.where(data['education'] == 'basic.4y', 'Basic', data['education'])
data['education'] = np.where(data['education'] == 'basic.6y', 'Basic', data['education'])
data['education'] = np.where(data['education'] == 'basic.9y', 'Basic', data['education'])
# Renaming
data['education'] = np.where(data['education'] == 'high.school', 'High School', data['education'])
data['education'] = np.where(data['education'] == 'professional.course', 'Professional Course', data['education'])
data['education'] = np.where(data['education'] == 'professional.course', 'Professional Course', data['education'])
data['education'] = np.where(data['education'] == 'university.degree', 'University Degree', data['education'])
data['education'] = np.where(data['education'] == 'illiterate', 'Illitirate', data['education'])
data['education'] = np.where(data['education'] == 'unknown', 'Unknown', data['education'])
# Response to Dummie feature
data.loc[data['y']=='yes','y'] = 1
data.loc[data['y']=='no','y'] = 0
# From categorical features to dummie features
categories = ['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
for category in categories:
    cat_list = 'cat'+'_'+category
    cat_dummies = pd.get_dummies(data[category],prefix=cat_list)
    data = data.join(cat_dummies)
# Dropping Features
data.drop(columns=categories, inplace=True)
# Splitting X and y
datacols = data.columns.values.tolist()
subset_y = ['y']
subset_X = [v for v in datacols if v != 'y']

#######################################
##4. Feature Selection      
#######################################
# Importing Libraries
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
# Number of features that i have to keep
n = 12
# Instantiating
logreg = LogisticRegression()
rfe = RFE(logreg, n)
# Training
rfe.fit(data[subset_X],data[subset_y].values.ravel())
# Chosing the new features
final_subset_X = []
for i in range(len(subset_X)):
    if rfe.support_[i] == True:
        final_subset_X.append(subset_X[i])
# X and y to develope the model
X = data[final_subset_X]
y = data['y']

In [2]:
X.head()

Unnamed: 0,previous,euribor3m,cat_job_entrepreneur,cat_job_self-employed,cat_month_dec,cat_month_jul,cat_month_jun,cat_month_mar,cat_month_may,cat_month_oct,cat_poutcome_failure,cat_poutcome_success
0,0,1.313,0,0,0,0,0,0,1,0,0,0
1,0,4.855,0,0,0,0,0,0,1,0,0,0
2,0,4.962,0,0,0,0,1,0,0,0,0,0
3,0,4.959,0,0,0,0,1,0,0,0,0,0
4,0,4.191,0,0,0,0,0,0,0,0,0,0


In [3]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

### StatsModel

In [8]:
import statsmodels.formula.api as sm
logreg = sm.Logit(y,X)

In [9]:
result = logreg.fit()

Optimization terminated successfully.
         Current function value: 0.279180
         Iterations 7


In [10]:
result.summary2()

0,1,2,3
Model:,Logit,No. Iterations:,7.0
Dependent Variable:,y,Pseudo R-squared:,0.192
Date:,2019-03-21 15:34,AIC:,2323.8813
No. Observations:,4119,BIC:,2399.7617
Df Model:,11,Log-Likelihood:,-1149.9
Df Residuals:,4107,LL-Null:,-1422.9
Converged:,1.0000,Scale:,1.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
previous,0.3493,0.1411,2.4750,0.0133,0.0727,0.6258
euribor3m,-0.6253,0.0265,-23.6151,0.0000,-0.6772,-0.5734
cat_job_entrepreneur,-0.4842,0.3817,-1.2685,0.2046,-1.2323,0.2639
cat_job_self-employed,-0.4626,0.3275,-1.4125,0.1578,-1.1046,0.1793
cat_month_dec,0.7147,0.4662,1.5331,0.1253,-0.1990,1.6284
cat_month_jul,0.3280,0.1842,1.7808,0.0750,-0.0330,0.6889
cat_month_jun,0.3431,0.1689,2.0310,0.0423,0.0120,0.6741
cat_month_mar,1.0612,0.3152,3.3666,0.0008,0.4434,1.6789
cat_month_may,-1.0490,0.1264,-8.3016,0.0000,-1.2967,-0.8014


- Este summary está muy bien para refinar el modelo y posteriormente llevarlo a Sklearn
- Cuanto menor sea el pvalue más significación tendrá la variable
- pvalues altos deberían ser eliminados 

### Sklearn

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
log_reg = LogisticRegression()
log_reg.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
log_reg.score(X,y)

0.9021607186210245

In [14]:
# Null Accuracy
max(y.mean(), 1 - y.mean()) 

0.8905074047098811

- Eleva un poco la eficacie que la null accuracy
- Podriamos escoger otras variables, añadir más, o eliminar algunas variables
- podriamos hacer cross validation también para aumentar el score

In [16]:
pd.DataFrame(list(zip(X.columns,np.transpose(log_reg.coef_))))

Unnamed: 0,0,1
0,previous,[0.3798316142096139]
1,euribor3m,[-0.5027490712653142]
2,cat_job_entrepreneur,[-0.34306615561163367]
3,cat_job_self-employed,[-0.33506416426093977]
4,cat_month_dec,[0.8733167999349573]
5,cat_month_jul,[0.3820874456812563]
6,cat_month_jun,[0.5096949858004949]
7,cat_month_mar,[1.270361228864782]
8,cat_month_may,[-0.7430896290668391]
9,cat_month_oct,[0.41185574423171945]
