In [1]:
# Identifying Customer Targets (Python)

# prepare for Python version 3x features and functions
from __future__ import division, print_function

# import packages for text processing and machine learning
import pandas as pd  # DataFrame structure and operations
import numpy as np  # arrays and numerical processing
import matplotlib.pyplot as plt  # 2D plotting
import statsmodels.api as sm  # logistic regression
import statsmodels.formula.api as smf  # R-like model specification
import patsy  # translate model specification into design matrices

# import user-defined module
import evaluate_classifier as eval

# read in comma-delimited text file and create data frame
# there are blank character fields for missing data
# read them as character fields initially
bank = pd.read_csv('bank.csv', sep = ';')
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,response
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [2]:
# define jobtype variable
job_to_jobtype = {'admin.':'White Collar', 'entrepreneur':'White Collar','management':'White Collar',\
    'self-employed':'White Collar','blue-collar':'Blue Collar','services':'Blue Collar','technician':'Blue Collar'}
bank['jobtype'] = bank['job'].map(job_to_jobtype)
bank['jobtype'] = bank['jobtype'].fillna('Other/Unknown')
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,response,jobtype
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no,Other/Unknown
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no,Blue Collar
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no,White Collar
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no,White Collar
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no,Blue Collar


In [3]:
# set marital variable
marital_to_label = {'divorced':'Divorced',\
    'married':'Married',\
    'single':'Single'}
bank['marital'] = bank['marital'].map(marital_to_label)
bank['marital'] = bank['marital'].fillna('Unknown')

# set education variable
education_to_label = {'primary':'Primary',\
    'secondary':'Secondary',\
    'tertiary':'Tertiary'}
bank['education'] = bank['education'].map(education_to_label)
bank['education'] = bank['education'].fillna('Unknown')

# set no/yes variable labels
noyes_to_label = {'no':'No', 'yes':'Yes'}
bank['default'] = bank['default'].map(noyes_to_label)
bank['default'] = bank['default'].fillna('No')

bank['housing'] = bank['housing'].map(noyes_to_label)
bank['housing'] = bank['housing'].fillna('No')

bank['loan'] = bank['loan'].map(noyes_to_label)
bank['loan'] = bank['loan'].fillna('No')

# code response as binary variable
noyes_to_binary = {'no':0, 'yes':1}
bank['response'] = bank['response'].map(noyes_to_binary)
bank['response'] = bank['response'].fillna('No')

bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,response,jobtype
0,30,unemployed,Married,Primary,No,1787,No,No,cellular,19,oct,79,1,-1,0,unknown,0,Other/Unknown
1,33,services,Married,Secondary,No,4789,Yes,Yes,cellular,11,may,220,1,339,4,failure,0,Blue Collar
2,35,management,Single,Tertiary,No,1350,Yes,No,cellular,16,apr,185,1,330,1,failure,0,White Collar
3,30,management,Married,Tertiary,No,1476,Yes,Yes,unknown,3,jun,199,4,-1,0,unknown,0,White Collar
4,59,blue-collar,Married,Secondary,No,0,Yes,No,unknown,5,may,226,1,-1,0,unknown,0,Blue Collar


In [4]:
# work only with bank clients who are being approached for the first time  
filter = bank['pdays'].map(lambda d: d == -1)
# apply the filter and select columns needed for targeting model
bankwork = pd.DataFrame(bank[filter], columns = ['response','age','jobtype',\
    'education',  'marital', 'default', 'balance', 'housing', 'loan'])
bankwork.head() 

Unnamed: 0,response,age,jobtype,education,marital,default,balance,housing,loan
0,0,30,Other/Unknown,Primary,Married,No,1787,No,No
3,0,30,White Collar,Tertiary,Married,No,1476,Yes,Yes
4,0,59,Blue Collar,Secondary,Married,No,0,Yes,No
7,0,39,Blue Collar,Secondary,Married,No,147,Yes,No
8,0,41,White Collar,Tertiary,Married,No,221,Yes,No


In [5]:
# examine descriptive statistics and frequency tables for variables in model
print(bankwork.describe())
print('\njobtype:\n',bankwork['jobtype'].value_counts())
print('\nmarital:\n',bankwork['marital'].value_counts())
print('\neducation:\n',bankwork['education'].value_counts())
print('\ndefault:\n',bankwork['default'].value_counts())
print('\nhousing:\n',bankwork['housing'].value_counts())
print('\nloan:\n',bankwork['loan'].value_counts())

# examine means of continuous explanatory variables by response
print(bankwork.pivot_table(['age'], index = ['response']))
print(bankwork.pivot_table(['balance'], index = ['response']))

# baseline response rate computed (will be used later)
filter_took_offer = bankwork['response'].map(lambda d: d == 1)
baseline_response_rate = len(bankwork[filter_took_offer]) / len(bankwork)
print('\nBaseline proportion of clients responding to offer: ',\
    round(baseline_response_rate,5), '\n')

# examine proportion responding across levels 
# of categorical variables
print(bankwork.pivot_table(['response'], index = ['jobtype']))
print(bankwork.pivot_table(['response'], index = ['education']))
print(bankwork.pivot_table(['response'], index = ['marital']))
print(bankwork.pivot_table(['response'], index = ['default']))
print(bankwork.pivot_table(['response'], index = ['housing']))
print(bankwork.pivot_table(['response'], index = ['loan']))

          response          age       balance
count  3705.000000  3705.000000   3705.000000
mean      0.090958    41.083671   1374.862078
std       0.287588    10.373818   3008.524207
min       0.000000    19.000000  -3313.000000
25%       0.000000    33.000000     60.000000
50%       0.000000    39.000000    415.000000
75%       0.000000    49.000000   1412.000000
max       1.000000    87.000000  71188.000000

jobtype:
 Blue Collar      1776
White Collar     1453
Other/Unknown     476
Name: jobtype, dtype: int64

marital:
 Married     2305
Single       957
Divorced     443
Name: marital, dtype: int64

education:
 Secondary    1891
Tertiary     1084
Primary       580
Unknown       150
Name: education, dtype: int64

default:
 No     3634
Yes      71
Name: default, dtype: int64

housing:
 Yes    2043
No     1662
Name: housing, dtype: int64

loan:
 No     3113
Yes     592
Name: loan, dtype: int64
                age
response           
0         40.983076
1         42.089021
             

In [7]:
# specify model for logisitc regression
bank_spec = 'response ~ age + jobtype + education + marital +\
    default + balance + housing + loan'
# ----------------------------------
# fit logistic regression model 
# ----------------------------------
# convert R-like formula into design matrix needed for statsmodels        
y,x = patsy.dmatrices(bank_spec, bankwork, return_type = 'dataframe')    

my_logit_model = sm.Logit(y,x)
# fit the model to the full data set
my_logit_model_fit = my_logit_model.fit()
print(my_logit_model_fit.summary())

Optimization terminated successfully.
         Current function value: 0.293877
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               response   No. Observations:                 3705
Model:                          Logit   Df Residuals:                     3692
Method:                           MLE   Df Model:                           12
Date:                Sun, 17 Feb 2019   Pseudo R-squ.:                 0.03568
Time:                        12:56:14   Log-Likelihood:                -1088.8
converged:                       True   LL-Null:                       -1129.1
                                        LLR p-value:                 3.223e-12
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                   -2.3937      0.390     -6.132      0.000      -3.159

In [8]:
# predicted probability of reponding to the offer
bankwork['pred_logit_prob'] = my_logit_model_fit.predict(linear = False)

# map target from probability cutoff specified
def prob_to_pred(x, cutoff):
    if(x > cutoff):
        return(1)
    else:
        return(0)

# try cutoff set at 0.50
bankwork['pred_logit_50'] =\
    bankwork['pred_logit_prob'].\
    apply(lambda d: prob_to_pred(d, cutoff = 0.50))    
print('\nConfusion matrix for 0.50 cutoff\n',\
    pd.crosstab(bankwork.pred_logit_50, bankwork.response, margins = True))    
# cutoff 0.50 does not work for targeting... all predictions 0 or No    


Confusion matrix for 0.50 cutoff
 response          0    1   All
pred_logit_50                 
0              3368  337  3705
All            3368  337  3705


In [9]:
# try cutoff set at 0.10
bankwork['pred_logit_10'] =\
    bankwork['pred_logit_prob'].\
    apply(lambda d: prob_to_pred(d, cutoff = 0.10))    
print('\nConfusion matrix for 0.10 cutoff\n',\
    pd.crosstab(bankwork.pred_logit_10, bankwork.response, margins = True)) 

print('\n Logistic Regression Performance (0.10 cutoff)\n',\
    'Percentage of Targets Correctly Classified:',\
    100 * round(eval.evaluate_classifier(bankwork['pred_logit_10'],\
    bankwork['response'])[4], 3),'\n')


Confusion matrix for 0.10 cutoff
 response          0    1   All
pred_logit_10                 
0              2262  159  2421
1              1106  178  1284
All            3368  337  3705

 Logistic Regression Performance (0.10 cutoff)
 Percentage of Targets Correctly Classified: 65.9 



In [10]:
# direct calculation of lift 
# decile labels from highest to lowest 
decile_label = []
for i in range(10):
    decile_label.append('Decile_'+str(10 - i))
# draws on baseline response rate computed earlier    
def lift(x):
    return(x / baseline_response_rate)

prediction_deciles = pd.qcut(bankwork.pred_logit_prob, 10, labels = decile_label)
decile_groups = bankwork.response.groupby(prediction_deciles)
print(decile_groups.mean())
lift_values = decile_groups.mean() / baseline_response_rate
print('\nLift Chart Values by Decile:\n', lift_values, '\n')

pred_logit_prob
Decile_10    0.043127
Decile_9     0.048649
Decile_8     0.053908
Decile_7     0.072973
Decile_6     0.078167
Decile_5     0.078378
Decile_4     0.110811
Decile_3     0.118598
Decile_2     0.102703
Decile_1     0.202156
Name: response, dtype: float64

Lift Chart Values by Decile:
 pred_logit_prob
Decile_10    0.474138
Decile_9     0.534846
Decile_8     0.592672
Decile_7     0.802270
Decile_6     0.859374
Decile_5     0.861697
Decile_4     1.218261
Decile_3     1.303878
Decile_2     1.129120
Decile_1     2.222520
Name: response, dtype: float64 

