## Predicting Direct Marketing Effectiveness

Given *data about bank customers*, we need to predict if for a given customer, bank marketing technique was effective or not.

We will be using Logistic Regression to make our predictions.

Data source: https://www.kaggle.com/datasets/psvishnu/bank-direct-marketing

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv("bank-full.csv", delimiter=";")
data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


### Preprocessing

In [4]:
df = data.copy()

In [5]:
y = df['y']
X = df.drop('y', axis=1)

In [6]:
def get_categorical_features(df):
    return [feature for feature in df.columns if df[feature].dtype == 'object']

In [7]:
get_categorical_features(X)

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [8]:
def get_uniques(df, columns):
    return {column: df[column].unique() for column in columns}

In [9]:
get_uniques(X, X.columns)

{'age': array([58, 44, 33, 47, 35, 28, 42, 43, 41, 29, 53, 57, 51, 45, 60, 56, 32,
        25, 40, 39, 52, 46, 36, 49, 59, 37, 50, 54, 55, 48, 24, 38, 31, 30,
        27, 34, 23, 26, 61, 22, 21, 20, 66, 62, 83, 75, 67, 70, 65, 68, 64,
        69, 72, 71, 19, 76, 85, 63, 90, 82, 73, 74, 78, 80, 94, 79, 77, 86,
        95, 81, 18, 89, 84, 87, 92, 93, 88]),
 'job': array(['management', 'technician', 'entrepreneur', 'blue-collar',
        'unknown', 'retired', 'admin.', 'services', 'self-employed',
        'unemployed', 'housemaid', 'student'], dtype=object),
 'marital': array(['married', 'single', 'divorced'], dtype=object),
 'education': array(['tertiary', 'secondary', 'unknown', 'primary'], dtype=object),
 'default': array(['no', 'yes'], dtype=object),
 'balance': array([ 2143,    29,     2, ...,  8205, 14204, 16353]),
 'housing': array(['yes', 'no'], dtype=object),
 'loan': array(['no', 'yes'], dtype=object),
 'contact': array(['unknown', 'cellular', 'telephone'], dtype=object),
 'day'

In [10]:
X = X.replace('unknown', np.NaN)
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,


In [11]:
X.isna().sum()

age              0
job            288
marital          0
education     1857
default          0
balance          0
housing          0
loan             0
contact      13020
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
poutcome     36959
dtype: int64

In [12]:
X = X.drop('poutcome', axis=1)
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0


In [13]:
get_uniques(X, X.columns)

{'age': array([58, 44, 33, 47, 35, 28, 42, 43, 41, 29, 53, 57, 51, 45, 60, 56, 32,
        25, 40, 39, 52, 46, 36, 49, 59, 37, 50, 54, 55, 48, 24, 38, 31, 30,
        27, 34, 23, 26, 61, 22, 21, 20, 66, 62, 83, 75, 67, 70, 65, 68, 64,
        69, 72, 71, 19, 76, 85, 63, 90, 82, 73, 74, 78, 80, 94, 79, 77, 86,
        95, 81, 18, 89, 84, 87, 92, 93, 88]),
 'job': array(['management', 'technician', 'entrepreneur', 'blue-collar', nan,
        'retired', 'admin.', 'services', 'self-employed', 'unemployed',
        'housemaid', 'student'], dtype=object),
 'marital': array(['married', 'single', 'divorced'], dtype=object),
 'education': array(['tertiary', 'secondary', nan, 'primary'], dtype=object),
 'default': array(['no', 'yes'], dtype=object),
 'balance': array([ 2143,    29,     2, ...,  8205, 14204, 16353]),
 'housing': array(['yes', 'no'], dtype=object),
 'loan': array(['no', 'yes'], dtype=object),
 'contact': array([nan, 'cellular', 'telephone'], dtype=object),
 'day': array([ 5,  6,  

In [14]:
binary_features = ['default', 'loan', 'housing']

ordinal_features = ['education', 'month']

nominal_features = ['job', 'marital', 'contact']

In [15]:
def binary_encode(df, columns, positive_label):
    df = df.copy()
    for column in columns:
        df[column] = df[column].apply(lambda x: 1 if x == positive_label else 0)
    return df

In [16]:
X = binary_encode(X, binary_features, 'yes')
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous
0,58,management,married,tertiary,0,2143,1,0,,5,may,261,1,-1,0
1,44,technician,single,secondary,0,29,1,0,,5,may,151,1,-1,0
2,33,entrepreneur,married,secondary,0,2,1,1,,5,may,76,1,-1,0
3,47,blue-collar,married,,0,1506,1,0,,5,may,92,1,-1,0
4,33,,single,,0,1,0,0,,5,may,198,1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,0,825,0,0,cellular,17,nov,977,3,-1,0
45207,71,retired,divorced,primary,0,1729,0,0,cellular,17,nov,456,2,-1,0
45208,72,retired,married,secondary,0,5715,0,0,cellular,17,nov,1127,5,184,3
45209,57,blue-collar,married,secondary,0,668,0,0,telephone,17,nov,508,4,-1,0


In [17]:
education_ordering = ['primary', 'secondary', 'tertiary']
month_ordering = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

In [18]:
def ordinal_encode(df, columns, orderings):
    df = df.copy()
    for column, ordering in zip(columns, orderings):
        df[column] = df[column].apply(lambda x: ordering.index(x) if str(x) != 'nan' else x)
    return df

In [19]:
orderings = [education_ordering, month_ordering]

X = ordinal_encode(X, ordinal_features, orderings)

In [20]:
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous
0,58,management,married,2.0,0,2143,1,0,,5,4,261,1,-1,0
1,44,technician,single,1.0,0,29,1,0,,5,4,151,1,-1,0
2,33,entrepreneur,married,1.0,0,2,1,1,,5,4,76,1,-1,0
3,47,blue-collar,married,,0,1506,1,0,,5,4,92,1,-1,0
4,33,,single,,0,1,0,0,,5,4,198,1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,2.0,0,825,0,0,cellular,17,10,977,3,-1,0
45207,71,retired,divorced,0.0,0,1729,0,0,cellular,17,10,456,2,-1,0
45208,72,retired,married,1.0,0,5715,0,0,cellular,17,10,1127,5,184,3
45209,57,blue-collar,married,1.0,0,668,0,0,telephone,17,10,508,4,-1,0


In [21]:
def onehot_encode(df, columns):
    for column in columns:
        dummies = pd.get_dummies(df[column], prefix=column, dtype=int)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [22]:
X = onehot_encode(X, nominal_features)
X

Unnamed: 0,age,education,default,balance,housing,loan,day,month,duration,campaign,...,job_self-employed,job_services,job_student,job_technician,job_unemployed,marital_divorced,marital_married,marital_single,contact_cellular,contact_telephone
0,58,2.0,0,2143,1,0,5,4,261,1,...,0,0,0,0,0,0,1,0,0,0
1,44,1.0,0,29,1,0,5,4,151,1,...,0,0,0,1,0,0,0,1,0,0
2,33,1.0,0,2,1,1,5,4,76,1,...,0,0,0,0,0,0,1,0,0,0
3,47,,0,1506,1,0,5,4,92,1,...,0,0,0,0,0,0,1,0,0,0
4,33,,0,1,0,0,5,4,198,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,2.0,0,825,0,0,17,10,977,3,...,0,0,0,1,0,0,1,0,1,0
45207,71,0.0,0,1729,0,0,17,10,456,2,...,0,0,0,0,0,1,0,0,1,0
45208,72,1.0,0,5715,0,0,17,10,1127,5,...,0,0,0,0,0,0,1,0,1,0
45209,57,1.0,0,668,0,0,17,10,508,4,...,0,0,0,0,0,0,1,0,0,1


In [23]:
X.isna().sum()

age                     0
education            1857
default                 0
balance                 0
housing                 0
loan                    0
day                     0
month                   0
duration                0
campaign                0
pdays                   0
previous                0
job_admin.              0
job_blue-collar         0
job_entrepreneur        0
job_housemaid           0
job_management          0
job_retired             0
job_self-employed       0
job_services            0
job_student             0
job_technician          0
job_unemployed          0
marital_divorced        0
marital_married         0
marital_single          0
contact_cellular        0
contact_telephone       0
dtype: int64

In [24]:
X['education'] = X['education'].fillna(X['education'].median())

In [25]:
X.isna().sum().sum()

0

In [26]:
y

0         no
1         no
2         no
3         no
4         no
        ... 
45206    yes
45207    yes
45208    yes
45209     no
45210     no
Name: y, Length: 45211, dtype: object

In [27]:
label_encoder = LabelEncoder()

y = label_encoder.fit_transform(y)

label_encoder.classes_

array(['no', 'yes'], dtype=object)

In [28]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

In [29]:
X_train.shape, X_test.shape

((31647, 28), (13564, 28))

In [30]:
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [31]:
X_train

Unnamed: 0,age,education,default,balance,housing,loan,day,month,duration,campaign,...,job_self-employed,job_services,job_student,job_technician,job_unemployed,marital_divorced,marital_married,marital_single,contact_cellular,contact_telephone
6149,-0.841435,-0.215155,7.351072,-0.513906,0.892293,-0.435537,1.347581,-0.476213,0.654165,-0.563213,...,-0.187879,-0.319662,-0.144354,-0.447443,-0.172356,-0.359110,0.815594,-0.631721,-1.359412,-0.264482
12403,-0.652995,-0.215155,-0.136035,-0.590185,0.892293,2.296018,1.347581,-0.061419,-0.572683,0.391184,...,-0.187879,-0.319662,-0.144354,-0.447443,-0.172356,-0.359110,-1.226100,1.582977,-1.359412,-0.264482
21645,-0.841435,-0.215155,-0.136035,-0.353719,0.892293,-0.435537,0.385408,0.768170,-0.351385,-0.245081,...,-0.187879,-0.319662,-0.144354,2.234924,-0.172356,-0.359110,0.815594,-0.631721,0.735612,-0.264482
29580,-0.464554,-0.215155,7.351072,-0.434130,-1.120709,2.296018,-1.538940,-1.720596,-0.421268,0.391184,...,-0.187879,-0.319662,-0.144354,-0.447443,-0.172356,-0.359110,0.815594,-0.631721,0.735612,-0.264482
31245,-1.689418,-0.215155,-0.136035,-0.283797,-1.120709,-0.435537,-1.538940,-1.305802,-0.665862,-0.563213,...,-0.187879,-0.319662,6.927421,-0.447443,-0.172356,-0.359110,-1.226100,1.582977,0.735612,-0.264482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43723,0.854529,1.319789,-0.136035,0.143366,-1.120709,-0.435537,-0.215951,-0.476213,0.230980,0.073052,...,-0.187879,-0.319662,-0.144354,2.234924,-0.172356,-0.359110,0.815594,-0.631721,0.735612,-0.264482
32511,-0.652995,1.319789,-0.136035,-0.270448,0.892293,-0.435537,0.144864,-0.891007,-0.099027,-0.245081,...,-0.187879,-0.319662,-0.144354,-0.447443,-0.172356,2.784665,-1.226100,-0.631721,0.735612,-0.264482
5192,0.666089,1.319789,-0.136035,0.893761,-1.120709,-0.435537,0.625951,-0.476213,0.405689,1.027448,...,-0.187879,-0.319662,-0.144354,-0.447443,-0.172356,-0.359110,0.815594,-0.631721,-1.359412,-0.264482
12172,0.383428,-0.215155,-0.136035,0.128428,-1.120709,-0.435537,0.505679,-0.061419,-0.960926,2.618110,...,-0.187879,-0.319662,-0.144354,-0.447443,-0.172356,-0.359110,0.815594,-0.631721,-1.359412,-0.264482


In [33]:
y_train

array([0, 0, 0, ..., 0, 0, 0])

### Training

In [34]:
model = LogisticRegression()

model.fit(X_train, y_train)

In [35]:
model_acc = model.score(X_test, y_test)
print("Model Accuracy: ", model_acc)

Model Accuracy:  0.8899292244175759
