# Task for Today
***  

## Marketing Effectiveness Prediction

Given data about subjects' responses to a bank's marketing campaign, let's try to predict whether a given subject will place a deposit or not.
We will use TensorFlow ANN to make our predictions

In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

from sklearn.metrics import confusion_matrix, classification_report

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/bank-marketing-campaigns-dataset/bank-additional-full.csv


In [5]:
tf.random.set_seed(100)

In [6]:
data = pd.read_csv('/kaggle/input/bank-marketing-campaigns-dataset/bank-additional-full.csv', delimiter=';')

In [7]:
pd.set_option('max_columns', None)

In [8]:
data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,334,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,383,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,189,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,442,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

## Encoding Labels

In [10]:
data.y = data.y.apply(lambda x: 1 if x == 'yes' else 0)

In [11]:
data.y.sum()

4640

## Encoding Categorical, Sequential, Binary Features

In [12]:
data.select_dtypes('object')

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,nonexistent
1,services,married,high.school,unknown,no,no,telephone,may,mon,nonexistent
2,services,married,high.school,no,yes,no,telephone,may,mon,nonexistent
3,admin.,married,basic.6y,no,no,no,telephone,may,mon,nonexistent
4,services,married,high.school,no,no,yes,telephone,may,mon,nonexistent
...,...,...,...,...,...,...,...,...,...,...
41183,retired,married,professional.course,no,yes,no,cellular,nov,fri,nonexistent
41184,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,nonexistent
41185,retired,married,university.degree,no,yes,no,cellular,nov,fri,nonexistent
41186,technician,married,professional.course,no,no,no,cellular,nov,fri,nonexistent


In [13]:
{column: len(data[column].unique()) for column in data.select_dtypes('object').columns}

{'job': 12,
 'marital': 4,
 'education': 8,
 'default': 3,
 'housing': 3,
 'loan': 3,
 'contact': 2,
 'month': 10,
 'day_of_week': 5,
 'poutcome': 3}

In [14]:
{column: list(data[column].unique()) for column in data.select_dtypes('object').columns}

{'job': ['housemaid',
  'services',
  'admin.',
  'blue-collar',
  'technician',
  'retired',
  'management',
  'unemployed',
  'self-employed',
  'unknown',
  'entrepreneur',
  'student'],
 'marital': ['married', 'single', 'divorced', 'unknown'],
 'education': ['basic.4y',
  'high.school',
  'basic.6y',
  'basic.9y',
  'professional.course',
  'unknown',
  'university.degree',
  'illiterate'],
 'default': ['no', 'unknown', 'yes'],
 'housing': ['no', 'yes', 'unknown'],
 'loan': ['no', 'yes', 'unknown'],
 'contact': ['telephone', 'cellular'],
 'month': ['may',
  'jun',
  'jul',
  'aug',
  'oct',
  'nov',
  'dec',
  'mar',
  'apr',
  'sep'],
 'day_of_week': ['mon', 'tue', 'wed', 'thu', 'fri'],
 'poutcome': ['nonexistent', 'failure', 'success']}

In [15]:
data.replace('unknown', np.nan, inplace=True)

In [16]:
# Encoding Helper Functions

def encode_onehot(df, columns, prefixes):
    df = df.copy()
    for column, prefix in zip(columns, prefixes):
        dumdums = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dumdums], axis=1)
        df = df.drop(column, axis=1)        
    return df

def encode_ordinal(df, columns, orderings):
    df = df.copy()
    for column, ordering in zip(columns, orderings):
        df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

def encode_binary(df, columns, positive_values):
    df = df.copy()
    for column, positive_value in zip(columns, positive_values):
        df[column] = df[column].apply(lambda x: 1 if x == positive_value else x)
        df[column] = df[column].apply(lambda x: 0 if str(x) != 'nan' else x)
    return df

In [17]:
nominal_features = [
    'job',
    'marital',
    'education',
    'day_of_week',
    'poutcome'
]

ordinal_features = [
    'month'
]

binary_features = [
    'default',
    'housing',
    'loan',
    'contact'
]

In [18]:
prefixes = ['J', 'M', 'E', 'D', 'P']

orderings = [
    ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
]

positive_values = [
    'yes',    
    'yes',
    'yes',
    'cellular'
]

In [19]:
data1 = data.copy()
data1 = encode_onehot(data1, nominal_features, prefixes)
data1 = encode_ordinal(data1, ordinal_features, orderings)
data1 = encode_binary(data1, binary_features, positive_values)
data1

Unnamed: 0,age,default,housing,loan,contact,month,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,J_admin.,J_blue-collar,J_entrepreneur,J_housemaid,J_management,J_retired,J_self-employed,J_services,J_student,J_technician,J_unemployed,M_divorced,M_married,M_single,E_basic.4y,E_basic.6y,E_basic.9y,E_high.school,E_illiterate,E_professional.course,E_university.degree,D_fri,D_mon,D_thu,D_tue,D_wed,P_failure,P_nonexistent,P_success
0,56,0.0,0.0,0.0,0,4,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1,57,,0.0,0.0,0,4,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
2,37,0.0,0.0,0.0,0,4,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
3,40,0.0,0.0,0.0,0,4,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
4,56,0.0,0.0,0.0,0,4,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,0.0,0.0,0.0,0,10,334,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0
41184,46,0.0,0.0,0.0,0,10,383,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0
41185,56,0.0,0.0,0.0,0,10,189,2,999,0,-1.1,94.767,-50.8,1.028,4963.6,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0
41186,44,0.0,0.0,0.0,0,10,442,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0


## Missing Values Sanity Check

In [20]:
for column in binary_features:
    data1[column]=data1[column].fillna(data1[column].mean())

In [21]:
print('Remaining Missing Values: ', data1.isna().sum().sum())

Remaining Missing Values:  0


## Split and Scale the Data

In [22]:
y = data1.y
X = data1.drop('y', axis=1)
X, y

(       age  default  housing  loan  contact  month  duration  campaign  pdays  \
 0       56      0.0      0.0   0.0        0      4       261         1    999   
 1       57      0.0      0.0   0.0        0      4       149         1    999   
 2       37      0.0      0.0   0.0        0      4       226         1    999   
 3       40      0.0      0.0   0.0        0      4       151         1    999   
 4       56      0.0      0.0   0.0        0      4       307         1    999   
 ...    ...      ...      ...   ...      ...    ...       ...       ...    ...   
 41183   73      0.0      0.0   0.0        0     10       334         1    999   
 41184   46      0.0      0.0   0.0        0     10       383         1    999   
 41185   56      0.0      0.0   0.0        0     10       189         2    999   
 41186   44      0.0      0.0   0.0        0     10       442         1    999   
 41187   74      0.0      0.0   0.0        0     10       239         3    999   
 
        previo

In [23]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=123)

## Modeling / Training

In [25]:
print('Positive Examples: {}'.format(y.sum()))
print('Negative Examples: {}'.format(len(y) - y.sum()))

print("\nClass Distribution: {:.1f} / {:.1f}".format(y.mean() * 100, (1-y.mean())*100) )

Positive Examples: 4640
Negative Examples: 36548

Class Distribution: 11.3 / 88.7


In [26]:
X.shape

(41188, 44)

In [27]:
inputs = tf.keras.Input(shape=(X.shape[1]))
#Create 2 hidden layers - 64 activations each - relu activ. fucn.
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs, outputs)

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)

batch_size=32
epochs = 100

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size = batch_size,
    epochs = epochs,
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


## Results

In [28]:
model.evaluate(X_test, y_test)



[0.19203051924705505, 0.9079873561859131, 0.9334160089492798]

In [30]:
y_true = np.array(y_test)
y_pred = np.squeeze(np.array(model.predict(X_test) >= 0.9, dtype=np.int))

In [31]:
print("Confusion Matrix: \n ", confusion_matrix(y_true, y_pred))

Confusion Matrix: 
  [[10958     3]
 [ 1365    31]]


In [None]:
print("Classification Report: \n", classification_report(y_true, y_pred))