# Bank Marketing Classification

## Data Collection/Preprocessing

Carolyn
- Load data
- Feature analysis (check for imbalanced data, heatmap feature correlation, etc.)

Jonathan
- Feature engineering (feature scaling, normalization, standardization, one hot encode features/targets, check for missing values)
- Train test validation split

In [1]:
import sklearn
import tensorflow
import pandas as pd
import numpy as np
import matplotlib as plt

# !pip install ucimlrepo
from ucimlrepo import fetch_ucirepo 

In [2]:
# import the dataset -- can use pip install

# # fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 
  
# # data (as pandas dataframes) 
X = bank_marketing.data.features 
y = bank_marketing.data.targets 
  
# # metadata 
print(bank_marketing.metadata) 
  
# # variable information 
print(bank_marketing.variables) 

{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'title': 'A data-driven approach to predict the success of bank telemarketing'

In [3]:
# set seed
seed = 42

In [4]:
# Drop any duplicates (none)

X = X.drop_duplicates()

# Checking for NA columns
for col in X.columns:
    if not X[col].notna().all():
        print(f'NA for {col}')
        print(X[col].value_counts(dropna=False))

# Problems to address: 1) 288 null in occupation (impute with mode), 2) 1857 null in education (get rid of or impute?) CHECK, 3) NaN contact label as new category 'other' CHECK, 4) poutcome NaN and other become unknown         
X['job'] = X['job'].mask(X['job'].isna(), 'blue-collar')
X['education'] = X['education'].mask(X['education'].isna(), 'secondary')
X['contact'] = X['contact'].mask(X['contact'].isna(), 'other')
X['poutcome'] = X['poutcome'].mask(X['poutcome'].isna(), 'unknown')
X['poutcome'] = X['poutcome'].mask(X['poutcome'] == 'other', 'unknown')
X.info()
print(X['job'].value_counts(dropna=False))

# Binarize target
y = pd.DataFrame(np.where(y == 'yes', 1, 0))
y.value_counts(dropna=False)

# pdays binning

bins = [float('-inf'), -1, 90, 180, 365, float('inf')]
labels = [0, 1, 2, 3, 4]
# bins to corresponding label
pdays_b2l = {0: 'never prev contacted', 1: '1-90 days', 2: '91-180 days', 3: '181-365 days', 4: '> 365 days'}

# Bin the pdays column
X['pdays_bin'] = pd.cut(X['pdays'], bins=bins, labels=labels)

# Check the distribution of bins
bin_counts = X['pdays_bin'].value_counts()
# print(bin_counts)

# age binning

bins = [18, 30, 40, 50, 60, float('inf')]
labels = [0, 1, 2, 3, 4]
# bins to corresponding label
age_b2l = {0: '18-30', 1: '31-40', 2: '41-50', 3: '51-60', 4: '61+'}

# Bin the age column
X['age_bin'] = pd.cut(X['age'], bins=bins, labels=labels, include_lowest=True)

# Check the distribution of bins
bin_counts = X['age_bin'].value_counts()
# print(bin_counts)

# balance binning

bins = [float('-inf'), 0, 500, 1000, 5000, 10000, float('inf')]
labels = [0, 1, 2, 3, 4, 5]
# bins to corresponding label
bal_b2l = {0: '< 0', 1: '0-500', 2: '501-1000', 3: '1001-5000', 4: '5001-10000', 5: '10001+'}

# Bin the balance column
X['bal_bin'] = pd.cut(X['balance'], bins=bins, labels=labels)

# Check the distribution of bins
bin_counts = X['bal_bin'].value_counts()
# print(bin_counts)

# campaign binning

bins = [1, 2, 5, 10, float('inf')]
labels = [0, 1, 2, 3]
# bins to corresponding label
camp_b2l = {0: '1', 1: '2-5', 2: '6-10', 3: '10+'}

# Bin the pdays column
X['camp_bin'] = pd.cut(X['campaign'], bins=bins, labels=labels, include_lowest=True)

# Check the distribution of bins
bin_counts = X['camp_bin'].value_counts()
# print(bin_counts)

# previous binning

bins = [float('-inf'), 0, 1, 5, float('inf')]
labels = [0, 1, 2, 3]
# bins to corresponding label
prev_b2l = {0: '0', 1: '1', 2: '2-5', 3: '6+'}

# Bin the pdays column
X['prev_bin'] = pd.cut(X['previous'], bins=bins, labels=labels)

# Check the distribution of bins
bin_counts = X['prev_bin'].value_counts()
# print(bin_counts)


X_archive = X

# Remove duration, day_of_week, month, duration (not useful for meaningful prediction), and all columns just binned
X = X.drop(columns=['month','duration', 'day_of_week', 'age', 'campaign', 'balance', 'pdays', 'previous'])

for col in X.columns:
    X[col] = X[col].mask(X[col] == 'yes', 1)
    X[col] = X[col].mask(X[col] == 'no', 0)
    X[col] = X[col].astype('category')
y = y.astype('float64').to_numpy()

# Check distribution and data types of each column
for col in X.columns:
    print(X[col].value_counts(dropna=False))
X.info()

# One-Hot-Encoding every column
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
X_ohe = ohe.fit_transform(X).toarray()

# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size = 0.15, random_state=seed)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1275, random_state=seed)

NA for job
blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
NaN               288
Name: job, dtype: int64
NA for education
secondary    23202
tertiary     13301
primary       6851
NaN           1857
Name: education, dtype: int64
NA for contact
cellular     29285
NaN          13020
telephone     2906
Name: contact, dtype: int64
NA for poutcome
NaN        36959
failure     4901
other       1840
success     1511
Name: poutcome, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 45211 entries, 0 to 45210
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          45211 non-null  int64 
 1   job          45211 non-null  object
 2   marital      45211 non-null  object
 3   education    45211 non-null  object
 4   defau

In [5]:
# Showing proportion of the no (first number) and yes (second number) for each split. Would recommend looking into ways of mitigating unbalanced class proportions

_, counts = np.unique(y_test, return_counts=True)
print(counts)

_, counts = np.unique(y_val, return_counts=True)
counts
print(counts)

_, counts = np.unique(y_train, return_counts=True)
print(counts)

[5970  812]
[4349  551]
[29603  3926]


## Baselines

Kevin
- Logistic regression

Megan

- Random guessing

In [6]:
import numpy as np
from sklearn.metrics import accuracy_score

# Assuming y_train contains the target labels of the training set
# Generate random predictions based on the distribution of the target variable
random_predictions = np.random.choice([0, 1], size=len(y_test), p=[1-np.mean(y_train), np.mean(y_train)])

# Evaluate the performance of the random guessing model
accuracy = accuracy_score(y_test, random_predictions)
print("Accuracy of random guessing model:", accuracy)


Accuracy of random guessing model: 0.7900324388086111


In [7]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

# Create a DummyClassifier with strategy='stratified' to perform random guessing
dummy_classifier = DummyClassifier(strategy='stratified', random_state=42)

# Train the DummyClassifier on the training data
dummy_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = dummy_classifier.predict(X_test)

# Evaluate the performance of the DummyClassifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of DummyClassifier (random guessing):", accuracy)

Accuracy of DummyClassifier (random guessing): 0.7959304040106163


## Training Model

Kevin
- Ensemble (random forest, xg boost)

Megan
- Simple neural network

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Define the neural network architecture
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),  # Add dropout with a rate of 0.2
    Dense(32, activation='relu'),
    Dropout(0.2),  # Add dropout with a rate of 0.2
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping callback
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, 
                    validation_data=(X_val, y_val), callbacks=[early_stopping])

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_accuracy)


Train on 33529 samples, validate on 4900 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


Test Accuracy: 0.8925096


## Evaluating Model

Michelle
- Metric 1: F1 Score
- Metric 2: Area Under Receiver Operating Characteristics Curve (ROC)
- Also responsible for conclusion in report