In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import scipy
from scipy.stats import pearsonr
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# Silence downcasting warning
pd.set_option('future.no_silent_downcasting', True)

In [2]:
raw_df = pd.read_csv('MarketingData/bank-full.csv', sep=';')
raw_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
base_df = raw_df.rename(columns={
    'default': 'has_defaulted',
    'pdays': 'days_since_last_contact',
    'previous': 'prev_contact_count',
    'poutcome': 'prev_outcome',
    'y': 'outcome'})

base_df.head()

Unnamed: 0,age,job,marital,education,has_defaulted,balance,housing,loan,contact,day,month,duration,campaign,days_since_last_contact,prev_contact_count,prev_outcome,outcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
# Splitting the data by type of column for ease of processing.

# Boolean columns
bool_cols = ['has_defaulted', 'housing', 'loan', 'outcome']
bool_df = base_df[bool_cols]

# Categorical columns including boolean
cat_cols = ['job', 'marital', 'education', 'contact', 'month', 'prev_outcome'] + bool_cols
cat_df = base_df[cat_cols]

# Numeric columns
num_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'days_since_last_contact', 'prev_contact_count']
num_df = base_df[num_cols]

# List of unique values in each categorical column.
cat_labels = {
    'job': ['unknown', 'unemployed', 'student', 'retired', 'self-employed', 'housemaid', 'technician', 'entrepreneur', 'blue-collar', 'admin.', 'services', 'management'],
    'marital': ['unknown', 'single', 'divorced', 'married'],
    'education': ['unknown', 'primary', 'secondary', 'tertiary'],
    'contact': ['unknown', 'cellular', 'telephone'],
    #TODO: Should month be numeric?
    #TODO: Should month/date even be included? it's unlikely to be relevant to
    # bank balance. We could actually strip out a lot of these featues without
    # losing much.
    'month': ['unknown', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'],
    'prev_outcome': ['unknown', 'failure', 'success', 'other']
}
for col in bool_cols:
    cat_labels[col] = ['no', 'yes']

# Map equivalent of cat_labels.
cat_map = {}
for col in cat_labels:
    cat_map[col] = {cat: i for i, cat in enumerate(cat_labels[col])}

In [5]:
# Replace -1's with N/A
for col in num_cols:
    # Balance may be negative.
    if col in ['balance']: continue
    # -1's mean that the data isn't available. In those cases it's common to
    # substitute the mean value for the column rather than having a hard-coded
    # value. This is needed for simpler models like linear regression to have
    # good accuracy as we want the presence of absence of each point to have as
    # little effect on the learnt model as possible.

    # We want to replace all -1's the the mean value. This is done in two steps:
    #   1. Replace -1's with N/A's.
    #   2. Replace the N/A's with the calculated means.
    # This can't be done in one step because we must remove the -1's from the
    # column to calculate the mean.
    num_df[col] = num_df[col].replace(-1, pd.NA)
    num_df[col] = num_df[col].fillna(base_df[col].mean())

num_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  num_df[col] = num_df[col].replace(-1, pd.NA)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  num_df[col] = num_df[col].fillna(base_df[col].mean())


Unnamed: 0,age,balance,day,duration,campaign,days_since_last_contact,prev_contact_count
0,58,2143,5,261,1,40.197828,0
1,44,29,5,151,1,40.197828,0
2,33,2,5,76,1,40.197828,0
3,47,1506,5,92,1,40.197828,0
4,33,1,5,198,1,40.197828,0


In [6]:
# ord_df is has all categorical columns as 'ordinal' numbers.
#TODO: This probably won't be used. Should ipd.set_option('future.no_silent_downcasting', True)t be removed?
#ord_df = base_df.copy()
#for col in cat_cols:
#    ord_df[col] = ord_df[col].map(cat_map[col])
#
#ord_df.head()

In [7]:
# hot_df is the same as base_df but with all categorical columns replaced with
# one-hot encodings as floats
hot_df = pd.get_dummies(cat_df, dtype=float)
hot_df.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,prev_outcome_success,prev_outcome_unknown,has_defaulted_no,has_defaulted_yes,housing_no,housing_yes,loan_no,loan_yes,outcome_no,outcome_yes
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [8]:
# Taking y as the balance column from the dataset because that's what we'll end up predicting.
# X_num is then just num_df (the numeric columns) but with that output column ('balance') removed.
# num_df is the dataframe of just the numeric columns from the dataset.
y = num_df['balance']
X_num = num_df.drop('balance', axis=1)
X_cat = pd.concat([X_num, cat_df], axis=1)
scaler = sklearn.preprocessing.StandardScaler()
Xs_num = pd.DataFrame(scaler.fit_transform(X_num), columns=X_num.columns)
X_hot = pd.concat([Xs_num, hot_df], axis=1)
X_hot.head()

Unnamed: 0,age,day,duration,campaign,days_since_last_contact,prev_contact_count,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,prev_outcome_success,prev_outcome_unknown,has_defaulted_no,has_defaulted_yes,housing_no,housing_yes,loan_no,loan_yes,outcome_no,outcome_yes
0,1.606965,-1.298476,0.011016,-0.569351,-0.388719,-0.25194,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1,0.288529,-1.298476,-0.416127,-0.569351,-0.388719,-0.25194,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,-0.747384,-1.298476,-0.707361,-0.569351,-0.388719,-0.25194,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,0.571051,-1.298476,-0.645231,-0.569351,-0.388719,-0.25194,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,-0.747384,-1.298476,-0.23362,-0.569351,-0.388719,-0.25194,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
