In [126]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

import os
import pandas_profiling

import scipy.stats as ss
from collections import Counter
import math

import warnings
warnings.filterwarnings('ignore')

In [101]:
bank_data = pd.read_csv('../data/interim/bank_data_clean.csv')# Loading the full CSV data file 


In [102]:
bank_data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,subscribed
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,other,no,no,telephone,may,mon,...,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [103]:
bank_data.rename(columns={'subscribed': 'response'}, inplace=True)

In [104]:
# Show duplicate observations
bank_data.loc[bank_data.duplicated()]

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,response
1266,39,blue-collar,married,basic.6y,no,no,no,telephone,may,thu,...,1,0,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
12261,36,retired,married,other,no,no,no,telephone,jul,thu,...,1,0,0,nonexistent,1.4,93.918,-42.7,4.966,5228.1,no
14234,27,technician,single,professional.course,no,no,no,cellular,jul,mon,...,2,0,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,no
16956,47,technician,divorced,high.school,no,yes,no,cellular,jul,thu,...,3,0,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,no
18465,32,technician,single,professional.course,no,yes,no,cellular,jul,thu,...,1,0,0,nonexistent,1.4,93.918,-42.7,4.968,5228.1,no
20216,55,services,married,high.school,other,no,no,cellular,aug,mon,...,1,0,0,nonexistent,1.4,93.444,-36.1,4.965,5228.1,no
20534,41,technician,married,professional.course,no,yes,no,cellular,aug,tue,...,1,0,0,nonexistent,1.4,93.444,-36.1,4.966,5228.1,no
25217,39,admin.,married,university.degree,no,no,no,cellular,nov,tue,...,2,0,0,nonexistent,-0.1,93.2,-42.0,4.153,5195.8,no
28477,24,services,single,high.school,no,yes,no,cellular,apr,tue,...,1,0,0,nonexistent,-1.8,93.075,-47.1,1.423,5099.1,no
32516,35,admin.,married,university.degree,no,yes,no,cellular,may,fri,...,4,0,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no


In [105]:
bank_data.drop_duplicates(inplace=True)

In [106]:
# Finds all of the categorical features along with the number their unique values
cat_cols = bank_data.select_dtypes(include=[object]).columns

In [107]:
for feature in cat_cols:
    uniq = np.unique(bank_data[feature])
    print('{}: {} distinct values -  {}'.format(feature,len(uniq),uniq))

job: 12 distinct values -  ['admin.' 'blue-collar' 'entrepreneur' 'housemaid' 'management' 'other'
 'retired' 'self-employed' 'services' 'student' 'technician' 'unemployed']
marital: 4 distinct values -  ['divorced' 'married' 'other' 'single']
education: 8 distinct values -  ['basic.4y' 'basic.6y' 'basic.9y' 'high.school' 'illiterate' 'other'
 'professional.course' 'university.degree']
default: 3 distinct values -  ['no' 'other' 'yes']
housing: 3 distinct values -  ['no' 'other' 'yes']
loan: 3 distinct values -  ['no' 'other' 'yes']
contact: 2 distinct values -  ['cellular' 'telephone']
month: 10 distinct values -  ['apr' 'aug' 'dec' 'jul' 'jun' 'mar' 'may' 'nov' 'oct' 'sep']
day_of_week: 5 distinct values -  ['fri' 'mon' 'thu' 'tue' 'wed']
poutcome: 3 distinct values -  ['failure' 'nonexistent' 'success']
response: 2 distinct values -  ['no' 'yes']


In [108]:
def categories_counts(feature, df=bank_data):
    ''' Calculates the count and percentage of each value of a categorical variable'''
    counts = pd.concat([df[feature].value_counts(), round(100 * df[feature].value_counts()/df.shape[0], 2)], axis=1)
    counts.columns=['count', '%']
    counts.reset_index(inplace =True)
    counts.rename(columns={'index': feature}, inplace=True)
    return counts

In [109]:
# Aggregates cat variables based on the passed dataframe and set of features
def cat_aggergator(df, features):
    '''This function aggregates the value counts of categorical values producing a new data frame '''
    df_list = []
    for var in features:
        count = categories_counts(var, df)
        count['idx'] = var
        count.rename(columns={var:'categories'}, inplace=True)
        df_list.append(count)
    df_aggregated = pd.concat(df_list)
    df_aggregated = df_aggregated.set_index(['idx', 'categories']).sort_index()
    return df_aggregated


In [111]:
all_cat_aggregated = cat_aggergator(bank_data, cat_cols)
print('The aggregated categories count for the enire data set is: \n')
all_cat_aggregated

The aggregated categories count for the enire data set is: 



Unnamed: 0_level_0,Unnamed: 1_level_0,count,%
idx,categories,Unnamed: 2_level_1,Unnamed: 3_level_1
contact,cellular,26135,63.47
contact,telephone,15041,36.53
day_of_week,fri,7826,19.01
day_of_week,mon,8512,20.67
day_of_week,thu,8618,20.93
day_of_week,tue,8086,19.64
day_of_week,wed,8134,19.75
default,no,32577,79.12
default,other,8596,20.88
default,yes,3,0.01


## Using Label Encoder to Transform the Response Variable to Numeric (Binary)

The values of the Target variable **response** should be changes from [yes, no] to [1, 0]. Sklearn Label Encoder can do the job.

In [114]:
# Copy the data
df = bank_data.copy()

In [128]:
# Import the LabelEncoder 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df.response = le.fit_transform(bank_data['response'])

print('The new data type fo the response varuable is:', df.response.dtypes)

The new data type fo the response varuable is: int64


In [148]:
df_ecoded.response.value_counts()

0    36537
1     4639
Name: response, dtype: int64

In [129]:
# Update the list of the categorical columns
cat_cols = df.select_dtypes(include=[object]).columns
cat_cols

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome'],
      dtype='object')

## Use One Hot Encoding to Transform the Remaining Categorical Variable

Here we use the implementation of OneHotEncoder in the [**Category Encoders**](https://contrib.scikit-learn.org/category_encoders/) package.

In [135]:
from category_encoders import OneHotEncoder

In [136]:
encoder = OneHotEncoder(cols=cat_cols ,handle_unknown='return_nan',return_df=True,use_cat_names=True)

In [137]:
encoder

OneHotEncoder(cols=Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome'],
      dtype='object'),
              drop_invariant=False, handle_missing='value',
              handle_unknown='return_nan', return_df=True, use_cat_names=True,
              verbose=0)

In [149]:
df_encoded = encoder.fit_transform(df)

In [151]:
print('The new features are {}:\n\n {}'.format(len(df_encoded.columns),  encoder.get_feature_names()))

The new features are 64:

 ['age', 'job_housemaid', 'job_services', 'job_admin.', 'job_blue-collar', 'job_technician', 'job_retired', 'job_management', 'job_unemployed', 'job_self-employed', 'job_other', 'job_entrepreneur', 'job_student', 'marital_married', 'marital_single', 'marital_divorced', 'marital_other', 'education_basic.4y', 'education_high.school', 'education_basic.6y', 'education_basic.9y', 'education_professional.course', 'education_other', 'education_university.degree', 'education_illiterate', 'default_no', 'default_other', 'default_yes', 'housing_no', 'housing_yes', 'housing_other', 'loan_no', 'loan_yes', 'loan_other', 'contact_telephone', 'contact_cellular', 'month_may', 'month_jun', 'month_jul', 'month_aug', 'month_oct', 'month_nov', 'month_dec', 'month_mar', 'month_apr', 'month_sep', 'day_of_week_mon', 'day_of_week_tue', 'day_of_week_wed', 'day_of_week_thu', 'day_of_week_fri', 'duration', 'campaign', 'pdays', 'previous', 'poutcome_nonexistent', 'poutcome_failure', 'pout

## Prepare the Data for Modeling: Train_Test_Split

In [170]:
# Get the name of dataframe
def get_df_name(df):
    '''Function to get the name of DataFrame'''
    name =[x for x in globals() if globals()[x] is df][0]
    return name

In [169]:
y = df_encoded.response
X = df_encoded.drop('response', axis=1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

data_sets =[X_train, X_test, y_train, y_test]

for d in data_sets:
    print(f'The shape of {get_df_name(d)}: {d.shape} {round(d.shape[0]/df_encoded.shape[0] *100, 0)}% \n')

The shape of X_train: (32940, 63) 80.0% 

The shape of X_test: (8236, 63) 20.0% 

The shape of y_train: (32940,) 80.0% 

The shape of y_test: (8236,) 20.0% 

