In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
# library imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [34]:
# scikit imports
from sklearn.preprocessing import Imputer

In [4]:
bank_data_set = pd.read_csv("dataset/bank-additional.csv", sep=";");

In [6]:
bank_data_set.shape

(4119, 21)

In [7]:
bank_data_set.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [8]:
bank_data_set.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [9]:
bank_data_set.describe(include='all')

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
count,4119.0,4119,4119,4119,4119,4119,4119,4119,4119,4119,...,4119.0,4119.0,4119.0,4119,4119.0,4119.0,4119.0,4119.0,4119.0,4119
unique,,12,4,8,3,3,3,2,10,5,...,,,,3,,,,,,2
top,,admin.,married,university.degree,no,yes,no,cellular,may,thu,...,,,,nonexistent,,,,,,no
freq,,1012,2509,1264,3315,2175,3349,2652,1378,860,...,,,,3523,,,,,,3668
mean,40.11362,,,,,,,,,,...,2.537266,960.42219,0.190337,,0.084972,93.579704,-40.499102,3.621356,5166.481695,
std,10.313362,,,,,,,,,,...,2.568159,191.922786,0.541788,,1.563114,0.579349,4.594578,1.733591,73.667904,
min,18.0,,,,,,,,,,...,1.0,0.0,0.0,,-3.4,92.201,-50.8,0.635,4963.6,
25%,32.0,,,,,,,,,,...,1.0,999.0,0.0,,-1.8,93.075,-42.7,1.334,5099.1,
50%,38.0,,,,,,,,,,...,2.0,999.0,0.0,,1.1,93.749,-41.8,4.857,5191.0,
75%,47.0,,,,,,,,,,...,3.0,999.0,0.0,,1.4,93.994,-36.4,4.961,5228.1,


In [24]:
# where pdays is 999: client was not previously contacted.
# more than 95% was not contacted, we should probably drop this feature or turn it into binary.
(bank_data_set["pdays"] == 999).sum() / len(bank_data_set)

0.9611556202961884

In [27]:
(bank_data_set["previous"] == 0).sum() / len(bank_data_set)

0.85530468560330175

In [26]:
(bank_data_set["poutcome"] == "nonexistent").sum() / len(bank_data_set)

0.85530468560330175

# Cleaning the dataset 

In [32]:
nominal_features = ["job", "marital", "education", "default", "housing", 
                    "loan", "contact", "month", "day_of_week", "poutcome",
                   ]
numeric_features = ["age", "campaign", "pdays", "previous", "emp.var.rate", 
                    "cons.price.idx", "euribor3m", "nr.employed",
                   ]

In [33]:
for feature in nominal_features:
    nr_missing = (bank_data_set[feature] == "unknown").sum()
    ratio = nr_missing / len(bank_data_set)
    print (feature, nr_missing, ratio)

job 39 0.0094683175528
marital 11 0.00267055110464
education 167 0.0405438213159
default 803 0.194950230639
housing 105 0.0254916241806
loan 105 0.0254916241806
contact 0 0.0
month 0 0.0
day_of_week 0 0.0
poutcome 0 0.0


In [85]:
bank_df = (bank_data_set[(bank_data_set["job"] != "unknown") & 
                        (bank_data_set["marital"] != "unknown") &
                        (bank_data_set["education"] != "unknown") &
                        (bank_data_set["housing"] != "unknown") &
                        (bank_data_set["loan"] != "unknown")
                       ]).copy()
bank_df.reset_index(inplace=True, drop=True)

In [96]:
# impute values of default
mode = bank_df["default"].mode()
actual_mode = mode[0]
bank_df["default"] = (bank_df["default"]).replace("unknown", actual_mode)

In [97]:
bank_df.describe(include="all")

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
count,3811.0,3811,3811,3811,3811,3811,3811,3811,3811,3811,...,3811.0,3811.0,3811.0,3811,3811.0,3811.0,3811.0,3811.0,3811.0,3811
unique,,11,3,7,2,2,2,2,10,5,...,,,,3,,,,,,2
top,,admin.,married,university.degree,no,yes,no,cellular,may,mon,...,,,,nonexistent,,,,,,no
freq,,970,2330,1239,3810,2072,3177,2470,1275,801,...,,,,3268,,,,,,3398
mean,39.961952,,,,,,,,,,...,2.539491,961.467331,0.184204,,0.088113,93.575567,-40.560798,3.626035,5167.004749,
std,10.215878,,,,,,,,,,...,2.590149,189.42784,0.523136,,1.55716,0.578653,4.602293,1.727428,73.160799,
min,19.0,,,,,,,,,,...,1.0,0.0,0.0,,-3.4,92.201,-50.8,0.635,4963.6,
25%,32.0,,,,,,,,,,...,1.0,999.0,0.0,,-1.8,93.075,-42.7,1.334,5099.1,
50%,38.0,,,,,,,,,,...,2.0,999.0,0.0,,1.1,93.444,-41.8,4.857,5191.0,
75%,47.0,,,,,,,,,,...,3.0,999.0,0.0,,1.4,93.994,-36.4,4.961,5228.1,
