# Data Prepocessing

In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn import preprocessing

### Bank Dataset

In [2]:
with open('bank/bank-full.csv', 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    df = list(spamreader)
    df = np.asarray(df)
    df = pd.DataFrame(data=df[1:], columns=df[0])

In [3]:
numeric = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
df[numeric] = df[numeric].astype(float)

In [4]:
categorical = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'day', 'month', 'poutcome', 'y']
for c in categorical:
    df[c] = pd.factorize(df[c])[0]
#df = pd.get_dummies(df, columns=categorical, prefix=categorical)
#df['y'] = pd.factorize(df['y'])[0]
df = df.astype(float)

In [5]:
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
rescaled_df = scaler.fit_transform(df[numeric])
df[numeric] = pd.DataFrame(rescaled_df, columns=numeric)

In [6]:
df[categorical] = df[categorical].astype('int')
df[categorical] = df[categorical].astype('category')
df['y'] = df['y'].astype(int)

In [7]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0.519481,0,0,0,0,0.092259,0,0,0,0,0,0.05307,0.0,0.0,0.0,0,0
1,0.337662,1,1,1,0,0.073067,0,0,0,0,0,0.030704,0.0,0.0,0.0,0,0
2,0.194805,2,0,1,0,0.072822,0,1,0,0,0,0.015453,0.0,0.0,0.0,0,0
3,0.376623,3,0,2,0,0.086476,0,0,0,0,0,0.018707,0.0,0.0,0.0,0,0
4,0.194805,4,1,2,0,0.072812,1,0,0,0,0,0.04026,0.0,0.0,0.0,0,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age          45211 non-null float64
job          45211 non-null category
marital      45211 non-null category
education    45211 non-null category
default      45211 non-null category
balance      45211 non-null float64
housing      45211 non-null category
loan         45211 non-null category
contact      45211 non-null category
day          45211 non-null category
month        45211 non-null category
duration     45211 non-null float64
campaign     45211 non-null float64
pdays        45211 non-null float64
previous     45211 non-null float64
poutcome     45211 non-null category
y            45211 non-null int64
dtypes: category(10), float64(6), int64(1)
memory usage: 2.8 MB


In [9]:
df.describe()

Unnamed: 0,age,balance,duration,campaign,pdays,previous,y
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,0.297873,0.085171,0.052494,0.028449,0.047245,0.00211,0.116985
std,0.137906,0.027643,0.052364,0.049968,0.114827,0.008376,0.321406
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.194805,0.073457,0.020943,0.0,0.0,0.0,0.0
50%,0.272727,0.076871,0.0366,0.016129,0.0,0.0,0.0
75%,0.38961,0.085768,0.064864,0.032258,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
#Saving as pickle file
df.to_pickle('data/bank_data.pkl')

### Bank Additional Dataset

In [11]:
with open('bank-additional/bank-additional-full.csv', 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    df = list(spamreader)
    df = np.asarray(df)
    df = pd.DataFrame(data=df[1:], columns=df[0])

In [12]:
numeric = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
df[numeric] = df[numeric].astype(float)

In [13]:
categorical = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
#for c in categorical:
 #   df[c] = pd.factorize(df[c])[0]
df = pd.get_dummies(df, columns=categorical, prefix=categorical)
df['y'] = pd.factorize(df['y'])[0]
df = df.astype(float)

In [14]:
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
rescaled_df = scaler.fit_transform(df[numeric])
df[numeric] = pd.DataFrame(rescaled_df, columns=numeric)

In [15]:
#df[categorical] = df[categorical].astype('int')
#df[categorical] = df[categorical].astype('category')
df['y'] = df['y'].astype(int)

In [16]:
df

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,0.481481,0.053070,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.493827,0.030297,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.246914,0.045954,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.283951,0.030704,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.481481,0.062424,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0.345679,0.040260,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
6,0.518519,0.028264,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
7,0.296296,0.044124,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8,0.086420,0.077267,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0.098765,0.010167,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 64 columns):
age                              41188 non-null float64
duration                         41188 non-null float64
campaign                         41188 non-null float64
pdays                            41188 non-null float64
previous                         41188 non-null float64
emp.var.rate                     41188 non-null float64
cons.price.idx                   41188 non-null float64
cons.conf.idx                    41188 non-null float64
euribor3m                        41188 non-null float64
nr.employed                      41188 non-null float64
y                                41188 non-null int64
job_admin.                       41188 non-null float64
job_blue-collar                  41188 non-null float64
job_entrepreneur                 41188 non-null float64
job_housemaid                    41188 non-null float64
job_management                   41188 non-null float

In [18]:
df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,...,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,0.284248,0.052518,0.028502,0.963439,0.024709,0.725393,0.535723,0.430854,0.677237,0.769134,...,0.017432,0.013839,0.190031,0.206711,0.209357,0.196416,0.197485,0.103234,0.863431,0.033335
std,0.128657,0.05272,0.050364,0.187098,0.0707,0.327283,0.225581,0.193648,0.39321,0.273163,...,0.130877,0.116824,0.39233,0.404951,0.406855,0.397292,0.398106,0.304268,0.343396,0.179512
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.185185,0.02074,0.0,1.0,0.0,0.333333,0.340608,0.338912,0.160961,0.512287,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,0.259259,0.0366,0.018182,1.0,0.0,0.9375,0.603274,0.376569,0.957379,0.859735,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,0.37037,0.064864,0.036364,1.0,0.0,1.0,0.698753,0.60251,0.980957,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
#Saving as pickle file
df.to_pickle('data/bank_additional_data.pkl')