# Data Prepocessing

In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn import preprocessing
from sklearn.decomposition import PCA

### Bank Dataset

In [2]:
with open('bank/bank-full.csv', 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    df = list(spamreader)
    df = np.asarray(df)
    df = pd.DataFrame(data=df[1:], columns=df[0])

In [3]:
numeric = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
df[numeric] = df[numeric].astype(float)

In [4]:
categorical = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'day', 'month', 'poutcome', 'y']
for c in categorical:
    df[c] = pd.factorize(df[c])[0]
#df = pd.get_dummies(df, columns=categorical, prefix=categorical)
#df['y'] = pd.factorize(df['y'])[0]
df = df.astype(float)

In [5]:
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
rescaled_df = scaler.fit_transform(df[numeric])
df[numeric] = pd.DataFrame(rescaled_df, columns=numeric)

In [6]:
df[categorical] = df[categorical].astype('int')
df[categorical] = df[categorical].astype('category')
df['y'] = df['y'].astype(int)

In [7]:
y_values = df.loc[:, ['y']].values
df.drop(columns=['y'], axis=1, inplace=True)

In [8]:
pca = PCA(n_components=10)
components = pca.fit_transform(df)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
df = pd.DataFrame(data=components)

[7.60086438e-01 1.08527446e-01 1.04558368e-01 9.54929996e-03
 5.20428638e-03 5.04546440e-03 2.90294453e-03 2.17195029e-03
 1.44598174e-03 1.92228735e-04]
[1778.59152381  672.07001271  659.66603757  199.35654308  147.17204333
  144.90897992  109.91680468   95.07572966   77.57577633   28.28485004]


In [9]:
df['y'] = y_values
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y
0,-13.155044,-3.254776,-2.726941,-0.713065,-0.417566,-0.549436,0.546195,0.158517,-0.167477,-0.137266,0
1,-13.165865,-2.903241,-1.694928,-0.023202,-0.108032,0.561258,0.502090,0.123444,-0.155127,-0.053390,0
2,-13.169500,-2.566877,-0.764428,0.082567,-0.366043,-0.418679,0.558158,0.024895,0.827360,-0.003980,0
3,-13.181295,-2.211983,0.262563,0.973091,-0.246153,-0.262979,0.505369,0.145883,-0.162723,-0.046545,0
4,-13.181052,-1.835840,1.197616,0.662110,-0.084628,0.680732,-0.240304,0.771522,-0.062366,0.045949,0
5,-13.155045,-3.255453,-2.726821,-0.722778,-0.417996,-0.544516,0.561955,0.150715,-0.167162,0.002459,0
6,-13.154071,-3.258176,-2.721919,-0.914596,-0.227924,0.405974,0.556228,0.001837,0.834847,0.000888,0
7,-13.164386,-2.563589,-0.848230,-1.275418,-0.012334,1.331475,0.518755,0.085982,-0.114253,0.813970,0
8,-13.196764,-1.518061,2.224567,1.780289,-0.166348,-0.133151,0.477374,0.142406,-0.160178,-0.101384,0
9,-13.165863,-2.903276,-1.694925,-0.023622,-0.108075,0.561442,0.502735,0.123110,-0.155122,-0.047374,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 11 columns):
0    45211 non-null float64
1    45211 non-null float64
2    45211 non-null float64
3    45211 non-null float64
4    45211 non-null float64
5    45211 non-null float64
6    45211 non-null float64
7    45211 non-null float64
8    45211 non-null float64
9    45211 non-null float64
y    45211 non-null int64
dtypes: float64(10), int64(1)
memory usage: 3.8 MB


In [11]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,-7.0272e-15,-4.342178e-15,-2.712735e-15,-3.374466e-16,9.187709e-16,4.831438e-16,2.324467e-16,-4.394063e-16,1.987387e-18,-1.486112e-17,0.116985
std,8.364866,3.160802,3.102465,0.9375906,0.6921625,0.6815191,0.516948,0.4471492,0.3648454,0.133026,0.321406
min,-13.22028,-4.954643,-6.523825,-2.043013,-1.104038,-1.460906,-1.591856,-1.476874,-0.3358457,-0.3391016,0.0
25%,-7.178002,-2.388503,-2.541101,-0.6443455,-0.3898403,-0.5108169,-0.4680137,-0.402339,-0.1824998,-0.05777683,0.0
50%,-1.001801,-0.9325416,-0.08804741,-0.185908,-0.2062441,-0.2494826,0.004614677,0.1234765,-0.147554,-0.005847476,0.0
75%,7.287384,1.530531,2.511246,0.2555477,0.05973223,0.4454036,0.5315995,0.2130232,-0.07983021,0.03330908,0.0
max,17.38722,10.8671,8.526788,2.614165,3.468577,1.983172,1.75458,1.587914,1.038904,1.024932,1.0


In [12]:
#Saving as pickle file
df.to_pickle('data/bank_data.pkl')

### Bank Additional Dataset

In [13]:
with open('bank-additional/bank-additional-full.csv', 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    df = list(spamreader)
    df = np.asarray(df)
    df = pd.DataFrame(data=df[1:], columns=df[0])

In [14]:
numeric = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
df[numeric] = df[numeric].astype(float)

In [15]:
categorical = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
#for c in categorical:
 #   df[c] = pd.factorize(df[c])[0]
df = pd.get_dummies(df, columns=categorical, prefix=categorical)
df['y'] = pd.factorize(df['y'])[0]
df = df.astype(float)

In [16]:
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
rescaled_df = scaler.fit_transform(df[numeric])
df[numeric] = pd.DataFrame(rescaled_df, columns=numeric)

In [17]:
#df[categorical] = df[categorical].astype('int')
#df[categorical] = df[categorical].astype('category')
df['y'] = df['y'].astype(int)

In [18]:
df

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,0.481481,0.053070,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.493827,0.030297,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.246914,0.045954,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.283951,0.030704,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.481481,0.062424,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0.345679,0.040260,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
6,0.518519,0.028264,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
7,0.296296,0.044124,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8,0.086420,0.077267,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0.098765,0.010167,0.000000,1.000000,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 64 columns):
age                              41188 non-null float64
duration                         41188 non-null float64
campaign                         41188 non-null float64
pdays                            41188 non-null float64
previous                         41188 non-null float64
emp.var.rate                     41188 non-null float64
cons.price.idx                   41188 non-null float64
cons.conf.idx                    41188 non-null float64
euribor3m                        41188 non-null float64
nr.employed                      41188 non-null float64
y                                41188 non-null int64
job_admin.                       41188 non-null float64
job_blue-collar                  41188 non-null float64
job_entrepreneur                 41188 non-null float64
job_housemaid                    41188 non-null float64
job_management                   41188 non-null float

In [20]:
df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,...,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,0.284248,0.052518,0.028502,0.963439,0.024709,0.725393,0.535723,0.430854,0.677237,0.769134,...,0.017432,0.013839,0.190031,0.206711,0.209357,0.196416,0.197485,0.103234,0.863431,0.033335
std,0.128657,0.05272,0.050364,0.187098,0.0707,0.327283,0.225581,0.193648,0.39321,0.273163,...,0.130877,0.116824,0.39233,0.404951,0.406855,0.397292,0.398106,0.304268,0.343396,0.179512
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.185185,0.02074,0.0,1.0,0.0,0.333333,0.340608,0.338912,0.160961,0.512287,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,0.259259,0.0366,0.018182,1.0,0.0,0.9375,0.603274,0.376569,0.957379,0.859735,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,0.37037,0.064864,0.036364,1.0,0.0,1.0,0.698753,0.60251,0.980957,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
#Saving as pickle file
df.to_pickle('data/bank_additional_data.pkl')