# Data Prepocessing

In [40]:
import numpy as np
import pandas as pd
import csv
from sklearn import preprocessing
from sklearn.decomposition import PCA

## Bank Dataset

In [41]:
# Read data from file

with open('bank/bank-full.csv', 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    df = list(spamreader)
    df = np.asarray(df)
    df = pd.DataFrame(data=df[1:], columns=df[0])
    

In [42]:
# Inspecting the unknown values in the dataset

jobs = df['job']
unknown_jobs = 0
for job in jobs:
    if job == 'unknown':
        unknown_jobs = unknown_jobs + 1
        
educations = df['education']
unknown_education = 0
for education in educations:
    if education == 'unknown':
        unknown_education = unknown_education + 1

defaults = df['default']
unknown_default = 0
for default in defaults:
    if default == 'unknown':
        unknown_default = unknown_default + 1

contacts = df['contact']
unknown_contact = 0
for contact in contacts:
    if contact == 'unknown':
        unknown_contact = unknown_contact + 1
    
pdays = df['pdays']
unknown_pday = 0
for pday in pdays:
    if pday == '-1':      # Treating never contacted as unknown
        unknown_pday = unknown_pday + 1

pouts = df['poutcome']
unknown_pouts = 0
for pout in pouts:
    if pout == 'unknown':
        unknown_pouts = unknown_pouts + 1
    

print("Unknown values\n")
print("Unknown jobs :", unknown_jobs)
print("Unknown education :", unknown_education)
print("Unknown contacts :", unknown_contact)
print("Unknown pdays :", unknown_pday)
print("Unknown pouts :", unknown_pouts)



Unknown values

Unknown jobs : 288
Unknown education : 1857
Unknown contacts : 13020
Unknown pdays : 36954
Unknown pouts : 36959


### Observations from Data Set

- Total number of rows: 45211
- Total number of attributes: 16 + output attribute (y)
- Yes samples - 5289
- No samples - 39922
- The number of yes samples are much smaller than the number of no samples indicating that the data set is highly skewed.

- Unknown values :-
    - job - 288
    - education - 1857
    - contact - 13020
    - pdays - 36954
    - poutcome - 36959
- A very large number of rows have pdays = -1 and poutcome = unknown. These represent clients that have not been contacted in the past

In [43]:
df = df.drop('duration', axis=1)             # Removing duration attribute

# Separating columns as numerical and categorical
numeric = ['age', 'balance', 'campaign', 'pdays', 'previous']
categorical = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'day', 'month', 'poutcome']


In [44]:
# Substituting pdays = 999 for pdays = -1
pdays = df['pdays']
pdays = pdays.replace('-1', '999')
df['pdays'] = pdays


# Substituting pdays = 999 and poutcome = failure for pdays = -1 and poutcome = unknown
# prev_list = ['pdays', 'poutcome']
# prev_df = df[prev_list]
# prev_df = prev_df.replace({'pdays':'-1', 'poutcome':'unknown'}, {'pdays':'999', 'poutcome':'failure'})
# df[prev_list] = prev_df


In [45]:
# Taking care of categorical values

# factorize assigns unique numeric values to each category
for c in categorical:
    df[c] = pd.factorize(df[c])[0]

    df[numeric] = df[numeric].astype('float')

df[categorical] = df[categorical].astype('category')
df['y'],un = pd.factorize(df['y'])
df['y'] = df['y'].astype(int) # Converting otuput variable to integers

# Use dummy attributes for categorical values
df1 = pd.get_dummies(df, columns=categorical, prefix=categorical)
df1['y'],un = pd.factorize(df1['y'])
df1[numeric] = df1[numeric].astype('float')


df1['y'] = df1['y'].astype(int) # Converting otuput variable to integers


In [46]:
# Normalizing numerical values

scaler = preprocessing.StandardScaler()
rescaled_df = scaler.fit_transform(df1[numeric])
df1[numeric] = pd.DataFrame(rescaled_df, columns=numeric)

scaler = preprocessing.StandardScaler()
rescaled_df = scaler.fit_transform(df[numeric])
df[numeric] = pd.DataFrame(rescaled_df, columns=numeric)

In [47]:
#Saving as pickle file

df1.to_pickle('data/bank_data_dummies.pkl') # using dummies and replacing pdays only
df.to_pickle('data/bank_data_factorize.pkl') # using factorize and replacing pdays only

In [48]:
# Principal Component Analysis

y_values = df.loc[:, ['y']].values
df.drop(columns=['y'], axis=1, inplace=True)

pca = PCA()
#pca = PCA(n_components=10)
components = pca.fit_transform(df)
var_ratios = pca.explained_variance_ratio_
#rint(var_ratios)

# Calculating the number of attributes for 80% variance
s=0
count = 0
for ratio in var_ratios:
    s = s + ratio
    count = count + 1
    if s >= 0.8:
        break
print(count)

# Calculating the number of attributes for 95% variance
s=0
count1 = 0
for ratio in var_ratios:
    s = s + ratio
    count1 = count1+ 1
    if s >= 0.95:
        break
print(count1)

#df = pd.DataFrame(data=components)
df['y'] = y_values

2
5


### Observations from PCA

- There were 15 attributes (after removing duration) subjected to PCA
- 80% of the variance can be captured by 2 attributes after PCA
- 95% of the variance can be captured by 5 attributes after PCA

In [49]:
df


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
0,1.606965,0,0,0,0,0.256419,0,0,0,0,0,-0.569351,0.466408,-0.251940,0,0
1,0.288529,1,1,1,0,-0.437895,0,0,0,0,0,-0.569351,0.466408,-0.251940,0,0
2,-0.747384,2,0,1,0,-0.446762,0,1,0,0,0,-0.569351,0.466408,-0.251940,0,0
3,0.571051,3,0,2,0,0.047205,0,0,0,0,0,-0.569351,0.466408,-0.251940,0,0
4,-0.747384,4,1,2,0,-0.447091,1,0,0,0,0,-0.569351,0.466408,-0.251940,0,0
5,-0.559037,0,0,0,0,-0.371551,0,0,0,0,0,-0.569351,0.466408,-0.251940,0,0
6,-1.218254,0,1,0,0,-0.300608,0,1,0,0,0,-0.569351,0.466408,-0.251940,0,0
7,0.100181,2,2,0,1,-0.446762,0,0,0,0,0,-0.569351,0.466408,-0.251940,0,0
8,1.606965,5,0,3,0,-0.407679,0,0,0,0,0,-0.569351,0.466408,-0.251940,0,0
9,0.194355,1,1,1,0,-0.252657,0,0,0,0,0,-0.569351,0.466408,-0.251940,0,0


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 16 columns):
age          45211 non-null float64
job          45211 non-null category
marital      45211 non-null category
education    45211 non-null category
default      45211 non-null category
balance      45211 non-null float64
housing      45211 non-null category
loan         45211 non-null category
contact      45211 non-null category
day          45211 non-null category
month        45211 non-null category
campaign     45211 non-null float64
pdays        45211 non-null float64
previous     45211 non-null float64
poutcome     45211 non-null category
y            45211 non-null int32
dtypes: category(10), float64(5), int32(1)
memory usage: 2.3 MB


In [51]:
df.describe()

Unnamed: 0,age,balance,campaign,pdays,previous,y
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,2.11225e-16,1.760208e-17,3.0175000000000005e-17,-8.046667000000001e-17,4.0233340000000005e-17,0.116985
std,1.000011,1.000011,1.000011,1.000011,1.000011,0.321406
min,-2.159994,-3.081149,-0.5693506,-2.824689,-0.2519404,0.0
25%,-0.7473845,-0.4237719,-0.5693506,0.4664082,-0.2519404,0.0
50%,-0.1823406,-0.30028,-0.2465603,0.4664082,-0.2519404,0.0
75%,0.6652252,0.02158743,0.07622994,0.4664082,-0.2519404,0.0
max,5.091402,33.09478,19.44365,0.4664082,119.136,1.0


In [52]:
#Saving as pickle file

df.to_pickle('data/bank_data_factorize_pca.pkl')    # using factorize and replacing pdays only and performing PCA


### Bank Additional Dataset

In [53]:
# Reading data from file

with open('bank-additional/bank-additional-full.csv', 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    df = list(spamreader)
    df = np.asarray(df)
    df = pd.DataFrame(data=df[1:], columns=df[0])

In [54]:
# Inspecting the unknown values in the dataset

jobs = df['job']
unknown_jobs = 0
for job in jobs:
    if job == 'unknown':
        unknown_jobs = unknown_jobs + 1
        
maritals = df['marital']
unknown_maritals = 0
for marital in maritals:
    if marital == 'unknown':
        unknown_maritals = unknown_maritals + 1

educations = df['education']
unknown_education = 0
for education in educations:
    if education == 'unknown':
        unknown_education = unknown_education + 1

defaults = df['default']
unknown_default = 0
for default in defaults:
    if default == 'unknown':
        unknown_default = unknown_default + 1

housing = df['housing']
unknown_housing = 0
for house in housing:
    if house == 'unknown':
        unknown_housing = unknown_housing + 1
    
loans = df['loan']
unknown_loans = 0
for loan in loans:
    if loan == 'unknown':
        unknown_loans = unknown_loans + 1
    
pdays = df['pdays']
unknown_pday = 0
for pday in pdays:
    if pday == '999':      # Treating never contacted as unknown
        unknown_pday = unknown_pday + 1

pouts = df['poutcome']
unknown_pouts = 0
for pout in pouts:
    if pout == 'nonexistent':
        unknown_pouts = unknown_pouts + 1
    

print("Unknown values\n")
print("Unknown jobs :", unknown_jobs)
print("Unknown marital status :", unknown_maritals)
print("Unknown education :", unknown_education)
print("Unknown default :", unknown_default)
print("Unknown housing :", unknown_housing)
print("Unknown loans :", unknown_loans)
print("Unknown pdays :", unknown_pday)
print("Unknown pouts :", unknown_pouts)


Unknown values

Unknown jobs : 330
Unknown marital status : 80
Unknown education : 1731
Unknown default : 8597
Unknown housing : 990
Unknown loans : 990
Unknown pdays : 39673
Unknown pouts : 35563


### Observations from Data Set

- Total number of rows: 41188
- Total number of attributes: 19 + output attribute (y)
- Yes samples - 4640
- No samples - 36548

- The number of yes samples are much smaller than the number of no samples indicating that the data set is highly skewed.

- Unknown values :-
    - job - 288
    - education - 1857
    - contact - 13020
    - pdays - 36954
    - poutcome - 36959
- A very large number of rows have pdays = 999 and poutcome = nonexistent. These represent clients that have not been contacted in the past

In [55]:
df = df.drop('duration', axis=1)             # Removing duration attribute

numeric = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
categorical = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']


In [56]:
# Taking care of categorical values

# factorize assigns unique numeric values to each category
for c in categorical:
    df[c] = pd.factorize(df[c])[0]

df[numeric] = df[numeric].astype('float')
df[categorical] = df[categorical].astype('category')
df['y'],un = pd.factorize(df['y'])
df['y'] = df['y'].astype(int) # Converting otuput variable to integers

# Use dummy attributes for categorical values
df1 = pd.get_dummies(df, columns=categorical, prefix=categorical)
df1['y'],un = pd.factorize(df1['y'])




In [57]:
# Normalizing numerical values

scaler = preprocessing.StandardScaler()
rescaled_df = scaler.fit_transform(df1[numeric])
df1[numeric] = pd.DataFrame(rescaled_df, columns=numeric)

scaler = preprocessing.StandardScaler()
rescaled_df = scaler.fit_transform(df[numeric])
df[numeric] = pd.DataFrame(rescaled_df, columns=numeric)

In [58]:
#Saving as pickle file

df1.to_pickle('data/bank_additional_data_dummies.pkl') # using dummies and replacing pdays only
df.to_pickle('data/bank_additional_data_factorize.pkl') # using factorize and replacing pdays only

In [59]:
# Principal Component Analysis

y_values = df.loc[:, ['y']].values
df.drop(columns=['y'], axis=1, inplace=True)

#pca = PCA()
pca = PCA(n_components=10)
components = pca.fit_transform(df)
var_ratios = pca.explained_variance_ratio_
#rint(var_ratios)

# Calculating the number of attributes for 80% variance
s=0
count = 0
for ratio in var_ratios:
    s = s + ratio
    count = count + 1
    if s >= 0.8:
        break
print(count)

# Calculating the number of attributes for 95% variance
s=0
count1 = 0
for ratio in var_ratios:
    s = s + ratio
    count1 = count1+ 1
    if s >= 0.95:
        break
print(count1)

df = pd.DataFrame(data=components)
df['y'] = y_values

6
10


### Observations from PCA

- There were 18 attributes (after removing duration) subjected to PCA
- 80% of the variance can be captured by 6 attributes after PCA
- 95% of the variance can be captured by 10 attributes after PCA


In [60]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y
0,-5.085238,1.976573,1.192552,-0.215155,-2.003972,0.848267,1.607368,-0.085110,-0.451382,0.102161,0
1,-4.245930,0.958810,0.738160,-0.208968,-2.013441,0.868691,1.707951,-0.000635,-0.638437,0.060624,0
2,-4.239274,0.939066,0.595273,-0.055896,-1.964206,0.416226,0.453556,-0.795599,0.563273,-0.107539,0
3,-3.379089,-0.068760,0.134065,-0.036606,-1.974160,0.474288,0.637311,-0.663590,0.327532,-0.052078,0
4,-4.220810,0.969717,0.712080,-0.169690,-2.008423,0.826473,1.603210,-0.065391,-0.520651,0.086857,0
5,-3.569816,0.414489,-1.113143,0.100325,-2.003916,0.617640,1.020843,-0.409641,-0.105124,-0.048261,0
6,-2.653864,-0.550303,-1.523710,0.097162,-2.043962,0.868999,1.832070,0.121289,-0.910122,0.109335,0
7,-1.844656,-1.606542,-2.096716,0.210928,-2.004811,0.493633,0.779250,-0.525972,-0.011746,-0.097869,0
8,-1.651643,-2.102220,-0.944987,0.173770,-1.949430,0.052176,-0.447901,-1.280976,1.170616,0.526923,0
9,-4.247304,0.919201,0.499158,0.036175,-1.935292,0.134031,-0.389039,-1.299040,1.316189,0.557737,0


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 11 columns):
0    41188 non-null float64
1    41188 non-null float64
2    41188 non-null float64
3    41188 non-null float64
4    41188 non-null float64
5    41188 non-null float64
6    41188 non-null float64
7    41188 non-null float64
8    41188 non-null float64
9    41188 non-null float64
y    41188 non-null int32
dtypes: float64(10), int32(1)
memory usage: 3.3 MB


In [62]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,7.176502e-17,2.704989e-16,1.276589e-16,-1.048873e-16,-6.762473e-17,9.384657000000001e-17,-4.968348e-17,4.6750770000000005e-17,-1.269689e-16,1.269689e-16,0.112654
std,2.740139,2.419473,2.082649,1.766707,1.411823,1.197357,1.041102,0.9870682,0.9125983,0.6911039,0.316173
min,-5.298805,-8.018556,-4.95598,-2.759779,-2.203425,-2.462962,-5.909771,-1.594202,-4.811094,-2.119874,0.0
25%,-2.091327,-1.094463,-1.561693,-1.246718,-1.018946,-0.5593388,-0.6508199,-0.5815768,-0.5711372,-0.5726443,0.0
50%,-0.5036308,0.05298918,0.03463561,-0.7347074,0.02002136,0.1128615,-0.0307372,-0.1905246,-0.03701894,-0.06741632,0.0
75%,1.675105,1.31529,1.390102,0.4631161,1.044035,0.4397546,0.5737027,0.304414,0.5338705,0.3261301,0.0
max,11.09295,8.698616,7.036056,7.612489,2.326363,9.746579,6.25463,17.0783,6.439887,3.61061,1.0


In [63]:
#Saving as pickle file

df.to_pickle('data/bank_additional_data_factorize_pca.pkl')    # using factorize and replacing pdays only and performing PCA