# Data Prepocessing

In [4]:
import numpy as np
import pandas as pd
import csv
from sklearn import preprocessing
from sklearn.decomposition import PCA

## Bank Dataset

In [104]:
# Read data from file

with open('bank/bank-full.csv', 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    df = list(spamreader)
    df = np.asarray(df)
    df = pd.DataFrame(data=df[1:], columns=df[0])
    

In [105]:
# Inspecting the unknown values in the dataset

jobs = df['job']
unknown_jobs = 0
for job in jobs:
    if job == 'unknown':
        unknown_jobs = unknown_jobs + 1
        
educations = df['education']
unknown_education = 0
for education in educations:
    if education == 'unknown':
        unknown_education = unknown_education + 1

defaults = df['default']
unknown_default = 0
for default in defaults:
    if default == 'unknown':
        unknown_default = unknown_default + 1

contacts = df['contact']
unknown_contact = 0
for contact in contacts:
    if contact == 'unknown':
        unknown_contact = unknown_contact + 1
    
pdays = df['pdays']
unknown_pday = 0
for pday in pdays:
    if pday == '-1':      # Treating never contacted as unknown
        unknown_pday = unknown_pday + 1

pouts = df['poutcome']
unknown_pouts = 0
for pout in pouts:
    if pout == 'unknown':
        unknown_pouts = unknown_pouts + 1
    

print("Unknown values\n")
print("Unknown jobs :", unknown_jobs)
print("Unknown education :", unknown_education)
print("Unknown contacts :", unknown_contact)
print("Unknown pdays :", unknown_pday)
print("Unknown pouts :", unknown_pouts)



Unknown values

Unknown jobs : 288
Unknown education : 1857
Unknown contacts : 13020
Unknown pdays : 36954
Unknown pouts : 36959


### Observations from Data Set

- Total number of rows: 45211
- Total number of attributes: 16 + output attribute (y)
- Yes samples - 5289
- No samples - 39922
- The number of yes samples are much smaller than the number of no samples indicating that the data set is highly skewed.

- Unknown values :-
    - job - 288
    - education - 1857
    - contact - 13020
    - pdays - 36954
    - poutcome - 36959
- A very large number of rows have pdays = -1 and poutcome = unknown. These represent clients that have not been contacted in the past

In [106]:
df = df.drop('duration', axis=1)             # Removing duration attribute

# Separating columns as numerical and categorical
numeric = ['age', 'balance', 'campaign', 'pdays', 'previous']
categorical = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'day', 'month', 'poutcome']


In [107]:
# Substituting pdays = 999 for pdays = -1
pdays = df['pdays']
pdays = pdays.replace('-1', '999')
df['pdays'] = pdays


# Substituting pdays = 999 and poutcome = failure for pdays = -1 and poutcome = unknown
# prev_list = ['pdays', 'poutcome']
# prev_df = df[prev_list]
# prev_df = prev_df.replace({'pdays':'-1', 'poutcome':'unknown'}, {'pdays':'999', 'poutcome':'failure'})
# df[prev_list] = prev_df


In [108]:
# Taking care of categorical values

# factorize assigns unique numeric values to each category
#for c in categorical:
#    df[c] = pd.factorize(df[c])[0]
    
# Use dummy attributes for categorical values
df = pd.get_dummies(df, columns=categorical, prefix=categorical)
df['y'],un = pd.factorize(df['y'])

#df[numeric] = df[numeric].astype('float')
#df[categorical] = df[categorical].astype('category')

df['y'] = df['y'].astype(int) # Converting otuput variable to integers


In [109]:
# Normalizing numerical values

scaler = preprocessing.StandardScaler()
rescaled_df = scaler.fit_transform(df[numeric])
df[numeric] = pd.DataFrame(rescaled_df, columns=numeric)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [62]:
#Saving as pickle file

df.to_pickle('data/bank_data_dummies.pkl') # using dummies and replacing pdays only

In [63]:
# Principal Component Analysis

y_values = df.loc[:, ['y']].values
df.drop(columns=['y'], axis=1, inplace=True)

pca = PCA()
#pca = PCA(n_components=10)
components = pca.fit_transform(df)
var_ratios = pca.explained_variance_ratio_
#rint(var_ratios)

# Calculating the number of attributes for 80% variance
s=0
count = 0
for ratio in var_ratios:
    s = s + ratio
    count = count + 1
    if s >= 0.8:
        break
print(count)

# Calculating the number of attributes for 95% variance
s=0
count1 = 0
for ratio in var_ratios:
    s = s + ratio
    count1 = count1+ 1
    if s >= 0.95:
        break
print(count1)

#df = pd.DataFrame(data=components)
df['y'] = y_values

15
44


### Observations from PCA

- There were 15 attributes (after removing duration) subjected to PCA
- 80% of the variance can be captured by 2 attributes after PCA
- 95% of the variance can be captured by 5 attributes after PCA

In [64]:
df


Unnamed: 0,age,balance,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y
0,1.606965,0.256419,-0.569351,0.466408,-0.251940,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
1,0.288529,-0.437895,-0.569351,0.466408,-0.251940,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,-0.747384,-0.446762,-0.569351,0.466408,-0.251940,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
3,0.571051,0.047205,-0.569351,0.466408,-0.251940,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,-0.747384,-0.447091,-0.569351,0.466408,-0.251940,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
5,-0.559037,-0.371551,-0.569351,0.466408,-0.251940,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
6,-1.218254,-0.300608,-0.569351,0.466408,-0.251940,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
7,0.100181,-0.446762,-0.569351,0.466408,-0.251940,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
8,1.606965,-0.407679,-0.569351,0.466408,-0.251940,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
9,0.194355,-0.252657,-0.569351,0.466408,-0.251940,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 81 columns):
age                    45211 non-null float64
balance                45211 non-null float64
campaign               45211 non-null float64
pdays                  45211 non-null float64
previous               45211 non-null float64
job_admin.             45211 non-null uint8
job_blue-collar        45211 non-null uint8
job_entrepreneur       45211 non-null uint8
job_housemaid          45211 non-null uint8
job_management         45211 non-null uint8
job_retired            45211 non-null uint8
job_self-employed      45211 non-null uint8
job_services           45211 non-null uint8
job_student            45211 non-null uint8
job_technician         45211 non-null uint8
job_unemployed         45211 non-null uint8
job_unknown            45211 non-null uint8
marital_divorced       45211 non-null uint8
marital_married        45211 non-null uint8
marital_single         45211 non-null uint8
e

In [66]:
df.describe()

Unnamed: 0,age,balance,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,...,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,2.11225e-16,1.760208e-17,3.0175000000000005e-17,-8.046667000000001e-17,4.0233340000000005e-17,0.114375,0.215257,0.03289,0.027427,0.209197,...,0.010551,0.304483,0.08781,0.016323,0.012807,0.108403,0.040698,0.033421,0.817478,0.116985
std,1.000011,1.000011,1.000011,1.000011,1.000011,0.318269,0.411005,0.178351,0.163326,0.40674,...,0.102174,0.460193,0.283022,0.126718,0.112441,0.310892,0.197592,0.179735,0.386278,0.321406
min,-2.159994,-3.081149,-0.5693506,-2.824689,-0.2519404,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.7473845,-0.4237719,-0.5693506,0.4664082,-0.2519404,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,-0.1823406,-0.30028,-0.2465603,0.4664082,-0.2519404,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,0.6652252,0.02158743,0.07622994,0.4664082,-0.2519404,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,5.091402,33.09478,19.44365,0.4664082,119.136,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [67]:
#Saving as pickle file

df.to_pickle('data/bank_data_dummies_pca.pkl')    # using dummies and replacing pdays only and performing PCA


### Bank Additional Dataset

In [5]:
# Reading data from file

with open('bank-additional/bank-additional-full.csv', 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';')
    df = list(spamreader)
    df = np.asarray(df)
    df = pd.DataFrame(data=df[1:], columns=df[0])

In [6]:
# Inspecting the unknown values in the dataset

jobs = df['job']
unknown_jobs = 0
for job in jobs:
    if job == 'unknown':
        unknown_jobs = unknown_jobs + 1
        
maritals = df['marital']
unknown_maritals = 0
for marital in maritals:
    if marital == 'unknown':
        unknown_maritals = unknown_maritals + 1

educations = df['education']
unknown_education = 0
for education in educations:
    if education == 'unknown':
        unknown_education = unknown_education + 1

defaults = df['default']
unknown_default = 0
for default in defaults:
    if default == 'unknown':
        unknown_default = unknown_default + 1

housing = df['housing']
unknown_housing = 0
for house in housing:
    if house == 'unknown':
        unknown_housing = unknown_housing + 1
    
loans = df['loan']
unknown_loans = 0
for loan in loans:
    if loan == 'unknown':
        unknown_loans = unknown_loans + 1
    
pdays = df['pdays']
unknown_pday = 0
for pday in pdays:
    if pday == '999':      # Treating never contacted as unknown
        unknown_pday = unknown_pday + 1

pouts = df['poutcome']
unknown_pouts = 0
for pout in pouts:
    if pout == 'nonexistent':
        unknown_pouts = unknown_pouts + 1
    

print("Unknown values\n")
print("Unknown jobs :", unknown_jobs)
print("Unknown marital status :", unknown_maritals)
print("Unknown education :", unknown_education)
print("Unknown default :", unknown_default)
print("Unknown housing :", unknown_housing)
print("Unknown loans :", unknown_loans)
print("Unknown pdays :", unknown_pday)
print("Unknown pouts :", unknown_pouts)


Unknown values

Unknown jobs : 330
Unknown marital status : 80
Unknown education : 1731
Unknown default : 8597
Unknown housing : 990
Unknown loans : 990
Unknown pdays : 39673
Unknown pouts : 35563


### Observations from Data Set

- Total number of rows: 41188
- Total number of attributes: 19 + output attribute (y)
- Yes samples - 4640
- No samples - 36548

- The number of yes samples are much smaller than the number of no samples indicating that the data set is highly skewed.

- Unknown values :-
    - job - 288
    - education - 1857
    - contact - 13020
    - pdays - 36954
    - poutcome - 36959
- A very large number of rows have pdays = 999 and poutcome = nonexistent. These represent clients that have not been contacted in the past

In [7]:
df = df.drop('duration', axis=1)             # Removing duration attribute

numeric = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
categorical = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']


In [8]:
# Taking care of categorical values

# factorize assigns unique numeric values to each category
#for c in categorical:
#    df[c] = pd.factorize(df[c])[0]
    
# Use dummy attributes for categorical values
df = pd.get_dummies(df, columns=categorical, prefix=categorical)
df['y'],un = pd.factorize(df['y'])

#df[numeric] = df[numeric].astype('float')
#df[categorical] = df[categorical].astype('category')

#df['y'] = df['y'].astype(int) # Converting otuput variable to integers

In [9]:
# Normalizing numerical values

scaler = preprocessing.StandardScaler()
rescaled_df = scaler.fit_transform(df[numeric])
df[numeric] = pd.DataFrame(rescaled_df, columns=numeric)

In [10]:
#Saving as pickle file

df.to_pickle('data/bank_additional_data_dummies.pkl') # using dummies and replacing pdays only
#df.to_pickle('data/bank_additional_data_factorize.pkl') # using dummies and replacing pdays only

In [74]:
# Principal Component Analysis

y_values = df.loc[:, ['y']].values
df.drop(columns=['y'], axis=1, inplace=True)

#pca = PCA()
pca = PCA(n_components=10)
components = pca.fit_transform(df)
var_ratios = pca.explained_variance_ratio_
#rint(var_ratios)

# Calculating the number of attributes for 80% variance
s=0
count = 0
for ratio in var_ratios:
    s = s + ratio
    count = count + 1
    if s >= 0.8:
        break
print(count)

# Calculating the number of attributes for 95% variance
s=0
count1 = 0
for ratio in var_ratios:
    s = s + ratio
    count1 = count1+ 1
    if s >= 0.95:
        break
print(count1)

df = pd.DataFrame(data=components)
df['y'] = y_values

10
10


### Observations from PCA

- There were 18 attributes (after removing duration) subjected to PCA
- 80% of the variance can be captured by 6 attributes after PCA
- 95% of the variance can be captured by 10 attributes after PCA


In [75]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y
0,-1.410808,1.044855,-1.504839,-0.035479,-0.674420,0.942602,0.541676,-0.151953,-0.144953,-0.208269,0
1,-1.492236,1.062992,-1.537888,0.062077,-0.800902,1.043149,0.556289,-0.026847,0.300815,0.556425,0
2,-1.359959,0.434188,-0.129926,-0.921168,-0.060613,0.852819,-0.717362,0.472941,0.194496,-0.438338,0
3,-1.392443,0.580200,-0.350919,-0.819922,-0.098910,0.966864,0.573939,-0.095172,-0.086614,-0.624038,0
4,-1.401831,0.979216,-1.358230,-0.096896,-0.633943,0.944694,0.465331,-0.100547,-0.264690,-0.210969,0
5,-1.497946,0.744891,-0.782792,-0.430506,-0.482862,1.122621,0.571079,0.059140,0.810783,0.326774,0
6,-1.397250,1.094465,-1.608873,-0.010656,-0.637127,0.827676,0.542967,-0.211509,-0.589107,-0.379958,0
7,-1.503121,0.653115,-0.550704,-0.575031,-0.395866,1.173157,0.573862,0.091377,1.055316,0.333709,0
8,-1.315879,0.013649,0.969964,-1.719263,0.567090,0.783060,-0.751126,0.426102,-0.097041,0.338668,0
9,-1.306606,0.004941,0.976022,-1.650575,0.484451,0.897293,-0.745559,0.445872,-0.118813,0.474080,0


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 11 columns):
0    41188 non-null float64
1    41188 non-null float64
2    41188 non-null float64
3    41188 non-null float64
4    41188 non-null float64
5    41188 non-null float64
6    41188 non-null float64
7    41188 non-null float64
8    41188 non-null float64
9    41188 non-null float64
y    41188 non-null int64
dtypes: float64(10), int64(1)
memory usage: 3.5 MB


In [77]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,-5.465183e-16,1.849329e-16,8.00456e-17,9.798686000000001e-17,-4.2782990000000006e-17,-4.2782990000000006e-17,1.359395e-16,1.826903e-16,-1.255888e-16,2.027017e-16,0.112654
std,2.028186,1.196995,1.080178,1.002392,0.9596998,0.8361479,0.6926512,0.683819,0.6371992,0.5555453,0.316173
min,-3.364377,-2.555953,-6.489687,-2.721793,-4.058649,-2.691589,-1.86625,-2.916052,-2.632857,-1.266601,0.0
25%,-1.475767,-0.6181178,-0.6143786,-0.6234059,-0.6071674,-0.8306479,-0.6684158,-0.3556995,-0.4445745,-0.4776802,0.0
50%,-1.293096,0.0476645,0.04366873,-0.09751535,-0.1317384,0.05813364,-0.04308235,-0.0563862,-0.02330935,0.06292974,0.0
75%,2.087327,0.4749275,0.6633857,0.4720083,0.4486139,0.7608282,0.616389,0.2463296,0.4325032,0.4106837,0.0
max,7.426492,9.846715,6.705459,13.34327,13.08671,2.893686,2.85263,7.211624,2.820911,1.802491,1.0


In [78]:
#Saving as pickle file

df.to_pickle('data/bank_additional_data_dummies_pca.pkl')    # using dummies and replacing pdays only and performing PCA