# Bank Project - Preprocessing

In [3]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from collections import Counter

## Fetch Data

In [4]:
df = pd.read_csv('./data/bank-full.csv', sep=';')

## Transform 'y' into 'target'

 + Useful for quantifying the target

In [5]:
df['target']= [1 if y=='yes' else 0 for y in df.y]

Preprocessing FUnction

In [247]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
#Custom transformer we wrote to engineer features ( bathrooms per bedroom and/or how old the house is in 2019  ) 
#passed as boolen arguements to its constructor
class TargetTransformer(BaseEstimator, TransformerMixin):
#Class Constructor
    def __init__(self):
        self = self

#Return self, nothing else to do here
    def fit( self, X ):
        return self 

#Custom transform method we wrote that creates aformentioned features and drops redundant ones 
    def transform(self, X):
        #Check if needed 
        X = np.array([1 if y=='yes' else 0 for y in X])
        #returns a numpy array
        return X.reshape(-1,1)

class LogPlus1Transformer(BaseEstimator, TransformerMixin):
#Class Constructor
    def __init__(self):
        self=self
    
    def fit(self, X):
        return(self)
    
    def transform(self, X):
        X = np.log(X+1)
        return(X.values)

class CampaignTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self = self
    
    def fit(self, X):
        return(self)

    def transform(self, X):
        X = np.array(pd.cut(X, np.quantile(X, [0,0.25,0.5,0.75,1]), duplicates='drop', include_lowest=True))
        return(X.reshape(-1,1))

In [248]:
cti = CampaignTransformer()
cti.fit_transform(df.campaign).shape

(45211, 1)

In [250]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    [("target",TargetTransformer(),'y'),
    ("logplus1",LogPlus1Transformer(),['duration','age']),
    ("campaign",CampaignTransformer(),'campaign'),
    ('ohe',OneHotEncoder(sparse=False),['month','day','job','marital','education','previous','default','housing','loan'])])

df_trans = ct.fit_transform(df)
df_trans

array([[0, 5.568344503761097, 4.07753744390572, ..., 1.0, 1.0, 0.0],
       [0, 5.0238805208462765, 3.8066624897703196, ..., 1.0, 1.0, 0.0],
       [0, 4.343805421853684, 3.5263605246161616, ..., 1.0, 0.0, 1.0],
       ...,
       [1, 7.028201432058005, 4.290459441148391, ..., 0.0, 1.0, 0.0],
       [0, 6.2324480165505225, 4.060443010546419, ..., 0.0, 1.0, 0.0],
       [0, 5.8916442118257715, 3.6375861597263857, ..., 0.0, 1.0, 0.0]],
      dtype=object)

In [151]:
#Custom transformer we wrote to engineer features ( bathrooms per bedroom and/or how old the house is in 2019  ) 
#passed as boolen arguements to its constructor
class NumericalTransformer(BaseEstimator, TransformerMixin):
    #Class Constructor
    def __init__( self, bath_per_bed = True, years_old = True ):
        self._bath_per_bed = bath_per_bed
        self._years_old = years_old
        
    #Return self, nothing else to do here
    def fit( self, X, y = None ):
        return self 
    
    #Custom transform method we wrote that creates aformentioned features and drops redundant ones 
    def transform(self, X, y = None):
        #Check if needed 
        if self._bath_per_bed:
            #create new column
            X.loc[:,'bath_per_bed'] = X['bathrooms'] / X['bedrooms']
            #drop redundant column
            X.drop('bathrooms', axis = 1 )
        #Check if needed     
        if self._years_old:
            #create new column
            X.loc[:,'years_old'] =  2019 - X['yr_built']
            #drop redundant column 
            X.drop('yr_built', axis = 1)
            
        #Converting any infinity values in the dataset to Nan
        X = X.replace( [ np.inf, -np.inf ], np.nan )
        #returns a numpy array
        return X.values

def j
pd.cut(df.campaign, np.quantile(df.campaign, [0,0.25,0.5,0.75,1]), duplicates='drop', include_lowest=True

In [6]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,target
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no,0


In [8]:
cat_features = ['job','marital','education','default','housing','load','contact','poutcome','month']

In [9]:
num_features = ['age','balance','previous','duration']

In [204]:
df.campaign.describe()

count    45211.000000
mean         2.763841
std          3.098021
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         63.000000
Name: campaign, dtype: float64

In [220]:
np.quantile(df.campaign, [0])

array([1], dtype=int64)

In [237]:
pd.cut(df['campaign'], np.quantile(df['campaign'], [0,0.25,0.5,0.75,1]), duplicates='drop', include_lowest=True).values.shape

(45211,)