### Loading the data

In [None]:
#Importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.base import BaseEstimator,TransformerMixin

import warnings
warnings.filterwarnings("ignore")


In [None]:
#reading the data
path='https://raw.githubusercontent.com/deepthikarun/banking_data/main/bank-additional-full.csv'
data = pd.read_csv(path,delimiter=';')
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

### Data Cleaning Pipeline

In [None]:
# #renaming columns
# data.rename(columns = {'default':'credit_default','housing':'housing_loan', 'loan':'personal_loan', 
#                                'day':'last_contacted_day', 'month':'last_contacted_month', 'duration':'last_call_duration' ,
#                               'campaign':'contacts_during_camapign' ,'pdays':'days_passed' ,'previous':'contacts_before_campaign' ,          
#                               'y': 'deposit'}, inplace = True)


In [None]:
#dropping unwanted columns
drop_cols=['emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','contact','poutcome']

In [None]:
class DataCleaning(BaseEstimator,TransformerMixin):
  def __init__(self):
    pass
  
  def fit(self,X,y=None):
    return self

 #converting employment col to categorical
  def transform(self,X,y=None):
    job=[]
    for x in X['job']:
      if x in ['services','admin.','blue-collar','technician','management','housemaid']:
        job.append('salaried')
      elif x in ['self-employed' ,'entrepreneur']:
        job.append('self-employed')
      else:job.append(x)

 #converting education col to categories   
    education=[]
    for x in X['education']:
      if x in ['basic.4y','basic.6y']:
          education.append('primary')
      elif x in ['high.school','basic.9y']:
          education.append('secondary')
      elif x in ['professional.course','university.degree']:
          education.append('tertiary')
      else: education.append(x)

 #creating categories out of days_passed column    
    days=[]
    for i in X['pdays']:
      if i<50:
        days.append('recent')
      else:
        days.append('never_contacted')
    
  #converting contacted_month to categories   
    months=[]
    for x in X['month']:
      if x in ['jan','feb','mar','apr']:
          months.append( 'jan-april')
      elif x in ['may','jun','jul','aug']:
          months.append('may-aug')
      elif x in ['sep','oct','nov','dec']:
          months.append('sep-dec')

 #converting contacts_before_campaign to categories
    contacts=[]
    for x in X['previous']:
      if (x>0 and x<10):
          contacts.append('<10')
      else:
          contacts.append('0')

 # Changing the unit of 'last_call_duration' from seconds to minutes   
    duration=[]
    for x in X['duration']:
      duration.append(x/60)

    X['previous']=contacts
    X['pdays']=days
    X['job']=job
    X['education']=education
    X['month']=months
    X['duration']=duration
    return X
  


class LogTransform(BaseEstimator,TransformerMixin):
   def __init__(self):
      pass
    
   def fit(self,X,y=None):
      return self

 #removing outliers
   def transform(self,X,y=None):
      duration=[]
      for x in X['last_call_duration']:
          duration.append(np.log(x+0.1))
      age=[]
      for x in X['age']:
          age.append(np.log(x+0.1))
      contacts=[]
      for x in X['contacts_during_campaign']:
          contacts.append(np.log(x+0.1))

      X['last_call_duration']=duration
      X['age']=age
      X['contacts_during_campaign']=contacts
      return X   


####Creating a updated ArraytoDataframe class with updated columns
class ArrayToDfUpdated(BaseEstimator,TransformerMixin):
    
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X=pd.DataFrame(X,\
                       index=[i for i in range(X.shape[0])],\
                       columns=['last_call_duration','age','contacts_during_campaign','days_passed','job','education','last_contacted_month','contacts_before_campaign',
            'marital', 'credit_default', 'housing_loan','personal_loan', 'day_of_week',
           'deposit']
       
        
                      )
        
        return X

####Creating a updated ArraytoDataframe class with updated columns
class ArrayToDf(BaseEstimator,TransformerMixin):
    
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X=pd.DataFrame(X,\
                       index=[i for i in range(X.shape[0])],\
                       columns=['days_passed','job','education','last_contacted_month','contacts_before_campaign','last_call_duration',
           'age', 'marital', 'credit_default', 'housing_loan','personal_loan', 'day_of_week','contacts_during_campaign',
           'deposit']
       
        
                      )
        
        return X

# ct_preprocessing=make_column_transformer((DataCleaning(),[12,1,3,8,13,10]),(ArrayToDfUpdated()),(LogTransform(),[5,6,12]),('drop',drop_cols),remainder='passthrough')
ct_dataCleaning=make_column_transformer((DataCleaning(),[12,1,3,8,13,10]),('drop',drop_cols),remainder='passthrough')
dataCleaning_pipeline=make_pipeline(ct_dataCleaning,ArrayToDf())
data=dataCleaning_pipeline.fit_transform(data)

ct_LogTransform=make_column_transformer((LogTransform(),[5,6,12]),remainder='passthrough')
featureEngg_pipeline=make_pipeline(ct_LogTransform,ArrayToDfUpdated())
data=featureEngg_pipeline.fit_transform(data)



In [None]:
data

Unnamed: 0,last_call_duration,age,contacts_during_campaign,days_passed,job,education,last_contacted_month,contacts_before_campaign,marital,credit_default,housing_loan,personal_loan,day_of_week,deposit
0,1.4929,4.02714,0.0953102,never_contacted,salaried,primary,may-aug,0,married,no,no,no,mon,no
1,0.949081,4.0448,0.0953102,never_contacted,salaried,secondary,may-aug,0,married,unknown,no,no,mon,no
2,1.35239,3.61362,0.0953102,never_contacted,salaried,secondary,may-aug,0,married,no,yes,no,mon,no
3,0.961901,3.69138,0.0953102,never_contacted,salaried,primary,may-aug,0,married,no,no,no,mon,no
4,1.65186,4.02714,0.0953102,never_contacted,salaried,secondary,may-aug,0,married,no,no,yes,mon,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,1.7346,4.29183,0.0953102,never_contacted,retired,tertiary,sep-dec,0,married,no,yes,no,fri,yes
41184,1.86923,3.83081,0.0953102,never_contacted,salaried,tertiary,sep-dec,0,married,no,no,no,fri,no
41185,1.17865,4.02714,0.741937,never_contacted,retired,tertiary,sep-dec,0,married,no,yes,no,fri,no
41186,2.01045,3.78646,0.0953102,never_contacted,salaried,tertiary,sep-dec,0,married,no,no,no,fri,yes


In [None]:
from sklearn.preprocessing import StandardScaler
# from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder



ct_scaling=make_column_transformer((StandardScaler(),[0,1,2]),remainder='passthrough')
scaling_pipeline=make_pipeline(ct_scaling,ArrayToDfUpdated())
data=scaling_pipeline.fit_transform(data)

ct_encoding=make_column_transformer((OneHotEncoder(),[3,4,5,6,7,8,9,10,11,12,13]),remainder='passthrough')
# encoding_pipeline=make_pipeline(ct_encoding,ArrayToDfUpdated())
data=ct_encoding.fit_transform(data)





In [35]:
# final_pipeline=make_pipeline(ct_dataCleaning,ArrayToDf(),ct_LogTransform,ArrayToDfUpdated(),
#                                        ct_scaling,ArrayToDfUpdated(),ct_encoding)
# final_pipeline.fit_transform(data)

In [None]:
from sklearn.preprocessing import StandardScaler
# from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder



#for numerical varibles
x_train_num=x_train[num_cols]

num_pipeline=Pipeline([
                       ('std_scaler',StandardScaler()),
                      
                      ])

x_train_num_prepared=num_pipeline.fit_transform(x_train_num)

#for categorical variables

cat_cols_for_train=['job', 'marital', 'education', 'credit_default', 'housing_loan',
       'personal_loan', 'last_contacted_month', 'day_of_week', 'days_passed',
       'contacts_before_campaign']
x_train_cat=x_train[cat_cols_for_train]
cat_pipeline=Pipeline([('encoder',OneHotEncoder())])

x_train_cat_prepared=cat_pipeline.fit_transform(x_train_cat)

# x_train_prepared=np.concatenate((x_train_num_prepared,x_train_cat_prepared.toarray()),axis=1)

from sklearn.compose import ColumnTransformer

num_attribs=list(x_train_num)
cat_attribs=list(x_train_cat)

full_pipeline=ColumnTransformer([
                                 ('num',num_pipeline,num_attribs),
                                 ('cat',cat_pipeline,cat_attribs),
])

x_train_prepared=full_pipeline.fit_transform(x_train)