In [1]:
import pandas as pd

In [2]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import OneHotEncoder

from feature_engine.imputation import DropMissingData

from sklearn import set_config
set_config(display="diagram")
set_config(transform_output="pandas")
import numpy as np
import pprint

In [3]:
df = pd.read_csv("data/dataset_raw/banking.csv")

In [4]:
def edu_trans(df):
    df['education'] = df['education'].replace(['basic.4y', 'basic.6y', 'basic.9y'], 'basic')
    return df 

In [5]:
tran = FunctionTransformer(edu_trans)

In [6]:
trans = tran.fit_transform(df)

In [9]:
trans['education'].unique()

array(['basic', 'unknown', 'university.degree', 'high.school',
       'professional.course', 'illiterate'], dtype=object)

In [116]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,...,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,...,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1


In [23]:
df.job.unique().tolist()

['blue-collar',
 'technician',
 'management',
 'services',
 'retired',
 'admin.',
 'housemaid',
 'unemployed',
 'entrepreneur',
 'self-employed',
 'unknown',
 'student']

In [39]:
cols = ['job',
  'marital',
  'education',
  'default',
  'housing',
  'loan',
  'contact',
  'month',
  'day_of_week',
  'poutcome']

In [93]:

feature_dict = {}
for feature in cols:
    idict = {}
    for index, item in enumerate(df[f'{feature}'].unique().tolist()):
#     print(f'{item} : {index}')
        idict[index+1] = item
        feature_dict[feature] = idict
pprint.pprint(feature_dict)

{'contact': {1: 'cellular', 2: 'telephone'},
 'day_of_week': {1: 'thu', 2: 'fri', 3: 'tue', 4: 'mon', 5: 'wed'},
 'default': {1: 'unknown', 2: 'no', 3: 'yes'},
 'education': {1: 'basic.4y',
               2: 'unknown',
               3: 'university.degree',
               4: 'high.school',
               5: 'basic.9y',
               6: 'professional.course',
               7: 'basic.6y',
               8: 'illiterate'},
 'housing': {1: 'yes', 2: 'no', 3: 'unknown'},
 'job': {1: 'blue-collar',
         2: 'technician',
         3: 'management',
         4: 'services',
         5: 'retired',
         6: 'admin.',
         7: 'housemaid',
         8: 'unemployed',
         9: 'entrepreneur',
         10: 'self-employed',
         11: 'unknown',
         12: 'student'},
 'loan': {1: 'no', 2: 'yes', 3: 'unknown'},
 'marital': {1: 'married', 2: 'single', 3: 'divorced', 4: 'unknown'},
 'month': {1: 'aug',
           2: 'nov',
           3: 'jun',
           4: 'apr',
           5: 'jul',
   

In [94]:
encode = {'contact': {1: 'cellular', 2: 'telephone'},
 'day_of_week': {1: 'thu', 2: 'fri', 3: 'tue', 4: 'mon', 5: 'wed'},
 'default': {1: 'unknown', 2: 'no', 3: 'yes'},
 'education': {1: 'unknown',
               2: 'illiterate',
               3: 'basic',
               4: 'high.school',
               5: 'university.degree',           
               6: 'professional.course',
               },
 'housing': {1: 'yes', 2: 'no', 3: 'unknown'},
 'job': {1: 'blue-collar',
         2: 'technician',
         3: 'management',
         4: 'services',
         5: 'retired',
         6: 'admin.',
         7: 'housemaid',
         8: 'unemployed',
         9: 'entrepreneur',
         10: 'self-employed',
         11: 'unknown',
         12: 'student'},
 'loan': {1: 'no', 2: 'yes', 3: 'unknown'},
 'marital': {1: 'married', 2: 'single', 3: 'divorced', 4: 'unknown'},
 'month': {1: 'mar',
           2: 'apr',
           3: 'may',
           4: 'jun',
           5: 'jul',
           6: 'aug',
           7: 'sep',
           8: 'oct',
           9: 'nov',
           10: 'dec'},
 'poutcome': {1: 'nonexistent', 2: 'success', 3: 'failure'}}

In [6]:
df.marital.unique()

array(['married', 'single', 'divorced', 'unknown'], dtype=object)

In [7]:
df.education.unique()

array(['basic.4y', 'unknown', 'university.degree', 'high.school',
       'basic.9y', 'professional.course', 'basic.6y', 'illiterate'],
      dtype=object)

In [8]:
df.default.unique() # - Ordinal

array(['unknown', 'no', 'yes'], dtype=object)

In [9]:
df.housing.unique() # - Ordinal

array(['yes', 'no', 'unknown'], dtype=object)

In [10]:
df.loan.unique()  # - Ordinal Encode

array(['no', 'yes', 'unknown'], dtype=object)

In [11]:
df.contact.unique() # - OneHot

array(['cellular', 'telephone'], dtype=object)

In [12]:
df.month.unique() # - Ordinal

array(['aug', 'nov', 'jun', 'apr', 'jul', 'may', 'oct', 'mar', 'sep',
       'dec'], dtype=object)

In [13]:
df.day_of_week.unique() # - Ordinal

array(['thu', 'fri', 'tue', 'mon', 'wed'], dtype=object)

In [14]:
df.poutcome.unique() # - Ordinal

array(['nonexistent', 'success', 'failure'], dtype=object)

In [49]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(
            categories=[['nonexistent', 'success', 'failure']],
            handle_unknown='use_encoded_value',
            unknown_value=-1,           
)

# data_train = encoder.fit_transform(df[['poutcome']])



In [79]:
trans = ColumnTransformer(
        [('ordinal Encoder - poutcome', OrdinalEncoder(
            categories=[['nonexistent', 'success', 'failure']],
            handle_unknown='use_encoded_value',
            unknown_value=-1,), ['poutcome']),
         
         ('ordinal Encoder - Day of weel', OrdinalEncoder(
            categories=[['mon', 'tue', 'wed', 'thu', 'fri']],
            handle_unknown='use_encoded_value',
            unknown_value=-1,), ['day_of_week']),
         
         ('ordinal Encoder - loan', OrdinalEncoder(
            categories=[['no', 'yes', 'unknown']],
            handle_unknown='use_encoded_value',
            unknown_value=-1,), ['loan']),
         
        ('OneHot - contact', OneHotEncoder(sparse_output=False, 
                                           handle_unknown='ignore'), ['contact'])
        
        
        ], remainder='drop', verbose_feature_names_out=False
)

In [80]:
df_transed = trans.fit_transform(df)

In [92]:
trans.transformers_

[('ordinal Encoder - poutcome',
  OrdinalEncoder(categories=[['nonexistent', 'success', 'failure']],
                 handle_unknown='use_encoded_value', unknown_value=-1),
  ['poutcome']),
 ('ordinal Encoder - Day of weel',
  OrdinalEncoder(categories=[['mon', 'tue', 'wed', 'thu', 'fri']],
                 handle_unknown='use_encoded_value', unknown_value=-1),
  ['day_of_week']),
 ('ordinal Encoder - loan',
  OrdinalEncoder(categories=[['no', 'yes', 'unknown']],
                 handle_unknown='use_encoded_value', unknown_value=-1),
  ['loan']),
 ('OneHot - contact',
  OneHotEncoder(handle_unknown='ignore', sparse_output=False),
  ['contact']),
 ('remainder',
  'drop',
  [0, 1, 2, 3, 4, 5, 8, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20])]

In [81]:
df_transed

Unnamed: 0,poutcome,day_of_week,loan,contact_cellular,contact_telephone
0,0.0,3.0,0.0,1.0,0.0
1,0.0,4.0,0.0,1.0,0.0
2,1.0,3.0,0.0,1.0,0.0
3,0.0,4.0,0.0,1.0,0.0
4,1.0,4.0,0.0,1.0,0.0
...,...,...,...,...,...
41183,0.0,3.0,1.0,0.0,1.0
41184,0.0,3.0,0.0,0.0,1.0
41185,0.0,2.0,1.0,0.0,1.0
41186,0.0,1.0,1.0,0.0,1.0


In [18]:
encoder.categories

[['nonexistent', 'success', 'failure']]

In [17]:
data_train

Unnamed: 0,poutcome
0,0.0
1,0.0
2,1.0
3,0.0
4,1.0
...,...
41183,0.0
41184,0.0
41185,0.0
41186,0.0


In [85]:
#  ('ordinal Encoder - Day of weel', OrdinalEncoder(
#             categories=[['mon', 'tue', 'wed', 'thu', 'fri']],
#             handle_unknown='use_encoded_value',
#             unknown_value=-1,), ['day_of_week']),

# ('ordinal Encoder - loan', OrdinalEncoder(
#             categories=[['no', 'yes', 'unknown']],
#             handle_unknown='use_encoded_value',
#             unknown_value=-1,), ['loan']),

In [123]:
cat_trans = ColumnTransformer(
        [('ordinal Encoder', OrdinalEncoder(
            categories=[list(encode['day_of_week'].values()),
                        list(encode['education'].values()),
                        list(encode['job'].values()),
                        list(encode['marital'].values()),
                        list(encode['month'].values()),
                       ],
            handle_unknown='use_encoded_value',
            unknown_value=-1,), 
          ['day_of_week', 'education', 'job', 'marital', 'month']),
         
        
         
         
         
        ('OneHot Encoder', OneHotEncoder(sparse_output=False, 
                                           handle_unknown='ignore'), 
         ['contact', 'default', 'housing', 'loan', 'poutcome'])
        
        
        ], remainder='passthrough', verbose_feature_names_out=False
)

In [110]:
tr

In [111]:
df_transed = tr.fit_transform(df)

In [114]:
df_transed

Unnamed: 0,day_of_week,education,job,marital,month,contact_cellular,contact_telephone,default_no,default_unknown,default_yes,...,duration,campaign,pdays,previous,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,0.0,-1.0,0.0,0.0,5.0,1.0,0.0,0.0,1.0,0.0,...,210,1,999,0,1.4,93.444,-36.1,4.963,5228.1,0
1,1.0,0.0,1.0,0.0,8.0,1.0,0.0,1.0,0.0,0.0,...,138,1,999,0,-0.1,93.200,-42.0,4.021,5195.8,0
2,0.0,4.0,2.0,1.0,3.0,1.0,0.0,1.0,0.0,0.0,...,339,3,6,2,-1.7,94.055,-39.8,0.729,4991.6,1
3,1.0,3.0,3.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,185,2,999,0,-1.8,93.075,-47.1,1.405,5099.1,0
4,1.0,-1.0,4.0,0.0,5.0,1.0,0.0,1.0,0.0,0.0,...,137,1,3,1,-2.9,92.201,-31.4,0.869,5076.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,0.0,3.0,4.0,0.0,3.0,0.0,1.0,0.0,1.0,0.0,...,222,1,999,0,1.4,94.465,-41.8,4.866,5228.1,0
41184,0.0,-1.0,6.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,...,196,2,999,0,1.1,93.994,-36.4,4.860,5191.0,0
41185,4.0,4.0,5.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,...,62,3,999,0,1.1,93.994,-36.4,4.857,5191.0,0
41186,2.0,5.0,1.0,0.0,7.0,0.0,1.0,1.0,0.0,0.0,...,200,2,999,0,-3.4,92.431,-26.9,0.742,5017.5,0


In [136]:
num_col = ColumnTransformer(
    
    [
        ('Scaling Numerical features', StandardScaler(), ['age',
  'duration',
  'campaign',
  'pdays',
  'previous',
  'emp_var_rate',
  'cons_price_idx',
  'cons_conf_idx',
  'euribor3m',
  'nr_employed']),
        
        
    ],
            
            remainder='passthrough', verbose_feature_names_out=False)

In [137]:
num_col

In [138]:
# DropMissingData()

In [141]:
pipe = Pipeline(
    [
        ('DropMissingValues', DropMissingData()),
        ('categorical Transformer', cat_trans),
        ('Numerical Transformer', num_col)
        
    ]
)

In [142]:
pipe

In [143]:
from sklearn.model_selection import train_test_split 

X = df.drop('y', axis=1)
y = df['y']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [130]:
X.shape

(41188, 20)

In [131]:
X_train.shape

(32950, 20)

In [132]:
y_train.shape

(32950,)

In [122]:
X_test.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed
1703,29,admin.,single,university.degree,no,yes,no,cellular,may,tue,159,8,999,0,nonexistent,-1.8,92.893,-46.2,1.344,5099.1
23509,34,technician,married,professional.course,no,no,no,cellular,may,tue,278,2,999,0,nonexistent,-1.8,92.893,-46.2,1.291,5099.1
19057,35,admin.,married,university.degree,no,no,no,telephone,may,thu,420,4,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0
4710,31,technician,single,professional.course,no,yes,no,cellular,aug,thu,360,3,999,0,nonexistent,1.4,93.444,-36.1,4.964,5228.1
13162,49,entrepreneur,married,university.degree,no,no,no,cellular,may,tue,242,2,999,0,nonexistent,-1.8,92.893,-46.2,1.291,5099.1


In [144]:
transformed_x = pipe.fit_transform(X)

In [147]:
transformed_x

Unnamed: 0,age,duration,campaign,pdays,previous,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,...,default_yes,housing_no,housing_unknown,housing_yes,loan_no,loan_unknown,loan_yes,poutcome_failure,poutcome_nonexistent,poutcome_success
0,0.381527,-0.186230,-0.565922,0.195414,-0.349494,0.839061,-0.227465,0.951267,0.773575,0.845170,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1.245157,-0.463926,-0.565922,0.195414,-0.349494,-0.115781,-0.649003,-0.323542,0.230456,0.398115,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-1.153816,0.311309,0.156105,-5.117342,3.691766,-1.134279,0.828107,0.151810,-1.667578,-2.428157,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,-0.098268,-0.282652,-0.204909,0.195414,-0.349494,-1.197935,-0.864955,-1.425496,-1.277824,-0.940281,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.437075,-0.467783,-0.565922,-5.133393,1.671136,-1.898153,-2.374889,1.966794,-1.586859,-1.257233,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,1.820911,-0.139947,-0.565922,0.195414,-0.349494,0.839061,1.536429,-0.280328,0.717649,0.845170,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
41184,-0.865939,-0.240227,-0.204909,0.195414,-0.349494,0.648092,0.722722,0.886447,0.714190,0.331680,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
41185,0.189609,-0.757050,0.156105,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
41186,0.765363,-0.224799,-0.204909,0.195414,-0.349494,-2.216433,-1.977538,2.939106,-1.660082,-2.069683,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [148]:
import joblib

In [149]:
joblib.dump(pipe, 'preprocessor.pkl')

['preprocessor.pkl']

In [150]:
processor = joblib.load('preprocessor.pkl')

In [151]:
processor

In [152]:
dat = processor.fit_transform(X[:200])

In [153]:
dat

Unnamed: 0,age,duration,campaign,pdays,previous,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,...,default_unknown,housing_no,housing_unknown,housing_yes,loan_no,loan_unknown,loan_yes,poutcome_failure,poutcome_nonexistent,poutcome_success
0,0.475734,-0.108814,-0.525583,0.229415,-0.351952,0.820652,-0.235512,0.949796,0.765148,0.855822,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1.410582,-0.427163,-0.525583,0.229415,-0.351952,-0.124074,-0.650796,-0.307506,0.221374,0.383380,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-1.186218,0.461562,0.135528,-4.350591,3.257810,-1.131781,0.804400,0.161318,-1.678947,-2.603389,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,-0.043626,-0.219351,-0.195028,0.229415,-0.351952,-1.194763,-0.863544,-1.394328,-1.288723,-1.031021,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.618326,-0.431584,-0.525583,-4.364428,1.452929,-1.887561,-2.351079,1.951377,-1.598132,-1.365972,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1.202838,-0.966588,4.763305,0.229415,-0.351952,0.820652,1.502214,-0.264886,0.764571,0.855822,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
196,-0.355242,-0.838364,1.788305,0.229415,-0.351952,0.820652,-0.235512,0.949796,0.764571,0.855822,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
197,-1.705578,-0.497907,-0.525583,0.229415,-0.351952,-1.194763,-0.863544,-1.394328,-1.296228,-1.031021,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
198,-0.874602,-0.718983,-0.525583,0.229415,-0.351952,0.820652,-0.235512,0.949796,0.766302,0.855822,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
