In [1]:
import pandas as pd

In [13]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from feature_engine.imputation import DropMissingData

from sklearn import set_config
set_config(display="diagram")
set_config(transform_output="pandas")
import numpy as np
import pprint

In [85]:
df = pd.read_csv("data/dataset_raw/banking.csv")

In [116]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,...,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,...,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1


In [39]:
cols = ['job',
  'marital',
  'education',
  'default',
  'housing',
  'loan',
  'contact',
  'month',
  'day_of_week',
  'poutcome']

In [93]:

feature_dict = {}
for feature in cols:
    idict = {}
    for index, item in enumerate(df[f'{feature}'].unique().tolist()):
#     print(f'{item} : {index}')
        idict[index+1] = item
        feature_dict[feature] = idict
pprint.pprint(feature_dict)

{'contact': {1: 'cellular', 2: 'telephone'},
 'day_of_week': {1: 'thu', 2: 'fri', 3: 'tue', 4: 'mon', 5: 'wed'},
 'default': {1: 'unknown', 2: 'no', 3: 'yes'},
 'education': {1: 'basic.4y',
               2: 'unknown',
               3: 'university.degree',
               4: 'high.school',
               5: 'basic.9y',
               6: 'professional.course',
               7: 'basic.6y',
               8: 'illiterate'},
 'housing': {1: 'yes', 2: 'no', 3: 'unknown'},
 'job': {1: 'blue-collar',
         2: 'technician',
         3: 'management',
         4: 'services',
         5: 'retired',
         6: 'admin.',
         7: 'housemaid',
         8: 'unemployed',
         9: 'entrepreneur',
         10: 'self-employed',
         11: 'unknown',
         12: 'student'},
 'loan': {1: 'no', 2: 'yes', 3: 'unknown'},
 'marital': {1: 'married', 2: 'single', 3: 'divorced', 4: 'unknown'},
 'month': {1: 'aug',
           2: 'nov',
           3: 'jun',
           4: 'apr',
           5: 'jul',
   

In [6]:
df.marital.unique()

array(['married', 'single', 'divorced', 'unknown'], dtype=object)

In [7]:
df.education.unique()

array(['basic.4y', 'unknown', 'university.degree', 'high.school',
       'basic.9y', 'professional.course', 'basic.6y', 'illiterate'],
      dtype=object)

In [8]:
df.default.unique() # - Ordinal

array(['unknown', 'no', 'yes'], dtype=object)

In [9]:
df.housing.unique() # - Ordinal

array(['yes', 'no', 'unknown'], dtype=object)

In [10]:
df.loan.unique()  # - Ordinal Encode

array(['no', 'yes', 'unknown'], dtype=object)

In [11]:
df.contact.unique() # - OneHot

array(['cellular', 'telephone'], dtype=object)

In [12]:
df.month.unique() # - Ordinal

array(['aug', 'nov', 'jun', 'apr', 'jul', 'may', 'oct', 'mar', 'sep',
       'dec'], dtype=object)

In [13]:
df.day_of_week.unique() # - Ordinal

array(['thu', 'fri', 'tue', 'mon', 'wed'], dtype=object)

In [14]:
df.poutcome.unique() # - Ordinal

array(['nonexistent', 'success', 'failure'], dtype=object)

In [79]:
# trans = ColumnTransformer(
#         [('ordinal Encoder - poutcome', OrdinalEncoder(
#             categories=[['nonexistent', 'success', 'failure']],
#             handle_unknown='use_encoded_value',
#             unknown_value=-1,), ['poutcome']),
         
#          ('ordinal Encoder - Day of weel', OrdinalEncoder(
#             categories=[['mon', 'tue', 'wed', 'thu', 'fri']],
#             handle_unknown='use_encoded_value',
#             unknown_value=-1,), ['day_of_week']),
         
#          ('ordinal Encoder - loan', OrdinalEncoder(
#             categories=[['no', 'yes', 'unknown']],
#             handle_unknown='use_encoded_value',
#             unknown_value=-1,), ['loan']),
         
#         ('OneHot - contact', OneHotEncoder(sparse_output=False, 
#                                            handle_unknown='ignore'), ['contact'])
        
        
#         ], remainder='drop', verbose_feature_names_out=False
# )

In [80]:
# df_transed = trans.fit_transform(df)

In [92]:
# trans.transformers_

[('ordinal Encoder - poutcome',
  OrdinalEncoder(categories=[['nonexistent', 'success', 'failure']],
                 handle_unknown='use_encoded_value', unknown_value=-1),
  ['poutcome']),
 ('ordinal Encoder - Day of weel',
  OrdinalEncoder(categories=[['mon', 'tue', 'wed', 'thu', 'fri']],
                 handle_unknown='use_encoded_value', unknown_value=-1),
  ['day_of_week']),
 ('ordinal Encoder - loan',
  OrdinalEncoder(categories=[['no', 'yes', 'unknown']],
                 handle_unknown='use_encoded_value', unknown_value=-1),
  ['loan']),
 ('OneHot - contact',
  OneHotEncoder(handle_unknown='ignore', sparse_output=False),
  ['contact']),
 ('remainder',
  'drop',
  [0, 1, 2, 3, 4, 5, 8, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20])]

In [81]:
# df_transed

Unnamed: 0,poutcome,day_of_week,loan,contact_cellular,contact_telephone
0,0.0,3.0,0.0,1.0,0.0
1,0.0,4.0,0.0,1.0,0.0
2,1.0,3.0,0.0,1.0,0.0
3,0.0,4.0,0.0,1.0,0.0
4,1.0,4.0,0.0,1.0,0.0
...,...,...,...,...,...
41183,0.0,3.0,1.0,0.0,1.0
41184,0.0,3.0,0.0,0.0,1.0
41185,0.0,2.0,1.0,0.0,1.0
41186,0.0,1.0,1.0,0.0,1.0


In [85]:
#  ('ordinal Encoder - Day of weel', OrdinalEncoder(
#             categories=[['mon', 'tue', 'wed', 'thu', 'fri']],
#             handle_unknown='use_encoded_value',
#             unknown_value=-1,), ['day_of_week']),

# ('ordinal Encoder - loan', OrdinalEncoder(
#             categories=[['no', 'yes', 'unknown']],
#             handle_unknown='use_encoded_value',
#             unknown_value=-1,), ['loan']),

### Preprocessing Pipeline

In [113]:
encode = {'contact': {1: 'cellular', 2: 'telephone'},
 'day_of_week': {1: 'thu', 2: 'fri', 3: 'tue', 4: 'mon', 5: 'wed'},
 'default': {1: 'unknown', 2: 'no', 3: 'yes'},
 'education': {1: 'unknown',
               2: 'illiterate',
               3: 'basic',
               4: 'high.school',
               5: 'university.degree',           
               6: 'professional.course',
               },
 'housing': {1: 'yes', 2: 'no', 3: 'unknown'},
 'job': {1: 'blue-collar',
         2: 'technician',
         3: 'management',
         4: 'services',
         5: 'retired',
         6: 'admin.',
         7: 'housemaid',
         8: 'unemployed',
         9: 'entrepreneur',
         10: 'self-employed',
         11: 'unknown',
         12: 'student'},
 'loan': {1: 'no', 2: 'yes', 3: 'unknown'},
 'marital': {1: 'married', 2: 'single', 3: 'divorced', 4: 'unknown'},
 'month': {1: 'mar',
           2: 'apr',
           3: 'may',
           4: 'jun',
           5: 'jul',
           6: 'aug',
           7: 'sep',
           8: 'oct',
           9: 'nov',
           10: 'dec'},
 'poutcome': {1: 'nonexistent', 2: 'success', 3: 'failure'}}

In [114]:
def edu_trans(df):
    df['education'] = df['education'].replace(['basic.4y', 'basic.6y', 'basic.9y'], 'basic')
    return df 

In [115]:
cat_trans = ColumnTransformer(
        [         
          ('ordinal Encoder', OrdinalEncoder(
            categories=[list(encode['day_of_week'].values()),
                        list(encode['education'].values()),
                        list(encode['job'].values()),
                        list(encode['marital'].values()),
                        list(encode['month'].values()),
                       ],
            handle_unknown='use_encoded_value',
            unknown_value=-1,), 
          ['day_of_week', 'education', 'job', 'marital', 'month']),
        
         
        ('OneHot Encoder', OneHotEncoder(sparse_output=False, 
                                           handle_unknown='ignore'), 
         ['contact', 'default', 'housing', 'loan', 'poutcome'])
        
        
        ], remainder='passthrough', verbose_feature_names_out=False
)

In [116]:
cat_trans

In [117]:
num_col = ColumnTransformer(
    
    [
        ('Scaling Numerical features', StandardScaler(), ['age',
  'duration',
  'campaign',
  'pdays',
  'previous',
  'emp_var_rate',
  'cons_price_idx',
  'cons_conf_idx',
  'euribor3m',
  'nr_employed']),
        
        
    ],
            
            remainder='passthrough', verbose_feature_names_out=False)

In [118]:
num_col

In [119]:
# DropMissingData()

In [120]:
pipe = Pipeline(
    [   
        ('DropMissingValues', DropMissingData()),
        ('handling - education feature', FunctionTransformer(edu_trans)),
        ('categorical Transformer', cat_trans),
        ('Numerical Transformer', num_col)
        
    ]
)

In [121]:
pipe

In [122]:
import joblib 

In [123]:
joblib.dump(pipe, 'preprocessor.pkl')

['preprocessor.pkl']

In [112]:
test_pipe = joblib.load('preprocessor.pkl')
test_pipe

In [86]:
from sklearn.model_selection import train_test_split 

X = df.drop('y', axis=1)
y = df['y']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [87]:
X.shape

(41188, 20)

In [88]:
X.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed
0,44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,210,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,138,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,339,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6
3,39,services,married,high.school,no,no,no,cellular,apr,fri,185,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,137,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2


In [105]:
transformed_x = pipe.fit_transform(X)

In [109]:
[transformed_x.info()]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   41188 non-null  float64
 1   duration              41188 non-null  float64
 2   campaign              41188 non-null  float64
 3   pdays                 41188 non-null  float64
 4   previous              41188 non-null  float64
 5   emp_var_rate          41188 non-null  float64
 6   cons_price_idx        41188 non-null  float64
 7   cons_conf_idx         41188 non-null  float64
 8   euribor3m             41188 non-null  float64
 9   nr_employed           41188 non-null  float64
 10  day_of_week           41188 non-null  float64
 11  education             41188 non-null  float64
 12  job                   41188 non-null  float64
 13  marital               41188 non-null  float64
 14  month                 41188 non-null  float64
 15  contact_cellular   

[None]