In [1]:
import pandas as pd
import numpy as np

In [2]:
from ucimlrepo import fetch_ucirepo

In [9]:
bankMarketing = fetch_ucirepo(id=222)

In [10]:
bankMarketingData = bankMarketing.data.original

In [11]:
bankMarketingData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no


In [12]:
bankMarketingData.shape

(45211, 17)

In [13]:
bankMarketingData.y.value_counts() # imbalanced dataset

y
no     39922
yes     5289
Name: count, dtype: int64

In [14]:
bankMarketingData.y = np.where(bankMarketingData.y == 'no', -1, 1)

In [15]:
bankMarketingData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,-1
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,-1
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,-1
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,-1
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,-1


In [16]:
bankMarketingData.tail()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,,1
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,,1
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,1
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,,-1
45210,37,entrepreneur,married,secondary,no,2971,no,no,cellular,17,nov,361,2,188,11,other,-1


In [17]:
bankMarketingData.dtypes

age             int64
job            object
marital        object
education      object
default        object
balance         int64
housing        object
loan           object
contact        object
day_of_week     int64
month          object
duration        int64
campaign        int64
pdays           int64
previous        int64
poutcome       object
y               int64
dtype: object

In [18]:
bankMarketingData.isna().sum()

age                0
job              288
marital            0
education       1857
default            0
balance            0
housing            0
loan               0
contact        13020
day_of_week        0
month              0
duration           0
campaign           0
pdays              0
previous           0
poutcome       36959
y                  0
dtype: int64

In [None]:
# significant nulls in poutcome (majority of entries), let's drop

In [19]:
bankMarketingData.drop('poutcome', axis=1, inplace=True)

In [20]:
bankMarketingData.isna().sum()

age                0
job              288
marital            0
education       1857
default            0
balance            0
housing            0
loan               0
contact        13020
day_of_week        0
month              0
duration           0
campaign           0
pdays              0
previous           0
y                  0
dtype: int64

In [21]:
# untransformed dataset
bankMarketingData.to_csv('untransformed_BANKMARKETING.csv', index=False)

In [None]:
# transformed dataset

In [22]:
transformedBankMarketingData = bankMarketingData.copy()

In [23]:
# one hot encoding as opposed to label encoding
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [24]:
categorical_cols = transformedBankMarketingData.select_dtypes(include=['object']).columns.tolist()
numerical_cols = transformedBankMarketingData.select_dtypes(include=['float64', 'int64']).columns.tolist()

In [25]:
categorical_cols, numerical_cols

(['job',
  'marital',
  'education',
  'default',
  'housing',
  'loan',
  'contact',
  'month'],
 ['age',
  'balance',
  'day_of_week',
  'duration',
  'campaign',
  'pdays',
  'previous',
  'y'])

In [26]:
numerical_cols.remove('y')

In [27]:
X = transformedBankMarketingData.drop('y', axis=1)

In [28]:
X.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day_of_week', 'month', 'duration', 'campaign',
       'pdays', 'previous'],
      dtype='object')

In [29]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [30]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [31]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ]
)

In [32]:
transformedX = preprocessor.fit_transform(X)

In [33]:
numeric_features = [f"{col}_scaled" for col in numerical_cols]
categorical_features = []
for col, categories in zip(categorical_cols, preprocessor.named_transformers_['cat'].named_steps['onehot'].categories_):
    categorical_features.extend([f"{col}_{cat}" for cat in categories])

In [34]:
numeric_features

['age_scaled',
 'balance_scaled',
 'day_of_week_scaled',
 'duration_scaled',
 'campaign_scaled',
 'pdays_scaled',
 'previous_scaled']

In [35]:
categorical_features

['job_admin.',
 'job_blue-collar',
 'job_entrepreneur',
 'job_housemaid',
 'job_management',
 'job_retired',
 'job_self-employed',
 'job_services',
 'job_student',
 'job_technician',
 'job_unemployed',
 'marital_divorced',
 'marital_married',
 'marital_single',
 'education_primary',
 'education_secondary',
 'education_tertiary',
 'default_no',
 'default_yes',
 'housing_no',
 'housing_yes',
 'loan_no',
 'loan_yes',
 'contact_cellular',
 'contact_telephone',
 'month_apr',
 'month_aug',
 'month_dec',
 'month_feb',
 'month_jan',
 'month_jul',
 'month_jun',
 'month_mar',
 'month_may',
 'month_nov',
 'month_oct',
 'month_sep']

In [36]:
feature_names = numeric_features + categorical_features

In [37]:
transformedBankMarketingData = pd.DataFrame(
    transformedX,
    columns=feature_names,
    index=X.index
)

In [38]:
transformedBankMarketingData.head()

Unnamed: 0,age_scaled,balance_scaled,day_of_week_scaled,duration_scaled,campaign_scaled,pdays_scaled,previous_scaled,job_admin.,job_blue-collar,job_entrepreneur,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,1.606965,0.256419,-1.298476,0.011016,-0.569351,-0.411453,-0.25194,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.288529,-0.437895,-1.298476,-0.416127,-0.569351,-0.411453,-0.25194,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.747384,-0.446762,-1.298476,-0.707361,-0.569351,-0.411453,-0.25194,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.571051,0.047205,-1.298476,-0.645231,-0.569351,-0.411453,-0.25194,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.747384,-0.447091,-1.298476,-0.23362,-0.569351,-0.411453,-0.25194,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [39]:
transformedBankMarketingData['y'] = bankMarketingData.y

In [40]:
transformedBankMarketingData.head()

Unnamed: 0,age_scaled,balance_scaled,day_of_week_scaled,duration_scaled,campaign_scaled,pdays_scaled,previous_scaled,job_admin.,job_blue-collar,job_entrepreneur,...,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,y
0,1.606965,0.256419,-1.298476,0.011016,-0.569351,-0.411453,-0.25194,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1
1,0.288529,-0.437895,-1.298476,-0.416127,-0.569351,-0.411453,-0.25194,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1
2,-0.747384,-0.446762,-1.298476,-0.707361,-0.569351,-0.411453,-0.25194,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1
3,0.571051,0.047205,-1.298476,-0.645231,-0.569351,-0.411453,-0.25194,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1
4,-0.747384,-0.447091,-1.298476,-0.23362,-0.569351,-0.411453,-0.25194,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1


In [41]:
# output transformed dataset
transformedBankMarketingData.to_csv('transformed_BANKMARKETING.csv', index=False)