In [27]:
# 1. Import Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [28]:
# Load official UCI dataset
df = pd.read_csv("bank-full.csv", sep=';')

print("Shape:", df.shape)
print(df.head())
print(df.info())


Shape: (45211, 17)
   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (to

In [29]:
# Check duplicates
print("Duplicate rows:", df.duplicated().sum())

# Remove duplicates if any
df = df.drop_duplicates()

# Check missing values
print("\nMissing values:")
print(df.isnull().sum())


Duplicate rows: 0

Missing values:
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [30]:
# 5. Handle 'unknown' Values in Categorical Columns
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].replace('unknown', df[col].mode()[0])


In [31]:
# 6. Convert Target Variable to Binary
df['y'] = df['y'].map({'yes': 1, 'no': 0})


In [32]:
# 7. Separate Categorical and Numerical Features
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=['int64']).columns

print("Categorical:", categorical_cols)
print("Numerical:", numerical_cols)


Categorical: Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome'],
      dtype='object')
Numerical: Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous',
       'y'],
      dtype='object')


In [33]:
# 8. Apply One-Hot Encoding to Categorical Variables
df_encoded = pd.get_dummies(
    df,
    columns=categorical_cols,
    drop_first=True
)

print("After Encoding Shape:", df_encoded.shape)


After Encoding Shape: (45211, 40)


In [34]:
# 9. Scale Numerical Features Using StandardScaler
scaler = StandardScaler()

df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])


In [35]:
# 10. Define Feature Matrix (X) and Target Vector (y)
X = df_encoded.drop('y', axis=1)
y = df_encoded['y']


In [36]:
# 11. Perform Stratified Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training set:", X_train.shape)
print("Test set:", X_test.shape)


Training set: (36168, 39)
Test set: (9043, 39)
