In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
bank_df = pd.read_csv('./Data/bank-additional-full.csv', sep=';')
# binary encoding of class label
bank_df['y'] = bank_df['y'].map({'no': 0, 'yes': 1})
# categorical features
categorical = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
# Perform feature scaling using MinMaxScaler
# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler()  # default= (0, 1)
numerical = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
             'euribor3m', 'nr.employed']
bank_df[numerical] = scaler.fit_transform(bank_df[numerical])
bank_df = pd.get_dummies(bank_df)  # encods categorical data
bank_df = bank_df.drop('duration', axis=1)
bank_df_y = bank_df['y'].values.reshape(-1, 1)
train_set, test_set, train_set_y, test_set_y = train_test_split(bank_df, bank_df_y, test_size=0.2, random_state=0)
# make sure negative class is distributed equally in train and test set.
print("Distribution of negative calss:\n", train_set['y'].value_counts() / len(train_set))

Distribution of negative calss:
 y
0    0.887071
1    0.112929
Name: count, dtype: float64


In [3]:
# Perform upsampling to address sample imbalance
# Separate majority and minority classes
from sklearn.utils import resample

train_negative = train_set[train_set['y'] == 0]
train_positive = train_set[train_set['y'] == 1]

# Upsample minority class
train_positive_upsample = resample(train_positive,
                                   replace=True,  # sample with replacement
                                   n_samples=29238,  # to match majority class
                                   random_state=18  # reproducible results
                                   )
# Combine majority class with upsampled minority class
train_upsample = pd.concat([train_negative, train_positive_upsample])

# Display new class counts
print("Display new class counts:\n", train_upsample['y'].value_counts())

# Create X, y for upsampled training and testing
X_train = train_upsample.drop('y', axis=1)
y_train = train_upsample['y']
# create X, y for imbalanced test set for performance validation
X_imb = test_set.drop('y', axis=1)
y_imb = test_set['y']
y_train = y_train.values.reshape(-1, 1)
X_train, X_valid, y_train, y_valid = train_test_split(X_train.values,
                                                      y_train,
                                                      test_size=0.2,
                                                      random_state=18)

Display new class counts:
 y
1    29238
0    29229
Name: count, dtype: int64
