In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

# Import data

In [2]:
filepath = "../bank-additional/bank-additional-full.csv"

df_raw = pd.read_csv(filepath, sep=";")
df_raw = df_raw.rename(columns={"emp.var.rate": "emp_var_rate", 
                                "cons.price.idx": "cons_price_idx", 
                                "cons.conf.idx": "cons_conf_idx",
                                "nr.employed": "nr_employed"
                               }
                      )

df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp_var_rate    41188 non-null  float64
 16  cons_price_idx  41188 non-null  float64
 17  cons_conf_idx   41188 non-null 

# Graphing and Utility Functions

## Utility

In [4]:
'''
Function for identifying indices for elements of input 1-dimensional array @arr that are equal to input @val.

Returns list of indices that can be later passed on to filter down other arrays (e.g. if @arr is class labels y, use to filter down y and corresponding values for given label in X).
'''
def ind_where_true(arr, val):
            
    ind_lst = [i for i in range(len(arr)) if arr[i] == val]
            
    return ind_lst

## Shuffle and Split

In [5]:
###-------------------------FUNCTIONS-------------------------###
'''
Shuffle input data matrix @dat, splits into training set of proportion @train_size_prop and remainder into validation set.

Assumes all but last column are continuous variables, and last column is class labels.
'''

def shuffle_split(dat, train_size_prop):
    from math import ceil
    
    np.random.seed(seed=0)
    dat_shuffled = dat.copy()
    np.random.shuffle(dat_shuffled)
    
    # split
    train_size_last_index = ceil(train_size_prop * dat_shuffled.shape[0])
    X_shuffled_train = dat_shuffled.copy()[:train_size_last_index, :-1]
    y_shuffled_train = dat_shuffled.copy()[:train_size_last_index, -1]
    
    X_shuffled_test = dat_shuffled.copy()[train_size_last_index:, :-1]
    y_shuffled_test = dat_shuffled.copy()[train_size_last_index:, -1]
    
    return X_shuffled_train, X_shuffled_test, y_shuffled_train, y_shuffled_test

'''
Shuffle input data matrix @X and @y, splitting based on proportion to go to training data @train_size_prop.

Keeps proportion of class labels consistent across training set and validation set (to ensure validation set will contain desired proportion of class labels to verify against).
'''

def shuffle_by_class_split(X, y, train_size_prop):
    from math import ceil
    np.random.seed(seed=0)
    uniq_classes = np.unique(y)
    
    train_i = []
    val_i = []
    
    # Build list of indices where samples have given class label. Adds first 2/3 of shuffled indices to train_i, and last 1/3 to val_i.
    for label in uniq_classes:

        filt_ind = np.array(ind_where_true(y, label))

        shuffled_ind = filt_ind.copy()
        np.random.shuffle(shuffled_ind)


        train_size_last_index = ceil(train_size_prop * (len(shuffled_ind)-1))


        train_i += list(shuffled_ind[:train_size_last_index])

        val_i += list(shuffled_ind[train_size_last_index:])
    
    # Training and validation sets filtered down based on lists indices built earlier
    X_train, X_val = X[train_i], X[val_i]
    y_train, y_val = y[train_i], y[val_i]
    
    return X_train, X_val, y_train, y_val

## Resample

In [6]:
'''
Given input feature matrix @X_inp and corresponding class labels @y_inp with random seed @seed,
return randomly re-sampled (with replacement) class labels and corresponding feature matrix so that all class labels have the same proportion.
'''
def resample_imbalance_auto(X_inp, y_inp, seed):
    np.random.seed(seed=seed)
    # Identify most common and imbalanced labels
    y_uniq_labels, y_uniq_counts = np.unique(y_inp, return_counts=True)
    # print(f"y_unique labels are: {y_uniq_labels}")
    # print(f"y_uniq_counts are: {y_uniq_counts}")
    
    mode_label_ind = np.argmax(y_uniq_counts)
    # print(f"mode label ind is {mode_label_ind}")
    mode_label = y_uniq_labels[mode_label_ind]
    # print(f"mode label is is {mode_label}")
    count_mode_label = y_uniq_counts[mode_label_ind]
    # print(f"count mode label is is {count_mode_label}")
    
    imbal_labels = list(set(y_uniq_labels) - set([mode_label]))
    # print(f"imbal labels are {imbal_labels}")
    
    # Make arrays for each imbalanced label
    ind_mode_label = ind_where_true(y_inp, mode_label)
    X_bal_mat = X_inp.copy()
    y_bal_mat = y_inp.copy()
    
    for label in imbal_labels:
        ind_imbal_label = ind_where_true(y_inp, label)
        # print(f"len array for imbal label {label} is {len(ind_imbal_label)}")
        
        y_imbal_label = y_inp[ind_imbal_label]
        
        count_imbal_label = len(y_imbal_label)
        # print(f"count_imbal_label is {count_imbal_label}")
        imbal_diff = count_mode_label - count_imbal_label
        # print(f"imbal diff is {imbal_diff}")
        
        rand_sample_ind = np.random.choice(ind_imbal_label, size=imbal_diff, replace=True)
        
        X_rand_resample_label = X_inp[rand_sample_ind]
        y_rand_resample_label = y_inp[rand_sample_ind]
        
        X_bal_mat = np.concatenate((X_bal_mat, X_rand_resample_label))
        y_bal_mat = np.concatenate((y_bal_mat, y_rand_resample_label))
        
    
        
    return X_bal_mat, y_bal_mat
        

## Make categorical columns

In [7]:
'''
For input continuous single-dimensional array @arr, turn data into binary categorical data (as integer) based on mean of array. If value is < array mean, make 0 else make 1.

Intended for training data pre-processing.

Returns 1-dimensional array to enable concatenation.
'''
def binarize_tr(arr):
    arr_mean = np.mean(arr)
    
    binary_arr = np.where(arr < arr_mean, 0, 1).reshape(-1,1)
    
    return binary_arr

'''
For input continuous single-dimensional array @arr_val, turn data into binary categorical data (as integer) based on mean of array from training set @tr_mean. If value is < training array mean, make 0 else make 1.

Intended for validation data pre-processing.

Returns 1-dimensional array to enable concatenation.
'''
def binarize_val(arr_val, tr_mean):
    binary_arr_val = np.where(arr_val < tr_mean, 0, 1).reshape(-1, 1)
    
    return binary_arr_val

'''
'''
def bin_labels(bins):
    labels = ["-".join((str(bins[i]), str(bins[i+1]))) for i in range(len(bins)-1)]
    
    return labels

'''
'''
def make_cat_col(df, col, bins):
    # df_col = df.loc[:, [col, "y"]]
    df_col = df.loc[:, [col]]
    
    bins = bins
    labels = bin_labels(bins)
    
    df_col[f"{col}_cat"] = pd.cut(df_col[col], bins=bins, labels=labels)
    
    return pd.cut(df_col[col], bins=bins, labels=labels, include_lowest=True)

## Categorize Data

In [8]:
def categorize_data(df_inp, df_tr_num_means):
    df = df_inp.copy()
    # Age
    age_bins = [17, 30, 40, 60, 80, 100]
    age_labels = bin_labels(age_bins)
    df["age_cat"] = make_cat_col(df, "age", age_bins)
    
    # Duration
    dur_lst = []

    for val in df["duration"] / 60:
        if val >=0 and val <=1:
            dur_lst.append("0-1")
        elif val >1 and val <= 2:
            dur_lst.append("1-2")
        elif val >2 and val <= 5:
            dur_lst.append("2-5")
        elif val > 5:
            dur_lst.append(">5")
        
    # dur_min_cat_labels = ["0-1", "1-2", "2-5", ">5"]
    df["duration_min_cat"] = pd.Series(dur_lst)
    
    # Campaign
    camp_lst = []

    for val in df["campaign"]:
        if val == 0:
            camp_lst.append("0")
        elif val == 1:
            camp_lst.append("1")
        elif val == 2:
            camp_lst.append("2")
        elif val >=3 and val <= 5:
            camp_lst.append("3-5")
        elif val > 5:
            camp_lst.append(">5")
    
    df["campaign_cat"] = pd.Series(camp_lst)
    
    # pdays
    df["pdays_cat"] = make_cat_col(df, "pdays", [-1, 0, 180, 999])
    
    # Previous
    prev_lst = []

    for val in df["previous"]:
        if val == 0:
            prev_lst.append("0")
        elif val == 1:
            prev_lst.append("1")
        elif val == 2:
            prev_lst.append("2")
        elif val >=3 and val <= 5:
            prev_lst.append("3-5")
        elif val > 5:
            prev_lst.append(">5")

    df["previous_cat"] = prev_lst
    
    # Employee variable rate
    emp_var_rate_bins = [-4, -2, 0, 2]
    emp_var_rate_labels = bin_labels(emp_var_rate_bins)
    df["emp_var_rate_cat"] = make_cat_col(df, "emp_var_rate", emp_var_rate_bins)

    # Consumer Price Index
    df["cons_price_idx_cat"] = binarize_val(df["cons_price_idx"].to_numpy(), df_tr_num_means["cons_price_idx"])

    # Consumer Confidence Index
    df["cons_conf_idx_cat"] = binarize_val(df["cons_conf_idx"].to_numpy(), df_tr_num_means["cons_price_idx"])

    # Euribor 3-Month
    df["euribor3m_cat"] = binarize_val(df["euribor3m"].to_numpy(), df_tr_num_means["cons_price_idx"])
    # Number Employed
    df["nr_employed_cat"] = binarize_val(df["nr_employed"].to_numpy(), df_tr_num_means["cons_price_idx"])
    
    return df

## Encoding

In [9]:
'''
Take a categorical feature array (i.e. column of a data matrix) and get encoded array of integers corresponding to each category.
Then return a dictionary of binary-encoded arrays for each category.

Example: for category column with unique values ["north", "south", "east", "west"], will return dictionary with boolean-like array of integers corresponding to columns "is_east", "is_north", "is_south", "is_west". Unique values are sorted, and so corresponding boolean->integer arrays are similarly ordered.
'''
def one_hot_encode(dat, dat_subset, col): 
    unique_cats = dat[col].unique()# dat_subset might not contain all values, so needs dat for reference
    # np.unique() returns sorted values
    
    feature_dict_onehot = {}
    
    for cat in unique_cats:
        onehot_arr = []
        for elem in dat_subset[col]:
            if elem == cat:
                onehot_arr.append(1)
            else:
                onehot_arr.append(0)
        feature_dict_onehot[f"{col} | {cat}"] = np.array(onehot_arr)
        
    return feature_dict_onehot

In [20]:
def one_hot_encode_df(df_inp_full, df_inp, feat_lst):
    
    one_hot_df_lst = []
    for feat in feat_lst:
        # one_hot_dict = one_hot_encode(df_inp_full.to_numpy(), df_inp.to_numpy(), list(df_inp_full.columns).index(feat))
        one_hot_dict = one_hot_encode(df_inp_full, df_inp, feat)
        one_hot_df_lst.append(pd.DataFrame(one_hot_dict))
        
    df_onehot_combined = pd.concat(one_hot_df_lst, axis=1)
    
    return df_onehot_combined

# Shuffle, Split, Resample

## Shuffle and Split by class label

In [10]:
# Shuffle and Split data, ensuring same proportion of class labels in train and validation sets
X_train, X_val, y_train, y_val = shuffle_by_class_split(X = df_raw.iloc[:, :-1].to_numpy(), 
                                                        y = df_raw["y"].to_numpy(),
                                                        train_size_prop = 2/3)

In [11]:
X_train_res, y_train_res = resample_imbalance_auto(X_train, y_train, seed=0)

In [12]:
df = 0

In [13]:
df_tr = pd.DataFrame(X_train_res.copy())

df_tr.columns = list(df_raw.iloc[:, :-1].columns)
df_tr

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed
0,33,technician,divorced,professional.course,no,no,no,telephone,jun,wed,128,1,999,0,nonexistent,1.4,94.465,-41.8,4.864,5228.1
1,38,admin.,divorced,basic.9y,no,no,no,cellular,nov,fri,71,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8
2,34,blue-collar,single,basic.9y,no,yes,no,cellular,may,tue,483,1,999,1,failure,-1.8,92.893,-46.2,1.291,5099.1
3,32,admin.,married,university.degree,no,no,no,cellular,aug,fri,133,2,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1
4,48,blue-collar,married,high.school,unknown,yes,no,telephone,jun,thu,234,1,999,0,nonexistent,1.4,94.465,-41.8,4.961,5228.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48725,53,admin.,divorced,university.degree,no,no,no,cellular,aug,tue,506,1,999,0,nonexistent,1.4,93.444,-36.1,4.966,5228.1
48726,29,admin.,single,university.degree,no,yes,no,cellular,oct,mon,526,1,999,0,nonexistent,-1.1,94.601,-49.5,1.0,4963.6
48727,34,technician,single,unknown,no,yes,no,cellular,aug,thu,320,2,3,1,success,-2.9,92.201,-31.4,0.851,5076.2
48728,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,137,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2


## Categorize Data

In [14]:
num_to_binary_cols = ["cons_price_idx", "cons_conf_idx", "euribor3m", "nr_employed"]
df_tr_to_binary_means = df_tr.loc[:, num_to_binary_cols].mean()
df_tr_to_binary_means

cons_price_idx      93.476952
cons_conf_idx      -40.185149
euribor3m            2.967258
nr_employed       5135.663017
dtype: float64

### `categorize_data()`

In [75]:
def categorize_data(df_inp, df_tr_num_means):
    df = df_inp.copy()
    # Age
    age_bins = [17, 30, 40, 60, 80, 100]
    age_labels = bin_labels(age_bins)
    df["age_cat"] = make_cat_col(df, "age", age_bins)
    
    # Duration
    dur_lst = []

    for val in df["duration"] / 60:
        if val >=0 and val <=1:
            dur_lst.append("0-1")
        elif val >1 and val <= 2:
            dur_lst.append("1-2")
        elif val >2 and val <= 5:
            dur_lst.append("2-5")
        elif val > 5:
            dur_lst.append(">5")
        
    # dur_min_cat_labels = ["0-1", "1-2", "2-5", ">5"]
    df["duration_min_cat"] = pd.Series(dur_lst)
    
    # Campaign
    camp_lst = []

    for val in df["campaign"]:
        if val == 0:
            camp_lst.append("0")
        elif val == 1:
            camp_lst.append("1")
        elif val == 2:
            camp_lst.append("2")
        elif val >=3 and val <= 5:
            camp_lst.append("3-5")
        elif val > 5:
            camp_lst.append(">5")
    
    df["campaign_cat"] = pd.Series(camp_lst)
    
    # pdays
    df["pdays_cat"] = make_cat_col(df, "pdays", [-1, 0, 180, 999])
    
    # Previous
    prev_lst = []

    for val in df["previous"]:
        if val == 0:
            prev_lst.append("0")
        elif val == 1:
            prev_lst.append("1")
        elif val == 2:
            prev_lst.append("2")
        elif val >=3 and val <= 5:
            prev_lst.append("3-5")
        elif val > 5:
            prev_lst.append(">5")

    df["previous_cat"] = prev_lst
    
    # Employee variable rate
    emp_var_rate_bins = [-4, -2, 0, 2]
    emp_var_rate_labels = bin_labels(emp_var_rate_bins)
    df["emp_var_rate_cat"] = make_cat_col(df, "emp_var_rate", emp_var_rate_bins)

    # Consumer Price Index
    df["cons_price_idx_cat"] = binarize_val(df["cons_price_idx"].to_numpy(), df_tr_num_means["cons_price_idx"])

    # Consumer Confidence Index
    df["cons_conf_idx_cat"] = binarize_val(df["cons_conf_idx"].to_numpy(), df_tr_num_means["cons_price_idx"])

    # Euribor 3-Month
    df["euribor3m_cat"] = binarize_val(df["euribor3m"].to_numpy(), df_tr_num_means["cons_price_idx"])
    # Number Employed
    df["nr_employed_cat"] = binarize_val(df["nr_employed"].to_numpy(), df_tr_num_means["cons_price_idx"])
    
    return df

In [15]:
feat_cat = [
    # 'age',
    'age_cat',
    'job', 'marital', 'education', 'default', 'housing', 
    'loan', 'contact', 'month', 'day_of_week',
    # 'duration',
    'duration_min_cat',
    # 'campaign',
    'campaign_cat',
    # 'pdays',
    'pdays_cat',
    # 'previous',
    'previous_cat',
    'poutcome',
    # 'emp_var_rate',
    'emp_var_rate_cat',
    # 'cons_price_idx',
    'cons_price_idx_cat',
    # 'cons_conf_idx',
    'cons_conf_idx_cat',
    # 'euribor3m',
    'euribor3m_cat',
    # 'nr_employed',
    'nr_employed_cat',
    # 'y',
]

df_tr_cat = categorize_data(df_tr, df_tr_to_binary_means).loc[:, feat_cat]
df_tr_cat

Unnamed: 0,age_cat,job,marital,education,default,housing,loan,contact,month,day_of_week,duration_min_cat,campaign_cat,pdays_cat,previous_cat,poutcome,emp_var_rate_cat,cons_price_idx_cat,cons_conf_idx_cat,euribor3m_cat,nr_employed_cat
0,30-40,technician,divorced,professional.course,no,no,no,telephone,jun,wed,2-5,1,180-999,0,nonexistent,0-2,1,0,0,1
1,30-40,admin.,divorced,basic.9y,no,no,no,cellular,nov,fri,1-2,1,180-999,0,nonexistent,-2-0,0,0,0,1
2,30-40,blue-collar,single,basic.9y,no,yes,no,cellular,may,tue,>5,1,180-999,1,failure,-2-0,0,0,0,1
3,30-40,admin.,married,university.degree,no,no,no,cellular,aug,fri,2-5,2,180-999,0,nonexistent,0-2,0,0,0,1
4,40-60,blue-collar,married,high.school,unknown,yes,no,telephone,jun,thu,2-5,1,180-999,0,nonexistent,0-2,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48725,40-60,admin.,divorced,university.degree,no,no,no,cellular,aug,tue,>5,1,180-999,0,nonexistent,0-2,0,0,0,1
48726,17-30,admin.,single,university.degree,no,yes,no,cellular,oct,mon,>5,1,180-999,0,nonexistent,-2-0,1,0,0,1
48727,30-40,technician,single,unknown,no,yes,no,cellular,aug,thu,>5,2,0-180,1,success,-4--2,0,0,0,1
48728,40-60,retired,married,basic.4y,no,yes,no,cellular,aug,fri,2-5,1,0-180,1,success,-4--2,0,0,0,1


## Encode

In [16]:
'''
Take a categorical feature array (i.e. column of a data matrix) and get encoded array of integers corresponding to each category.
Then return a dictionary of binary-encoded arrays for each category.

Example: for category column with unique values ["north", "south", "east", "west"], will return dictionary with boolean-like array of integers corresponding to columns "is_east", "is_north", "is_south", "is_west". Unique values are sorted, and so corresponding boolean->integer arrays are similarly ordered.
'''
def one_hot_encode(dat, dat_subset, col): 
    unique_cats = dat[col].unique()# dat_subset might not contain all values, so needs dat for reference
    # np.unique() returns sorted values
    
    feature_dict_onehot = {}
    
    for cat in unique_cats:
        onehot_arr = []
        for elem in dat_subset[col]:
            if elem == cat:
                onehot_arr.append(1)
            else:
                onehot_arr.append(0)
        feature_dict_onehot[f"{col} | {cat}"] = np.array(onehot_arr)
        
    return feature_dict_onehot

### Dataframe categorized - has all possible categorical values

In [18]:
df_raw_cat_comparison = categorize_data(df_raw, df_tr_to_binary_means).loc[:, feat_cat]

In [19]:
one_hot_feat_lst = [
    "age_cat",
    "job",
    "marital",
    "education",
    "default",
    "housing",
    "loan",
    "contact",
    "month",
    "day_of_week",
    "duration_min_cat",
    "campaign_cat",
    "pdays_cat",
    "previous_cat",
    "poutcome",
    "emp_var_rate_cat",
    # "cons_price_idx_cat",
    # "cons_conf_idx_cat",
    # "euribor3m_cat",
    # "nr_employed_cat",
]

In [21]:
def one_hot_encode_df(df_inp_full, df_inp, feat_lst):
    
    one_hot_df_lst = []
    for feat in feat_lst:
        # one_hot_dict = one_hot_encode(df_inp_full.to_numpy(), df_inp.to_numpy(), list(df_inp_full.columns).index(feat))
        one_hot_dict = one_hot_encode(df_inp_full, df_inp, feat)
        one_hot_df_lst.append(pd.DataFrame(one_hot_dict))
        
    df_onehot_combined = pd.concat(one_hot_df_lst, axis=1)
    
    return df_onehot_combined

In [22]:
df_tr_one_hot_cat = one_hot_encode_df(df_raw_cat_comparison, df_tr_cat, one_hot_feat_lst)
df_tr_one_hot_cat

Unnamed: 0,age_cat | 40-60,age_cat | 30-40,age_cat | 17-30,age_cat | 60-80,age_cat | 80-100,job | housemaid,job | services,job | admin.,job | blue-collar,job | technician,...,previous_cat | 1,previous_cat | 2,previous_cat | 3-5,previous_cat | >5,poutcome | nonexistent,poutcome | failure,poutcome | success,emp_var_rate_cat | 0-2,emp_var_rate_cat | -2-0,emp_var_rate_cat | -4--2
0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
1,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,1,0,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,1,0
3,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48725,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
48726,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
48727,0,1,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,1
48728,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1


In [23]:
df_tr_one_hot_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48730 entries, 0 to 48729
Data columns (total 77 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   age_cat | 40-60                  48730 non-null  int64
 1   age_cat | 30-40                  48730 non-null  int64
 2   age_cat | 17-30                  48730 non-null  int64
 3   age_cat | 60-80                  48730 non-null  int64
 4   age_cat | 80-100                 48730 non-null  int64
 5   job | housemaid                  48730 non-null  int64
 6   job | services                   48730 non-null  int64
 7   job | admin.                     48730 non-null  int64
 8   job | blue-collar                48730 non-null  int64
 9   job | technician                 48730 non-null  int64
 10  job | retired                    48730 non-null  int64
 11  job | management                 48730 non-null  int64
 12  job | unemployed                 48730 non-nul

In [24]:
df_tr_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48730 entries, 0 to 48729
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   age_cat             48730 non-null  category
 1   job                 48730 non-null  object  
 2   marital             48730 non-null  object  
 3   education           48730 non-null  object  
 4   default             48730 non-null  object  
 5   housing             48730 non-null  object  
 6   loan                48730 non-null  object  
 7   contact             48730 non-null  object  
 8   month               48730 non-null  object  
 9   day_of_week         48730 non-null  object  
 10  duration_min_cat    48730 non-null  object  
 11  campaign_cat        48730 non-null  object  
 12  pdays_cat           48730 non-null  category
 13  previous_cat        48730 non-null  object  
 14  poutcome            48730 non-null  object  
 15  emp_var_rate_cat    48730 non-null  

In [25]:
df_tr_cat_binary = df_tr_cat.loc[:, ["cons_price_idx_cat", "cons_conf_idx_cat", "euribor3m_cat", "nr_employed_cat"]]

df_tr_cat_binary

Unnamed: 0,cons_price_idx_cat,cons_conf_idx_cat,euribor3m_cat,nr_employed_cat
0,1,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,1,0,0,1
...,...,...,...,...
48725,0,0,0,1
48726,1,0,0,1
48727,0,0,0,1
48728,0,0,0,1


In [26]:
df_tr_encoded_res = pd.concat([df_tr_one_hot_cat, df_tr_cat_binary], axis=1)
df_tr_encoded_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48730 entries, 0 to 48729
Data columns (total 81 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   age_cat | 40-60                  48730 non-null  int64
 1   age_cat | 30-40                  48730 non-null  int64
 2   age_cat | 17-30                  48730 non-null  int64
 3   age_cat | 60-80                  48730 non-null  int64
 4   age_cat | 80-100                 48730 non-null  int64
 5   job | housemaid                  48730 non-null  int64
 6   job | services                   48730 non-null  int64
 7   job | admin.                     48730 non-null  int64
 8   job | blue-collar                48730 non-null  int64
 9   job | technician                 48730 non-null  int64
 10  job | retired                    48730 non-null  int64
 11  job | management                 48730 non-null  int64
 12  job | unemployed                 48730 non-nul

In [134]:
df_tr_encoded_res

Unnamed: 0,age_cat | 40-60,age_cat | 30-40,age_cat | 17-30,age_cat | 60-80,age_cat | 80-100,job | housemaid,job | services,job | admin.,job | blue-collar,job | technician,...,poutcome | nonexistent,poutcome | failure,poutcome | success,emp_var_rate_cat | 0-2,emp_var_rate_cat | -2-0,emp_var_rate_cat | -4--2,cons_price_idx_cat,cons_conf_idx_cat,euribor3m_cat,nr_employed_cat
0,0,1,0,0,0,0,0,0,0,1,...,1,0,0,1,0,0,1,0,0,1
1,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
2,0,1,0,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,1
3,0,1,0,0,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,1
4,1,0,0,0,0,0,0,0,1,0,...,1,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48725,1,0,0,0,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,1
48726,0,0,1,0,0,0,0,1,0,0,...,1,0,0,0,1,0,1,0,0,1
48727,0,1,0,0,0,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,1
48728,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1


# Validation Set

## Categorize

In [27]:
df_val = pd.DataFrame(X_val)
df_val.columns = list(df_raw.iloc[:, :-1].columns)
df_val

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed
0,55,management,married,basic.9y,no,yes,no,cellular,jul,wed,421,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1
1,34,technician,married,high.school,no,yes,yes,cellular,nov,thu,116,1,999,0,nonexistent,-0.1,93.2,-42.0,4.076,5195.8
2,58,blue-collar,married,basic.6y,no,yes,no,cellular,aug,tue,140,3,999,0,nonexistent,1.4,93.444,-36.1,4.965,5228.1
3,33,unemployed,married,high.school,unknown,yes,no,cellular,jul,wed,590,9,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1
4,41,blue-collar,married,basic.4y,no,yes,no,cellular,jul,fri,138,3,999,0,nonexistent,1.4,93.918,-42.7,4.957,5228.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13725,68,retired,divorced,high.school,no,yes,yes,cellular,oct,mon,567,1,3,1,success,-1.1,94.601,-49.5,1.0,4963.6
13726,31,blue-collar,married,basic.9y,unknown,yes,no,telephone,jun,tue,1051,3,999,0,nonexistent,1.4,94.465,-41.8,4.961,5228.1
13727,34,admin.,single,high.school,no,no,no,cellular,jun,fri,548,2,999,0,nonexistent,-2.9,92.963,-40.8,1.268,5076.2
13728,32,admin.,single,university.degree,no,yes,no,cellular,jun,thu,89,2,999,0,nonexistent,-2.9,92.963,-40.8,1.26,5076.2


In [28]:
df_val_cat = categorize_data(df_val, df_tr_to_binary_means).loc[:, feat_cat]
df_val_cat

Unnamed: 0,age_cat,job,marital,education,default,housing,loan,contact,month,day_of_week,duration_min_cat,campaign_cat,pdays_cat,previous_cat,poutcome,emp_var_rate_cat,cons_price_idx_cat,cons_conf_idx_cat,euribor3m_cat,nr_employed_cat
0,40-60,management,married,basic.9y,no,yes,no,cellular,jul,wed,>5,2,180-999,0,nonexistent,0-2,1,0,0,1
1,30-40,technician,married,high.school,no,yes,yes,cellular,nov,thu,1-2,1,180-999,0,nonexistent,-2-0,0,0,0,1
2,40-60,blue-collar,married,basic.6y,no,yes,no,cellular,aug,tue,2-5,3-5,180-999,0,nonexistent,0-2,0,0,0,1
3,30-40,unemployed,married,high.school,unknown,yes,no,cellular,jul,wed,>5,>5,180-999,0,nonexistent,0-2,1,0,0,1
4,40-60,blue-collar,married,basic.4y,no,yes,no,cellular,jul,fri,2-5,3-5,180-999,0,nonexistent,0-2,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13725,60-80,retired,divorced,high.school,no,yes,yes,cellular,oct,mon,>5,1,0-180,1,success,-2-0,1,0,0,1
13726,30-40,blue-collar,married,basic.9y,unknown,yes,no,telephone,jun,tue,>5,3-5,180-999,0,nonexistent,0-2,1,0,0,1
13727,30-40,admin.,single,high.school,no,no,no,cellular,jun,fri,>5,2,180-999,0,nonexistent,-4--2,0,0,0,1
13728,30-40,admin.,single,university.degree,no,yes,no,cellular,jun,thu,1-2,2,180-999,0,nonexistent,-4--2,0,0,0,1


## Encode

In [29]:
df_val_one_hot_cat = one_hot_encode_df(df_raw_cat_comparison, df_val_cat, one_hot_feat_lst)
df_val_one_hot_cat

Unnamed: 0,age_cat | 40-60,age_cat | 30-40,age_cat | 17-30,age_cat | 60-80,age_cat | 80-100,job | housemaid,job | services,job | admin.,job | blue-collar,job | technician,...,previous_cat | 1,previous_cat | 2,previous_cat | 3-5,previous_cat | >5,poutcome | nonexistent,poutcome | failure,poutcome | success,emp_var_rate_cat | 0-2,emp_var_rate_cat | -2-0,emp_var_rate_cat | -4--2
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,1,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13725,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
13726,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
13727,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
13728,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1


In [30]:
df_val_one_hot_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13730 entries, 0 to 13729
Data columns (total 77 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   age_cat | 40-60                  13730 non-null  int64
 1   age_cat | 30-40                  13730 non-null  int64
 2   age_cat | 17-30                  13730 non-null  int64
 3   age_cat | 60-80                  13730 non-null  int64
 4   age_cat | 80-100                 13730 non-null  int64
 5   job | housemaid                  13730 non-null  int64
 6   job | services                   13730 non-null  int64
 7   job | admin.                     13730 non-null  int64
 8   job | blue-collar                13730 non-null  int64
 9   job | technician                 13730 non-null  int64
 10  job | retired                    13730 non-null  int64
 11  job | management                 13730 non-null  int64
 12  job | unemployed                 13730 non-nul

In [31]:
df_val_cat_binary = df_val_cat.loc[:, ["cons_price_idx_cat", "cons_conf_idx_cat", "euribor3m_cat", "nr_employed_cat"]]

df_val_cat_binary

Unnamed: 0,cons_price_idx_cat,cons_conf_idx_cat,euribor3m_cat,nr_employed_cat
0,1,0,0,1
1,0,0,0,1
2,0,0,0,1
3,1,0,0,1
4,1,0,0,1
...,...,...,...,...
13725,1,0,0,1
13726,1,0,0,1
13727,0,0,0,1
13728,0,0,0,1


In [32]:
df_val_encoded = pd.concat([df_val_one_hot_cat, df_val_cat_binary], axis=1)
df_val_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13730 entries, 0 to 13729
Data columns (total 81 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   age_cat | 40-60                  13730 non-null  int64
 1   age_cat | 30-40                  13730 non-null  int64
 2   age_cat | 17-30                  13730 non-null  int64
 3   age_cat | 60-80                  13730 non-null  int64
 4   age_cat | 80-100                 13730 non-null  int64
 5   job | housemaid                  13730 non-null  int64
 6   job | services                   13730 non-null  int64
 7   job | admin.                     13730 non-null  int64
 8   job | blue-collar                13730 non-null  int64
 9   job | technician                 13730 non-null  int64
 10  job | retired                    13730 non-null  int64
 11  job | management                 13730 non-null  int64
 12  job | unemployed                 13730 non-nul

# Write Datasets

## Combine X and y for train, val

In [33]:
df_tr_encoded_res

Unnamed: 0,age_cat | 40-60,age_cat | 30-40,age_cat | 17-30,age_cat | 60-80,age_cat | 80-100,job | housemaid,job | services,job | admin.,job | blue-collar,job | technician,...,poutcome | nonexistent,poutcome | failure,poutcome | success,emp_var_rate_cat | 0-2,emp_var_rate_cat | -2-0,emp_var_rate_cat | -4--2,cons_price_idx_cat,cons_conf_idx_cat,euribor3m_cat,nr_employed_cat
0,0,1,0,0,0,0,0,0,0,1,...,1,0,0,1,0,0,1,0,0,1
1,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
2,0,1,0,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,1
3,0,1,0,0,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,1
4,1,0,0,0,0,0,0,0,1,0,...,1,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48725,1,0,0,0,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,1
48726,0,0,1,0,0,0,0,1,0,0,...,1,0,0,0,1,0,1,0,0,1
48727,0,1,0,0,0,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,1
48728,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1


In [48]:
df_train_encoded_resampled_output = df_tr_encoded_res.copy()
df_train_encoded_resampled_output["y"] = np.where(y_train_res == "yes", 1, 0)

df_val_encoded_output = df_val_encoded.copy()
df_val_encoded_output["y"] = np.where(y_val == "yes", 1, 0)

In [49]:
df_train_encoded_resampled_output

Unnamed: 0,age_cat | 40-60,age_cat | 30-40,age_cat | 17-30,age_cat | 60-80,age_cat | 80-100,job | housemaid,job | services,job | admin.,job | blue-collar,job | technician,...,poutcome | failure,poutcome | success,emp_var_rate_cat | 0-2,emp_var_rate_cat | -2-0,emp_var_rate_cat | -4--2,cons_price_idx_cat,cons_conf_idx_cat,euribor3m_cat,nr_employed_cat,y
0,0,1,0,0,0,0,0,0,0,1,...,0,0,1,0,0,1,0,0,1,0
1,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,1,0,...,1,0,0,1,0,0,0,0,1,0
3,0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48725,1,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,1
48726,0,0,1,0,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,1,1
48727,0,1,0,0,0,0,0,0,0,1,...,0,1,0,0,1,0,0,0,1,1
48728,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,1


In [50]:
df_val_encoded_output

Unnamed: 0,age_cat | 40-60,age_cat | 30-40,age_cat | 17-30,age_cat | 60-80,age_cat | 80-100,job | housemaid,job | services,job | admin.,job | blue-collar,job | technician,...,poutcome | failure,poutcome | success,emp_var_rate_cat | 0-2,emp_var_rate_cat | -2-0,emp_var_rate_cat | -4--2,cons_price_idx_cat,cons_conf_idx_cat,euribor3m_cat,nr_employed_cat,y
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
1,0,1,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
4,1,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13725,0,0,0,1,0,0,0,0,0,0,...,0,1,0,1,0,1,0,0,1,1
13726,0,1,0,0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,1,1
13727,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,1
13728,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,1


## write to csv

In [51]:
df_train_encoded_resampled_output.to_csv("./output/training_data_resampled_encoded.csv", index=False)

df_val_encoded_output.to_csv("./output/validation_data_encoded.csv", index=False)