# Bank Project - Resampling

In [18]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
import preprocessing as prep
from sklearn.utils import resample

## Fetch Data

In [19]:
df = pd.read_csv('./data/bank-full.csv', sep=';')

In [49]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    [("target",prep.TargetTransformer(),'y')])

y_trans = pd.DataFrame(data = ct.fit_transform(df),columns=['y_trans'] ,index = df.index)

In [35]:
df_t = pd.concat([y_trans,df],axis=1).drop(['y'],axis=1)

In [47]:
# Upsample minority classPython

# Separate majority and minority classes
df_majority = df_t[df_t.y_trans==0]
df_minority = df_t[df_t.y_trans==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=5289,    # to match majority class
                                 random_state=101) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled

Unnamed: 0,y_trans,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown
1,0,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown
2,0,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown
3,0,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown
4,0,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44786,1,68,retired,divorced,primary,no,2027,no,no,cellular,14,sep,404,2,91,4,success
44173,1,86,retired,married,primary,no,1255,no,no,telephone,14,jul,247,1,180,3,success
26018,1,43,self-employed,divorced,secondary,no,347,no,no,cellular,19,nov,1468,3,-1,0,unknown
18015,1,55,management,married,tertiary,no,-375,no,no,cellular,30,jul,814,2,-1,0,unknown


In [48]:
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=len(df_minority),    # to match majority class
                                 random_state=101) # reproducible results
 
# Combine majority class with upsampled minority class
df_downsampled = pd.concat([df_minority, df_majority_downsampled])
 
# Display new class counts
df_downsampled

Unnamed: 0,y_trans,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
83,1,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown
86,1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown
87,1,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown
129,1,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown
168,1,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25304,0,40,technician,married,secondary,no,7313,yes,no,cellular,18,nov,241,2,182,1,failure
30294,0,27,unemployed,single,secondary,no,11,no,no,cellular,5,feb,257,1,-1,0,unknown
39379,0,38,self-employed,married,secondary,no,8094,no,no,cellular,22,may,94,7,-1,0,unknown
8794,0,24,blue-collar,single,secondary,no,476,yes,no,unknown,4,jun,161,1,-1,0,unknown
