In [1]:
%matplotlib inline
from collections import Counter
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline
from pathlib import Path
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys
sys.path.append("../../..")

In [4]:
from utils.distMix import distmix
from utils.RSMOTENC import RSMOTENC
from utils.SMOTEENC import SMOTEENC
from utils.auxSamplingStudy import *

In [44]:
from analysis.data.bank.config import DATA, MODELS, REPORTS, idbin, idcat, idnum

# Read the file

In [7]:
df= pd.read_csv(DATA / 'bank.csv')

In [8]:
df= remove_outlier(df, 'age', 2.5)
df= remove_outlier(df, 'campaign', 2.5)
df= remove_outlier(df, 'emp.var.rate', 2.5)
df= remove_outlier(df, 'cons.price.idx', 2.5)
df= remove_outlier(df, 'cons.conf.idx', 2.5)
df= remove_outlier(df, 'euribor3m', 2.5)
df= remove_outlier(df, 'nr.employed', 2.5)

In [9]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [10]:
is_pdays_999 = df['pdays'] == 999
df.loc[is_pdays_999, 'pdays_c'] = "never contacted"
df.loc[~is_pdays_999, 'pdays_c'] = pd.qcut(df.loc[~is_pdays_999, 'pdays'], 4, labels=["very recently contacted","recently contacted", "moderately recently contacted", "contacted long ago"])
df[['pdays_c', 'pdays']]

df.drop('pdays', 1, inplace= True)

In [11]:
is_previous_0 = df['previous'] == 0
df.loc[is_previous_0, 'previous_c'] = "never contacted"
df.loc[~is_previous_0, 'previous_c'] = pd.cut(df.previous,bins=[0, 1, 4, 7],labels=["contacted once", "rarely contacted", "frequently contacted"])
df[['previous_c', 'previous']]

df.drop('previous', 1, inplace= True)

In [12]:
#### In order to feed the data to any machine learning method, 
### it's convenient to change strings to numeric values. So, we are going to change 'no' to 0 and 'yes' to 1
is_purchased = df['y'] == 'yes'
df.loc[is_purchased, 'target'] = 1
df.loc[~is_purchased, 'target'] = 0
df[['target', 'y']]
df.drop('y', 1, inplace= True)

In [13]:
df_num= df[['age','duration','campaign','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','target']]
corr_num = df_num.corr()
corr_num

Unnamed: 0,age,duration,campaign,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,target
age,1.0,-0.006561,0.006875,0.081826,0.054775,0.109962,0.092023,0.077782,-0.02421
duration,-0.006561,1.0,-0.037328,-0.014408,0.00574,-0.012044,-0.017155,-0.024651,0.423297
campaign,0.006875,-0.037328,1.0,0.10192,0.087973,-0.007239,0.081965,0.09554,-0.046072
emp.var.rate,0.081826,-0.014408,0.10192,1.0,0.845136,0.328424,0.978759,0.95469,-0.257349
cons.price.idx,0.054775,0.00574,0.087973,0.845136,1.0,0.190927,0.805977,0.735563,-0.185322
cons.conf.idx,0.109962,-0.012044,-0.007239,0.328424,0.190927,1.0,0.397912,0.217505,0.027898
euribor3m,0.092023,-0.017155,0.081965,0.978759,0.805977,0.397912,1.0,0.9626,-0.248442
nr.employed,0.077782,-0.024651,0.09554,0.95469,0.735563,0.217505,0.9626,1.0,-0.274426
target,-0.02421,0.423297,-0.046072,-0.257349,-0.185322,0.027898,-0.248442,-0.274426,1.0


In [14]:
# emp.var.rate and nr.employed are highly correlated with euribor3m. 
# That's why former two are removed to get rid of multi-collinearity.
# Duration column is substantially correlated to target 
# and could be a good predictor of target outcome. 
# However, one can not know call duration before making the call. 
# That's why this column is removed so that the model can generalise on unseen data

In [15]:
df.drop('duration',1,inplace=True)
df.drop('emp.var.rate',1,inplace=True)
df.drop('nr.employed',1,inplace=True)

In [16]:
df.drop_duplicates(inplace=True)

In [17]:
df_feature= df.drop('target',1)
df_target= df[['target']]

In [18]:
print(Counter(df['target']))

Counter({0.0: 32599, 1.0: 3519})


In [19]:
# Num of minority instances is 3519 and majority instances is 32599

# Create helper functions

In [21]:
## Split the data to be 5-fold cross-validated
kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
#randomforest model - hyperparameter tuning using grid search
param_grid = {
'max_depth': [10], 'max_features': [5, 10],
'min_samples_leaf': [3, 5], 'min_samples_split': [2, 4], 'n_estimators': [500]
}# Create a base model
param_grid = {'randomforestclassifier__' + key: param_grid[key] for key in param_grid}

In [22]:
encoded_df = MultiColumnLabelEncoder(columns = [ 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week',
                                               'poutcome', 'pdays_c', 'previous_c']).fit_transform(df)


In [23]:
encoded_df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'campaign', 'poutcome',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'pdays_c', 'previous_c',
       'target'],
      dtype='object')

In [24]:
anovap_value = list()
for cols in ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week',
                                               'poutcome', 'pdays_c', 'previous_c']:
    statistic, p = stats.f_oneway(encoded_df[cols], encoded_df['target'])
    anovap_value.append(tuple([cols, p]))
    
print(anovap_value)

[('job', 0.0), ('marital', 0.0), ('education', 0.0), ('default', 0.0), ('housing', 0.0), ('loan', 0.0), ('contact', 0.0), ('month', 0.0), ('day_of_week', 0.0), ('poutcome', 0.0), ('pdays_c', 0.0), ('previous_c', 0.0)]


In [25]:
from scipy.stats import chi2_contingency
chi2p_value = list()
for cols in ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week',
                                               'poutcome', 'pdays_c', 'previous_c']:
    obs = pd.crosstab(encoded_df[cols], encoded_df['target'])
    g, p, dof, expctd = chi2_contingency(obs)
    chi2p_value.append(tuple([cols, p]))
    
print(chi2p_value)

[('job', 8.274959557723803e-85), ('marital', 1.0490021676190478e-27), ('education', 4.722556979141996e-30), ('default', 2.396736751936363e-56), ('housing', 0.019146278089660962), ('loan', 0.13725167724069698), ('contact', 3.456262374450821e-129), ('month', 0.0), ('day_of_week', 0.0019573310174748654), ('poutcome', 0.0), ('pdays_c', 0.0), ('previous_c', 7.705708237314738e-209)]


In [26]:
encoded_df= encoded_df.drop('target',1)

In [27]:
1 - 2000 / encoded_df.shape[0]

0.9446259482806357

In [28]:
np.sum(df_target)/df_target.shape[0]

target    0.097431
dtype: float64

In [29]:
#Sampling: Added by me
encoded_df, aux1, df_target, aux2 = train_test_split(encoded_df, df_target, test_size=0.95, random_state=12)

In [30]:
df_target.shape

(1805, 1)

In [31]:
np.sum(df_target)

target    153.0
dtype: float64

In [32]:
np.sum(df_target)/df_target.shape[0]

target    0.084765
dtype: float64

In [33]:
1/12

0.08333333333333333

In [34]:
# Split the raw data into train and test set. Split ratio = 75:25
X_train, X_test, y_train, y_test = train_test_split(encoded_df, df_target, test_size=0.25, random_state=12)

In [35]:
col_list = list(X_train.columns)
X_train.index = pd.RangeIndex(len(X_train.index))
y_train.index = pd.RangeIndex(len(y_train.index))
X_test.index = pd.RangeIndex(len(X_test.index))
y_test.index = pd.RangeIndex(len(y_test.index))

In [36]:
X_train.shape

(1353, 17)

In [37]:
## Apply standard scaler on the features , so that euclidean distance calculation in SMOTE is not biased

columns = [0,10,12,13,14]
name_columns = X_train.columns[columns]

sc = StandardScaler()
X_train[name_columns] = sc.fit_transform(X_train[name_columns])
X_test[name_columns] = sc.transform(X_test[name_columns])

In [38]:
X_train = pd.DataFrame(X_train, index=range(X_train.shape[0]),
                          columns=col_list)
X_test = pd.DataFrame(X_test, index=range(X_test.shape[0]),
                          columns=col_list)

In [39]:
X_train.shape

(1353, 17)

In [40]:
X_train.to_csv(DATA / "bank_X_train.csv", index = False)
y_train.to_csv(DATA / "bank_y_train.csv", index = False)
X_test.to_csv(DATA / "bank_X_test.csv", index = False)
y_test.to_csv(DATA / "bank_y_test.csv", index = False)