# Combination

In [1]:
import imblearn
from collections import Counter
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import SMOTE, RandomOverSampler

In [2]:
df = pd.read_csv("bank-additional-full.csv", delimiter=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
cols = df.columns
var_numerical = list(df._get_numeric_data().columns)
var_categorical = list(set(cols) - set(var_numerical) - set('y'))

In [4]:
# Default variable and Day of week has no impact on the client subscribing for term deposit.
df.drop(["default"], axis = 1, inplace=True)
df.drop(["day_of_week"], axis = 1, inplace=True)


In [5]:
df.drop(['emp.var.rate', 'nr.employed'], axis = 1, inplace=True)
var_numerical = list(set(var_numerical) - set(['emp.var.rate', 'nr.employed']))

In [6]:
# Get the dummy variables for contact, poutcome , job, month, marital,  education
# Let's drop the first column from  using 'drop_first = True' as first column can be derived using other columns 

df['job'] = df['job'].apply(lambda x: -1 if x=='unknown' or x=='unemployed' else (15 if x=='entrepreneur' else (8 if x == 'blue-collar' else ( 6 if x=='technician' or x=='services' or  x=='admin.' or x=='management' else (4 if x== 'self-employed' or x=='student' else (2 if x=='housemaid' or x=='retired' else None) )))))
df['housing'] = df['housing'].apply(lambda x: 0 if x=='no' else (1 if x=='yes' else -1))
df['loan'] = df['loan'].apply(lambda x: 0 if x=='no' else (1 if x=='yes' else -1))
df['y'] = df['y'].apply(lambda x: 0 if x=='no' else (1 if x=='yes' else -1))
df['poutcome'] = df['poutcome'].apply(lambda x: 0 if x=='failure' else (2 if x=='failure' else - 1))
df['pdays'] = df['pdays'].apply(lambda x: 0 if x==999 else(20 if x<=10 else(6 if x<=20 else 3)))

df  = pd.get_dummies(df, drop_first = True)

In [7]:
y = df.pop('y')

In [8]:
df.head()

Unnamed: 0,age,job,housing,loan,duration,campaign,pdays,previous,poutcome,cons.price.idx,...,contact_telephone,month_aug,month_dec,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,56,2,0,0,261,1,0,0,-1,93.994,...,1,0,0,0,0,0,1,0,0,0
1,57,6,0,0,149,1,0,0,-1,93.994,...,1,0,0,0,0,0,1,0,0,0
2,37,6,1,0,226,1,0,0,-1,93.994,...,1,0,0,0,0,0,1,0,0,0
3,40,6,0,0,151,1,0,0,-1,93.994,...,1,0,0,0,0,0,1,0,0,0
4,56,6,0,1,307,1,0,0,-1,93.994,...,1,0,0,0,0,0,1,0,0,0


In [9]:
model_dt = DecisionTreeClassifier()
model_rf = RandomForestClassifier(max_depth = 10, max_features = 25, n_estimators = 100)
model_nb = GaussianNB()

In [10]:
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

## SMOTE and ENN


In [11]:
resample = SMOTEENN()

In [12]:
pipeline_dt = Pipeline(steps=[('r', resample), ('m', model_dt)])

In [13]:
%%time
scores_dt = cross_val_score(pipeline_dt, df, y, scoring = 'accuracy', cv = cv, n_jobs = -1)

CPU times: total: 1.77 s
Wall time: 3min 54s


In [14]:
np.mean(scores_dt)

0.8733045565307919

In [15]:
pipeline_rf = Pipeline(steps=[('r', resample), ('m', model_rf)])

In [16]:
%%time
scores_rf = cross_val_score(pipeline_rf, df, y, scoring = 'accuracy', cv = cv, n_jobs = -1)

CPU times: total: 1.83 s
Wall time: 8min 7s


In [17]:
np.mean(scores_rf)

0.8718882746153638

In [18]:
pipeline_nb = Pipeline(steps=[('r', resample), ('m', model_nb)])

In [19]:
%%time
scores_nb = cross_val_score(pipeline_nb, df, y, scoring = 'accuracy', cv = cv, n_jobs = -1)

CPU times: total: 1.94 s
Wall time: 5min 46s


In [20]:
np.mean(scores_nb)

0.8481677854588499

## Smote & Tomek Links

In [21]:
resample_st = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))

In [22]:
pipeline_nb_st = Pipeline(steps=[('r', resample_st), ('m', model_nb)])

In [23]:
%%time
scores_nb_st = cross_val_score(pipeline_nb_st, df, y, scoring='accuracy', cv=cv, n_jobs=-1)

CPU times: total: 1.62 s
Wall time: 6min 45s


In [24]:
np.mean(scores_nb_st)

0.83869905128954

In [25]:
pipeline_dt_st = Pipeline(steps=[('r', resample_st), ('m', model_dt)])

In [26]:
%%time
scores_dt_st = cross_val_score(pipeline_dt_st, df, y, scoring='accuracy', cv=cv, n_jobs=-1)

CPU times: total: 1.83 s
Wall time: 6min 17s


In [27]:
np.mean(scores_dt_st)

0.885670638397586

In [28]:
pipeline_rf_st = Pipeline(steps=[('r', resample_st), ('m', model_dt)])

In [29]:
%%time
scores_rf_st = cross_val_score(pipeline_rf_st, df, y, scoring='accuracy', cv=cv, n_jobs=-1)

CPU times: total: 1.98 s
Wall time: 6min 16s


In [30]:
np.mean(scores_rf_st)

0.8857839246791942

## Smote and Random Undersampling

In [31]:
over = SMOTE()
under = RandomUnderSampler()

In [32]:
pipeline_dt_sr = Pipeline(steps = [('o', over), ('u', under), ('m', model_dt)])

In [36]:
%%time
scores_dt_sr = cross_val_score(pipeline_dt_sr, df, y, scoring='accuracy', cv=cv, n_jobs=-1)

CPU times: total: 1.28 s
Wall time: 10.6 s


In [37]:
np.mean(scores_dt_sr)

0.8853387502911895

In [38]:
pipeline_rf_sr = Pipeline(steps = [('o', over), ('u', under), ('m', model_rf)])

In [40]:
%%time
scores_rf_sr = cross_val_score(pipeline_rf_sr, df, y, scoring='accuracy', cv=cv, n_jobs=-1)

CPU times: total: 1.47 s
Wall time: 2min 36s


In [41]:
np.mean(scores_rf_sr)

0.9013303960297546

In [42]:
pipeline_nb_sr = Pipeline(steps = [('o', over), ('u', under), ('m', model_nb)])

In [43]:
%%time
scores_nb_sr = cross_val_score(pipeline_nb_sr, df, y, scoring='accuracy', cv=cv, n_jobs=-1)

CPU times: total: 1.67 s
Wall time: 5.3 s


In [44]:
np.mean(scores_nb_sr)

0.838286308138293

## Random Over+Under Sampling

In [45]:
over_1 = RandomOverSampler()
under_1 = RandomUnderSampler()

In [46]:
pipeline_dt_ou = Pipeline(steps = [('o', over_1), ('u', under_1), ('m', model_dt)])

In [47]:
%%time
scores_dt_ou = cross_val_score(pipeline_dt_ou, df, y, scoring='accuracy', cv=cv, n_jobs=-1)

CPU times: total: 1.92 s
Wall time: 8.43 s


In [48]:
np.mean(scores_dt_ou)

0.8905588313010898

In [49]:
pipeline_rf_ou = Pipeline(steps = [('o', over_1), ('u', under_1), ('m', model_rf)])

In [50]:
%%time
scores_rf_ou = cross_val_score(pipeline_rf_ou, df, y, scoring='accuracy', cv=cv, n_jobs=-1)

CPU times: total: 1.95 s
Wall time: 3min 9s


In [51]:
np.mean(scores_rf_ou)

0.8749554898323366

In [52]:
pipeline_nb_ou = Pipeline(steps = [('o', over_1), ('u', under_1), ('m', model_nb)])

In [53]:
%%time
scores_nb_ou = cross_val_score(pipeline_nb_ou, df, y, scoring='accuracy', cv=cv, n_jobs=-1)

CPU times: total: 1.81 s
Wall time: 4.84 s


In [54]:
np.mean(scores_rf_ou)

0.8749554898323366