In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("bank-full.csv")
df.shape

(41188, 21)

In [3]:
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscribed
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [4]:
df.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
subscribed         object
dtype: object

In [5]:
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
subscribed        0
dtype: int64

In [6]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'subscribed'],
      dtype='object')

In [7]:
df['subscribed'].unique()

array(['no', 'yes'], dtype=object)

In [8]:
subscription = {'no':0, 'yes':1}
df = df.replace({'subscribed': subscription})

In [9]:
df['job'].unique()

array(['housemaid', 'services', 'admin.', 'blue-collar', 'technician',
       'retired', 'management', 'unemployed', 'self-employed', 'unknown',
       'entrepreneur', 'student'], dtype=object)

In [10]:
df['education'].unique()

array(['basic.4y', 'high.school', 'basic.6y', 'basic.9y',
       'professional.course', 'unknown', 'university.degree',
       'illiterate'], dtype=object)

In [11]:
df['default'].unique()

array(['no', 'unknown', 'yes'], dtype=object)

In [12]:
df['marital'].unique()

array(['married', 'single', 'divorced', 'unknown'], dtype=object)

In [13]:
df['housing'].unique()

array(['no', 'yes', 'unknown'], dtype=object)

In [14]:
df['loan'].unique()

array(['no', 'yes', 'unknown'], dtype=object)

In [15]:
df['contact'].unique()

array(['telephone', 'cellular'], dtype=object)

In [16]:
df['month'].unique()

array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'mar', 'apr',
       'sep'], dtype=object)

In [17]:
df['day_of_week'].unique()

array(['mon', 'tue', 'wed', 'thu', 'fri'], dtype=object)

In [18]:
X = df.drop('subscribed', axis=1).copy()
y = df['subscribed'].copy()

In [19]:
# we need to apply one Hot encoding on these categorical columns as no relation so can't apply label encoding
X_dummies = pd.get_dummies(X, columns=['job', 'marital', 'education', 'default', 'housing', 'loan',
                                        'contact', 'month', 'day_of_week','poutcome'])

In [20]:
X_dummies

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,334,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0,0,1,0,0,0,0,0,1,0
41184,46,383,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0,0,1,0,0,0,0,0,1,0
41185,56,189,2,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0,0,1,0,0,0,0,0,1,0
41186,44,442,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0,0,1,0,0,0,0,0,1,0


In [21]:
X.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')

In [22]:
X_dummies.columns

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oc

In [23]:
X_dummies.dtypes

age                     int64
duration                int64
campaign                int64
pdays                   int64
previous                int64
                        ...  
day_of_week_tue         uint8
day_of_week_wed         uint8
poutcome_failure        uint8
poutcome_nonexistent    uint8
poutcome_success        uint8
Length: 63, dtype: object

In [24]:
# data is not balanced as only 11% subscibed so we will use stratify in split to make the balance
sum(y)/len(y)

0.11265417111780131

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, random_state=48, stratify=y)

In [26]:
sum(y_train)/len(y_train)

0.11265417111780131

In [27]:
sum(y_test)/len(y_test)

0.11265417111780131

**1. Decision Trees**

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import plot_tree

In [29]:
dt_model = DecisionTreeClassifier(max_depth=6)

In [30]:
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=6)

In [31]:
y_pred = dt_model.predict(X_test)

In [32]:
accuracy_score(y_test, y_pred)

0.9149266776731086

In [33]:
confusion_matrix(y_test, y_pred)

array([[8745,  392],
       [ 484,  676]])

**2. Random Forest**

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
rf_model = RandomForestClassifier(n_estimators=800, max_depth=8)
rf_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=8, n_estimators=800)

In [36]:
y_rf_pred = rf_model.predict(X_test)

In [37]:
accuracy_score(y_test, y_rf_pred)

0.907740118481111

In [38]:
rf_model.feature_importances_

array([2.19079339e-02, 2.92150918e-01, 6.66680894e-03, 7.55322775e-02,
       1.42468965e-02, 5.42878105e-02, 4.16644396e-02, 5.18064630e-02,
       1.09457057e-01, 1.38660067e-01, 1.61828002e-03, 2.63725088e-03,
       6.94447270e-04, 6.96019255e-04, 1.09007230e-03, 2.71934236e-03,
       8.73324114e-04, 1.10660333e-03, 2.39498188e-03, 1.73972476e-03,
       8.03281026e-04, 7.25214695e-04, 1.24977522e-03, 1.83659858e-03,
       2.16858224e-03, 2.03313558e-04, 1.55165415e-03, 1.06667362e-03,
       1.60555888e-03, 1.67048367e-03, 1.33268766e-04, 1.63180628e-03,
       2.53322483e-03, 1.17524490e-03, 2.99616291e-03, 2.75656941e-03,
       0.00000000e+00, 1.87683484e-03, 7.44080241e-04, 1.82176584e-03,
       1.76488933e-03, 6.82607551e-04, 1.78561049e-03, 8.97636514e-03,
       8.99827870e-03, 5.32771536e-03, 1.59483085e-03, 9.42472778e-04,
       1.46684111e-03, 3.82316017e-03, 1.04022609e-02, 6.93108623e-03,
       1.42903874e-03, 1.10366850e-02, 2.67113332e-03, 1.78868823e-03,
      

In [39]:
params = {
    'n_estimators': [100,200,400,600],
    'max_depth': [4,6,8]
}

from sklearn.model_selection import GridSearchCV
grid_cv = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=4, scoring='accuracy')
grid_cv.fit(X_train, y_train)

GridSearchCV(cv=4, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [4, 6, 8],
                         'n_estimators': [100, 200, 400, 600]},
             scoring='accuracy')

In [40]:
grid_cv.best_params_

{'max_depth': 8, 'n_estimators': 200}

In [41]:
y_rf_pred = grid_cv.predict(X_test)

In [42]:
accuracy_score(y_test, y_rf_pred)

0.9075458871515976

**3. AdaBoost**

In [44]:
from sklearn.ensemble import AdaBoostClassifier

In [45]:
adb_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                               n_estimators=100,
                               learning_rate=0.8)

In [46]:
adb_model.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.8, n_estimators=100)

In [47]:
y_adb_pred = adb_model.predict(X_test)

In [48]:
accuracy_score(y_test, y_adb_pred)

0.9102651257647859

**4. Gradient Boost**

In [49]:
from sklearn.ensemble import GradientBoostingClassifier

In [51]:
gbc_model = GradientBoostingClassifier(
                               n_estimators=100,
                               learning_rate=0.8)

In [52]:
gbc_model.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.8)

In [58]:
y_gbc_pred = gbc_model.predict(X_test)

In [60]:
accuracy_score(y_test, y_gbc_pred)

0.9170632222977566

**5. XGBoost**

In [63]:
import xgboost as xgb

In [64]:
xgb_model = xgb.XGBClassifier()

In [65]:
xgb_model.fit(X_train, y_train)

XGBClassifier()

In [66]:
y_xgb_model = xgb_model.predict(X_test)

In [67]:
accuracy_score(y_test, y_xgb_model)

0.9174516849567835

In [69]:
params_xgb = {
    'max_depth':[4,6,8],
    'learning_rate':[0.2,0.4,0.8]
}

In [70]:
grid_cv = GridSearchCV(xgb.XGBClassifier(), param_grid=params_xgb, cv=4, scoring='accuracy')
grid_cv.fit(X_train, y_train)

GridSearchCV(cv=4, estimator=XGBClassifier(),
             param_grid={'learning_rate': [0.2, 0.4, 0.8],
                         'max_depth': [4, 6, 8]},
             scoring='accuracy')

In [71]:
grid_cv.best_params_

{'learning_rate': 0.2, 'max_depth': 4}

In [72]:
y_xgb_pred = grid_cv.predict(X_test)

In [73]:
accuracy_score(y_test, y_xgb_pred)

0.9195882295814315