In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [15]:
df = pd.read_csv("bank-full.csv")
df.shape

(41188, 21)

In [17]:
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscribed
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [18]:
df.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
subscribed         object
dtype: object

In [19]:
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
subscribed        0
dtype: int64

In [21]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'subscribed'],
      dtype='object')

In [22]:
df['subscribed'].unique()

array(['no', 'yes'], dtype=object)

In [27]:
subscription = {'no':0, 'yes':1}
df = df.replace({'subscribed': subscription})

In [32]:
df['job'].unique()

array(['housemaid', 'services', 'admin.', 'blue-collar', 'technician',
       'retired', 'management', 'unemployed', 'self-employed', 'unknown',
       'entrepreneur', 'student'], dtype=object)

In [29]:
df['education'].unique()

array(['basic.4y', 'high.school', 'basic.6y', 'basic.9y',
       'professional.course', 'unknown', 'university.degree',
       'illiterate'], dtype=object)

In [30]:
df['default'].unique()

array(['no', 'unknown', 'yes'], dtype=object)

In [31]:
df['marital'].unique()

array(['married', 'single', 'divorced', 'unknown'], dtype=object)

In [33]:
df['housing'].unique()

array(['no', 'yes', 'unknown'], dtype=object)

In [35]:
df['loan'].unique()

array(['no', 'yes', 'unknown'], dtype=object)

In [36]:
df['contact'].unique()

array(['telephone', 'cellular'], dtype=object)

In [37]:
df['month'].unique()

array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'mar', 'apr',
       'sep'], dtype=object)

In [38]:
df['day_of_week'].unique()

array(['mon', 'tue', 'wed', 'thu', 'fri'], dtype=object)

In [39]:
X = df.drop('subscribed', axis=1).copy()
y = df['subscribed'].copy()

In [42]:
# we need to apply one Hot encoding on these categorical columns as no relation so can't apply label encoding
X_dummies = pd.get_dummies(X, columns=['job', 'marital', 'education', 'default', 'housing', 'loan',
                                        'contact', 'month', 'day_of_week','poutcome'])

In [43]:
X_dummies

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,334,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0,0,1,0,0,0,0,0,1,0
41184,46,383,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0,0,1,0,0,0,0,0,1,0
41185,56,189,2,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0,0,1,0,0,0,0,0,1,0
41186,44,442,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0,0,1,0,0,0,0,0,1,0


In [44]:
X.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')

In [45]:
X_dummies.columns

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oc

In [46]:
X_dummies.dtypes

age                     int64
duration                int64
campaign                int64
pdays                   int64
previous                int64
                        ...  
day_of_week_tue         uint8
day_of_week_wed         uint8
poutcome_failure        uint8
poutcome_nonexistent    uint8
poutcome_success        uint8
Length: 63, dtype: object

In [47]:
# data is not balanced as only 11% subscibed so we will use stratify in split to make the balance
sum(y)/len(y)

0.11265417111780131

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, random_state=48, stratify=y)

In [51]:
sum(y_train)/len(y_train)

0.11265417111780131

In [52]:
sum(y_test)/len(y_test)

0.11265417111780131

**1. Decision Trees**

In [54]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import plot_tree

In [55]:
dt_model = DecisionTreeClassifier(max_depth=6)

In [57]:
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=6)

In [59]:
y_pred = dt_model.predict(X_test)

In [60]:
accuracy_score(y_test, y_pred)

0.9149266776731086

In [61]:
confusion_matrix(y_test, y_pred)

array([[8745,  392],
       [ 484,  676]])

**2. Random Forest**

In [62]:
from sklearn.ensemble import RandomForestClassifier

In [70]:
rf_model = RandomForestClassifier(n_estimators=800, max_depth=8)
rf_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=8, n_estimators=800)

In [71]:
y_rf_pred = rf_model.predict(X_test)

In [72]:
accuracy_score(y_test, y_rf_pred)

0.9076430028163542

In [73]:
rf_model.feature_importances_

array([2.26306268e-02, 2.98710259e-01, 6.29781544e-03, 7.10597277e-02,
       1.69849390e-02, 5.45542779e-02, 4.13361318e-02, 5.25246309e-02,
       1.19799852e-01, 1.18885664e-01, 1.60645928e-03, 2.25445721e-03,
       7.70898980e-04, 6.73170376e-04, 1.12285684e-03, 2.66643002e-03,
       8.99680040e-04, 1.05579665e-03, 2.35006341e-03, 1.62174631e-03,
       8.10141477e-04, 7.23246748e-04, 1.23347307e-03, 1.79889614e-03,
       1.99489967e-03, 1.95673283e-04, 1.48089475e-03, 1.13920916e-03,
       1.55716169e-03, 1.55341285e-03, 1.61297121e-04, 1.64013430e-03,
       2.67800511e-03, 1.15221901e-03, 2.88913516e-03, 2.75060057e-03,
       0.00000000e+00, 1.85322380e-03, 7.62374740e-04, 1.82901481e-03,
       1.60824290e-03, 6.84623581e-04, 1.65562333e-03, 8.41281012e-03,
       8.39809186e-03, 4.96738844e-03, 1.63824743e-03, 1.28043336e-03,
       1.53059359e-03, 4.03617239e-03, 9.97501213e-03, 8.30234407e-03,
       1.41069473e-03, 1.23323175e-02, 2.96191119e-03, 1.86939575e-03,
      

In [75]:
params = {
    'n_estimators': [100,200,300,400,500,600],
    'max_depth': [4,5,6,7,8],
}

from sklearn.model_selection import GridSearchCV
grid_cv = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=4, scoring='accuracy')
grid_cv.fit(X_train, y_train)

GridSearchCV(cv=4, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [4, 5, 6, 7, 8],
                         'n_estimators': [100, 200, 300, 400, 500, 600]},
             scoring='accuracy')

In [76]:
grid_cv.best_params_

{'max_depth': 8, 'n_estimators': 300}

In [78]:
y_rf_pred = grid_cv.predict(X_test)

In [79]:
accuracy_score(y_test, y_rf_pred)

0.9074487714868408