In [1]:
# Important Statements
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [2]:
dataset = pd.read_csv("bank-additional-full.csv")
dataset = dataset.drop(['emp.var.rate','nr.employed', 'duration'], axis=1)

In [3]:
dataset.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'campaign', 'pdays', 'previous',
       'poutcome', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'y'],
      dtype='object')

In [4]:
dataset.shape

(41188, 18)

In [5]:
dataset.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,cons.price.idx,cons.conf.idx,euribor3m,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,93.994,-36.4,4.857,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,93.994,-36.4,4.857,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,93.994,-36.4,4.857,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,93.994,-36.4,4.857,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,93.994,-36.4,4.857,no


In [6]:
#Initial data prep
#check for null values
#dataset.isnull().values.any()
#remove rows with >= 4 unkown values
#dataset = dataset.replace(to_replace='unknown', value=np.nan).dropna(thresh=17)
#dataset = dataset.replace(to_replace='unknown', value=np.nan).dropna()
#dataset = dataset.replace(to_replace=np.nan, value='unknown')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 18 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
y                 41188 non-null object
dtypes: float64(3), int64(4), object(11)
memory usage: 5.7+ MB


In [7]:
#One Hot Encoding all Catergorical Variables without Order
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['job','marital','education','default','housing','loan','contact','poutcome','month','day_of_week'])
#encoder = ce.BinaryEncoder(cols=['job','marital','default','housing','loan','contact','poutcome','month','day_of_week'])
dataset = encoder.fit_transform(dataset)

In [8]:
# Encoding Outcome Row to Binary
#dataset['outcome'] = dataset['outcome'].map({'yes': 1, 'no': 0})
dataset['y'] = dataset['y'].map({'yes': 1, 'no': 0})

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 43 columns):
age               41188 non-null int64
job_0             41188 non-null int64
job_1             41188 non-null int64
job_2             41188 non-null int64
job_3             41188 non-null int64
job_4             41188 non-null int64
marital_0         41188 non-null int64
marital_1         41188 non-null int64
marital_2         41188 non-null int64
education_0       41188 non-null int64
education_1       41188 non-null int64
education_2       41188 non-null int64
education_3       41188 non-null int64
default_0         41188 non-null int64
default_1         41188 non-null int64
default_2         41188 non-null int64
housing_0         41188 non-null int64
housing_1         41188 non-null int64
housing_2         41188 non-null int64
loan_0            41188 non-null int64
loan_1            41188 non-null int64
loan_2            41188 non-null int64
contact_0         41188 non-null 

In [10]:
dataset.shape

(41188, 43)

In [11]:
# Defining Dependent and Independent values
X = dataset.iloc[:,:-1].values 
y = dataset.iloc[:,-1].values 

In [12]:
#Splitting the data into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [13]:
# Scaling
from sklearn.preprocessing import StandardScaler 
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [15]:
# note you need to install lightGBM as not part of sklearn
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import lightgbm as lgb
import numpy as np

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(bootstrap=True, criterion='gini', n_estimators=50)
clf3 = GaussianNB()
clf4 = lgb.LGBMClassifier(learning_rate = 0.2, metric = 'l1', n_estimators = 20)
lr = LogisticRegression(random_state=5)

sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4], 
                          meta_classifier=lr)

print('3-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, clf4, lr, sclf], 
                      ['KNN', 
                       'Random Forest', 
                       'Naive Bayes',
                       'lightGBM',
                        'Logistic Regression',
                        'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, X, y, 
                                              cv=10
                                             , scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

3-fold cross validation:

Accuracy: 0.67 (+/- 0.15) [KNN]
Accuracy: 0.50 (+/- 0.30) [Random Forest]
Accuracy: 0.78 (+/- 0.20) [Naive Bayes]
Accuracy: 0.43 (+/- 0.29) [lightGBM]




Accuracy: 0.80 (+/- 0.19) [Logistic Regression]




Accuracy: 0.61 (+/- 0.27) [StackingClassifier]
