In [86]:
#Import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [87]:
df = pd.read_csv('bank-marketing.csv')

In [88]:
df.head()

Unnamed: 0,age,job,salary,marital,education,targeted,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,response
0,58,management,100000,married,tertiary,yes,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,60000,single,secondary,yes,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,120000,married,secondary,yes,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,20000,married,unknown,no,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,0,single,unknown,no,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [89]:
Itemlist = []
for col in df.columns:
    Itemlist.append([col, df[col].dtype, df[col].isnull().sum(),
                     round(df[col].isnull().sum()/len(df[col])*100,2), 
                     df[col].nunique(), 
                     list(df[col].sample(5).drop_duplicates().values)])

dfDesc = pd.DataFrame(columns=['dataFeatures', 'dataType', 'null', 'nullPct', 'unique', 'uniqueSample'], data=Itemlist)
dfDesc

Unnamed: 0,dataFeatures,dataType,null,nullPct,unique,uniqueSample
0,age,int64,0,0.0,77,"[42, 32, 35, 36, 45]"
1,job,object,0,0.0,12,"[blue-collar, services, student, technician]"
2,salary,int64,0,0.0,11,"[100000, 0, 55000, 70000]"
3,marital,object,0,0.0,3,"[married, single]"
4,education,object,0,0.0,4,"[secondary, unknown]"
5,targeted,object,0,0.0,2,"[yes, no]"
6,default,object,0,0.0,2,[no]
7,balance,int64,0,0.0,7168,"[831, 1208, 2152, 13763, 1956]"
8,housing,object,0,0.0,2,"[yes, no]"
9,loan,object,0,0.0,2,[no]


In [90]:
df['poutcome'].unique()

array(['unknown', 'failure', 'other', 'success'], dtype=object)

### Data Preprocessing

In [91]:
# Map pdays=-1 into a large value (10000 is used) to indicate that it is so far in the past that it has no effect
df.loc[df['pdays']==-1, 'pdays'] = 10000

In [92]:
# Create a new column: recent_pdays 
df['recent_pdays'] = np.where(df['pdays'], 1/df.pdays, 1/df.pdays)

# Drop 'pdays'
df.drop('pdays', axis=1, inplace = True)

In [93]:
# Label Encoding
from sklearn import preprocessing
  
category_col =df.select_dtypes('object').columns.tolist()
labelEncoder = preprocessing.LabelEncoder() 
  
mapping_dict ={} 
for col in category_col: 
    df[col] = labelEncoder.fit_transform(df[col]) 
  
    le_name_mapping = dict(zip(labelEncoder.classes_, 
                        labelEncoder.transform(labelEncoder.classes_))) 
  
    mapping_dict[col]= le_name_mapping 
print(mapping_dict) 

{'job': {'admin.': 0, 'blue-collar': 1, 'entrepreneur': 2, 'housemaid': 3, 'management': 4, 'retired': 5, 'self-employed': 6, 'services': 7, 'student': 8, 'technician': 9, 'unemployed': 10, 'unknown': 11}, 'marital': {'divorced': 0, 'married': 1, 'single': 2}, 'education': {'primary': 0, 'secondary': 1, 'tertiary': 2, 'unknown': 3}, 'targeted': {'no': 0, 'yes': 1}, 'default': {'no': 0, 'yes': 1}, 'housing': {'no': 0, 'yes': 1}, 'loan': {'no': 0, 'yes': 1}, 'contact': {'cellular': 0, 'telephone': 1, 'unknown': 2}, 'month': {'apr': 0, 'aug': 1, 'dec': 2, 'feb': 3, 'jan': 4, 'jul': 5, 'jun': 6, 'mar': 7, 'may': 8, 'nov': 9, 'oct': 10, 'sep': 11}, 'poutcome': {'failure': 0, 'other': 1, 'success': 2, 'unknown': 3}, 'response': {'no': 0, 'yes': 1}}


In [94]:
df.head()

Unnamed: 0,age,job,salary,marital,education,targeted,default,balance,housing,loan,contact,day,month,duration,campaign,previous,poutcome,response,recent_pdays
0,58,4,100000,1,2,1,0,2143,1,0,2,5,8,261,1,0,3,0,0.0001
1,44,9,60000,2,1,1,0,29,1,0,2,5,8,151,1,0,3,0,0.0001
2,33,2,120000,1,1,1,0,2,1,1,2,5,8,76,1,0,3,0,0.0001
3,47,1,20000,1,3,0,0,1506,1,0,2,5,8,92,1,0,3,0,0.0001
4,33,11,0,2,3,0,0,1,0,0,2,5,8,198,1,0,3,0,0.0001


### Model Selection

In [95]:
X = df.drop('response', 1)
y = df['response']

# split into train and test set using sklearn model_selection
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [96]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFE

models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

In [97]:
models

[('LR', LogisticRegression()),
 ('LDA', LinearDiscriminantAnalysis()),
 ('KNN', KNeighborsClassifier()),
 ('CART', DecisionTreeClassifier()),
 ('RF', RandomForestClassifier()),
 ('NB', GaussianNB()),
 ('SVM', SVC())]

In [98]:
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state='seed')
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold,scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print("{} {}({})".format(name, cv_results.mean(), cv_results.std()))

LR 0.8875249023783951(0.006185429145849432)
LDA 0.8916448124148257(0.007083474880935084)
KNN 0.8746959035014106(0.004541707716298627)
CART 0.8730367643820601(0.006487623590722167)
RF 0.9042247645704528(0.00387625289884734)
NB 0.8797279959923762(0.0066897498734920175)
SVM 0.88393046540305(0.005735973324191322)


### Feature Selection

In [111]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rfe = RFE(rf_model, 8)
rfe = rfe.fit(X_train, y_train)

In [112]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

[('age', True, 1),
 ('job', True, 1),
 ('salary', False, 3),
 ('marital', False, 7),
 ('education', False, 4),
 ('targeted', False, 10),
 ('default', False, 11),
 ('balance', True, 1),
 ('housing', False, 5),
 ('loan', False, 9),
 ('contact', False, 8),
 ('day', True, 1),
 ('month', True, 1),
 ('duration', True, 1),
 ('campaign', False, 2),
 ('previous', False, 6),
 ('poutcome', True, 1),
 ('recent_pdays', True, 1)]

In [113]:
X_train.columns[rfe.support_]

Index(['age', 'job', 'balance', 'day', 'month', 'duration', 'poutcome',
       'recent_pdays'],
      dtype='object')

In [114]:
X_train_rfe = X_train[X_train.columns[rfe.support_]]
X_test_rfe = X_test[X_test.columns[rfe.support_]]

### Fitting Model

In [115]:
model = RandomForestClassifier()

model.fit(X_train_rfe, y_train)

RandomForestClassifier()

In [116]:
y_pred = model.predict(X_test_rfe)

In [117]:
accuracy_score(y_test, y_pred)

0.9031294924250802

In [121]:
arr = [[23, 2345, 2343, 34, 4, 46, 2, 23]]
model.predict(arr)[0]

0

### Pickling Model

In [118]:
import pickle 

pickle.dump(model, open('model.pkl', 'wb'))