In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
pd.options.display.max_columns = None
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [4]:
df = pd.read_excel('bank data.xlsx')

In [28]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,210,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,138,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,339,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,185,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,137,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1


In [29]:
df.shape

(41188, 21)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp_var_rate    41188 non-null  float64
 16  cons_price_idx  41188 non-null  float64
 17  cons_conf_idx   41188 non-null 

In [31]:
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp_var_rate      0
cons_price_idx    0
cons_conf_idx     0
euribor3m         0
nr_employed       0
y                 0
dtype: int64

In [32]:
X_df = df.drop('y', axis=1) #data 
y = df['y'] #label

X_df.shape,y.shape

((41188, 20), (41188,))

In [33]:
objList = X_df.select_dtypes(include = "object").columns #
print (objList)

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome'],
      dtype='object')


In [34]:
#Label Encoding for object to numeric conversion
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for feat in objList:
    X_df[feat] = le.fit_transform(X_df[feat].astype(str))

print (X_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  int32  
 2   marital         41188 non-null  int32  
 3   education       41188 non-null  int32  
 4   default         41188 non-null  int32  
 5   housing         41188 non-null  int32  
 6   loan            41188 non-null  int32  
 7   contact         41188 non-null  int32  
 8   month           41188 non-null  int32  
 9   day_of_week     41188 non-null  int32  
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  int32  
 15  emp_var_rate    41188 non-null  float64
 16  cons_price_idx  41188 non-null  float64
 17  cons_conf_idx   41188 non-null 

In [35]:
# Split feature and label sets to train and data sets.

X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size = 0.2, random_state = 10)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((32950, 20), (8238, 20), (32950,), (8238,))

In [36]:
scaler = MinMaxScaler(feature_range = (0,1))

scaler.fit(X_train)
scaler.fit(X_test)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [37]:
#Python scikit-learn provides a Pipeline utility to help automate machine learning workflows.
#Pipelines work by allowing for a linear sequence of data transforms to be chained together 
#culminating in a modeling process that can be evaluated.

pipe = Pipeline([('classifier' , LogisticRegression())])

# Create param grid.

param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.linspace(1, 10, 20),
    'classifier__solver' : ['liblinear']},
    {'classifier' : [GradientBoostingClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
     'classifier__learning_rate' : np.linspace(0.1, 10, 50)},
     {'classifier' : [GaussianNB()],'classifier__var_smoothing' :np.linspace(0.0000001, 0.001,4)},
     {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
     'classifier__max_depth' : list(range(5,20,5))},
    {'classifier' : [DecisionTreeClassifier()],
     'classifier__max_depth' : list(range(5,20,5))},
    {'classifier' : [KNeighborsClassifier()],
     'classifier__n_neighbors' : list(range(2,10,1))},
    

   
]

# Create grid search object

clf = GridSearchCV(pipe, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)

# Fit on data

best_clf = clf.fit(X_train, y_train)


Fitting 3 folds for each of 585 candidates, totalling 1755 fits


 0.91004547 0.91053107 0.91001512 0.91050072 0.91001513 0.91053106
 0.91007583 0.91050072 0.91007582 0.91047036 0.91007582 0.91034896
 0.91010618 0.91028826 0.91010618 0.91022756 0.91001513 0.91019721
 0.91004548 0.91019721 0.90998478 0.91016686 0.91013653 0.91013652
 0.90998478 0.91013652 0.90998478 0.91013652 0.90995443 0.91007582
 0.91004548 0.91007582 0.90995443 0.91004547 0.90248859 0.91122915
 0.9135053  0.91465853 0.91496201 0.91468886 0.91465851 0.9150834
 0.91532619 0.91550828 0.91441571 0.91386942 0.91450677 0.91453709
 0.91459781 0.91465853 0.91453712 0.91471922 0.91499235 0.91526551
 0.9140212  0.91447641 0.91499231 0.91426395 0.91347483 0.91335349
 0.91386941 0.91484058 0.91417293 0.91426395 0.91298927 0.91265539
 0.91241262 0.91177531 0.91168427 0.91171459 0.9111076  0.91086482
 0.91028817 0.91095585 0.91174493 0.91244297 0.91314104 0.91262512
 0.91201813 0.91235198 0.91201813 0.91186637 0.9110773  0.9110166
 0.89933248 0.89905934 0.89890759 0.89893794 0.89912004 0.899150

In [38]:
best_clf.best_estimator_.get_params()['classifier']

GradientBoostingClassifier()

In [39]:
grid_predictions = best_clf.predict(X_test) 

In [40]:
print('Model accuracy is',best_clf.score(X_test, y_test))

Model accuracy is 0.9197620781743141


In [41]:
tableclf= pd.DataFrame(best_clf.cv_results_)
tableclf.to_excel("tableclf.xlsx")