In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics
%matplotlib inline

# Bank Marketing Data - A Decision Tree Approach

## Aim:
The aim of this attempt is to predict if the client will subscribe (yes/no) to a term deposit, by building a classification model using Decision Tree.
### Step 1: Load the data
- Load `bank.csv' data
- Check the first five observations
- Check if there are any null values

In [16]:
bank = pd.read_csv('bank.csv')
bank.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


## Summay of data

### Categorical Variables :
**[1] job      :** admin,technician, services, management, retired, blue-collar, unemployed, entrepreneur,
               housemaid, unknown, self-employed, student
<br>**[2] marital  :** married, single, divorced
<br>**[3] education:** secondary, tertiary, primary, unknown
<br>**[4] default  :** yes, no
<br>**[5] housing  :** yes, no
<br>**[6] loan     :** yes, no
<br>**[7] deposit  :** yes, no ** (Dependent Variable)**
<br>**[8] contact  :** unknown, cellular, telephone
<br>**[9] month    :** jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec
<br>**[10] poutcome:** unknown, other, failure, success

### Numerical Variables:
**[1] age
<br>[2] balance
<br>[3] day
<br>[4] duration
<br>[5] campaign
<br>[6] pdays
<br>[7] previous **

In [17]:
bank[bank.isnull().any(axis=1)].count()

Unnamed: 0,0
age,0
job,0
marital,0
education,0
default,0
balance,0
housing,0
loan,0
contact,0
day,0


### Step 2: Transformer
- Create a trasnformer pipeline for numeric and categorical features. numerical features will be imputed and scaled. Categorical features will be imputed and encoded
- Create a Column transformer

In [18]:
b_data = bank.copy()

numeric_features = ['age', 'balance', 'day','duration','pdays','previous']
numeric_transformer = Pipeline(steps=[('imputer',
                                      SimpleImputer(strategy='median')),
                                       ('scaler',StandardScaler())])
categorical_features = ['job', 'marital','education','default',
                        'housing','loan','contact','month','poutcome']
categorical_transformer = Pipeline(steps=[('imputer',
                                           SimpleImputer(strategy='constant',
                                                         fill_value='missing')),
                                                          ('onehot',
                                                           OneHotEncoder
                                                            (handle_unknown
                                                             ='ignore'))])
preprocessor = ColumnTransformer(transformers=
 [('num',numeric_transformer,numeric_features),('cat',categorical_transformer,
                                                categorical_features)])

### Step 3: Classifier
- Create a pipeline for the decision tree classifier as well as the transformer
- Encode the target variable using `LabelEncoder`

In [19]:
clf = Pipeline(steps=[('preprocessor',
                       preprocessor),('classifier',DecisionTreeClassifier
                        (random_state=1,max_depth=2))])
label_encoder=LabelEncoder()
b_data['deposit_cat']=label_encoder.fit_transform(b_data['deposit'])
b_data.drop('deposit',axis=1,inplace=True)

### Step 4: Model
- Create a pipeline for the decision tree classifier as well as the transformer
- Encode the target variable using `LabelEncoder`

In [25]:
X = b_data.drop('deposit_cat',axis=1)
y = b_data.deposit_cat
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=50)
clf.fit(X_train,y_train)
score_train = clf.score(X_train,y_train)
print('Training score:',score_train)
score_test = clf.score(X_test,y_test)
print('Testing score:',score_test)

Training score: 0.7285250307985217
Testing score: 0.7268248992386923
