In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics
%matplotlib inline

# Bank Marketing Data - A Decision Tree Approach

## Aim:
The aim of this attempt is to predict if the client will subscribe (yes/no) to a term deposit, by building a classification model using Decision Tree.
### Step 1: Load the data
- Load `bank.csv' data
- Check the first five observations
- Check if there are any null values

In [2]:
# Load data file
bank=pd.read_csv('bank.csv')
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


## Summay of data

### Categorical Variables :
**[1] job      :** admin,technician, services, management, retired, blue-collar, unemployed, entrepreneur,
               housemaid, unknown, self-employed, student
<br>**[2] marital  :** married, single, divorced
<br>**[3] education:** secondary, tertiary, primary, unknown
<br>**[4] default  :** yes, no
<br>**[5] housing  :** yes, no
<br>**[6] loan     :** yes, no 
<br>**[7] deposit  :** yes, no ** (Dependent Variable)**
<br>**[8] contact  :** unknown, cellular, telephone
<br>**[9] month    :** jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec
<br>**[10] poutcome:** unknown, other, failure, success

### Numerical Variables:
**[1] age 
<br>[2] balance
<br>[3] day
<br>[4] duration
<br>[5] campaign
<br>[6] pdays
<br>[7] previous **

In [4]:
# Check if the data set contains any null values - Nothing found!
bank[bank.isnull().any(axis=1)].count()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

In [5]:
bank.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0
mean,41.231948,1528.538524,15.658036,371.993818,2.508421,51.330407,0.832557
std,11.913369,3225.413326,8.42074,347.128386,2.722077,108.758282,2.292007
min,18.0,-6847.0,1.0,2.0,1.0,-1.0,0.0
25%,32.0,122.0,8.0,138.0,1.0,-1.0,0.0
50%,39.0,550.0,15.0,255.0,2.0,-1.0,0.0
75%,49.0,1708.0,22.0,496.0,3.0,20.75,1.0
max,95.0,81204.0,31.0,3881.0,63.0,854.0,58.0


### Step 2: Transformer
- Create a trasnformer pipeline for numeric and categorical features. numerical features will be imputed and scaled. Categorical features will be imputed and encoded
- Create a Column transformer

In [52]:
# Make a copy for parsing
b_data = bank.copy()

numeric_features = ['age','balance','day','duration','pdays','previous']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['job', 'marital','education','default','housing','loan','contact','month','poutcome']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
     #('onehot', OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])



Training score:  0.7285250307985217
Testing score:  0.7268248992386923


### Step 3: Classifier
- Create a pipeline for the decision tree classifier as well as the transformer
- Encode the target variable using `LabelEncoder`

In [None]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier',   DecisionTreeClassifier(random_state=1, max_depth=2))])
# values for "deposit" : yes/no
#b_data["deposit_cat"] = b_data['deposit'].map({'yes':1, 'no':0})
label_encoder=LabelEncoder()
b_data['deposit_cat']=label_encoder.fit_transform(b_data['deposit']) 
b_data.drop('deposit', axis=1, inplace=True)


### Step 4: Model
- Create a pipeline for the decision tree classifier as well as the transformer
- Encode the target variable using `LabelEncoder`

In [None]:
# Train-Test split: 20% test data
X = b_data.drop('deposit_cat', 1)
y = b_data.deposit_cat
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 50)
clf.fit(X_train,y_train)
score_train = clf.score(X_train, y_train)
print("Training score: ",score_train)
score_test = clf.score(X_test, y_test)
print("Testing score: ",score_test)

In [3]:
# A slightly different solution
b_data = bank.copy()
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# values for "deposit" : yes/no
#b_data["deposit_cat"] = b_data['deposit'].map({'yes':1, 'no':0})
label_encoder=LabelEncoder()
b_data['deposit_cat']=label_encoder.fit_transform(b_data['deposit']) 
b_data.drop('deposit', axis=1, inplace=True)
# Train-Test split: 20% test data
X = b_data.drop('deposit_cat', 1)
y = b_data.deposit_cat

categorical = X.dtypes == object
print(categorical)
print(~categorical)


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
     #('onehot', OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, ~categorical),
        ('cat', categorical_transformer, categorical)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier',   DecisionTreeClassifier(random_state=1, max_depth=2))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 50)
clf.fit(X_train,y_train)
score_train = clf.score(X_train, y_train)
print("Training score: ",score_train)
score_test = clf.score(X_test, y_test)
print("Testing score: ",score_test)

age          False
job           True
marital       True
education     True
default       True
balance      False
housing       True
loan          True
contact       True
day          False
month         True
duration     False
campaign     False
pdays        False
previous     False
poutcome      True
dtype: bool
age           True
job          False
marital      False
education    False
default      False
balance       True
housing      False
loan         False
contact      False
day           True
month        False
duration      True
campaign      True
pdays         True
previous      True
poutcome     False
dtype: bool
Training score:  0.7285250307985217
Testing score:  0.7268248992386923
