In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import datasets
from io import StringIO
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics
%matplotlib inline

In [24]:
# Load data file
bank=pd.read_csv('./bank.csv')
bank.sample(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
4315,62,retired,married,secondary,no,569,no,no,cellular,4,feb,94,5,184,1,success,yes
4159,57,management,married,secondary,no,2155,no,yes,cellular,10,nov,144,3,91,2,failure,yes
10368,58,blue-collar,married,primary,no,96,no,no,unknown,16,jun,76,3,-1,0,unknown,no
489,33,self-employed,divorced,secondary,no,1904,yes,no,cellular,17,jul,1584,2,-1,0,unknown,yes
6748,40,management,married,tertiary,no,964,yes,no,telephone,13,may,95,3,-1,0,unknown,no


In [25]:
bank[bank.isnull().any(axis=1)].count()  #to count the number of null values in the dataset

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

In [26]:
# Different types of job categories and their counts
bank_data = bank.copy()  #copy ot of the actual data into bank_data
bank_data.job.value_counts()  #count of each value in the job column(attribute)

management       2566
blue-collar      1944
technician       1823
admin.           1334
services          923
retired           778
self-employed     405
student           360
unemployed        357
entrepreneur      328
housemaid         274
unknown            70
Name: job, dtype: int64

In [27]:
# Combine similar jobs into categiroes
# to perform one hot encoding we will need  10 columns which is huge for training purposes
# replace common attributes in the job columns by other values as below:
bank_data['job'] = bank_data['job'].replace(['management', 'admin.'], 'white-collar') #replce management,admin to white-collar
bank_data['job'] = bank_data['job'].replace(['services','housemaid'], 'pink-collar')  #replace services and house-maid to pink-collar
bank_data['job'] = bank_data['job'].replace(['retired', 'student', 'unemployed', 'unknown'], 'other') #replace retired,student ... to other


In [28]:
bank_data.job.value_counts()

white-collar     3900
blue-collar      1944
technician       1823
other            1565
pink-collar      1197
self-employed     405
entrepreneur      328
Name: job, dtype: int64

In [29]:
bank_data.poutcome.value_counts()  #for poutcome column

unknown    8326
failure    1228
success    1071
other       537
Name: poutcome, dtype: int64

In [30]:
#since unknown and other are same replce them
bank_data['poutcome'] = bank_data['poutcome'].replace(['other'] , 'unknown')
bank_data.poutcome.value_counts()


unknown    8863
failure    1228
success    1071
Name: poutcome, dtype: int64

In [31]:
bank_data.drop('contact', axis=1, inplace=True)

In [32]:
# values for "default" : yes/no
bank_data['default_cat'] = bank_data['default'].map( {'yes':1, 'no':0} )
bank_data.drop('default', axis=1,inplace = True)

In [33]:
# values for "housing" : yes/no
bank_data["housing_cat"]=bank_data['housing'].map({'yes':1, 'no':0})
bank_data.drop('housing', axis=1,inplace = True)

In [34]:
# values for "loan" : yes/no
bank_data["loan_cat"] = bank_data['loan'].map({'yes':1, 'no':0})
bank_data.drop('loan', axis=1, inplace=True)

In [35]:
# day  : last contact day of the month
# month: last contact month of year
# Drop 'month' and 'day' as they don't have any intrinsic meaning
bank_data.drop('month', axis=1, inplace=True)
bank_data.drop('day', axis=1, inplace=True)

In [36]:
# values for "deposit" : yes/no
bank_data["deposit_cat"] = bank_data['deposit'].map({'yes':1, 'no':0})
bank_data.drop('deposit', axis=1, inplace=True)

In [37]:
print("Customers that have not been contacted before:", len(bank_data[bank_data.pdays==-1]))
print("Maximum values on padys    :", bank_data['pdays'].max())

Customers that have not been contacted before: 8324
Maximum values on padys    : 854


In [38]:
# Map padys=-1 into a large value (10000 is used) to indicate that it is so far in the past that it has no effect
bank_data.loc[bank_data['pdays'] == -1, 'pdays'] = 10000

In [39]:
# Create a new column: recent_pdays 
#numpy.where(condition[, x, y]) Where True, yield x, otherwise yield y.
bank_data['recent_pdays'] = np.where(bank_data['pdays'], 1/bank_data.pdays, 1/bank_data.pdays)

# Drop 'pdays' since its stored as recent pdays column drop the previous pdays column
bank_data.drop('pdays', axis=1, inplace = True)

In [40]:
bank_data.tail()

Unnamed: 0,age,job,marital,education,balance,duration,campaign,previous,poutcome,default_cat,housing_cat,loan_cat,deposit_cat,recent_pdays
11157,33,blue-collar,single,primary,1,257,1,0,unknown,0,1,0,0,0.0001
11158,39,pink-collar,married,secondary,733,83,4,0,unknown,0,0,0,0,0.0001
11159,32,technician,single,secondary,29,156,2,0,unknown,0,0,0,0,0.0001
11160,43,technician,married,secondary,0,9,2,5,failure,0,0,1,0,0.005814
11161,34,technician,married,secondary,0,628,1,0,unknown,0,0,0,0,0.0001


In [41]:
# Convert categorical variables to dummies
#to prevent collision of same values between two different columns
bank_with_dummies = pd.get_dummies(data=bank_data, columns = ['job', 'marital', 'education', 'poutcome'], \
                                   prefix = ['job', 'marital', 'education', 'poutcome'])
bank_with_dummies.head()

Unnamed: 0,age,balance,duration,campaign,previous,default_cat,housing_cat,loan_cat,deposit_cat,recent_pdays,...,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,poutcome_failure,poutcome_success,poutcome_unknown
0,59,2343,1042,1,0,0,1,0,1,0.0001,...,0,1,0,0,1,0,0,0,0,1
1,56,45,1467,1,0,0,0,0,1,0.0001,...,0,1,0,0,1,0,0,0,0,1
2,41,1270,1389,1,0,0,1,0,1,0.0001,...,0,1,0,0,1,0,0,0,0,1
3,55,2476,579,1,0,0,1,0,1,0.0001,...,0,1,0,0,1,0,0,0,0,1
4,54,184,673,2,0,0,0,0,1,0.0001,...,0,1,0,0,0,1,0,0,0,1


In [42]:
bank_with_dummies.shape

(11162, 27)

In [43]:
bank_with_dummies.describe()

Unnamed: 0,age,balance,duration,campaign,previous,default_cat,housing_cat,loan_cat,deposit_cat,recent_pdays,...,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,poutcome_failure,poutcome_success,poutcome_unknown
count,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,...,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0
mean,41.231948,1528.538524,371.993818,2.508421,0.832557,0.015051,0.473123,0.130801,0.47384,0.003124,...,0.115839,0.568984,0.315176,0.134385,0.490593,0.330496,0.044526,0.110016,0.095951,0.794033
std,11.913369,3225.413326,347.128386,2.722077,2.292007,0.121761,0.499299,0.337198,0.499338,0.030686,...,0.320047,0.495241,0.464607,0.34108,0.499934,0.470413,0.20627,0.312924,0.294537,0.404424
min,18.0,-6847.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.0,122.0,138.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,39.0,550.0,255.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0001,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,49.0,1708.0,496.0,3.0,1.0,0.0,1.0,0.0,1.0,0.001919,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
max,95.0,81204.0,3881.0,63.0,58.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [46]:
bankcl = bank_with_dummies

In [47]:
# Train-Test split: 20% test data
data_drop_deposite = bankcl.drop('deposit_cat', 1)
label = bankcl.deposit_cat
data_train, data_test, label_train, label_test = train_test_split(data_drop_deposite, label, test_size = 0.2, random_state = 50)

In [48]:
# Decision tree with depth = 2
dt2 = tree.DecisionTreeClassifier( max_depth=2)
dt2.fit(data_train, label_train)
dt2_score_train = dt2.score(data_train, label_train)
print("Training score: ",dt2_score_train)
dt2_score_test = dt2.score(data_test, label_test)
print("Testing score: ",dt2_score_test)

Training score:  0.7285250307985217
Testing score:  0.7268248992386923


In [49]:
# Decision tree: To the full depth
dt1 = tree.DecisionTreeClassifier() #max_depth=100
dt1.fit(data_train, label_train)
dt1_score_train = dt1.score(data_train, label_train)
print("Training score: ", dt1_score_train)
dt1_score_test = dt1.score(data_test, label_test)
print("Testing score: ", dt1_score_test)

Training score:  1.0
Testing score:  0.7384684281236006
