In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import datasets
from io import StringIO
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics
%matplotlib inline

In [2]:
# Load data file
bank=pd.read_csv('./bank.csv')
bank.sample(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
8679,28,services,single,secondary,no,1636,yes,no,cellular,18,may,97,1,355,2,failure,no
2770,73,retired,married,unknown,no,519,no,no,telephone,16,oct,434,1,57,1,failure,yes
4470,46,admin.,married,secondary,no,659,yes,no,telephone,29,jul,1127,11,-1,0,unknown,yes
4613,35,management,divorced,tertiary,no,1723,yes,no,cellular,25,jul,862,1,-1,0,unknown,yes
2767,22,services,single,secondary,no,129,no,no,cellular,16,oct,258,1,-1,0,unknown,yes


In [4]:
bank_data = bank.copy()
bank_data.job.value_counts()

management       2566
blue-collar      1944
technician       1823
admin.           1334
services          923
retired           778
self-employed     405
student           360
unemployed        357
entrepreneur      328
housemaid         274
unknown            70
Name: job, dtype: int64

In [5]:
bank_data['job'] = bank_data['job'].replace(['management', 'admin.'], 'white-collar')
bank_data['job'] = bank_data['job'].replace(['services','housemaid'], 'pink-collar')
bank_data['job'] = bank_data['job'].replace(['retired', 'student', 'unemployed', 'unknown'], 'other')

In [6]:
bank_data.job.value_counts()

white-collar     3900
blue-collar      1944
technician       1823
other            1565
pink-collar      1197
self-employed     405
entrepreneur      328
Name: job, dtype: int64

In [7]:
bank_data.poutcome.value_counts()

unknown    8326
failure    1228
success    1071
other       537
Name: poutcome, dtype: int64

In [8]:
bank_data['poutcome'] = bank_data['poutcome'].replace(['other'] , 'unknown')
bank_data.poutcome.value_counts()

unknown    8863
failure    1228
success    1071
Name: poutcome, dtype: int64

In [9]:
# Drop 'contact', as every participant has been contacted. 
bank_data.drop('contact', axis=1, inplace=True)

In [10]:
# values for "default" : yes/no
bank_data['default_cat'] = bank_data['default'].map( {'yes':1, 'no':0} )
bank_data.drop('default', axis=1,inplace = True)

In [11]:
# values for "housing" : yes/no
bank_data["housing_cat"]=bank_data['housing'].map({'yes':1, 'no':0})
bank_data.drop('housing', axis=1,inplace = True)

In [12]:
# values for "loan" : yes/no
bank_data["loan_cat"] = bank_data['loan'].map({'yes':1, 'no':0})
bank_data.drop('loan', axis=1, inplace=True)

In [14]:
# day  : last contact day of the month
# month: last contact month of year
# Drop 'month' and 'day' as they don't have any intrinsic meaning
bank_data.drop('month', axis=1, inplace=True)
bank_data.drop('day', axis=1, inplace=True)

ValueError: labels ['month'] not contained in axis

In [15]:
# values for "deposit" : yes/no
bank_data["deposit_cat"] = bank_data['deposit'].map({'yes':1, 'no':0})
bank_data.drop('deposit', axis=1, inplace=True)

In [16]:
print("Customers that have not been contacted before:", len(bank_data[bank_data.pdays==-1]))
print("Maximum values on padys    :", bank_data['pdays'].max())

Customers that have not been contacted before: 8324
Maximum values on padys    : 854


In [None]:
bank_data.loc[bank_data['pdays'] == -1, 'pdays'] = 10000

In [17]:
# Create a new column: recent_pdays 
#numpy.where(condition[, x, y]) Where True, yield x, otherwise yield y.
bank_data['recent_pdays'] = np.where(bank_data['pdays'], 1/bank_data.pdays, 1/bank_data.pdays)

# Drop 'pdays'
bank_data.drop('pdays', axis=1, inplace = True)

In [18]:
bank_data.tail()

Unnamed: 0,age,job,marital,education,balance,duration,campaign,previous,poutcome,default_cat,housing_cat,loan_cat,deposit_cat,recent_pdays
11157,33,blue-collar,single,primary,1,257,1,0,unknown,0,1,0,0,-1.0
11158,39,pink-collar,married,secondary,733,83,4,0,unknown,0,0,0,0,-1.0
11159,32,technician,single,secondary,29,156,2,0,unknown,0,0,0,0,-1.0
11160,43,technician,married,secondary,0,9,2,5,failure,0,0,1,0,0.005814
11161,34,technician,married,secondary,0,628,1,0,unknown,0,0,0,0,-1.0


In [19]:
# Convert categorical variables to dummies
bank_with_dummies = pd.get_dummies(data=bank_data, columns = ['job', 'marital', 'education', 'poutcome'], \
                                   prefix = ['job', 'marital', 'education', 'poutcome'])
bank_with_dummies.head()

Unnamed: 0,age,balance,duration,campaign,previous,default_cat,housing_cat,loan_cat,deposit_cat,recent_pdays,...,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,poutcome_failure,poutcome_success,poutcome_unknown
0,59,2343,1042,1,0,0,1,0,1,-1.0,...,0,1,0,0,1,0,0,0,0,1
1,56,45,1467,1,0,0,0,0,1,-1.0,...,0,1,0,0,1,0,0,0,0,1
2,41,1270,1389,1,0,0,1,0,1,-1.0,...,0,1,0,0,1,0,0,0,0,1
3,55,2476,579,1,0,0,1,0,1,-1.0,...,0,1,0,0,1,0,0,0,0,1
4,54,184,673,2,0,0,0,0,1,-1.0,...,0,1,0,0,0,1,0,0,0,1


In [20]:
bank_with_dummies.shape

(11162, 27)

In [22]:
bankcl = bank_with_dummies

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [24]:
data_drop_deposite = bankcl.drop('deposit_cat', 1)
label = bankcl.deposit_cat
data_train, data_test, label_train, label_test = train_test_split(data_drop_deposite, label, test_size = 0.2, random_state = 50)

In [42]:
clf=RandomForestClassifier(random_state=0)
clf.fit(data_train,label_train)
dt1_score_train = clf.score(data_train, label_train)
print('training score:',dt1_score_train)
dt6_score_test = clf.score(data_test, label_test)
print('test score:',dt6_score_test)

training score: 0.9875685967073581
test score: 0.793999104343932


In [43]:
print(clf.predict(data_test))

[1 0 0 ... 0 0 0]
