 # Input variables:
   ## Bank client data:
   1 - age (numeric)  
   2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid","entrepreneur","student",
                                       "blue-collar","self-employed","retired","technician","services") 
   3 - marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)  
   4 - education (categorical: "unknown","secondary","primary","tertiary")  
   5 - default: has credit in default? (binary: "yes","no")  
   6 - balance: average yearly balance, in euros (numeric)   
   7 - housing: has housing loan? (binary: "yes","no")  
   8 - loan: has personal loan? (binary: "yes","no")  
   ## Related with the last contact of the current campaign:
   9 - contact: contact communication type (categorical: "unknown","telephone","cellular")   
  10 - day: last contact day of the month (numeric)  
  11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")  
  12 - duration: last contact duration, in seconds (numeric)  
   ## Other attributes:
  13 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)  
  14 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)  
  15 - previous: number of contacts performed before this campaign and for this client (numeric)  
  16 - poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")  

  ## Output variable:
  17 - y - has the client subscribed a term deposit? (binary: "yes","no")  
  
  ### Data Set:
  https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
import pydotplus
import graphviz
from IPython.display import Image
import time

In [2]:
file = 'C:\\Users\\Abe\\Data Science Bootcamp\\Unit 3\\If A Tree Falls\\Data\\bank-full.csv'
df = pd.read_csv(file,delimiter=';')

In [18]:
# Decision tree needs balance
df_yes = df[df.ix[:,16] == 'yes']
df_no = df[~(df.ix[:,16] == 'yes')]

# Pick a random sample of no's from df equal to the amount of yes's in the data
yes_count = df.ix[:,16].value_counts()[1]
df_ss = df_no.sample(n=yes_count,random_state=1337).append(df_yes)

(10578, 17)

In [4]:
# Check how many of the columns are categorical and how many unique attributes each have

categorical = df_ss.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    print(column.nunique())

job
12
marital
3
education
4
default
2
housing
2
loan
2
contact
3
month
12
poutcome
4
y
2


In [19]:
# Check out how much of the data is made up of unkowns

for col in categorical:
    print(col)
    print('{:.2%}'.format(((df_ss[col] == 'unknown').sum()/df_ss.shape[0])))

job
0.64%
marital
0.00%
education
4.26%
default
0.00%
housing
0.00%
loan
0.00%
contact
20.71%
month
0.00%
poutcome
73.96%
y
0.00%


In [10]:
Y = df_ss.y
X = df_ss.ix[:,df_ss.columns != 'y']
X = pd.get_dummies(X)

In [13]:
%%timeit

dtree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=50,
    max_depth=5,
    random_state = 1337
)

scre = cross_val_score(dtree, X, Y, cv=10)
print(scre)
print(scre.mean())

[ 0.48109641  0.88468809  0.6805293   0.66257089  0.83175803  0.75519849
  0.74385633  0.77410208  0.81568998  0.81155303]
0.744104263333
[ 0.48109641  0.88468809  0.6805293   0.66257089  0.83175803  0.75519849
  0.74385633  0.77410208  0.81568998  0.81155303]
0.744104263333
[ 0.48109641  0.88468809  0.6805293   0.66257089  0.83175803  0.75519849
  0.74385633  0.77410208  0.81568998  0.81155303]
0.744104263333
[ 0.48109641  0.88468809  0.6805293   0.66257089  0.83175803  0.75519849
  0.74385633  0.77410208  0.81568998  0.81155303]
0.744104263333
1 loop, best of 3: 811 ms per loop


In [15]:
%%timeit

rfc = ensemble.RandomForestClassifier()
scre = cross_val_score(rfc, X, Y, cv=10)
print(scre)
print(scre.mean())

[ 0.47258979  0.56427221  0.55671078  0.67296786  0.67485822  0.73913043
  0.76181474  0.74102079  0.8563327   0.85700758]
0.689670511829
[ 0.46975425  0.59451796  0.5463138   0.66918715  0.61909263  0.70793951
  0.76275992  0.75708885  0.84877127  0.86174242]
0.683716775506
[ 0.47731569  0.63988658  0.5557656   0.68336484  0.6389414   0.72589792
  0.77882798  0.70699433  0.84782609  0.86268939]
0.691750980982
[ 0.46597353  0.63232514  0.56427221  0.65500945  0.64744802  0.72589792
  0.77977316  0.7173913   0.86483932  0.86742424]
0.692035429914
1 loop, best of 3: 2.05 s per loop


In [16]:
tree = dtree.fit(X,Y)

tree_imp = pd.DataFrame()
tree_imp['features'] = X.columns
tree_imp['importance'] = tree.feature_importances_
tree_imp = tree_imp.sort_values('importance',axis=0,ascending=False).reset_index()
tree_imp

Unnamed: 0,index,features,importance
0,3,duration,0.559953
1,49,poutcome_success,0.171841
2,34,contact_unknown,0.135816
3,28,housing_no,0.042563
4,29,housing_yes,0.034835
5,43,month_may,0.029515
6,41,month_jun,0.008371
7,2,day,0.004669
8,1,balance,0.003816
9,38,month_feb,0.002982


In [17]:
forest = rfc.fit(X,Y)

for_imp = pd.DataFrame()
for_imp['features'] = X.columns
for_imp['importance'] = forest.feature_importances_
for_imp = tree_imp.sort_values('importance',axis=0,ascending=False).reset_index(drop=True)

# Check if features and their importance are the same for the tree and the forest
for_imp['shared'] = (for_imp['importance'] == tree_imp['importance']) & (for_imp['features'] == tree_imp['features'])
for_imp

Unnamed: 0,index,features,importance,shared
0,3,duration,0.559953,True
1,49,poutcome_success,0.171841,True
2,34,contact_unknown,0.135816,True
3,28,housing_no,0.042563,True
4,29,housing_yes,0.034835,True
5,43,month_may,0.029515,True
6,41,month_jun,0.008371,True
7,2,day,0.004669,True
8,1,balance,0.003816,True
9,38,month_feb,0.002982,True
