In [1]:
import pandas as pd
import numpy as np
import copy

In [2]:
def Purity(probs, type_of_purity = "Entropy"):
    if type_of_purity == "Entropy":
        return(-1 * probs * np.log2(probs)).sum()
    elif type_of_purity == "Gini_Index":
        return(1- (probs**2).sum())
    elif type_of_purity == "Majority_Error": 
        return(1- max(probs))

In [3]:
def gain_info(data, attribute_column_name, label_column_name, type_of_gain = "Entropy"):
    weighted_purity = {} 
    Expected_Purity = 0
    Purity_1 =Purity(data[label_column_name].value_counts()/ len(data[label_column_name]), type_of_gain)
                                                    # calculates Entropy(S) by componentwise division
    group_names = data[attribute_column_name].unique()
    grouped_data = data[[attribute_column_name, label_column_name]].groupby(attribute_column_name)
    for name in group_names:
        #print ('group %s in attribute %s' %(name, attribute_column))
        X = grouped_data.get_group(name)[label_column_name]
        probs = X.value_counts()/ len(X)
        weight = len(X) / len(data[label_column_name])
        a = Purity(probs, type_of_gain)
        weighted_purity[name] = [weight, a]
        Expected_Purity += weight * a 
    return(weighted_purity,  Expected_Purity, Purity_1 - Expected_Purity)

In [4]:
def build_tree(data, label_column_name, attributes = None, 
               values_of_attributes= None, depth= None, type_tree = "Entropy"):    
    
        #check if the list of attributes is given!  
    
    if attributes == None: 
        
            # create a dictionary containing attributes (names of columns in dataframe) 
            # and their corresponding values 
    
        list_of_attributes = list(data.keys())  # keys of dictionary
                # output is: ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'label']
        list_of_attributes.remove(label_column_name) # remove the target column name
                # output is: ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
        values = [list(data[key].unique()) for key in list_of_attributes] # gives values of 
                     # each attribute: ouptput is: 
                        # [['low', 'vhigh', 'high', 'med'],
                        # ['vhigh', 'high', 'med', 'low'],
                        # ['4', '5more', '2', '3'],
                        # ['4', '2', 'more'],
                        # ['big', 'med', 'small'],
                        # ['med', 'high', 'low']]
        values_of_attributes = dict(zip(list_of_attributes, values)) 
        # makes a dictionary with key = attributes and other= values  
        # output is: 
        # {'buying': ['low', 'vhigh', 'high', 'med'],
        #'maint': ['vhigh', 'high', 'med', 'low'],
        #'doors': ['4', '5more', '2', '3'],
        #'persons': ['4', '2', 'more'],
        #'lug_boot': ['big', 'med', 'small'],
        #'safety': ['med', 'high', 'low']}
        
    else:
        list_of_attributes = attributes.copy()
        
     # -----------------------------------------------------
    
    if len(list_of_attributes) == 0 or len(data[label_column_name].unique()) == 1 or depth == 0:  
        
        # return np.argmax(data[label_column_name].value_counts()) idxmax(axis = 0) 
        return data[label_column_name].value_counts().idxmax(axis = 0) 
                    # gives a label with majority (maximum number)
    
    else:
            # computing the Entropy of target column
    
        probs = data[label_column_name].value_counts() / len(data[label_column_name])
                # gives probabilities of labels
                # output for original label column is:
                # unacc    0.698
                # acc      0.222
                # good     0.045
                # vgood    0.035
        
        Target_Purity = Purity(probs, type_tree) # is a number: Entropy or Ginin index or Majority error
    
            # find the maximum gain information
    
        list_of_gains = [] # list of gains correspoinding to each attributes
    
        for attribute in list_of_attributes:
            list_of_gains.append(gain_info(data, attribute, label_column_name, type_tree)[2])
                # computes gain (according to type_tree) of each attribute (heads of columns)
                # so, output is a list of size 6 like:
                    # [0.10152470712485662, 0.07741985577459642, 0.006726514230977809,
                    #     0.22441128678577127, 0.03688725199484155, 0.25822501448993573]
    
        attribute_for_split = list_of_attributes[np.argmax(list_of_gains)]
        #attribute_for_split = list_of_attributes[list_of_gains.argmax()]
                # gives an attribute with maximum gain (e.g. for first time gives "safety")
        
        #print('Best attribute for split is', attribute_for_split)
    
               # constructing the tree for the current branch
        #print("stage 1: attribute_for_split = ", attribute_for_split)
    
        tree = {attribute_for_split:{}} 
                # "attribute_for_split" like "safety" is a node of tree
    
        #for value, group in data.groupby(attribute_for_split):
        grouped_data = data.groupby(attribute_for_split)
        list_of_attributes.remove(attribute_for_split)
        
        
        values_in_attribute_for_split = values_of_attributes[attribute_for_split]
        #values_of_attributes.pop(attribute_for_split, None)
        #print("stage 2: values_in_attribute_for_split =", values_in_attribute_for_split)
        
        
        for value in values_in_attribute_for_split: 
            
            #print(values_of_attributes)
            #print('Current value =', value)
        
            #splited_data.drop(columns = [attributes_for_split], inplace = True)
            #new_data = group.drop(columns = [attribute_for_split])
                 
            #print('this is values_in_attribute_for_split', values_in_attribute_for_split)
            # this line (above) removes attribute_for_split form values_of_attributes and 
            # returns its value. When attribute_for_split is not a key, it returns None. 
            
            #if value in values_in_attribute_for_split:
            #print('values in', attribute_for_split, '=', list(data[attribute_for_split].unique()))
            
            if value in list(data[attribute_for_split].unique()):
                #print("values_of_attributes=", values_in_attribute_for_split)
                new_data = grouped_data.get_group(value).drop(columns = [attribute_for_split])
                
                #print(new_data.head())
                       
            #print('new data is \n', new_data) 
            
                if depth!= None:
                    left_depth = depth -1
                    tt = type_tree 
                    subtree = build_tree(new_data, label_column_name, 
                            list_of_attributes, values_of_attributes, depth = left_depth, type_tree = tt) 
                else:
                    tt = type_tree
                    subtree = build_tree(new_data, label_column_name, 
                                     list_of_attributes, values_of_attributes, type_tree = tt)
        
                tree[attribute_for_split][value] = subtree
            else:
                tree[attribute_for_split][value] = data[label_column_name].value_counts().idxmax(axis = 0)
        
    return (tree)  

In [5]:
def prediction(instance, trained_tree):
    #print(trained_tree.keys())
    #root = list(trained_tree.keys())[0]
    root = next(iter(trained_tree))
    #print('root is', root)
    if isinstance(trained_tree[root], dict):
        branch = instance[root]
        F = trained_tree[root][branch]
        if isinstance(F, dict):
            #return predict(trained_tree[root][branch], instance.drop(columns = [root])
            return prediction(instance, F)
        else:
            return trained_tree[root][branch]

In [6]:
# stay

In [7]:
A = open('data-desc.txt', 'r')
#for line in A.readlines():
    #print(line)
B = A.read()
print(B)

1. Title: Bank Marketing

2. Relevant Information:

   The data is related with direct marketing campaigns of a Portuguese banking institution. 
   The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, 
   in order to access if the product (bank term deposit) would be (or not) subscribed. 

   The classification goal is to predict if the client will subscribe a term deposit (variable y).

3. Number of Attributes: 16 + output attribute.

4. Attribute information:

   Input variables:
   # bank client data:
   1 - age (numeric)
   2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid","entrepreneur","student",
                                       "blue-collar","self-employed","retired","technician","services") 
   3 - marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)
   4 - education (categorical: "unknown","secondary","primary

In [8]:
Columns_names = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 
 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
type_of_Attributes = ['numeric', 'categorical', 'categorical', 'categorical', 'binary', 'numeric', 
                      'binary', 'binary', 'categorical', 'numeric', 'categorical', 'numeric', 
                      'numeric', 'numeric', 'numeric', 'categorical', 'binary']
dic= dict(zip(Columns_names, type_of_Attributes))

In [9]:
df_train = pd.read_csv('bank-train.csv', names = Columns_names)
df_test = pd.read_csv('bank-test.csv', names = Columns_names)
#df.head(5)

In [10]:
median_dict = {}
df_train_new =pd.DataFrame()
df_test_new =pd.DataFrame()
for name in Columns_names:
    if dic[name] == 'numeric':
        M = df_train[name].median()
        median_dict[name] = M
        df_train_new[name+ '>' + str(M)] = np.where(df_train[name]  > M, "yes", 'no')
        df_test_new[name+ '>' + str(M)] = np.where(df_test[name]  > M, "yes", 'no')
    else:
        df_train_new[name] = df_train[name]
        df_test_new[name] = df_test[name]

In [11]:
df_train_new.head()

Unnamed: 0,age>38.0,job,marital,education,default,balance>452.5,housing,loan,contact,day>16.0,month,duration>180.0,campaign>2.0,pdays>-1.0,previous>0.0,poutcome,y
0,yes,services,married,secondary,no,no,yes,no,unknown,no,may,no,no,no,no,unknown,no
1,yes,blue-collar,single,secondary,no,no,yes,yes,cellular,no,feb,yes,no,no,no,unknown,no
2,yes,technician,married,secondary,no,yes,no,yes,cellular,yes,aug,yes,no,yes,yes,success,yes
3,yes,admin.,married,tertiary,no,no,yes,no,cellular,no,jul,yes,no,no,no,unknown,no
4,no,management,single,tertiary,no,yes,no,no,cellular,no,apr,no,no,no,no,unknown,yes


In [12]:
df_test_new.head()

Unnamed: 0,age>38.0,job,marital,education,default,balance>452.5,housing,loan,contact,day>16.0,month,duration>180.0,campaign>2.0,pdays>-1.0,previous>0.0,poutcome,y
0,yes,management,single,secondary,no,yes,no,no,cellular,no,jun,yes,no,no,no,unknown,no
1,yes,blue-collar,married,secondary,no,no,yes,no,cellular,no,may,yes,no,yes,yes,failure,no
2,yes,retired,married,primary,no,no,no,no,telephone,yes,jul,no,yes,no,no,unknown,no
3,no,entrepreneur,single,tertiary,no,no,yes,yes,unknown,no,jun,yes,no,no,no,unknown,no
4,no,student,single,unknown,no,yes,no,no,telephone,yes,jan,no,yes,no,no,unknown,no


In [13]:
trained_tree = build_tree(df_train_new, 'y', depth= None, type_tree = "Entropy")

In [14]:
from pprint import pprint
pprint(trained_tree) 

{'duration>180.0': {'no': {'month': {'apr': {'job': {'admin.': {'education': {'primary': 'no',
                                                                              'secondary': 'no',
                                                                              'tertiary': {'housing': {'no': 'yes',
                                                                                                       'yes': 'no'}},
                                                                              'unknown': 'no'}},
                                                     'blue-collar': {'poutcome': {'failure': 'no',
                                                                                  'other': 'no',
                                                                                  'success': 'yes',
                                                                                  'unknown': 'no'}},
                                                     'entrepreneur': 'no',
     

                                                                                                                                                                                                                                                                                                                                                            'success': 'yes',
                                                                                                                                                                                                                                                                                                                                                            'unknown': 'yes'}},
                                                                                                                                                                                                                                                                          

                                                                                                       'yes': 'yes'}},
                                                                               'blue-collar': {'marital': {'divorced': 'no',
                                                                                                           'married': 'no',
                                                                                                           'single': {'loan': {'no': {'age>38.0': {'no': {'education': {'primary': 'yes',
                                                                                                                                                                        'secondary': {'default': {'no': {'balance>452.5': {'no': 'yes',
                                                                                                                                                                                                                      

                                                                                                                                                     'management': {'age>38.0': {'no': {'default': {'no': {'balance>452.5': {'no': 'yes',
                                                                                                                                                                                                                             'yes': {'loan': {'no': {'contact': {'cellular': {'day>16.0': {'no': 'yes',
                                                                                                                                                                                                                                                                                           'yes': {'campaign>2.0': {'no': {'pdays>-1.0': {'no': 'yes',
                                                                                                                               

                                                                                                                                                                                                  'yes': {'marital': {'divorced': 'yes',
                                                                                                                                                                                                                      'married': 'no',
                                                                                                                                                                                                                      'single': 'yes'}}}},
                                                                                                                                                                          'yes': {'balance>452.5': {'no': 'yes',
                                                                                            

                                                                                                                                                          'technician': {'age>38.0': {'no': {'balance>452.5': {'no': 'no',
                                                                                                                                                                                                               'yes': {'day>16.0': {'no': 'no',
                                                                                                                                                                                                                                    'yes': {'campaign>2.0': {'no': {'default': {'no': {'housing': {'no': {'loan': {'no': {'pdays>-1.0': {'no': {'previous>0.0': {'no': 'yes',
                                                                                                                                                                               

                                                                               'blue-collar': {'contact': {'cellular': {'education': {'primary': {'loan': {'no': {'age>38.0': {'no': 'no',
                                                                                                                                                                               'yes': {'marital': {'divorced': {'housing': {'no': 'no',
                                                                                                                                                                                                                            'yes': 'yes'}},
                                                                                                                                                                                                   'married': {'balance>452.5': {'no': 'no',
                                                                                                            

                                                               'jun': {'contact': {'cellular': {'job': {'admin.': 'yes',
                                                                                                        'blue-collar': 'yes',
                                                                                                        'entrepreneur': 'yes',
                                                                                                        'housemaid': 'yes',
                                                                                                        'management': {'day>16.0': {'no': {'marital': {'divorced': 'yes',
                                                                                                                                                       'married': {'age>38.0': {'no': 'no',
                                                                                                                                                

                                                                                                                                                         'yes': 'no'}}}},
                                                                                                        'student': {'education': {'primary': 'no',
                                                                                                                                  'secondary': 'no',
                                                                                                                                  'tertiary': 'yes',
                                                                                                                                  'unknown': 'no'}},
                                                                                                        'technician': {'marital': {'divorced': {'education': {'primary': 'yes',
                                                            

                                                                                                                                                                                                                         'secondary': 'yes',
                                                                                                                                                                                                                         'tertiary': {'campaign>2.0': {'no': {'day>16.0': {'no': 'yes',
                                                                                                                                                                                                                                                                           'yes': {'default': {'no': {'housing': {'no': 'no',
                                                                                                                                                                     

In [15]:
Our_prediction = df_train_new.apply(prediction, axis=1, args = [trained_tree])
np.where(df_train_new['y'] == Our_prediction,1,0).mean()

0.9868

In [16]:
Our_prediction = df_test_new.apply(prediction, axis=1, args = [trained_tree])
np.where(df_test_new['y'] == Our_prediction,1,0).mean()

0.8394

In [33]:
# dealing with missing values

df_train_correction = pd.DataFrame()
df_test_correction = pd.DataFrame()
for name in list(df_train_new.keys()):
    m = df_train_new[name].value_counts().idxmax(axis = 0)
    print('Majority in column %s at trauning is ' %name, m)
    df_train_correction[name] = np.where(df_train_new[name] == 'unknown', m, df_train_new[name])
    n = df_test_new[name].value_counts().idxmax(axis = 0)
    print('Majority in column %s at test data is ' %name, n)
    df_test_correction[name] = np.where(df_test_new[name] == 'unknown', n, df_test_new[name])

Majority in column age>38.0 at trauning is  no
Majority in column age>38.0 at test data is  yes
Majority in column job at trauning is  blue-collar
Majority in column job at test data is  blue-collar
Majority in column marital at trauning is  married
Majority in column marital at test data is  married
Majority in column education at trauning is  secondary
Majority in column education at test data is  secondary
Majority in column default at trauning is  no
Majority in column default at test data is  no
Majority in column balance>452.5 at trauning is  yes
Majority in column balance>452.5 at test data is  yes
Majority in column housing at trauning is  yes
Majority in column housing at test data is  yes
Majority in column loan at trauning is  no
Majority in column loan at test data is  no
Majority in column contact at trauning is  cellular
Majority in column contact at test data is  cellular
Majority in column day>16.0 at trauning is  no
Majority in column day>16.0 at test data is  no
Major

In [34]:
df_train_correction.loc[[1806]]

Unnamed: 0,age>38.0,job,marital,education,default,balance>452.5,housing,loan,contact,day>16.0,month,duration>180.0,campaign>2.0,pdays>-1.0,previous>0.0,poutcome,y
1806,no,blue-collar,single,secondary,no,yes,no,no,cellular,yes,jan,no,no,no,no,unknown,no


In [35]:
df_test_correction.loc[[1806]]

Unnamed: 0,age>38.0,job,marital,education,default,balance>452.5,housing,loan,contact,day>16.0,month,duration>180.0,campaign>2.0,pdays>-1.0,previous>0.0,poutcome,y
1806,no,blue-collar,married,secondary,no,yes,yes,no,cellular,yes,may,yes,no,no,no,unknown,no


In [36]:
df_train_correction.head()

Unnamed: 0,age>38.0,job,marital,education,default,balance>452.5,housing,loan,contact,day>16.0,month,duration>180.0,campaign>2.0,pdays>-1.0,previous>0.0,poutcome,y
0,yes,services,married,secondary,no,no,yes,no,cellular,no,may,no,no,no,no,unknown,no
1,yes,blue-collar,single,secondary,no,no,yes,yes,cellular,no,feb,yes,no,no,no,unknown,no
2,yes,technician,married,secondary,no,yes,no,yes,cellular,yes,aug,yes,no,yes,yes,success,yes
3,yes,admin.,married,tertiary,no,no,yes,no,cellular,no,jul,yes,no,no,no,unknown,no
4,no,management,single,tertiary,no,yes,no,no,cellular,no,apr,no,no,no,no,unknown,yes


In [48]:
tabel_train_data = {}
tabel_test_data = {}
for ptype in ["Entropy","Gini_Index","Majority_Error"]:
    tabel_train_data[ptype]= np.zeros(16)
    tabel_test_data[ptype]= np.zeros(16)
    for i in range(16):
        trained_tree = build_tree(df_train_new, 'y', depth = i+1, type_tree = ptype)

        Our_prediction_train = df_train_new.apply(prediction, axis=1, args = [trained_tree])
        tabel_train_data[ptype][i]= 1- np.where(df_train_new['y'] == Our_prediction_train,1,0).mean()
        
        Our_prediction_test = df_test_new.apply(prediction, axis=1, args = [trained_tree])
        tabel_test_data[ptype][i]= 1- np.where(df_test_new['y'] == Our_prediction_test,1,0).mean()
        print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15


In [49]:
tabel_df = pd.DataFrame.from_dict(tabel_train_data)
tabel_df

Unnamed: 0,Entropy,Gini_Index,Majority_Error
0,0.1192,0.1088,0.1088
1,0.106,0.1042,0.1042
2,0.1006,0.0936,0.0966
3,0.08,0.0754,0.084
4,0.0624,0.0604,0.0686
5,0.048,0.0484,0.0626
6,0.0372,0.0364,0.0584
7,0.0292,0.0268,0.0528
8,0.0222,0.0216,0.0488
9,0.0182,0.0174,0.0438


In [50]:
tabel_df_test = pd.DataFrame.from_dict(tabel_test_data)
tabel_df_test

Unnamed: 0,Entropy,Gini_Index,Majority_Error
0,0.1248,0.1166,0.1166
1,0.1114,0.1088,0.1088
2,0.1074,0.1154,0.1146
3,0.1198,0.1222,0.1176
4,0.1282,0.1338,0.121
5,0.1346,0.1462,0.1268
6,0.1412,0.152,0.1282
7,0.1476,0.158,0.134
8,0.153,0.1626,0.1344
9,0.157,0.165,0.1392


In [45]:
tabel_train_data_correction = {}
tabel_test_data_correction = {}
for ptype in ["Entropy","Gini_Index","Majority_Error"]:
    tabel_train_data_correction[ptype]= np.zeros(16)
    tabel_test_data_correction[ptype]= np.zeros(16)
    for i in range(16):
        trained_tree = build_tree(df_train_correction, 'y', depth = i+1, type_tree = ptype)

        Our_prediction_train_correction = df_train_correction.apply(prediction, axis=1, args = [trained_tree])
        tabel_train_data_correction[ptype][i]= 1- np.where(df_train_correction['y'] == Our_prediction_train_correction,1,0).mean()
        
        Our_prediction_test_correction = df_test_correction.apply(prediction, axis=1, args = [trained_tree])
        tabel_test_data_correction[ptype][i]= 1- np.where(df_train_correction['y'] == Our_prediction_test_correction,1,0).mean()
        print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15


In [46]:
tabel_df_train_correction = pd.DataFrame.from_dict(tabel_train_data_correction)
tabel_df_train_correction

Unnamed: 0,Entropy,Gini_Index,Majority_Error
0,0.1192,0.1088,0.1088
1,0.106,0.1042,0.1042
2,0.1008,0.0936,0.0966
3,0.0814,0.077,0.0838
4,0.0646,0.0632,0.0686
5,0.051,0.0522,0.0618
6,0.0406,0.0404,0.0564
7,0.033,0.0318,0.0542
8,0.0274,0.0268,0.0506
9,0.0218,0.0214,0.048


In [47]:
tabel_df_test_correction = pd.DataFrame.from_dict(tabel_test_data_correction)
tabel_df_test_correction

Unnamed: 0,Entropy,Gini_Index,Majority_Error
0,0.1192,0.1442,0.1442
1,0.139,0.1472,0.1472
2,0.1582,0.1622,0.157
3,0.168,0.1676,0.1618
4,0.1796,0.1788,0.1702
5,0.1912,0.1876,0.1722
6,0.1962,0.1972,0.176
7,0.2086,0.2062,0.1778
8,0.2126,0.2126,0.1814
9,0.2152,0.2146,0.1856
