# Machine Learning 

### Homework 1

In [3]:
import numpy as np
import pandas as pd
import matplotlib as mpl

In [4]:
import pprint

**Decision Tree:**

Input data:

In [4]:
test_data = pd.read_csv('car/test.csv', header=None)
train_data = pd.read_csv('car/train.csv', header=None)

In [5]:
train_data.columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'label']
test_data.columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'label']

In [6]:
attrs = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
label = 'label'

In [None]:
attr_values = {'buying': ['vhigh', 'high', 'med'] ,
               'maint': , 
               'doors': , 
               'persons': ,
               'lug_boot': ,
               'safety': }

In [7]:
test_data.shape, train_data.shape

((728, 7), (1000, 7))

In [8]:
train_data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,label
0,low,vhigh,4,4,big,med,acc
1,low,high,5more,4,med,high,vgood
2,vhigh,med,2,2,big,high,unacc
3,high,high,2,2,small,high,unacc
4,vhigh,low,3,2,big,low,unacc


**Entropy:**

In [47]:
def entropy(labels):
    
    vals, freqs = np.unique(labels, return_counts=True)
    probs = freqs / len(labels)
    entropy = - probs.dot(np.log2(probs))

    return entropy

**Majority Error:**

In [48]:
def maj_error(labels):
    vals, freqs = np.unique(labels, return_counts=True)
    probs = freqs / len(labels)
    me = 1 - probs.max()

    return me

**Gini Index:**

In [49]:
def gini(labels):
    vals, freqs = np.unique(labels, return_counts=True)
    probs = freqs / len(labels)
    gi = 1 - probs.dot(probs)

    return gi

Info Gain:

In [50]:
def info_gain(data,split_attr, target_attr, gain_method):
    
    #print('method: {}'.format(gain_method))
    #print('type: {}'.format(type(gain_method)))
    total_e = gain_method(data[target_attr])
    
    # Calculate the values and counts for the split feature
    vals, freqs = np.unique(data[split_attr] ,return_counts=True)
    
    # Calculate new entropy for split
    new_e = 0
    for i in range(len(vals)):
        split_data = data.where(data[split_attr]==vals[i]).dropna()[target_attr]
        new_e += (freqs[i]/np.sum(freqs))*gain_method(split_data)
    
    # Calculate info gain
    info_gain = total_e - new_e
    return info_gain

In [87]:
info_gain(data, 'safety', 'label', maj_error)

0.043523438979818585

---
H(x1 = 0)

In [51]:
class Node(object):
    def __init__(self, name='',value=None, children=None, parent=None):
        self.name = name
        self.value = value
        self.parent = parent
        self.children = children or []

    def add_child(self, value):
        new_child = Node(value, parent=self)
        self.children.append(new_child)
        return new_child
    
    def is_root(self):
        return self.parent is None

    def is_leaf(self):
        return not self.children

    def __str__(self):
        if self.is_leaf():
            return str(self.value)
        return '{value} [{children}]'.format(value=self.value, children=', '.join(map(str, self.children)))

In [52]:
def id3(data, original_data, attrs, target_attr, gain_method, parent_label, current_depth=0, max_depth=6):
    """ ID3 Algorithm
    
    Args:
        data (pandas dataframe): input data
        original_data (pandas dataframe): copy of original, untouched data 
        attrs (list): list of strings of attributes, all but the target attribute
        target_attr (str): name of attribute to be used at the target labels
        gain_method (function name): Information Gain method, either entropy, maj_error, or gini
        parent_label (int): attribute label of parent node in recursive algorithm.
        current_depth (int): current tree depth
        max_depth (int): maximum tree depth

    Returns:
        tree (dict): dictionary structure represented the decision tree
  
    """
    
    # if all target labels are the same, stop and return this value
    unique_labels = np.unique(data[target_attr])
    if len(unique_labels) == 1:
        #
        return unique_labels[0]
    
    # if the data is empty, return the label that occurs the most in the origional data
    elif len(data) == 0:
        vals, freqs = np.unique(original_data[target_attr],return_counts=True)
        return vals[np.argmax(freqs)]

    # if there are no more attributes, return the parent label
    elif len(attrs) == 0:
        return parent_label
    else:
        current_depth += 1
        # set value for this node to the mode of the target feature values
        vals, freqs = np.unique(data[target_attr],return_counts=True)
        parent_label = vals[np.argmax(freqs)]
        
        # if max depth is reached, return label that occurs the most
        if current_depth == max_depth+1:
            return parent_label
        
        # Find best attribute to split data on
        info_gains = [info_gain(data,attr, target_attr, gain_method) for attr in attrs]
        best_attr = attrs[info_gains.index(max(info_gains))]
        
        # create new subtree
        tree = dict()
        tree[best_attr] = dict()
        
        # remove best attribute from attribute list
        attrs = [i for i in attrs if i != best_attr]

        # grow tree
        for val in np.unique(original_data[best_attr]):
            val = val
            new_data = dict(data)
            
            # split dataset on the best attribute and remove this column from dataset
            new_data = data.where(data[best_attr] == val).dropna()
            
            # Recursion 
            new_tree = id3(new_data, original_data, attrs, target_attr, gain_method, parent_label, current_depth, max_depth)
            
            # Add subtree to parents tree
            tree[best_attr][val] = new_tree

        return tree

---
**TESTING:**

In [16]:
attrs = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
label = 'label'

In [13]:
train_data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,label
0,low,vhigh,4,4,big,med,acc
1,low,high,5more,4,med,high,vgood
2,vhigh,med,2,2,big,high,unacc
3,high,high,2,2,small,high,unacc
4,vhigh,low,3,2,big,low,unacc


In [14]:
data = train_data
original_data = data

In [31]:
split_attr = 'maint'
target_attr = 'label'

In [35]:
a = data['maint']=='vhigh'
a.head()

0     True
1    False
2    False
3    False
4    False
Name: maint, dtype: bool

In [42]:
b = data.where(a).dropna()['label']

In [43]:
entropy(b)

0.6717587540417598

In [43]:
vals, freqs = np.unique(data['safety'],return_counts=True)
np.unique(data['safety'])[np.argmax(freqs)]

'med'

In [44]:
vals, freqs = np.unique(data['safety'],return_counts=True)
vals[np.argmax(freqs)]

'med'

In [46]:
info_gain(data, split_attr, target_attr)

0.07741985577459642

In [104]:
tree = id3(data, original_data, attrs, label,gain_method=entropy, parent_label=None, max_depth=4)

---
## Evaluation

str

In [146]:
test_data.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,label
0,low,low,5more,2,small,med,unacc
1,low,vhigh,4,2,med,low,unacc
2,high,vhigh,3,4,med,med,unacc
3,vhigh,low,4,4,med,low,unacc
4,high,vhigh,5more,4,med,low,unacc
5,med,med,3,4,big,med,acc
6,vhigh,med,4,2,med,med,unacc
7,med,high,5more,2,med,med,unacc
8,vhigh,low,2,2,big,high,unacc
9,med,vhigh,2,2,med,high,unacc


In [63]:
def predict_label(ex, tree, label): 
    """
    Returns True if actual label matches label from trained descision tree
    """
    for key,val in tree.items():
        attr_value = ex[key]
        new_val = val[attr_value]
        
        if isinstance(new_val, dict):
            # current node is not and endnode, keep recursion going
            return predict_label(ex, new_val, label)
        else:
            return ex[label] == new_val
        

In [61]:
def evaluate(data, tree, label):
    """
        Loops over each data example to caclulate accuracy of learned tree.
    """
    N = data.shape[0]
    
    correct_counter = 0
    
    for i in range(N):
        #print(i)
        correct_counter += predict_label(data.iloc[i], tree, label)
        
    return 1 - (correct_counter / float(N))

---
**Entropy:**

In [128]:
# depth=1
tree1 = id3(data, original_data, attrs, label,gain_method=entropy, parent_label=None, max_depth=1)
#pprint.pprint(tree1)

In [129]:
# depth=2
tree2 = id3(data, original_data, attrs, label,gain_method=entropy, parent_label=None, max_depth=2)
#pprint.pprint(tree2)

In [127]:
# depth=3
tree3 = id3(data, original_data, attrs, label,gain_method=entropy, parent_label=None, max_depth=3)
#pprint.pprint(tree3)

In [113]:
# depth=4
tree4 = id3(data, original_data, attrs, label,gain_method=entropy, parent_label=None, max_depth=4)
#pprint.pprint(tree4)

In [114]:
# depth=5
tree5 = id3(data, original_data, attrs, label,gain_method=entropy, parent_label=None, max_depth=5)
#pprint.pprint(tree5)

In [115]:
# depth=6
tree6 = id3(data, original_data, attrs, label,gain_method=entropy, parent_label=None, max_depth=6)
#pprint.pprint(tree6)

In [130]:
res1_train = evaluate(train_data, tree1)
res1_test = evaluate(test_data, tree1)
res1_train, res1_test

(0.30200000000000005, 0.29670329670329665)

In [131]:
res2_train = evaluate(train_data, tree2)
res2_test = evaluate(test_data, tree2)
res2_train, res2_test

(0.22199999999999998, 0.22252747252747251)

In [132]:
res3_train = evaluate(train_data, tree3)
res3_test = evaluate(test_data, tree3)
res3_train, res3_test

(0.18100000000000005, 0.1964285714285714)

In [133]:
res4_train = evaluate(train_data, tree4)
res4_test = evaluate(test_data, tree4)
res4_train, res4_test

(0.08199999999999996, 0.15109890109890112)

In [134]:
res5_train = evaluate(train_data, tree5)
res5_test = evaluate(test_data, tree5)
res5_train, res5_test

(0.027000000000000024, 0.09890109890109888)

In [135]:
res6_train = evaluate(train_data, tree6)
res6_test = evaluate(test_data, tree6)
res6_train, res6_test

(0.0, 0.12225274725274726)

---
**Majority Error:**

In [157]:
# depth=1
tree1 = id3(data, original_data, attrs, label,gain_method=maj_error, parent_label=None, max_depth=1)
pprint.pprint(tree1)

{'buying': {'high': 'unacc', 'low': 'unacc', 'med': 'unacc', 'vhigh': 'unacc'}}


In [158]:
# depth=2
tree2 = id3(data, original_data, attrs, label,gain_method=maj_error, parent_label=None, max_depth=2)
pprint.pprint(tree2)

{'buying': {'high': {'maint': {'high': 'unacc',
                               'low': 'unacc',
                               'med': 'unacc',
                               'vhigh': 'unacc'}},
            'low': {'safety': {'high': 'unacc', 'low': 'unacc', 'med': 'acc'}},
            'med': {'safety': {'high': 'acc', 'low': 'unacc', 'med': 'unacc'}},
            'vhigh': {'doors': {'2': 'unacc',
                                '3': 'unacc',
                                '4': 'unacc',
                                '5more': 'unacc'}}}}


In [160]:
# depth=3
tree3 = id3(data, original_data, attrs, label,gain_method=maj_error, parent_label=None, max_depth=3)
#pprint.pprint(tree3)

In [162]:
# depth=4
tree4 = id3(data, original_data, attrs, label,gain_method=maj_error, parent_label=None, max_depth=4)
#pprint.pprint(tree4)

In [163]:
# depth=5
tree5 = id3(data, original_data, attrs, label,gain_method=maj_error, parent_label=None, max_depth=5)
#pprint.pprint(tree5)

In [164]:
# depth=6
tree6 = id3(data, original_data, attrs, label,gain_method=maj_error, parent_label=None, max_depth=6)
#pprint.pprint(tree6)

In [165]:
res1_train = evaluate(train_data, tree1)
res1_test = evaluate(test_data, tree1)
res1_train, res1_test

(0.30200000000000005, 0.29670329670329665)

In [166]:
res2_train = evaluate(train_data, tree2)
res2_test = evaluate(test_data, tree2)
res2_train, res2_test

(0.29200000000000004, 0.3131868131868132)

In [167]:
res3_train = evaluate(train_data, tree3)
res3_test = evaluate(test_data, tree3)
res3_train, res3_test

(0.19299999999999995, 0.21153846153846156)

In [168]:
res4_train = evaluate(train_data, tree4)
res4_test = evaluate(test_data, tree4)
res4_train, res4_test

(0.11099999999999999, 0.1785714285714286)

In [169]:
res5_train = evaluate(train_data, tree5)
res5_test = evaluate(test_data, tree5)
res5_train, res5_test

(0.03600000000000003, 0.12362637362637363)

In [170]:
res6_train = evaluate(train_data, tree6)
res6_test = evaluate(test_data, tree6)
res6_train, res6_test

(0.0, 0.14835164835164838)

---
**Gini Index:**

In [171]:
# depth=1
tree1 = id3(data, original_data, attrs, label,gain_method=gini, parent_label=None, max_depth=1)
#pprint.pprint(tree1)

In [172]:
# depth=2
tree2 = id3(data, original_data, attrs, label,gain_method=gini, parent_label=None, max_depth=2)
#pprint.pprint(tree2)

In [173]:
# depth=3
tree3 = id3(data, original_data, attrs, label,gain_method=gini, parent_label=None, max_depth=3)
#pprint.pprint(tree3)

In [174]:
# depth=1
tree4 = id3(data, original_data, attrs, label,gain_method=gini, parent_label=None, max_depth=4)
#pprint.pprint(tree4)

In [175]:
# depth=1
tree5 = id3(data, original_data, attrs, label,gain_method=gini, parent_label=None, max_depth=5)
#pprint.pprint(tree5)

In [176]:
# depth=1
tree6 = id3(data, original_data, attrs, label,gain_method=gini, parent_label=None, max_depth=6)
#pprint.pprint(tree6)

In [177]:
res1_train = evaluate(train_data, tree1)
res1_test = evaluate(test_data, tree1)
res1_train, res1_test

(0.30200000000000005, 0.29670329670329665)

In [178]:
res2_train = evaluate(train_data, tree2)
res2_test = evaluate(test_data, tree2)
res2_train, res2_test

(0.22199999999999998, 0.22252747252747251)

In [179]:
res3_train = evaluate(train_data, tree3)
res3_test = evaluate(test_data, tree3)
res3_train, res3_test

(0.17600000000000005, 0.18406593406593408)

In [180]:
res4_train = evaluate(train_data, tree4)
res4_test = evaluate(test_data, tree4)
res4_train, res4_test

(0.08899999999999997, 0.13736263736263732)

In [181]:
res5_train = evaluate(train_data, tree5)
res5_test = evaluate(test_data, tree5)
res5_train, res5_test

(0.027000000000000024, 0.09890109890109888)

In [182]:
res6_train = evaluate(train_data, tree6)
res6_test = evaluate(test_data, tree6)
res6_train, res6_test

(0.0, 0.12225274725274726)

---
## Numerical Attributes

In [5]:
test_b = pd.read_csv('bank/test.csv', header=None)
train_b = pd.read_csv('bank/train.csv', header=None)

In [6]:
test_b.columns = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
train_b.columns = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']

In [7]:
train_b.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,41,services,married,secondary,no,0,yes,no,unknown,5,may,114,2,-1,0,unknown,no
1,48,blue-collar,single,secondary,no,312,yes,yes,cellular,3,feb,369,2,-1,0,unknown,no
2,55,technician,married,secondary,no,1938,no,yes,cellular,18,aug,193,1,386,3,success,yes
3,54,admin.,married,tertiary,no,59,yes,no,cellular,10,jul,268,1,-1,0,unknown,no
4,34,management,single,tertiary,no,2646,no,no,cellular,14,apr,142,1,-1,0,unknown,yes


convert numerical features to binary:

In [9]:
attrs = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']

In [35]:
def num_2_binary(data):
    for attr in attrs:
        vals = data[attr]
        if np.unique(vals).dtype == 'int64':
            data[attr] = data[attr] >= data[attr].median()


In [40]:
num_2_binary(test_b)

In [37]:
num_2_binary(train_b)

In [43]:
train_b.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,True,services,married,secondary,no,False,yes,no,unknown,False,may,False,True,True,True,unknown,no
1,True,blue-collar,single,secondary,no,False,yes,yes,cellular,False,feb,True,True,True,True,unknown,no
2,True,technician,married,secondary,no,True,no,yes,cellular,True,aug,True,False,True,True,success,yes
3,True,admin.,married,tertiary,no,False,yes,no,cellular,False,jul,True,False,True,True,unknown,no
4,False,management,single,tertiary,no,True,no,no,cellular,False,apr,False,False,True,True,unknown,yes


In [58]:
attrs = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']
label = 'y'

In [67]:
trees = []
train_results = []
test_results =[]

data = train_b

for i in range(16):
    tree = id3(data, data, attrs, label,gain_method=entropy, parent_label=None, max_depth=i+1)
    trees.append(tree)
    train_results.append(evaluate(train_b, tree, 'y'))
    test_results.append(evaluate(test_b, tree, 'y'))

In [68]:
train_results

[0.11919999999999997,
 0.10599999999999998,
 0.10060000000000002,
 0.07920000000000005,
 0.06120000000000003,
 0.04720000000000002,
 0.03480000000000005,
 0.02859999999999996,
 0.02300000000000002,
 0.017000000000000015,
 0.014399999999999968,
 0.013599999999999945,
 0.013599999999999945,
 0.013599999999999945,
 0.013599999999999945,
 0.013599999999999945]

In [69]:
test_results

[0.12480000000000002,
 0.11140000000000005,
 0.10699999999999998,
 0.11499999999999999,
 0.122,
 0.13239999999999996,
 0.1422,
 0.14639999999999997,
 0.15080000000000005,
 0.15539999999999998,
 0.1562,
 0.15559999999999996,
 0.15559999999999996,
 0.15539999999999998,
 0.15539999999999998,
 0.15539999999999998]

**ME**

In [70]:
trees = []
train_results = []
test_results =[]

data = train_b

for i in range(16):
    tree = id3(data, data, attrs, label,gain_method=maj_error, parent_label=None, max_depth=i+1)
    trees.append(tree)
    train_results.append(evaluate(train_b, tree, 'y'))
    test_results.append(evaluate(test_b, tree, 'y'))

In [71]:
train_results

[0.10880000000000001,
 0.10419999999999996,
 0.09599999999999997,
 0.08179999999999998,
 0.07179999999999997,
 0.06740000000000002,
 0.06440000000000001,
 0.05920000000000003,
 0.050799999999999956,
 0.04279999999999995,
 0.03839999999999999,
 0.032399999999999984,
 0.027599999999999958,
 0.020199999999999996,
 0.01539999999999997,
 0.013599999999999945]

In [72]:
test_results

[0.11660000000000004,
 0.10880000000000001,
 0.11219999999999997,
 0.11819999999999997,
 0.11680000000000001,
 0.11939999999999995,
 0.12039999999999995,
 0.12319999999999998,
 0.12839999999999996,
 0.138,
 0.14200000000000002,
 0.14739999999999998,
 0.15100000000000002,
 0.15580000000000005,
 0.15759999999999996,
 0.15759999999999996]

Unknown as a value:

Unknown as missing:

In [74]:
a = [.5, .5]
b = [0, 1]
c = [3/5, 2/5]

In [75]:
entropy(a)

-0.0

In [76]:
entropy(b)

1.0

In [77]:
np.log2(.5)

-1.0

In [None]:
def entropy(labels):
    
    vals, freqs = np.unique(labels, return_counts=True)
    probs = freqs / len(labels)
    entropy = - probs.dot(np.log2(probs))

    return entropy

In [78]:
vals, freqs = np.unique(a, return_counts=True)
probs = freqs / len(a)

vals, freqs, probs

(array([0.5]), array([2], dtype=int64), array([1.]))

In [86]:
entropy([1,1,1,1,1,1,0,1,0])

0.7642045065086203