forked from insightbook/data-science-from-scratch
/
ch17_decision_trees.py
155 lines (118 loc) · 5.73 KB
/
ch17_decision_trees.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from __future__ import division
from collections import Counter, defaultdict
from functools import partial
import math, random
def entropy(class_probabilities):
"""given a list of class probabilities, compute the entropy"""
return sum(-p * math.log(p, 2) for p in class_probabilities if p)
def class_probabilities(labels):
total_count = len(labels)
return [count / total_count
for count in Counter(labels).values()]
def data_entropy(labeled_data):
labels = [label for _, label in labeled_data]
probabilities = class_probabilities(labels)
return entropy(probabilities)
def partition_entropy(subsets):
"""find the entropy from this partition of data into subsets"""
total_count = sum(len(subset) for subset in subsets)
return sum( data_entropy(subset) * len(subset) / total_count
for subset in subsets )
def group_by(items, key_fn):
"""returns a defaultdict(list), where each input item
is in the list whose key is key_fn(item)"""
groups = defaultdict(list)
for item in items:
key = key_fn(item)
groups[key].append(item)
return groups
def partition_by(inputs, attribute):
"""returns a dict of inputs partitioned by the attribute
each input is a pair (attribute_dict, label)"""
return group_by(inputs, lambda x: x[0][attribute])
def partition_entropy_by(inputs,attribute):
"""computes the entropy corresponding to the given partition"""
partitions = partition_by(inputs, attribute)
return partition_entropy(partitions.values())
def classify(tree, input):
"""classify the input using the given decision tree"""
# if this is a leaf node, return its value
if tree in [True, False]:
return tree
# otherwise find the correct subtree
attribute, subtree_dict = tree
subtree_key = input.get(attribute) # None if input is missing attribute
if subtree_key not in subtree_dict: # if no subtree for key,
subtree_key = None # we'll use the None subtree
subtree = subtree_dict[subtree_key] # choose the appropriate subtree
return classify(subtree, input) # and use it to classify the input
def build_tree_id3(inputs, split_candidates=None):
# if this is our first pass,
# all keys of the first input are split candidates
if split_candidates is None:
split_candidates = inputs[0][0].keys()
# count Trues and Falses in the inputs
num_inputs = len(inputs)
num_trues = len([label for item, label in inputs if label])
num_falses = num_inputs - num_trues
if num_trues == 0: # if only Falses are left
return False # return a "False" leaf
if num_falses == 0: # if only Trues are left
return True # return a "True" leaf
if not split_candidates: # if no split candidates left
return num_trues >= num_falses # return the majority leaf
# otherwise, split on the best attribute
best_attribute = min(split_candidates,
key=partial(partition_entropy_by, inputs))
partitions = partition_by(inputs, best_attribute)
new_candidates = [a for a in split_candidates
if a != best_attribute]
# recursively build the subtrees
subtrees = { attribute : build_tree_id3(subset, new_candidates)
for attribute, subset in partitions.iteritems() }
subtrees[None] = num_trues > num_falses # default case
return (best_attribute, subtrees)
def forest_classify(trees, input):
votes = [classify(tree, input) for tree in trees]
vote_counts = Counter(votes)
return vote_counts.most_common(1)[0][0]
if __name__ == "__main__":
inputs = [
({'level':'Senior','lang':'Java','tweets':'no','phd':'no'}, False),
({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'}, False),
({'level':'Mid','lang':'Python','tweets':'no','phd':'no'}, True),
({'level':'Junior','lang':'Python','tweets':'no','phd':'no'}, True),
({'level':'Junior','lang':'R','tweets':'yes','phd':'no'}, True),
({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'}, False),
({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'}, True),
({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
({'level':'Senior','lang':'R','tweets':'yes','phd':'no'}, True),
({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'}, True),
({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'}, True),
({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)
]
for key in ['level','lang','tweets','phd']:
print key, partition_entropy_by(inputs, key)
print
senior_inputs = [(input, label)
for input, label in inputs if input["level"] == "Senior"]
for key in ['lang', 'tweets', 'phd']:
print key, partition_entropy_by(senior_inputs, key)
print
print "building the tree"
tree = build_tree_id3(inputs)
print tree
print "Junior / Java / tweets / no phd", classify(tree,
{ "level" : "Junior",
"lang" : "Java",
"tweets" : "yes",
"phd" : "no"} )
print "Junior / Java / tweets / phd", classify(tree,
{ "level" : "Junior",
"lang" : "Java",
"tweets" : "yes",
"phd" : "yes"} )
print "Intern", classify(tree, { "level" : "Intern" } )
print "Senior", classify(tree, { "level" : "Senior" } )