This repository has been archived by the owner on Jul 10, 2022. It is now read-only.
/
DecisionTree.py
349 lines (264 loc) · 10.6 KB
/
DecisionTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
#!/usr/local/bin/python3
# Brodderick Rodriguez
# Auburn University - CSSE
# 28 Feb. 2019
import Node
import json
import os.path
import collections
import math
import AttributeLookUpTable as alt
import eval_accuracy as evalacc
#
# sample of how to we are creating decision tree nodes
#
# r = Node.Node(attribute='root')
# self.set_root_node(r)
# c0 = r.add_child(attribute='c0', attribute_value=0)
# c1 = r.add_child(attribute='c1', attribute_value=1)
# c00 = c0.add_child(attribute='c00', attribute_value=0)
# c11 = c1.add_child(attribute='c11', attribute_value=1)
#
class DecisionTree:
def __init__(self, feature_matrix):
self.root_node = None
self.feature_matrix = feature_matrix
# sets the root node
def set_root_node(self, node):
if self.has_root_node():
raise RuntimeError('root node is already set')
self.root_node = node
# checks if there is a root node
def has_root_node(self):
return self.root_node is not None
# walks the tree for each amino acid
def test(self):
# check if there is a feature matrix
if self.feature_matrix == None:
raise RuntimeError('no feature matrix supplied')
# create a local variable to test on
testing = self.feature_matrix.testing
# check if there is testing data
if len(testing) == 0:
raise RuntimeError('no testing data supplied')
# check if there is a root node
if self.root_node == None:
raise RuntimeError('no root node. Did you train?')
# walk the tree for each amino acid
for sequence in testing:
for amino_acid in sequence:
self.walk_tree(self.root_node, amino_acid)
print('finished testing')
# used to test. Walks the tree for an amino acid
# and places the amino acid in the first leaf node it encounters
def walk_tree(self, node, amino_acid):
# if we have reached a leaf node, add this amino acid to it
if node.is_leaf() == 1:
node.add_amino_acid(amino_acid)
return
# get node variables
node_attribute = node.attribute_name
node_children = node.children
# get the amino acid attribute value for this node
amino_acid_attribute_value = amino_acid[1][node_attribute]
# find the next node to walk to
next_node = node_children[str(amino_acid_attribute_value)]
# recuse on that node
self.walk_tree(next_node, amino_acid)
# trains (or builds) a decision tree based on the data
def train(self):
# local variable for our feature matrix traning data set
training = self.feature_matrix.training
# initiate a list of reamining attributes
remaining_attributes = alt.get_attributes()
# get the attribute for the initial split
attribute = self.get_next_attribute(remaining_attributes, training)
# remove it from the list of remaining attributes
remaining_attributes.remove(attribute)
# add our root node to this tree
root = Node.Node(attribute=attribute)
self.set_root_node(root)
# subset the data given the attribute is 0 or 1
d0 = self.feature_matrix.subset(training, attribute, 0)
d1 = self.feature_matrix.subset(training, attribute, 1)
# recurse and add nodes to the tree
c0 = self.add_nodes(parent_node=root, data=d0, remaining_attributes=remaining_attributes)
c1 = self.add_nodes(parent_node=root, data=d1, remaining_attributes=remaining_attributes)
# add the root nodes children
root.add_child_n(c0, attribute_value=0)
root.add_child_n(c1, attribute_value=1)
print('built tree')
# recursivley adds nodes to our tree intil we have no
# more attributes or our entropy is zero
def add_nodes(self, parent_node, data, remaining_attributes):
# if we have no more attributes to split on
if len(remaining_attributes) == 0:
return Node.LeafNode(data)
# calculate the remaining entropy of our label
label_prob = self.probability_of_labels(data)
label_entropy = self.entropy(label_prob)
# if our entropy is zero, return a leaf node
if label_entropy == 0:
return Node.LeafNode(data)
# choose the next attribute
next_attribute = self.get_next_attribute(remaining_attributes, data)
# remaining_attributes.remove(next_attribute)
remaining_attributes = [a for a in remaining_attributes if a != next_attribute]
# the node we are adding to the tree
node = Node.Node(attribute=next_attribute)
# the subsets of data given our next attribute is 0 or 1
d0 = self.feature_matrix.subset(data, next_attribute, 0)
d1 = self.feature_matrix.subset(data, next_attribute, 1)
# recursivley add the child nodes to the tree
c0 = self.add_nodes(parent_node=node, data=d0, remaining_attributes=remaining_attributes)
c1 = self.add_nodes(parent_node=node, data=d1, remaining_attributes=remaining_attributes)
# add this node to the parent
node.add_child_n(c0, attribute_value=0)
node.add_child_n(c1, attribute_value=1)
return node
# determines the next attribute to split the tree on
def get_next_attribute(self, remaining_attributes, data):
# builds a dictionary: {<attribute>: <info_gain(attribute)>}
info_gains = {a: self.information_gain(a, data) for a in remaining_attributes}
# get the max information gain attribute
return max(info_gains)
# computes information gain given a parameter attribute and data
def information_gain(self, attribute, data):
# the probability proportions of the labels. i.e. 'B'=0.5, 'E'=0.5
label_probs = self.probability_of_labels(data)
# the computed entropy of the label probs in range [0, 1]
label_entropy = self.entropy(label_probs)
# the conditional entropy of this attrubite on this data
ce = self.conditional_entropy(attribute, data)
# compute information gain per ID3 algorithm
ig = label_entropy - ce
return ig
# computes the conditional entropy of an attribute given a subset dataset
def conditional_entropy(self, attribute, data):
# get the dictionary of the probabilities: {'B': Double, 'E': Double}
probs = self.get_probability_of_attribute(attribute, data)
# initialize entropy
entropy = 0
# for each probability, compute the partial entropy
for val in probs:
# generate a subset given attrbute = val. i.e. hydrophobic=1
d = self.feature_matrix.subset(data, attribute, val)
# get the probability of this amino acids label given oru subset dataset
p = self.probability_of_labels(d)
# compute the entropy of that probability
e = self.entropy(p)
# compute entropy via the ID3 entropy equation
entropy += (e * probs[val])
return entropy
# computes entropy of a dictionary {'1': Double, '0': Double}
# where each double prepresents the probability of an attrribute having that value
def entropy(self, probabilities):
result = 0
# for each probability
for _, prob in probabilities.items():
# if the probability is zero, add nothing to result
if prob == 0:
continue
# otherwise, compute entropy via the ID3 entropy equation
result += (-prob) * (math.log(prob) / math.log(2))
return result
# computes the probability of an attribute within a subset dataset
# returns a dictionary: {'1': Double, '0': Double}
def get_probability_of_attribute(self, attribute, data):
# initialize a list to contain all the attribute values
attribute_values = []
# for each amino acid
for sequence in data:
for amino_acid in sequence:
# append this attributes value to the attribute_value list
attribute_values.append(amino_acid[1][attribute])
# get the number of values
num_items = len(attribute_values)
# count the occurances of each attribute value
value_occurances = collections.Counter(attribute_values)
# build a dictionary: {'1': Double, '0': Double}
probabilities = {v: value_occurances[v] / num_items for v in value_occurances}
return probabilities
# computes the probability of the amino acids labels given a subset dataset
def probability_of_labels(self, data):
# grab the values for the entire feature matrix
labels = self.feature_matrix.labels
# get the labels that appear in this subset dataset
all_labels = self.feature_matrix.get_all_labels(data)
# compute the number of items in all_labels
num_items = len(all_labels) if len(all_labels) > 0 else 1
# count the occurances of the labels
labels_occurances = collections.Counter(all_labels)
# build a probability dictionary: {<label>: <probability}
probabilities = {l: labels_occurances[l] / num_items for l in labels}
return probabilities
# creates a saved model file for this Decision Tree
def pack(self, file_path='./models/model.json'):
if self.root_node is None:
return ''
result, queue = {}, [self.root_node]
# while we have an element in the queue
while queue:
node = queue.pop(0)
# add this node to the result
result[node.node_id] = node.pack()
# for each child, add it to the queue
queue += [child for child in node.children.values()]
file = open(file_path, 'w')
file.write(json.dumps(result))
file.close()
return result
# creates a Decision tree from a saved file
def unpack(self, file_path):
# check if the model exists
if not os.path.isfile(file_path):
raise RuntimeError('file does not exist')
# check if this decison tree is empty
if self.has_root_node():
raise RuntimeError('decision tree not empty')
# recursivley add children to each node is a DFS mannor
def recurive_add_childen(parent_node, whole_dict, child_dict):
# for each child node
for child_attribute_value, child_id in child_dict.items():
# get next child's attribute and children
child_attribute = whole_dict[str(child_id)]['attribute']
child_children = whole_dict[str(child_id)]['children']
is_leaf = whole_dict[str(child_id)]['leaf']
# create a node and add child to it
if is_leaf == 1:
child_node = Node.LeafNode()
else:
child_node = Node.Node(child_attribute)
parent_node.add_child_n(child_node, child_attribute_value)
# recursively add its children
if is_leaf == 0:
recurive_add_childen(parent_node=child_node, whole_dict=whole_dict, child_dict=child_children)
# get the model file contens
file = open(file_path, 'r')
data = json.load(file)
file.close()
# get root entry in dict
root_entry = data[str(0)]
# build root node
root = Node.Node(root_entry['attribute'])
root_children = root_entry['children']
# add each childens childen...
recurive_add_childen(parent_node=root, whole_dict=data, child_dict=root_children)
# set the root node
self.set_root_node(root)
return self.set_root_node
def test_performance(self):
def find_leafs(node):
c0 = node.children['0']
c1 = node.children['1']
if c0.is_leaf():
c0.get_label()
else:
find_leafs(c0)
if c1.is_leaf():
c1.get_label()
else:
find_leafs(c1)
find_leafs(self.root_node)
acc = evalacc.eval_accuracy()
acc.test_accuracy(self.feature_matrix.testing)