Skip to content

Commit

Permalink
Merge pull request #66 from sjanssen2/seedset
Browse files Browse the repository at this point in the history
support for 'seedsets' in destructive_maxdist heuristic:
  • Loading branch information
qiyunzhu committed Apr 19, 2017
2 parents c8e4c52 + 2414a0b commit 9a6bc55
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 2 deletions.
17 changes: 15 additions & 2 deletions phylogeny/prototypeSelection.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,8 @@ def prototype_selection_constructive_protoclass(dm: DistanceMatrix,
return [dm.ids[idx] for idx in prototypes]


def prototype_selection_destructive_maxdist(dm, num_prototypes):
def prototype_selection_destructive_maxdist(dm, num_prototypes,
seedset=set([])):
'''Heuristically select k prototypes for given distance matrix.
Prototype selection is NP-hard. This is an implementation of a greedy
Expand All @@ -469,6 +470,12 @@ def prototype_selection_destructive_maxdist(dm, num_prototypes):
Must be >= 2, since a single prototype is useless.
Must be smaller than the number of elements in the distance matrix,
otherwise no reduction is necessary.
seedset: set(ids)
A set of element IDs that are preferably selected for the resulting
set. All get selected first, if num_prototype >= len(seedset),
otherwise a random sub-selected of seedset is returned.
Warning: It will most likely violate the global objective function to
pre-select elements.
Returns
-------
Expand All @@ -485,7 +492,7 @@ def prototype_selection_destructive_maxdist(dm, num_prototypes):
Notes
-----
Timing: %timeit -n 100 prototype_selection_constructive_maxdist(dm, 100)
Timing: %timeit -n 100 prototype_selection_destructive_maxdist(dm, 100)
100 loops, best of 3: 2.1 s per loop
where the dm holds 27,398 elements
function signature with type annotation for future use with python >= 3.5:
Expand All @@ -508,6 +515,12 @@ def prototype_selection_constructive_maxdist(dm: DistanceMatrix,
# distances from each element to all others
currDists = dm.data.sum(axis=1)

# a dirty hack to ensure that all elements of the seedset will be selected
# last and thus make it into the resulting set
maxVal = currDists.max()
for e in seedset:
currDists[dm.index(e)] = maxVal*2

# the element to remove first is the one that has smallest distance to all
# other. "Removing" works by tagging its distance-sum as infinity. Plus, we
# decrease the number of available elements by one.
Expand Down
35 changes: 35 additions & 0 deletions phylogeny/tests/test_prototypeSelection.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,41 @@ def test_prototype_selection_constructive_pMedian(self):
res)
self.assertAlmostEqual(100.32727028, distance_sum(res, self.dm100))

def test_seedset(self):
# test seedset function, first include elements that are supposed to
# be selected, to see if result is identical
seedset = set(['A', 'P'])
res = prototype_selection_destructive_maxdist(self.dm20, 5, seedset)
self.assertCountEqual(('A', 'P', 'T', 'C', 'O'), res)
self.assertAlmostEqual(5.4494, distance_sum(res, self.dm20))

# then include different elements, to see result changes, and score
# (sum of distances) slightly drops.
seedset = ['G', 'I']
res = prototype_selection_destructive_maxdist(self.dm20, 5, seedset)
self.assertCountEqual(('A', 'G', 'I', 'K', 'T'), res)
self.assertAlmostEqual(5.3082, distance_sum(res, self.dm20))

seedset = ['550.L1S18.s.1.sequence', '550.L1S142.s.1.sequence',
'550.L1S176.s.1.sequence']
res = prototype_selection_destructive_maxdist(self.dm100, 10, seedset)
self.assertCountEqual(
('550.L1S1.s.1.sequence', '550.L1S15.s.1.sequence',
'550.L1S18.s.1.sequence', '550.L1S129.s.1.sequence',
'550.L1S132.s.1.sequence', '550.L1S136.s.1.sequence',
'550.L1S142.s.1.sequence', '550.L1S147.s.1.sequence',
'550.L1S176.s.1.sequence', '550.L1S189.s.1.sequence'),
res)
self.assertAlmostEqual(26.7457727563, distance_sum(res, self.dm100))

self.assertRaisesRegex(
MissingIDError,
"The ID 'X' is not in the dissimilarity matrix.",
prototype_selection_destructive_maxdist,
self.dm20,
5,
set(['A', 'X']))


if __name__ == '__main__':
main()

0 comments on commit 9a6bc55

Please sign in to comment.