From 05b381bcee54b025427f6a28f321753f94520722 Mon Sep 17 00:00:00 2001
From: Hayden Metsky <hmetsky@gmail.com>
Date: Mon, 7 Jan 2019 10:57:01 -0500
Subject: [PATCH 1/2] Expand test for IntervalSet initialization

Add check that creating a new IntervalSet from a list of
2 intervals is equivalent to taking the union of those
2 intervals.
---
 catch/utils/tests/test_interval.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/catch/utils/tests/test_interval.py b/catch/utils/tests/test_interval.py
index b90522bab..a3722f315 100644
--- a/catch/utils/tests/test_interval.py
+++ b/catch/utils/tests/test_interval.py
@@ -62,10 +62,16 @@ def compare_union(self, input_a, input_b, desired_output):
         a = IntervalSet(input_a)
         b = IntervalSet(input_b)
         o = IntervalSet(desired_output)
+
         # Union is commutative, so check both orders
         self.assertEqual(a.union(b), o)
         self.assertEqual(b.union(a), o)
 
+        # Making a new IntervalSet should merge overlapping
+        # intervals, effectively taking the union
+        ab = IntervalSet(input_a + input_b)
+        self.assertEqual(ab, o)
+
     def test_union(self):
         self.compare_union([], [], [])
         self.compare_union([], [(1, 3)], [(1, 3)])
@@ -77,6 +83,7 @@ def test_union(self):
         self.compare_union([(1, 10)], [(3, 7)], [(1, 10)])
         self.compare_union([(2, 100)], [(0, 50)], [(0, 100)])
         self.compare_union([(0, 7)], [(4, 10)], [(0, 10)])
+        self.compare_union([(4, 10)], [(0, 7)], [(0, 10)])
         self.compare_union([(1, 5), (10, 15)], [(1, 5), (15, 20)], [(1, 5),
                                                                     (10, 20)])
         self.compare_union([(1, 5), (10, 15)], [(3, 12)], [(1, 15)])

From 8c98893155f213adbf2d5c6755eaccda3c774fb9 Mon Sep 17 00:00:00 2001
From: Hayden Metsky <hmetsky@gmail.com>
Date: Mon, 7 Jan 2019 11:05:49 -0500
Subject: [PATCH 2/2] Speed up construction of universe from IntervalSets

set_cover.approx_multiuniverse() constructs the universes by taking
the union over all input sets. When use_intervalsets is True,
this was previously slow. (use_intervalsets is True for most runs because
set_cover_filter sets it to True.) It worked by creating an empty
IntervalSet, and then repeatedly taking the union of the universe's
IntervalSet with another input set (also represented as an IntervalSet).
In the worst-case, taking the union takes O(n) time where n is the
current size of the universe. As a result, constructing the universe
took, in the worst-case, O(n^2) time. For particularly long genomes,
where n is large, this could take hours in practice.

This fix instead initializes each universe by simply constructing
a list of the input intervals for it, and then initializing an IntervalSet
from the list. Initializing the IntervalSet already merges overlapping
intervals (effectively taking the union of all of them) in O(N log N)
time, where N is the number of input intervals. Since the input sets
are constructed from the universe, N is O(n), where n is defined above.
So, now, constructing the universe takes, in the worst-case, O(n log n)
time. In practice, this seems to provide a noticeable speed up as well.
---
 catch/utils/set_cover.py | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/catch/utils/set_cover.py b/catch/utils/set_cover.py
index 0e11b9424..7f937b24b 100644
--- a/catch/utils/set_cover.py
+++ b/catch/utils/set_cover.py
@@ -299,23 +299,34 @@ def approx_multiuniverse(sets,
     # Create the universes from given sets
     if use_intervalsets:
         # Store the elements of each universe in an IntervalSet
-        universes = defaultdict(lambda: interval.IntervalSet([]))
+        # First, collect a list of intervals for each universe
+        universes_unmerged = defaultdict(list)
+        for sets_by_universe in sets.values():
+            for universe_id, s in sets_by_universe.items():
+                if isinstance(s, tuple):
+                    # s is a single interval
+                    universes_unmerged[universe_id].append(s)
+                else:
+                    # s is an IntervalSet
+                    for i in s.intervals:
+                        universes_unmerged[universe_id].append(i)
+        # Now, for each universe, create one IntervalSet from its list
+        # of intervals; doing so will merge overlapping intervals
+        # (effectively taking the union of all the intervals)
+        universes = {}
+        for universe_id, intervals in universes_unmerged.items():
+            universes[universe_id] = interval.IntervalSet(intervals)
     else:
         # Store the elements of each universe in a set
         universes = defaultdict(set)
-    for sets_by_universe in sets.values():
-        for universe_id, s in sets_by_universe.items():
-            if use_intervalsets:
-                if isinstance(s, tuple):
-                    # s is a single interval
-                    s = interval.IntervalSet([s])
-                universes[universe_id] = universes[universe_id].union(s)
-            elif use_arrays:
-                for v in s:
-                    universes[universe_id].add(v)
-            else:
-                universes[universe_id].update(s)
-    universes = dict(universes)
+        for sets_by_universe in sets.values():
+            for universe_id, s in sets_by_universe.items():
+                if use_arrays:
+                    for v in s:
+                        universes[universe_id].add(v)
+                else:
+                    universes[universe_id].update(s)
+        universes = dict(universes)
 
     if universe_p is None:
         # Give each universe a coverage fraction of 1.0 (i.e., cover