MAFINDEX_VERSION = 2 (inclusive end coordinates).

The use of inclusive coodinates for the sqlite MAX index is an attempt to fix issues discussed in the following pull requests: biopython#504 biopython#1086 A test has been added in `test_MafIO_index.py` to check that the number of MAF alignment blocks returned when querying for a single position will be 1 at the boundary between blocks (it could be 2 with the previous MAF index, which is a bug). The indices built with end-exclusive coordinates will not be compatible with the present version.
blaiseli · Mar 27, 2017 · aaf6d9d · aaf6d9d
1 parent 48ae539
commit aaf6d9d
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 57 deletions.
diff --git a/Bio/AlignIO/MafIO.py b/Bio/AlignIO/MafIO.py
@@ -85,7 +85,7 @@
 from Bio.Align import MultipleSeqAlignment
 from .Interfaces import SequentialAlignmentWriter
 
-MAFINDEX_VERSION = 1
+MAFINDEX_VERSION = 2
 
 
 class MafWriter(SequentialAlignmentWriter):
@@ -349,7 +349,7 @@ def __make_new_index(self):
         """Read MAF file and generate SQLite index"""
         # make the tables
         self._con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);")
-        self._con.execute("INSERT INTO meta_data (key, value) VALUES ('version', 1);")
+        self._con.execute("INSERT INTO meta_data (key, value) VALUES ('version', %s);" % MAFINDEX_VERSION)
         self._con.execute("INSERT INTO meta_data (key, value) VALUES ('record_count', -1);")
         self._con.execute("INSERT INTO meta_data (key, value) VALUES ('target_seqname', '%s');" %
                           (self._target_seqname,))
@@ -388,6 +388,7 @@ def __make_new_index(self):
                 break
 
             # batch is made from self.__maf_indexer(),
+            # which yields zero-based "inclusive" start and end coordinates
             self._con.executemany(
                 "INSERT INTO offset_data (bin, start, end, offset) VALUES (?,?,?,?);", batch)
             self._con.commit()
@@ -433,13 +434,18 @@ def __maf_indexer(self):
 
                         if line_split[1] == self._target_seqname:
                             start = int(line_split[2])
-                            end = int(line_split[2]) + int(line_split[3])
+                            size = int(line_split[3])
+                            if size != len(line_split[6].replace("-", "")):
+                                raise ValueError(
+                                    "Invalid length for target coordinates "
+                                    "(expected %s, found %s)" % (
+                                        size, len(line_split[6].replace("-", ""))))
 
-                            if end - start != len(line_split[6].replace("-", "")):
-                                raise ValueError("Invalid length for target coordinates (expected %s, found %s)" %
-                                                 (end - start, len(line_split[6].replace("-", ""))))
+                            # "inclusive" end position is start + length - 1
+                            end = start + size - 1
 
-                            yield (self._ucscbin(start, end), start, end, offset)
+                            # _ucscbin takes end-exclusive coordinates
+                            yield (self._ucscbin(start, end + 1), start, end, offset)
 
                             break
 
@@ -508,9 +514,12 @@ def search(self, starts, ends):
         if len(starts) != len(ends):
             raise ValueError("Every position in starts must have a match in ends")
 
+        # Could it be safer to sort the (exonstart, exonend) pairs?
         for exonstart, exonend in zip(starts, ends):
-            if exonstart >= exonend:
-                raise ValueError("Exon coordinates invalid (%s >= %s)" % (exonstart, exonend))
+            exonlen = exonend - exonstart
+            if exonlen < 1:
+                raise ValueError("Exon coordinates (%d, %d) invalid: exon length (%d) < 1" % (
+                    exonstart, exonend, exonlen))
         con = self._con
 
         # Keep track of what blocks have already been yielded
@@ -537,14 +546,25 @@ def search(self, starts, ends):
             # right.
             # -----
 
-            result = con.execute("SELECT DISTINCT start, end, offset FROM "
-                                 "offset_data WHERE bin IN (%s) AND (end BETWEEN %s AND %s "
-                                 "OR %s BETWEEN start AND end) ORDER BY start, end, "
-                                 "offset ASC;"
-                                 % (possible_bins, exonstart, exonend, exonend))
+            # We are testing overlap between the query segment and records in
+            # the index, using non-strict coordinates comparisons.
+            # The query segment end must be passed as end-inclusive
+            # The index should also have been build with end-inclusive
+            # end coordinates.
+            # See https://github.com/biopython/biopython/pull/1086#issuecomment-285069073
+
+            result = con.execute(
+                "SELECT DISTINCT start, end, offset FROM offset_data "
+                "WHERE bin IN (%s) "
+                "AND (end BETWEEN %s AND %s OR %s BETWEEN start AND end) "
+                "ORDER BY start, end, offset ASC;" %
+                (possible_bins, exonstart, exonend - 1, exonend - 1))
 
             rows = result.fetchall()
 
+            # rows come from the sqlite index,
+            # which should have been written using __make_new_index,
+            # so rec_start and rec_end should be zero-based "inclusive" coordinates
             for rec_start, rec_end, offset in rows:
                 # Avoid yielding multiple time the same block
                 if (rec_start, rec_end) in yielded_rec_coords:
@@ -560,7 +580,8 @@ def search(self, starts, ends):
                     if record.id == self._target_seqname:
                         # start and size come from the maf lines
                         start = record.annotations["start"]
-                        end = record.annotations["start"] + record.annotations["size"]
+                        # "inclusive" end is start + length - 1
+                        end = start + record.annotations["size"] - 1
 
                         if not (start == rec_start and end == rec_end):
                             raise ValueError("Expected %s-%s @ offset %s, found %s-%s" %
@@ -641,16 +662,20 @@ def get_spliced(self, starts, ends, strand=1):
                     # length including gaps (i.e. alignment length)
                     rec_length = len(seqrec)
                     rec_start = seqrec.annotations["start"]
-                    rec_end = seqrec.annotations["start"] + seqrec.annotations["size"]
-
-                    total_rec_length += rec_end - rec_start
+                    ungapped_length = seqrec.annotations["size"]
+                    # inclusive end in zero-based coordinates of the reference
+                    rec_end = rec_start + ungapped_length - 1
+                    # This is length in terms of actual letters in the reference
+                    total_rec_length += ungapped_length
 
                     # blank out these positions for every seqname
                     for seqrec in multiseq:
-                        for pos in range(rec_start, rec_end):
+                        for pos in range(rec_start, rec_end + 1):
                             split_by_position[seqrec.id][pos] = ""
 
                     break
+            # http://psung.blogspot.fr/2007/12/for-else-in-python.html
+            # https://docs.python.org/2/tutorial/controlflow.html#break-and-continue-statements-and-else-clauses-on-loops
             else:
                 raise ValueError("Did not find %s in alignment bundle" % (self._target_seqname,))
 
@@ -671,7 +696,7 @@ def get_spliced(self, starts, ends, strand=1):
 
                 # increment the real_pos counter only when non-gaps are found in
                 # the target_seqname, and we haven't reached the end of the record
-                if track_val != "-" and real_pos < rec_end - 1:
+                if track_val != "-" and real_pos < rec_end:
                     real_pos += 1
 
         # make sure the number of bp entries equals the sum of the record lengths
@@ -682,9 +707,9 @@ def get_spliced(self, starts, ends, strand=1):
                               total_rec_length))
 
         # translates a position in the target_seqname sequence to its gapped length
-        realpos_to_len = dict([(x, len(y))
-                               for x, y in split_by_position[self._target_seqname].items()
-                               if len(y) > 1])
+        realpos_to_len = dict([(pos, len(gapped_fragment))
+                               for pos, gapped_fragment in split_by_position[self._target_seqname].items()
+                               if len(gapped_fragment) > 1])
 
         # splice together the exons
         subseq = {}
@@ -700,6 +725,7 @@ def get_spliced(self, starts, ends, strand=1):
             append = seq_splice.append
 
             for exonstart, exonend in zip(starts, ends):
+                # exonend is exclusive
                 for real_pos in range(exonstart, exonend):
                     # if this seqname has this position, add it
                     if real_pos in seq_split:

diff --git a/Tests/test_MafIO_index.py b/Tests/test_MafIO_index.py
@@ -1,8 +1,8 @@
 # Copyright 2012 by Andrew Sczesnak.  All rights reserved.
+# Revisions Copyright 2017 by Blaise Li.  All rights reserved.
 # This code is part of the Biopython distribution and governed by its
 # license.  Please see the LICENSE file that should have been included
 # as part of this package.
-
 """Unit tests for Bio.AlignIO.MafIO.MafIndex()"""
 
 try:
@@ -289,50 +289,48 @@ def test_correct_retrieval_1(self):
             search = self.idx.search((3014742, 3018161), (3015028, 3018644))
             results = [x for x in search]
 
-            self.assertEqual(len(results), 12)
+            self.assertEqual(len(results), 4 + 4)
 
             self.assertEqual(set([len(x) for x in results]),
-                             set([5, 10, 7, 6, 3, 1, 1, 1, 2, 4, 4, 9]))
-
-            self.assertEqual(set([x.annotations["start"] for y in results
-                                  for x in y]),
-                             set([3018359, 16390338, 15871771, 184712,
-                                  16169512, 16169976, 3014842, 1371, 7842,
-                                  171548, 16389874, 15871306, 6404, 184317,
-                                  14750994, 3015028, 1616, 8040, 171763,
-                                  16169731, 6627, 184539, 3014689, 15870832,
-                                  16389401, 6228, 184148, 1201, 3018230,
-                                  15871676, 16390243, 3014778, 3018482, 3017743,
-                                  3018644, 78070420, 3014742, 6283, 184202,
-                                  1257, 3018161, 16390178, 15871611, 16169818,
-                                  3014795, 184257, 6365, 15871286, 16389854,
-                                  16169492, 171521, 7816, 1309]))
+                             set([4, 1, 9, 10, 4, 3, 5, 1]))
+
+            # Code formatting note:
+            # Expected start coordinates are grouped by alignment blocks
+            self.assertEqual(
+                set([x.annotations["start"] for y in results for x in y]),
+                set([
+                    3014742, 6283, 184202, 1257,
+                    3014778,
+                    3014795, 184257, 6365, 15871286, 16389854, 16169492, 171521, 7816, 1309,
+                    3014842, 1371, 7842, 171548, 16169512, 16389874, 15871306, 6404, 184317, 14750994,
+                    3018161, 16390178, 15871611, 16169818,
+                    3018230, 15871676, 16390243,
+                    3018359, 16390338, 15871771, 184712, 16169976, 3018482]))
 
         def test_correct_retrieval_2(self):
             search = self.idx.search((3009319, 3021421), (3012566, 3021536))
             results = [x for x in search]
 
-            self.assertEqual(len(results), 8)
+            self.assertEqual(len(results), 6)
 
             self.assertEqual(set([len(x) for x in results]),
-                             set([14, 5, 2, 6, 7, 15, 6, 4]))
-
-            self.assertEqual(set([x.annotations["start"] for y in results
-                                  for x in y]),
-                             set([3021421, 9910, 996, 16173434, 16393782,
-                                  15875216, 11047, 175213, 3552, 677, 78072203,
-                                  3590, 95587, 14757054, 3012441, 15860899,
-                                  16379447, 16160646, 180525, 3009319, 11087,
-                                  3012566, 15861013, 16379561, 16160760, 180626,
-                                  310, 3021465, 9957, 16173483, 16393831,
-                                  15875265, 78072243, 14757099, 3021275, 9741,
-                                  838, 16173265, 16393613, 15875047, 10878,
-                                  175057, 3382, 521, 78072035, 73556, 3422,
-                                  95418, 14756885, 3021494, 16173516, 16393864,
-                                  15875298, 78072287, 14757144, 3012076,
-                                  16160203, 16379004, 15860456]))
+                             set([2, 4, 5, 14, 7, 6]))
+
+            # Code formatting note:
+            # Expected start coordinates are grouped by alignment blocks
+            self.assertEqual(
+                set([x.annotations["start"] for y in results for x in y]),
+                set([
+                    3009319, 11087,
+                    3012076, 16160203, 16379004, 15860456,
+                    3012441, 15860899, 16379447, 16160646, 180525,
+                    3021421, 9910, 996, 16173434, 16393782, 15875216, 11047, 175213, 3552, 677, 78072203, 3590, 95587, 14757054,
+                    3021465, 9957, 16173483, 16393831, 15875265, 78072243, 14757099,
+                    3021494, 16173516, 16393864, 15875298, 78072287, 14757144]))
 
         def test_correct_retrieval_3(self):
+            """References:
+            https://github.com/biopython/biopython/issues/1083"""
             search = self.idx.search((3012076, 3012076 + 300), (3012076 + 100, 3012076 + 400))
             results = [x for x in search]
 
@@ -349,6 +347,20 @@ def test_correct_retrieval_3(self):
                     3012076, 16160203, 16379004, 15860456,
                     3012441, 15860899, 16379447, 16160646, 180525]))
 
+        def test_correct_block_boundary(self):
+            """References:
+            https://github.com/biopython/biopython/pull/504
+            https://github.com/biopython/biopython/pull/1086#issuecomment-285080702
+            """
+            search = self.idx.search([3014688], [3014689])
+            self.assertEqual(len(list(search)), 1)
+
+            search = self.idx.search([3014689], [3014690])
+            self.assertEqual(len(list(search)), 1)
+
+            search = self.idx.search([3014688], [3014690])
+            self.assertEqual(len(list(search)), 2)
+
     class TestSearchBadMAF(unittest.TestCase):
         """Test index searching on an incorrectly-formatted MAF"""