From aaf6d9d7274a15a9b948fc75382f7ecda578e307 Mon Sep 17 00:00:00 2001 From: Blaise Li Date: Thu, 9 Mar 2017 14:29:27 +0100 Subject: [PATCH] MAFINDEX_VERSION = 2 (inclusive end coordinates). The use of inclusive coodinates for the sqlite MAX index is an attempt to fix issues discussed in the following pull requests: https://github.com/biopython/biopython/pull/504 https://github.com/biopython/biopython/pull/1086 A test has been added in `test_MafIO_index.py` to check that the number of MAF alignment blocks returned when querying for a single position will be 1 at the boundary between blocks (it could be 2 with the previous MAF index, which is a bug). The indices built with end-exclusive coordinates will not be compatible with the present version. --- Bio/AlignIO/MafIO.py | 72 ++++++++++++++++++++++++----------- Tests/test_MafIO_index.py | 80 ++++++++++++++++++++++----------------- 2 files changed, 95 insertions(+), 57 deletions(-) diff --git a/Bio/AlignIO/MafIO.py b/Bio/AlignIO/MafIO.py index eea01c9ebf6..df98736eecf 100644 --- a/Bio/AlignIO/MafIO.py +++ b/Bio/AlignIO/MafIO.py @@ -85,7 +85,7 @@ from Bio.Align import MultipleSeqAlignment from .Interfaces import SequentialAlignmentWriter -MAFINDEX_VERSION = 1 +MAFINDEX_VERSION = 2 class MafWriter(SequentialAlignmentWriter): @@ -349,7 +349,7 @@ def __make_new_index(self): """Read MAF file and generate SQLite index""" # make the tables self._con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") - self._con.execute("INSERT INTO meta_data (key, value) VALUES ('version', 1);") + self._con.execute("INSERT INTO meta_data (key, value) VALUES ('version', %s);" % MAFINDEX_VERSION) self._con.execute("INSERT INTO meta_data (key, value) VALUES ('record_count', -1);") self._con.execute("INSERT INTO meta_data (key, value) VALUES ('target_seqname', '%s');" % (self._target_seqname,)) @@ -388,6 +388,7 @@ def __make_new_index(self): break # batch is made from self.__maf_indexer(), + # which yields zero-based "inclusive" start and end coordinates self._con.executemany( "INSERT INTO offset_data (bin, start, end, offset) VALUES (?,?,?,?);", batch) self._con.commit() @@ -433,13 +434,18 @@ def __maf_indexer(self): if line_split[1] == self._target_seqname: start = int(line_split[2]) - end = int(line_split[2]) + int(line_split[3]) + size = int(line_split[3]) + if size != len(line_split[6].replace("-", "")): + raise ValueError( + "Invalid length for target coordinates " + "(expected %s, found %s)" % ( + size, len(line_split[6].replace("-", "")))) - if end - start != len(line_split[6].replace("-", "")): - raise ValueError("Invalid length for target coordinates (expected %s, found %s)" % - (end - start, len(line_split[6].replace("-", "")))) + # "inclusive" end position is start + length - 1 + end = start + size - 1 - yield (self._ucscbin(start, end), start, end, offset) + # _ucscbin takes end-exclusive coordinates + yield (self._ucscbin(start, end + 1), start, end, offset) break @@ -508,9 +514,12 @@ def search(self, starts, ends): if len(starts) != len(ends): raise ValueError("Every position in starts must have a match in ends") + # Could it be safer to sort the (exonstart, exonend) pairs? for exonstart, exonend in zip(starts, ends): - if exonstart >= exonend: - raise ValueError("Exon coordinates invalid (%s >= %s)" % (exonstart, exonend)) + exonlen = exonend - exonstart + if exonlen < 1: + raise ValueError("Exon coordinates (%d, %d) invalid: exon length (%d) < 1" % ( + exonstart, exonend, exonlen)) con = self._con # Keep track of what blocks have already been yielded @@ -537,14 +546,25 @@ def search(self, starts, ends): # right. # ----- - result = con.execute("SELECT DISTINCT start, end, offset FROM " - "offset_data WHERE bin IN (%s) AND (end BETWEEN %s AND %s " - "OR %s BETWEEN start AND end) ORDER BY start, end, " - "offset ASC;" - % (possible_bins, exonstart, exonend, exonend)) + # We are testing overlap between the query segment and records in + # the index, using non-strict coordinates comparisons. + # The query segment end must be passed as end-inclusive + # The index should also have been build with end-inclusive + # end coordinates. + # See https://github.com/biopython/biopython/pull/1086#issuecomment-285069073 + + result = con.execute( + "SELECT DISTINCT start, end, offset FROM offset_data " + "WHERE bin IN (%s) " + "AND (end BETWEEN %s AND %s OR %s BETWEEN start AND end) " + "ORDER BY start, end, offset ASC;" % + (possible_bins, exonstart, exonend - 1, exonend - 1)) rows = result.fetchall() + # rows come from the sqlite index, + # which should have been written using __make_new_index, + # so rec_start and rec_end should be zero-based "inclusive" coordinates for rec_start, rec_end, offset in rows: # Avoid yielding multiple time the same block if (rec_start, rec_end) in yielded_rec_coords: @@ -560,7 +580,8 @@ def search(self, starts, ends): if record.id == self._target_seqname: # start and size come from the maf lines start = record.annotations["start"] - end = record.annotations["start"] + record.annotations["size"] + # "inclusive" end is start + length - 1 + end = start + record.annotations["size"] - 1 if not (start == rec_start and end == rec_end): raise ValueError("Expected %s-%s @ offset %s, found %s-%s" % @@ -641,16 +662,20 @@ def get_spliced(self, starts, ends, strand=1): # length including gaps (i.e. alignment length) rec_length = len(seqrec) rec_start = seqrec.annotations["start"] - rec_end = seqrec.annotations["start"] + seqrec.annotations["size"] - - total_rec_length += rec_end - rec_start + ungapped_length = seqrec.annotations["size"] + # inclusive end in zero-based coordinates of the reference + rec_end = rec_start + ungapped_length - 1 + # This is length in terms of actual letters in the reference + total_rec_length += ungapped_length # blank out these positions for every seqname for seqrec in multiseq: - for pos in range(rec_start, rec_end): + for pos in range(rec_start, rec_end + 1): split_by_position[seqrec.id][pos] = "" break + # http://psung.blogspot.fr/2007/12/for-else-in-python.html + # https://docs.python.org/2/tutorial/controlflow.html#break-and-continue-statements-and-else-clauses-on-loops else: raise ValueError("Did not find %s in alignment bundle" % (self._target_seqname,)) @@ -671,7 +696,7 @@ def get_spliced(self, starts, ends, strand=1): # increment the real_pos counter only when non-gaps are found in # the target_seqname, and we haven't reached the end of the record - if track_val != "-" and real_pos < rec_end - 1: + if track_val != "-" and real_pos < rec_end: real_pos += 1 # make sure the number of bp entries equals the sum of the record lengths @@ -682,9 +707,9 @@ def get_spliced(self, starts, ends, strand=1): total_rec_length)) # translates a position in the target_seqname sequence to its gapped length - realpos_to_len = dict([(x, len(y)) - for x, y in split_by_position[self._target_seqname].items() - if len(y) > 1]) + realpos_to_len = dict([(pos, len(gapped_fragment)) + for pos, gapped_fragment in split_by_position[self._target_seqname].items() + if len(gapped_fragment) > 1]) # splice together the exons subseq = {} @@ -700,6 +725,7 @@ def get_spliced(self, starts, ends, strand=1): append = seq_splice.append for exonstart, exonend in zip(starts, ends): + # exonend is exclusive for real_pos in range(exonstart, exonend): # if this seqname has this position, add it if real_pos in seq_split: diff --git a/Tests/test_MafIO_index.py b/Tests/test_MafIO_index.py index e44143a8f2f..8e56477a33e 100644 --- a/Tests/test_MafIO_index.py +++ b/Tests/test_MafIO_index.py @@ -1,8 +1,8 @@ # Copyright 2012 by Andrew Sczesnak. All rights reserved. +# Revisions Copyright 2017 by Blaise Li. All rights reserved. # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. - """Unit tests for Bio.AlignIO.MafIO.MafIndex()""" try: @@ -289,50 +289,48 @@ def test_correct_retrieval_1(self): search = self.idx.search((3014742, 3018161), (3015028, 3018644)) results = [x for x in search] - self.assertEqual(len(results), 12) + self.assertEqual(len(results), 4 + 4) self.assertEqual(set([len(x) for x in results]), - set([5, 10, 7, 6, 3, 1, 1, 1, 2, 4, 4, 9])) - - self.assertEqual(set([x.annotations["start"] for y in results - for x in y]), - set([3018359, 16390338, 15871771, 184712, - 16169512, 16169976, 3014842, 1371, 7842, - 171548, 16389874, 15871306, 6404, 184317, - 14750994, 3015028, 1616, 8040, 171763, - 16169731, 6627, 184539, 3014689, 15870832, - 16389401, 6228, 184148, 1201, 3018230, - 15871676, 16390243, 3014778, 3018482, 3017743, - 3018644, 78070420, 3014742, 6283, 184202, - 1257, 3018161, 16390178, 15871611, 16169818, - 3014795, 184257, 6365, 15871286, 16389854, - 16169492, 171521, 7816, 1309])) + set([4, 1, 9, 10, 4, 3, 5, 1])) + + # Code formatting note: + # Expected start coordinates are grouped by alignment blocks + self.assertEqual( + set([x.annotations["start"] for y in results for x in y]), + set([ + 3014742, 6283, 184202, 1257, + 3014778, + 3014795, 184257, 6365, 15871286, 16389854, 16169492, 171521, 7816, 1309, + 3014842, 1371, 7842, 171548, 16169512, 16389874, 15871306, 6404, 184317, 14750994, + 3018161, 16390178, 15871611, 16169818, + 3018230, 15871676, 16390243, + 3018359, 16390338, 15871771, 184712, 16169976, 3018482])) def test_correct_retrieval_2(self): search = self.idx.search((3009319, 3021421), (3012566, 3021536)) results = [x for x in search] - self.assertEqual(len(results), 8) + self.assertEqual(len(results), 6) self.assertEqual(set([len(x) for x in results]), - set([14, 5, 2, 6, 7, 15, 6, 4])) - - self.assertEqual(set([x.annotations["start"] for y in results - for x in y]), - set([3021421, 9910, 996, 16173434, 16393782, - 15875216, 11047, 175213, 3552, 677, 78072203, - 3590, 95587, 14757054, 3012441, 15860899, - 16379447, 16160646, 180525, 3009319, 11087, - 3012566, 15861013, 16379561, 16160760, 180626, - 310, 3021465, 9957, 16173483, 16393831, - 15875265, 78072243, 14757099, 3021275, 9741, - 838, 16173265, 16393613, 15875047, 10878, - 175057, 3382, 521, 78072035, 73556, 3422, - 95418, 14756885, 3021494, 16173516, 16393864, - 15875298, 78072287, 14757144, 3012076, - 16160203, 16379004, 15860456])) + set([2, 4, 5, 14, 7, 6])) + + # Code formatting note: + # Expected start coordinates are grouped by alignment blocks + self.assertEqual( + set([x.annotations["start"] for y in results for x in y]), + set([ + 3009319, 11087, + 3012076, 16160203, 16379004, 15860456, + 3012441, 15860899, 16379447, 16160646, 180525, + 3021421, 9910, 996, 16173434, 16393782, 15875216, 11047, 175213, 3552, 677, 78072203, 3590, 95587, 14757054, + 3021465, 9957, 16173483, 16393831, 15875265, 78072243, 14757099, + 3021494, 16173516, 16393864, 15875298, 78072287, 14757144])) def test_correct_retrieval_3(self): + """References: + https://github.com/biopython/biopython/issues/1083""" search = self.idx.search((3012076, 3012076 + 300), (3012076 + 100, 3012076 + 400)) results = [x for x in search] @@ -349,6 +347,20 @@ def test_correct_retrieval_3(self): 3012076, 16160203, 16379004, 15860456, 3012441, 15860899, 16379447, 16160646, 180525])) + def test_correct_block_boundary(self): + """References: + https://github.com/biopython/biopython/pull/504 + https://github.com/biopython/biopython/pull/1086#issuecomment-285080702 + """ + search = self.idx.search([3014688], [3014689]) + self.assertEqual(len(list(search)), 1) + + search = self.idx.search([3014689], [3014690]) + self.assertEqual(len(list(search)), 1) + + search = self.idx.search([3014688], [3014690]) + self.assertEqual(len(list(search)), 2) + class TestSearchBadMAF(unittest.TestCase): """Test index searching on an incorrectly-formatted MAF"""