Skip to content

Commit

Permalink
MAFINDEX_VERSION = 2 (inclusive end coordinates).
Browse files Browse the repository at this point in the history
The use of inclusive coodinates for the sqlite MAX index
is an attempt to fix issues discussed in the following pull requests:
biopython#504
biopython#1086

A test has been added in `test_MafIO_index.py` to check that the number
of
MAF alignment blocks returned when querying for a single position will
be 1 at the boundary between blocks (it could be 2 with the previous MAF
index, which is a bug).

The indices built with end-exclusive coordinates will not be compatible
with the present version.
  • Loading branch information
blaiseli committed Mar 27, 2017
1 parent 48ae539 commit aaf6d9d
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 57 deletions.
72 changes: 49 additions & 23 deletions Bio/AlignIO/MafIO.py
Expand Up @@ -85,7 +85,7 @@
from Bio.Align import MultipleSeqAlignment
from .Interfaces import SequentialAlignmentWriter

MAFINDEX_VERSION = 1
MAFINDEX_VERSION = 2


class MafWriter(SequentialAlignmentWriter):
Expand Down Expand Up @@ -349,7 +349,7 @@ def __make_new_index(self):
"""Read MAF file and generate SQLite index"""
# make the tables
self._con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);")
self._con.execute("INSERT INTO meta_data (key, value) VALUES ('version', 1);")
self._con.execute("INSERT INTO meta_data (key, value) VALUES ('version', %s);" % MAFINDEX_VERSION)
self._con.execute("INSERT INTO meta_data (key, value) VALUES ('record_count', -1);")
self._con.execute("INSERT INTO meta_data (key, value) VALUES ('target_seqname', '%s');" %
(self._target_seqname,))
Expand Down Expand Up @@ -388,6 +388,7 @@ def __make_new_index(self):
break

# batch is made from self.__maf_indexer(),
# which yields zero-based "inclusive" start and end coordinates
self._con.executemany(
"INSERT INTO offset_data (bin, start, end, offset) VALUES (?,?,?,?);", batch)
self._con.commit()
Expand Down Expand Up @@ -433,13 +434,18 @@ def __maf_indexer(self):

if line_split[1] == self._target_seqname:
start = int(line_split[2])
end = int(line_split[2]) + int(line_split[3])
size = int(line_split[3])
if size != len(line_split[6].replace("-", "")):
raise ValueError(
"Invalid length for target coordinates "
"(expected %s, found %s)" % (
size, len(line_split[6].replace("-", ""))))

if end - start != len(line_split[6].replace("-", "")):
raise ValueError("Invalid length for target coordinates (expected %s, found %s)" %
(end - start, len(line_split[6].replace("-", ""))))
# "inclusive" end position is start + length - 1
end = start + size - 1

yield (self._ucscbin(start, end), start, end, offset)
# _ucscbin takes end-exclusive coordinates
yield (self._ucscbin(start, end + 1), start, end, offset)

break

Expand Down Expand Up @@ -508,9 +514,12 @@ def search(self, starts, ends):
if len(starts) != len(ends):
raise ValueError("Every position in starts must have a match in ends")

# Could it be safer to sort the (exonstart, exonend) pairs?
for exonstart, exonend in zip(starts, ends):
if exonstart >= exonend:
raise ValueError("Exon coordinates invalid (%s >= %s)" % (exonstart, exonend))
exonlen = exonend - exonstart
if exonlen < 1:
raise ValueError("Exon coordinates (%d, %d) invalid: exon length (%d) < 1" % (
exonstart, exonend, exonlen))
con = self._con

# Keep track of what blocks have already been yielded
Expand All @@ -537,14 +546,25 @@ def search(self, starts, ends):
# right.
# -----

result = con.execute("SELECT DISTINCT start, end, offset FROM "
"offset_data WHERE bin IN (%s) AND (end BETWEEN %s AND %s "
"OR %s BETWEEN start AND end) ORDER BY start, end, "
"offset ASC;"
% (possible_bins, exonstart, exonend, exonend))
# We are testing overlap between the query segment and records in
# the index, using non-strict coordinates comparisons.
# The query segment end must be passed as end-inclusive
# The index should also have been build with end-inclusive
# end coordinates.
# See https://github.com/biopython/biopython/pull/1086#issuecomment-285069073

result = con.execute(
"SELECT DISTINCT start, end, offset FROM offset_data "
"WHERE bin IN (%s) "
"AND (end BETWEEN %s AND %s OR %s BETWEEN start AND end) "
"ORDER BY start, end, offset ASC;" %
(possible_bins, exonstart, exonend - 1, exonend - 1))

rows = result.fetchall()

# rows come from the sqlite index,
# which should have been written using __make_new_index,
# so rec_start and rec_end should be zero-based "inclusive" coordinates
for rec_start, rec_end, offset in rows:
# Avoid yielding multiple time the same block
if (rec_start, rec_end) in yielded_rec_coords:
Expand All @@ -560,7 +580,8 @@ def search(self, starts, ends):
if record.id == self._target_seqname:
# start and size come from the maf lines
start = record.annotations["start"]
end = record.annotations["start"] + record.annotations["size"]
# "inclusive" end is start + length - 1
end = start + record.annotations["size"] - 1

if not (start == rec_start and end == rec_end):
raise ValueError("Expected %s-%s @ offset %s, found %s-%s" %
Expand Down Expand Up @@ -641,16 +662,20 @@ def get_spliced(self, starts, ends, strand=1):
# length including gaps (i.e. alignment length)
rec_length = len(seqrec)
rec_start = seqrec.annotations["start"]
rec_end = seqrec.annotations["start"] + seqrec.annotations["size"]

total_rec_length += rec_end - rec_start
ungapped_length = seqrec.annotations["size"]
# inclusive end in zero-based coordinates of the reference
rec_end = rec_start + ungapped_length - 1
# This is length in terms of actual letters in the reference
total_rec_length += ungapped_length

# blank out these positions for every seqname
for seqrec in multiseq:
for pos in range(rec_start, rec_end):
for pos in range(rec_start, rec_end + 1):
split_by_position[seqrec.id][pos] = ""

break
# http://psung.blogspot.fr/2007/12/for-else-in-python.html
# https://docs.python.org/2/tutorial/controlflow.html#break-and-continue-statements-and-else-clauses-on-loops
else:
raise ValueError("Did not find %s in alignment bundle" % (self._target_seqname,))

Expand All @@ -671,7 +696,7 @@ def get_spliced(self, starts, ends, strand=1):

# increment the real_pos counter only when non-gaps are found in
# the target_seqname, and we haven't reached the end of the record
if track_val != "-" and real_pos < rec_end - 1:
if track_val != "-" and real_pos < rec_end:
real_pos += 1

# make sure the number of bp entries equals the sum of the record lengths
Expand All @@ -682,9 +707,9 @@ def get_spliced(self, starts, ends, strand=1):
total_rec_length))

# translates a position in the target_seqname sequence to its gapped length
realpos_to_len = dict([(x, len(y))
for x, y in split_by_position[self._target_seqname].items()
if len(y) > 1])
realpos_to_len = dict([(pos, len(gapped_fragment))
for pos, gapped_fragment in split_by_position[self._target_seqname].items()
if len(gapped_fragment) > 1])

# splice together the exons
subseq = {}
Expand All @@ -700,6 +725,7 @@ def get_spliced(self, starts, ends, strand=1):
append = seq_splice.append

for exonstart, exonend in zip(starts, ends):
# exonend is exclusive
for real_pos in range(exonstart, exonend):
# if this seqname has this position, add it
if real_pos in seq_split:
Expand Down
80 changes: 46 additions & 34 deletions Tests/test_MafIO_index.py
@@ -1,8 +1,8 @@
# Copyright 2012 by Andrew Sczesnak. All rights reserved.
# Revisions Copyright 2017 by Blaise Li. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.

"""Unit tests for Bio.AlignIO.MafIO.MafIndex()"""

try:
Expand Down Expand Up @@ -289,50 +289,48 @@ def test_correct_retrieval_1(self):
search = self.idx.search((3014742, 3018161), (3015028, 3018644))
results = [x for x in search]

self.assertEqual(len(results), 12)
self.assertEqual(len(results), 4 + 4)

self.assertEqual(set([len(x) for x in results]),
set([5, 10, 7, 6, 3, 1, 1, 1, 2, 4, 4, 9]))

self.assertEqual(set([x.annotations["start"] for y in results
for x in y]),
set([3018359, 16390338, 15871771, 184712,
16169512, 16169976, 3014842, 1371, 7842,
171548, 16389874, 15871306, 6404, 184317,
14750994, 3015028, 1616, 8040, 171763,
16169731, 6627, 184539, 3014689, 15870832,
16389401, 6228, 184148, 1201, 3018230,
15871676, 16390243, 3014778, 3018482, 3017743,
3018644, 78070420, 3014742, 6283, 184202,
1257, 3018161, 16390178, 15871611, 16169818,
3014795, 184257, 6365, 15871286, 16389854,
16169492, 171521, 7816, 1309]))
set([4, 1, 9, 10, 4, 3, 5, 1]))

# Code formatting note:
# Expected start coordinates are grouped by alignment blocks
self.assertEqual(
set([x.annotations["start"] for y in results for x in y]),
set([
3014742, 6283, 184202, 1257,
3014778,
3014795, 184257, 6365, 15871286, 16389854, 16169492, 171521, 7816, 1309,
3014842, 1371, 7842, 171548, 16169512, 16389874, 15871306, 6404, 184317, 14750994,
3018161, 16390178, 15871611, 16169818,
3018230, 15871676, 16390243,
3018359, 16390338, 15871771, 184712, 16169976, 3018482]))

def test_correct_retrieval_2(self):
search = self.idx.search((3009319, 3021421), (3012566, 3021536))
results = [x for x in search]

self.assertEqual(len(results), 8)
self.assertEqual(len(results), 6)

self.assertEqual(set([len(x) for x in results]),
set([14, 5, 2, 6, 7, 15, 6, 4]))

self.assertEqual(set([x.annotations["start"] for y in results
for x in y]),
set([3021421, 9910, 996, 16173434, 16393782,
15875216, 11047, 175213, 3552, 677, 78072203,
3590, 95587, 14757054, 3012441, 15860899,
16379447, 16160646, 180525, 3009319, 11087,
3012566, 15861013, 16379561, 16160760, 180626,
310, 3021465, 9957, 16173483, 16393831,
15875265, 78072243, 14757099, 3021275, 9741,
838, 16173265, 16393613, 15875047, 10878,
175057, 3382, 521, 78072035, 73556, 3422,
95418, 14756885, 3021494, 16173516, 16393864,
15875298, 78072287, 14757144, 3012076,
16160203, 16379004, 15860456]))
set([2, 4, 5, 14, 7, 6]))

# Code formatting note:
# Expected start coordinates are grouped by alignment blocks
self.assertEqual(
set([x.annotations["start"] for y in results for x in y]),
set([
3009319, 11087,
3012076, 16160203, 16379004, 15860456,
3012441, 15860899, 16379447, 16160646, 180525,
3021421, 9910, 996, 16173434, 16393782, 15875216, 11047, 175213, 3552, 677, 78072203, 3590, 95587, 14757054,
3021465, 9957, 16173483, 16393831, 15875265, 78072243, 14757099,
3021494, 16173516, 16393864, 15875298, 78072287, 14757144]))

def test_correct_retrieval_3(self):
"""References:
https://github.com/biopython/biopython/issues/1083"""
search = self.idx.search((3012076, 3012076 + 300), (3012076 + 100, 3012076 + 400))
results = [x for x in search]

Expand All @@ -349,6 +347,20 @@ def test_correct_retrieval_3(self):
3012076, 16160203, 16379004, 15860456,
3012441, 15860899, 16379447, 16160646, 180525]))

def test_correct_block_boundary(self):
"""References:
https://github.com/biopython/biopython/pull/504
https://github.com/biopython/biopython/pull/1086#issuecomment-285080702
"""
search = self.idx.search([3014688], [3014689])
self.assertEqual(len(list(search)), 1)

search = self.idx.search([3014689], [3014690])
self.assertEqual(len(list(search)), 1)

search = self.idx.search([3014688], [3014690])
self.assertEqual(len(list(search)), 2)

class TestSearchBadMAF(unittest.TestCase):
"""Test index searching on an incorrectly-formatted MAF"""

Expand Down

0 comments on commit aaf6d9d

Please sign in to comment.