Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More consistent coordinate specification for Maf alignment access in MafIO.py #504

Closed
wants to merge 24 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
a6c0bb4
Added MafIO.py genome alignment format parser.
May 12, 2011
e2f0987
Added more details in search and get_spliced doc.
blaiseli Jan 11, 2017
4455004
Some pep8 and pylint style cleaning.
blaiseli Jan 11, 2017
2fc9b04
Merge branch 'master' into alignio-maf
blaiseli Jan 11, 2017
9ab25d2
Reverted to MafIterator, removed Generic.Alignment
blaiseli Jan 11, 2017
650abdb
Style checking in some tests.
blaiseli Jan 11, 2017
c6bdbb6
Regenerated the output of test_SeqIO
blaiseli Jan 12, 2017
e2f2744
Style checking in some tests.
blaiseli Jan 12, 2017
7807540
Switched back to 0-based, half-open coordinates.
blaiseli Jan 12, 2017
d5397c0
Merge branch 'master' into alignio-maf
blaiseli Feb 22, 2017
be7194a
Merge branch 'master' into alignio-maf
blaiseli Feb 24, 2017
37c218a
Fixed error introduced while resolving conflicts.
blaiseli Feb 24, 2017
9b3e234
Removed internal end coordinates inconsistency.
blaiseli Feb 24, 2017
055ee12
Re-establishing lost-during-merge edits:
blaiseli Feb 24, 2017
c09e6a7
Removed some useless code.
blaiseli Feb 24, 2017
f7253b9
Avoid duplicate results when blocks span 2 exons.
blaiseli Mar 8, 2017
7c3c0da
Set MAFINDEX_VERSION to 2.
blaiseli Mar 8, 2017
f729d65
Updated MAF indices.
blaiseli Mar 8, 2017
31123f7
Updating mafindex for ucsc_mm9_chr10_bad
blaiseli Mar 8, 2017
2391ca6
Added new test for block boundaries checking.
blaiseli Mar 8, 2017
a622f21
Added new test for block boundaries checking.
blaiseli Mar 8, 2017
9fe3c5a
Merge branch 'alignio-maf' of https://github.com/blaiseli/biopython i…
blaiseli Mar 8, 2017
02dfd2f
Merge branch 'alignio-maf' of https://github.com/blaiseli/biopython i…
blaiseli Mar 8, 2017
ef73b3a
Merge branch 'alignio-maf' of https://github.com/blaiseli/biopython i…
blaiseli Mar 8, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
411 changes: 300 additions & 111 deletions Bio/AlignIO/MafIO.py

Large diffs are not rendered by default.

Binary file modified Tests/MAF/ucsc_mm9_chr10.mafindex
Binary file not shown.
Binary file modified Tests/MAF/ucsc_mm9_chr10_bad.mafindex
Binary file not shown.
Binary file modified Tests/MAF/ucsc_mm9_chr10_big.mafindex
Binary file not shown.
9 changes: 6 additions & 3 deletions Tests/run_tests.py
Expand Up @@ -355,9 +355,12 @@ def runTest(self):
# otherwise make sure the two lines are the same
elif expected_line != output_line:
expected.close()
raise ValueError("\nOutput : %s\nExpected: %s\n%s line %s"
% (repr(output_line), repr(expected_line),
outputfile, line_number))
raise ValueError(
"\nOutput : %s\nExpected: %s\n%s line %s" % (
repr(output_line),
repr(expected_line),
outputfile,
line_number))

expected.close()

Expand Down
21 changes: 15 additions & 6 deletions Tests/test_AlignIO.py
Expand Up @@ -115,7 +115,7 @@ def check_simple_write_read(alignments, indent=" "):
records_per_alignment = None
# Can we expect this format to work?
if not records_per_alignment \
and format not in test_write_read_alignment_formats:
and format not in test_write_read_alignment_formats:
continue

print(indent + "Checking can write/read as '%s' format" % format)
Expand Down Expand Up @@ -204,6 +204,14 @@ def simple_alignment_comparison(alignments, alignments2, format):
else:
assert r1.id == r2.id, \
"'%s' vs '%s'" % (r1.id, r2.id)

# Check the sequence
if format == "stockholm":
# We map dot to dash in the stockholm parser, since
# both are gaps (but technically different kinds in HMM)
assert str(r1.seq).replace(".", "-") == str(r2.seq), \
"Seq does not match %s vs %s (%s vs %s)" \
% (r1.seq, r2.seq, r1.id, r2.id)
return True


Expand All @@ -226,8 +234,8 @@ def check_phylip_reject_duplicate():
# Expected - check the error
assert "Repeated name 'longsequen'" in str(err)

check_phylip_reject_duplicate()

check_phylip_reject_duplicate()

# Check parsers can cope with an empty file
for t_format in AlignIO._FormatToIterator:
Expand Down Expand Up @@ -265,8 +273,9 @@ def check_phylip_reject_duplicate():
# Try as an iterator using handle
with open(t_filename, "r") as handle:
alignments = list(AlignIO.parse(handle, format=t_format))
assert len(alignments) == t_count, \
"Found %i alignments but expected %i" % (len(alignments), t_count)
msg = "Found %i alignments but expected %i" % (
len(alignments), t_count)
assert len(alignments) == t_count, msg
if t_per is not None:
for alignment in alignments:
assert len(alignment) == t_per, \
Expand Down Expand Up @@ -355,7 +364,6 @@ def check_phylip_reject_duplicate():
except ValueError as err:
if str(err) != "Error in alphabet: not Nucleotide or Protein, supply expected frequencies":
raise err
pass

if t_count == 1 and t_format not in ["nexus", "emboss", "fasta-m10"]:
# print(" Trying to read a triple concatenation of the input file")
Expand All @@ -364,7 +372,8 @@ def check_phylip_reject_duplicate():
handle = StringIO()
handle.write(data + "\n\n" + data + "\n\n" + data)
handle.seek(0)
assert len(list(AlignIO.parse(handle=handle, format=t_format, seq_count=t_per))) == 3
assert len(list(AlignIO.parse(
handle=handle, format=t_format, seq_count=t_per))) == 3
handle.close()

# Some alignment file formats have magic characters which mean
Expand Down
186 changes: 116 additions & 70 deletions Tests/test_MafIO_index.py
Expand Up @@ -63,7 +63,6 @@ def test_ucscbin(self):
for x, y, z in data:
self.assertRaises(TypeError, MafIndex._ucscbin, str(x), str(y))


if sqlite3:
class PreBuiltIndexTest(unittest.TestCase):
"""Test loading of prebuilt indices"""
Expand Down Expand Up @@ -163,36 +162,42 @@ class TestGetRecord(unittest.TestCase):
"""Make sure we can seek and fetch records properly"""

def setUp(self):
self.idx = MafIndex("MAF/ucsc_mm9_chr10.mafindex",
"MAF/ucsc_mm9_chr10.maf", "mm9.chr10")
self.idx = MafIndex(
"MAF/ucsc_mm9_chr10.mafindex",
"MAF/ucsc_mm9_chr10.maf",
"mm9.chr10")
self.assertEqual(len(self.idx), 48)

def test_records_begin(self):
recs = {}

recs[0] = SeqRecord(Seq("TCATAGGTATTTATTTTTAAATATGGTTTGCTTTATGGCTAGAA"
"CACACCGATTACTTAAAATAGGATTAACC--CCCATACACTTTA"
"AAAATGATTAAACAACATTTCTGCTGCTCGCTCACATTCTTCAT"
"AGAAGATGACATAATGTATTTTCCTTTTGGTT"),
id="mm9.chr10",
name="mm9.chr10",
description="",
annotations={"start": 3009319,
"srcSize": 129993255,
"strand": 1,
"size": 162})

recs[1] = SeqRecord(Seq("TCACAGATATTTACTATTAAATATGGTTTGTTATATGGTTACGG"
"TTCATAGGTTACTTGGAATTGGATTAACCTTCTTATTCATTGCA"
"GAATTGGTTACACTGTGTTCTTGACCTTTGCTTGTTTTCTCCAT"
"GGAAACTGATGTCAAATACTTTCCCTTTGGTT"),
id="oryCun1.scaffold_133159",
name="oryCun1.scaffold_133159",
description="",
annotations={"start": 11087,
"srcSize": 13221,
"strand": 1,
"size": 164})
recs[0] = SeqRecord(
Seq("TCATAGGTATTTATTTTTAAATATGGTTTGCTTTATGGCTAGAA"
"CACACCGATTACTTAAAATAGGATTAACC--CCCATACACTTTA"
"AAAATGATTAAACAACATTTCTGCTGCTCGCTCACATTCTTCAT"
"AGAAGATGACATAATGTATTTTCCTTTTGGTT"),
id="mm9.chr10",
name="mm9.chr10",
description="",
annotations={
"start": 3009319,
"srcSize": 129993255,
"strand": 1,
"size": 162})

recs[1] = SeqRecord(
Seq("TCACAGATATTTACTATTAAATATGGTTTGTTATATGGTTACGG"
"TTCATAGGTTACTTGGAATTGGATTAACCTTCTTATTCATTGCA"
"GAATTGGTTACACTGTGTTCTTGACCTTTGCTTGTTTTCTCCAT"
"GGAAACTGATGTCAAATACTTTCCCTTTGGTT"),
id="oryCun1.scaffold_133159",
name="oryCun1.scaffold_133159",
description="",
annotations={
"start": 11087,
"srcSize": 13221,
"strand": 1,
"size": 164})

fetched_recs = self.idx._get_record(34)

Expand Down Expand Up @@ -265,8 +270,10 @@ class TestSearchGoodMAF(unittest.TestCase):
"""Test index searching on a properly-formatted MAF"""

def setUp(self):
self.idx = MafIndex("MAF/ucsc_mm9_chr10.mafindex",
"MAF/ucsc_mm9_chr10.maf", "mm9.chr10")
self.idx = MafIndex(
"MAF/ucsc_mm9_chr10.mafindex",
"MAF/ucsc_mm9_chr10.maf",
"mm9.chr10")
self.assertEqual(len(self.idx), 48)

def test_invalid_type_1(self):
Expand All @@ -289,55 +296,80 @@ def test_correct_retrieval_1(self):
search = self.idx.search((3014742, 3018161), (3015028, 3018644))
results = [x for x in search]

self.assertEqual(len(results), 12)
self.assertEqual(len(results), 4 + 4)

self.assertEqual(set([len(x) for x in results]),
set([5, 10, 7, 6, 3, 1, 1, 1, 2, 4, 4, 9]))

self.assertEqual(set([x.annotations["start"] for y in results
for x in y]),
set([3018359, 16390338, 15871771, 184712,
16169512, 16169976, 3014842, 1371, 7842,
171548, 16389874, 15871306, 6404, 184317,
14750994, 3015028, 1616, 8040, 171763,
16169731, 6627, 184539, 3014689, 15870832,
16389401, 6228, 184148, 1201, 3018230,
15871676, 16390243, 3014778, 3018482, 3017743,
3018644, 78070420, 3014742, 6283, 184202,
1257, 3018161, 16390178, 15871611, 16169818,
3014795, 184257, 6365, 15871286, 16389854,
16169492, 171521, 7816, 1309]))
set([4, 1, 9, 10, 4, 3, 5, 1]))

# Code formatting note:
# Expected start coordinates are grouped by alignment blocks
self.assertEqual(
set([x.annotations["start"] for y in results for x in y]),
set([
3014742, 6283, 184202, 1257,
3014778,
3014795, 184257, 6365, 15871286, 16389854, 16169492, 171521, 7816, 1309,
3014842, 1371, 7842, 171548, 16169512, 16389874, 15871306, 6404, 184317, 14750994,
3018161, 16390178, 15871611, 16169818,
3018230, 15871676, 16390243,
3018359, 16390338, 15871771, 184712, 16169976, 3018482]))

def test_correct_retrieval_2(self):
search = self.idx.search((3009319, 3021421), (3012566, 3021536))
results = [x for x in search]

self.assertEqual(len(results), 8)
self.assertEqual(len(results), 6)

self.assertEqual(set([len(x) for x in results]),
set([2, 4, 5, 14, 7, 6]))

# Code formatting note:
# Expected start coordinates are grouped by alignment blocks
self.assertEqual(
set([x.annotations["start"] for y in results for x in y]),
set([
3009319, 11087,
3012076, 16160203, 16379004, 15860456,
3012441, 15860899, 16379447, 16160646, 180525,
3021421, 9910, 996, 16173434, 16393782, 15875216, 11047, 175213, 3552, 677, 78072203, 3590, 95587, 14757054,
3021465, 9957, 16173483, 16393831, 15875265, 78072243, 14757099,
3021494, 16173516, 16393864, 15875298, 78072287, 14757144]))

def test_correct_retrieval_3(self):
search = self.idx.search((3012076, 3012076 + 300), (3012076 + 100, 3012076 + 400))
results = [x for x in search]

self.assertEqual(len(results), 2)

self.assertEqual(set([len(x) for x in results]),
set([14, 5, 2, 6, 7, 15, 6, 4]))

self.assertEqual(set([x.annotations["start"] for y in results
for x in y]),
set([3021421, 9910, 996, 16173434, 16393782,
15875216, 11047, 175213, 3552, 677, 78072203,
3590, 95587, 14757054, 3012441, 15860899,
16379447, 16160646, 180525, 3009319, 11087,
3012566, 15861013, 16379561, 16160760, 180626,
310, 3021465, 9957, 16173483, 16393831,
15875265, 78072243, 14757099, 3021275, 9741,
838, 16173265, 16393613, 15875047, 10878,
175057, 3382, 521, 78072035, 73556, 3422,
95418, 14756885, 3021494, 16173516, 16393864,
15875298, 78072287, 14757144, 3012076,
16160203, 16379004, 15860456]))
set([4, 5]))

# Code formatting note:
# Expected start coordinates are grouped by alignment blocks
self.assertEqual(
set([x.annotations["start"] for y in results for x in y]),
set([
3012076, 16160203, 16379004, 15860456,
3012441, 15860899, 16379447, 16160646, 180525]))

def test_correct_block_boundary(self):
search = self.idx.search([3014688], [3014689])
self.assertEqual(len(list(search)), 1)

search = self.idx.search([3014689], [3014690])
self.assertEqual(len(list(search)), 1)

search = self.idx.search([3014688], [3014690])
self.assertEqual(len(list(search)), 2)

class TestSearchBadMAF(unittest.TestCase):
"""Test index searching on an incorrectly-formatted MAF"""

def setUp(self):
self.idx = MafIndex("MAF/ucsc_mm9_chr10_bad.mafindex",
"MAF/ucsc_mm9_chr10_bad.maf", "mm9.chr10")
self.idx = MafIndex(
"MAF/ucsc_mm9_chr10_bad.mafindex",
"MAF/ucsc_mm9_chr10_bad.maf",
"mm9.chr10")
self.assertEqual(len(self.idx), 48)

def test_incorrect_bundle_coords(self):
Expand All @@ -348,8 +380,10 @@ class TestSpliceGoodMAF(unittest.TestCase):
"""Test in silico splicing on a correctly-formatted MAF"""

def setUp(self):
self.idx = MafIndex("MAF/ucsc_mm9_chr10_big.mafindex",
"MAF/ucsc_mm9_chr10_big.maf", "mm9.chr10")
self.idx = MafIndex(
"MAF/ucsc_mm9_chr10_big.mafindex",
"MAF/ucsc_mm9_chr10_big.maf",
"mm9.chr10")
self.assertEqual(len(self.idx), 983)

def test_invalid_strand(self):
Expand All @@ -369,6 +403,17 @@ def test_correct_retrieval_1(self):
This is the real thing. We're pulling the spliced alignment of
an actual gene (Cnksr3) in mouse. It should perfectly match the
spliced transcript pulled independently from UCSC.

Getting coordinates:

wget http://hgdownload.soe.ucsc.edu/goldenPath/mm9/database/refGene.txt.gz
gunzip refGene.txt.gz
grep Cnksr3 refGene.txt

Output:

9 NM_172546 chr10 + 3134303 3227479 3134857 3226253 13 3134303,3185733,3192055,3193589,3203538,3206102,3208126,3211424,3211872,3217393,3219697,3220356,3225954, 3134909,3185897,3192258,3193677,3203580,3206222,3208186,3211493,3212019,3217518,3219906,3220446,3227479, 0 Cnksr3 cmpl cmpl 0,1,0,2,0,0,0,0,0,0,2,1,1,

"""

result = self.idx.get_spliced((3134303, 3185733, 3192055, 3193589,
Expand All @@ -381,17 +426,18 @@ def test_correct_retrieval_1(self):
3227479), 1)

cnksr3 = str(SeqIO.read("MAF/cnksr3.fa", "fasta").seq).upper()
mm9_seq = "".join([str(x.seq) for x in result
if x.id.startswith("mm9")]).replace("-", "")

mm9_seq = "".join([
str(x.seq) for x in result if x.id.startswith("mm9")]).replace("-", "")
self.assertEqual(mm9_seq, cnksr3)

class TestSpliceBadMAF(unittest.TestCase):
"""Test in silico splicing on an incorrectly-formatted MAF"""

def setUp(self):
self.idx = MafIndex("MAF/ucsc_mm9_chr10_bad.mafindex",
"MAF/ucsc_mm9_chr10_bad.maf", "mm9.chr10")
self.idx = MafIndex(
"MAF/ucsc_mm9_chr10_bad.mafindex",
"MAF/ucsc_mm9_chr10_bad.maf",
"mm9.chr10")
self.assertEqual(len(self.idx), 48)

def test_inconsistent_strand(self):
Expand Down