Skip to content

Commit

Permalink
Moving NexusIO from SeqIO to AlignIO. See also Bug 2531 and duplicate…
Browse files Browse the repository at this point in the history
… names in Nexus files.
  • Loading branch information
peterc committed Jul 3, 2008
1 parent d297466 commit 6c11c7b
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 17 deletions.
107 changes: 107 additions & 0 deletions Bio/AlignIO/NexusIO.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Copyright 2008 by Peter Cock. All rights reserved.
#
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Bio.AlignIO support for the "nexus" file format.
You are expected to use this module via the Bio.AlignIO functions
(or the Bio.SeqIO functions).
See also the Bio.Nexus module (which this code calls internally),
as this offers more than just accessing the alignment or its
sequences as SeqRecord objects.
"""

from Bio.Nexus import Nexus
from Bio.Align.Generic import Alignment
from Bio.SeqRecord import SeqRecord

#You can get a couple of example files here:
#http://www.molecularevolution.org/resources/fileformats/

#This is a generator function!
def NexusIterator(handle, seq_count=None) :
"""Returns SeqRecord objects from a Nexus file.
Thus uses the Bio.Nexus module to do the hard work.
NOTE - We only expect ONE alignment matrix per Nexus file,
meaning this iterator will only yield one Alignment."""
n = Nexus.Nexus(handle)
alignment = Alignment(n.alphabet)

#Bio.Nexus deals with duplicated names by adding a '.copy' suffix.
#The original names and the modified names are kept in these two lists:
assert len(n.unaltered_taxlabels) == len(n.taxlabels)

if seq_count :
assert seq_count == len(n.unaltered_taxlabels)

for old_name, new_name in zip (n.unaltered_taxlabels, n.taxlabels) :
assert new_name.startswith(old_name)
seq = n.matrix[new_name] #already a Seq object with the alphabet set
#ToDo - Can we extract any annotation too?
#ToDo - Avoid abusing the private _records list
alignment._records.append(SeqRecord(seq,
id=new_name,
name=old_name,
description=""))
#All done
yield alignment

if __name__ == "__main__" :
from StringIO import StringIO
print "Quick self test"
print
print "Repeated names without a TAXA block"
handle = StringIO("""#NEXUS
[TITLE: NoName]
begin data;
dimensions ntax=4 nchar=50;
format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG";
matrix
CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ----
ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG
CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK----
CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X
;
end;
""")
for a in NexusIterator(handle) :
print a
for r in a :
print repr(r.seq), r.name, r.id
print "Done"

print
print "Repeated names with a TAXA block"
handle = StringIO("""#NEXUS
[TITLE: NoName]
begin taxa
CYS1_DICDI
ALEU_HORVU
CATH_HUMAN
CYS1_DICDI;
end;
begin data;
dimensions ntax=4 nchar=50;
format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG";
matrix
CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ----
ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG
CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK----
CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X
;
end;
""")
for a in NexusIterator(handle) :
print a
for r in a :
print repr(r.seq), r.name, r.id
print "Done"
10 changes: 4 additions & 6 deletions Bio/AlignIO/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,6 @@
#
# - MSF multiple alignment format, aka GCG, aka PileUp format (*.msf)
# http://www.bioperl.org/wiki/MSF_multiple_alignment_format
#
# - Writing NEXUS multiple alignment format (*.nxs)
# http://www.bioperl.org/wiki/NEXUS_multiple_alignment_format
# Can be simply offload to Bio.Nexus for this?

import os
#from cStringIO import StringIO
Expand All @@ -128,17 +124,19 @@

import StockholmIO
import ClustalIO
import NexusIO
import PhylipIO
import EmbossIO
import FastaIO

#Convention for format names is "mainname-subtype" in lower case.
#Please use the same names as BioPerl and EMBOSS where possible.

_FormatToIterator ={#"fasta" and "nexus" are done via Bio.SeqIO
_FormatToIterator ={#"fasta" is done via Bio.SeqIO
"clustal" : ClustalIO.ClustalIterator,
"emboss" : EmbossIO.EmbossIterator,
"fasta-m10" : FastaIO.FastaM10Iterator,
"nexus" : NexusIO.NexusIterator,
"phylip" : PhylipIO.PhylipIterator,
"stockholm" : StockholmIO.StockholmIterator,
}
Expand Down Expand Up @@ -216,7 +214,7 @@ def _SeqIO_to_alignment_iterator(handle, format, seq_count=None) :
yield SeqIO.to_alignment(records)
records = []
if len(records) > 0 :
raise ValueError("Check count argument, not enough sequences?")
raise ValueError("Check seq_count argument, not enough sequences?")
else :
#Must assume that there is a single alignment using all
#the SeqRecord objects:
Expand Down
13 changes: 9 additions & 4 deletions Bio/SeqIO/NexusIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,17 @@
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.

"""Bio.SeqIO support for the "nexus" file format.
You are expected to use this module via the Bio.SeqIO functions.
See also the Bio.Nexus module which offers more than just accessing
the sequences in a Nexus alignments as SeqRecord objects."""
You were expected to use this module via the Bio.SeqIO functions.
This module has now been replaced by Bio.AlignIO.NexusIO, and is
deprecated."""

import warnings
warnings.warn("Bio.SeqIO.NexusIO is deprecated. You can continue to read" \
+ " 'nexus' files with Bio.SeqIO, but this is now" \
+ " handled via Bio.AlignIO internally.",
DeprecationWarning)

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
Expand Down
8 changes: 1 addition & 7 deletions Bio/SeqIO/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,6 @@
#
# - MSF multiple alignment format, aka GCG, aka PileUp format (*.msf)
# http://www.bioperl.org/wiki/MSF_multiple_alignment_format
#
# - Writing NEXUS multiple alignment format (*.nxs)
# http://www.bioperl.org/wiki/NEXUS_multiple_alignment_format
# Can be simply offload to Bio.Nexus for this?

"""
FAO BioPython Developers
Expand Down Expand Up @@ -175,7 +171,6 @@
import AceIO
import FastaIO
import InsdcIO #EMBL and GenBank
import NexusIO
import PhdIO
import SwissIO

Expand All @@ -193,7 +188,6 @@
"genbank-cds" : InsdcIO.GenBankCdsFeatureIterator,
"embl" : InsdcIO.EmblIterator,
"embl-cds" : InsdcIO.EmblCdsFeatureIterator,
"nexus" : NexusIO.NexusIterator,
"swiss" : SwissIO.SwissIterator,
"phd" : PhdIO.PhdIterator,
"ace" : AceIO.AceIterator,
Expand Down Expand Up @@ -1552,7 +1546,7 @@ def to_alignment(sequences, alphabet=None, strict=True) :
print "#########################################################"
print

general_output_formats = ["fasta"]
general_output_formats = _FormatToWriter.keys()
alignment_formats = ["phylip","stockholm","clustal"]
for (in_data, in_format, rec_count, last_id, last_seq, unique_ids) in tests:
if unique_ids :
Expand Down

0 comments on commit 6c11c7b

Please sign in to comment.