Moving NexusIO from SeqIO to AlignIO. See also Bug 2531 and duplicate…

… names in Nexus files.
biopython · Jul 3, 2008 · 6c11c7b · 6c11c7b
1 parent d297466
commit 6c11c7b
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 17 deletions.
diff --git a/Bio/AlignIO/NexusIO.py b/Bio/AlignIO/NexusIO.py
@@ -0,0 +1,107 @@
+# Copyright 2008 by Peter Cock.  All rights reserved.
+#
+# This code is part of the Biopython distribution and governed by its
+# license.  Please see the LICENSE file that should have been included
+# as part of this package.
+"""Bio.AlignIO support for the "nexus" file format.
+
+You are expected to use this module via the Bio.AlignIO functions
+(or the Bio.SeqIO functions).
+
+See also the Bio.Nexus module (which this code calls internally),
+as this offers more than just accessing the alignment or its
+sequences as SeqRecord objects.
+"""
+
+from Bio.Nexus import Nexus
+from Bio.Align.Generic import Alignment
+from Bio.SeqRecord import SeqRecord
+
+#You can get a couple of example files here:
+#http://www.molecularevolution.org/resources/fileformats/
+
+#This is a generator function!
+def NexusIterator(handle, seq_count=None) :
+    """Returns SeqRecord objects from a Nexus file.
+
+    Thus uses the Bio.Nexus module to do the hard work.
+
+    NOTE - We only expect ONE alignment matrix per Nexus file,
+    meaning this iterator will only yield one Alignment."""
+    n = Nexus.Nexus(handle)
+    alignment = Alignment(n.alphabet)
+
+    #Bio.Nexus deals with duplicated names by adding a '.copy' suffix.
+    #The original names and the modified names are kept in these two lists:
+    assert len(n.unaltered_taxlabels) == len(n.taxlabels)
+
+    if seq_count :
+        assert seq_count == len(n.unaltered_taxlabels)
+
+    for old_name, new_name in zip (n.unaltered_taxlabels, n.taxlabels) :
+        assert new_name.startswith(old_name)
+        seq = n.matrix[new_name] #already a Seq object with the alphabet set
+        #ToDo - Can we extract any annotation too?
+        #ToDo - Avoid abusing the private _records list
+        alignment._records.append(SeqRecord(seq,
+                                            id=new_name,
+                                            name=old_name,
+                                            description=""))
+    #All done
+    yield alignment
+
+if __name__ == "__main__" :
+    from StringIO import StringIO
+    print "Quick self test"
+    print
+    print "Repeated names without a TAXA block"
+    handle = StringIO("""#NEXUS
+    [TITLE: NoName]
+
+    begin data;
+    dimensions ntax=4 nchar=50;
+    format interleave datatype=protein   gap=- symbols="FSTNKEYVQMCLAWPHDRIG";
+
+    matrix
+    CYS1_DICDI          -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 
+    ALEU_HORVU          MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 
+    CATH_HUMAN          ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK----
+    CYS1_DICDI          -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X
+    ;
+    end; 
+    """)
+    for a in NexusIterator(handle) :
+        print a
+        for r in a :
+            print repr(r.seq), r.name, r.id
+    print "Done"
+
+    print
+    print "Repeated names with a TAXA block"
+    handle = StringIO("""#NEXUS
+    [TITLE: NoName]
+
+    begin taxa
+    CYS1_DICDI
+    ALEU_HORVU
+    CATH_HUMAN
+    CYS1_DICDI;
+    end;
+
+    begin data;
+    dimensions ntax=4 nchar=50;
+    format interleave datatype=protein   gap=- symbols="FSTNKEYVQMCLAWPHDRIG";
+
+    matrix
+    CYS1_DICDI          -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 
+    ALEU_HORVU          MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 
+    CATH_HUMAN          ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK----
+    CYS1_DICDI          -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X
+    ;
+    end; 
+    """)
+    for a in NexusIterator(handle) :
+        print a
+        for r in a :
+            print repr(r.seq), r.name, r.id
+    print "Done"
diff --git a/Bio/AlignIO/__init__.py b/Bio/AlignIO/__init__.py
@@ -113,10 +113,6 @@
 #
 # - MSF multiple alignment format, aka GCG, aka PileUp format (*.msf)
 #   http://www.bioperl.org/wiki/MSF_multiple_alignment_format 
-#
-# - Writing NEXUS multiple alignment format (*.nxs)
-#   http://www.bioperl.org/wiki/NEXUS_multiple_alignment_format
-#   Can be simply offload to Bio.Nexus for this?
 
 import os
 #from cStringIO import StringIO
@@ -128,17 +124,19 @@
 
 import StockholmIO
 import ClustalIO
+import NexusIO
 import PhylipIO
 import EmbossIO
 import FastaIO
 
 #Convention for format names is "mainname-subtype" in lower case.
 #Please use the same names as BioPerl and EMBOSS where possible.
 
-_FormatToIterator ={#"fasta" and "nexus" are done via Bio.SeqIO
+_FormatToIterator ={#"fasta" is done via Bio.SeqIO
                     "clustal" : ClustalIO.ClustalIterator,
                     "emboss" : EmbossIO.EmbossIterator,
                     "fasta-m10" : FastaIO.FastaM10Iterator,
+                    "nexus" : NexusIO.NexusIterator,
                     "phylip" : PhylipIO.PhylipIterator,
                     "stockholm" : StockholmIO.StockholmIterator,
                     }
@@ -216,7 +214,7 @@ def _SeqIO_to_alignment_iterator(handle, format, seq_count=None) :
                 yield SeqIO.to_alignment(records)
                 records = []
         if len(records) > 0 :
-            raise ValueError("Check count argument, not enough sequences?")
+            raise ValueError("Check seq_count argument, not enough sequences?")
     else :
         #Must assume that there is a single alignment using all
         #the SeqRecord objects:

diff --git a/Bio/SeqIO/NexusIO.py b/Bio/SeqIO/NexusIO.py
@@ -3,12 +3,17 @@
 # This code is part of the Biopython distribution and governed by its
 # license.  Please see the LICENSE file that should have been included
 # as part of this package.
-
 """Bio.SeqIO support for the "nexus" file format.
 
-You are expected to use this module via the Bio.SeqIO functions.
-See also the Bio.Nexus module which offers more than just accessing
-the sequences in a Nexus alignments as SeqRecord objects."""
+You were expected to use this module via the Bio.SeqIO functions.
+This module has now been replaced by Bio.AlignIO.NexusIO, and is
+deprecated."""
+
+import warnings
+warnings.warn("Bio.SeqIO.NexusIO is deprecated.  You can continue to read" \
+              + " 'nexus' files with Bio.SeqIO, but this is now" \
+              + " handled via Bio.AlignIO internally.",
+              DeprecationWarning)
 
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord

diff --git a/Bio/SeqIO/__init__.py b/Bio/SeqIO/__init__.py
@@ -132,10 +132,6 @@
 #
 # - MSF multiple alignment format, aka GCG, aka PileUp format (*.msf)
 #   http://www.bioperl.org/wiki/MSF_multiple_alignment_format 
-#
-# - Writing NEXUS multiple alignment format (*.nxs)
-#   http://www.bioperl.org/wiki/NEXUS_multiple_alignment_format
-#   Can be simply offload to Bio.Nexus for this?
 
 """
 FAO BioPython Developers
@@ -175,7 +171,6 @@
 import AceIO
 import FastaIO
 import InsdcIO #EMBL and GenBank
-import NexusIO
 import PhdIO
 import SwissIO
 
@@ -193,7 +188,6 @@
                     "genbank-cds" : InsdcIO.GenBankCdsFeatureIterator,
                     "embl" : InsdcIO.EmblIterator,
                     "embl-cds" : InsdcIO.EmblCdsFeatureIterator,
-                    "nexus" : NexusIO.NexusIterator,
                     "swiss" : SwissIO.SwissIterator,
                     "phd" : PhdIO.PhdIterator,
                     "ace" : AceIO.AceIterator,
@@ -1552,7 +1546,7 @@ def to_alignment(sequences, alphabet=None, strict=True) :
     print "#########################################################"
     print
 
-    general_output_formats = ["fasta"]
+    general_output_formats = _FormatToWriter.keys()
     alignment_formats = ["phylip","stockholm","clustal"]
     for (in_data, in_format, rec_count, last_id, last_seq, unique_ids) in tests:
         if unique_ids :