Adding Intelligenetics/MASE format to Bio.SeqIO (see also Bio.Intelli…

…Genetics).
biopython · Jul 4, 2008 · bf58af2 · bf58af2
1 parent 50d9872
commit bf58af2
Show file tree

Hide file tree

Showing 6 changed files with 166 additions and 1 deletion.
diff --git a/Bio/SeqIO/IgIO.py b/Bio/SeqIO/IgIO.py
@@ -0,0 +1,87 @@
+# Copyright 2008 by Peter Cock.  All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license.  Please see the LICENSE file that should have been included
+# as part of this package.
+#
+# This module is for reading and writing IntelliGenetics format files as
+# SeqRecord objects.  This file format appears to be the same as the MASE
+# multiple sequence alignment format.
+
+"""Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format.
+
+You are expected to use this module via the Bio.SeqIO functions."""
+
+from Bio.Alphabet import single_letter_alphabet
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+#This is a generator function!
+def IgIterator(handle, alphabet = single_letter_alphabet) :
+    """Iterate over IntelliGenetics records (as SeqRecord objects).
+
+    handle - input file
+    alphabet - optional alphabet
+
+    The optional free format file header lines (which start with two
+    semi-colons) are ignored.
+
+    The free format commentary lines at the start of each record (which
+    start with a semi-colon) are recorded as a single string with embedded
+    new line characters in the SeqRecord's annotations dictionary under the
+    key 'comment'.
+    """
+    #Skip any file header text before the first record (;; lines)
+    while True :
+        line = handle.readline()
+        if not line : break #Premature end of file, or just empty?
+        if not line.startswith(";;") : break
+
+    while line :
+        #Now iterate over the records
+        if line[0]<>";" :
+            raise ValueError, \
+                  "Records should start with ';' and not:\n%s" % repr(line)
+
+        #Try and agree with SeqRecord convention from the GenBank parser,
+        #(and followed in the SwissProt parser) which stores the comments
+        #as a long string with newlines under annotations key 'comment'.
+
+        #Note some examples use "; ..." and others ";..."
+        comment_lines = []
+        while line.startswith(";") :
+            #TODO - Extract identifier from lines like "LOCUS\tB_SF2"?
+            comment_lines.append(line[1:].strip())
+            line = handle.readline()
+        title = line.rstrip()
+
+        seq_lines = []
+        while True:
+            line = handle.readline()
+            if not line : break
+            if line[0] == ";": break
+            #Remove trailing whitespace, and any internal spaces
+            seq_lines.append(line.rstrip().replace(" ",""))
+
+        #Return the record and then continue...
+        record= SeqRecord(Seq("".join(seq_lines), alphabet),
+                          id = title, name = title)
+        record.annotations['comment'] = "\n".join(comment_lines)
+        yield record
+
+    #We should be at the end of the file now
+    assert not line
+
+if __name__ == "__main__" :
+    print "Running quick self test"
+
+    import os
+    for filename in os.listdir("../../Tests/Intelligenetics/") :
+        if os.path.splitext(filename)[-1] == ".txt" :
+            print
+            print filename
+            print "-"*len(filename)
+            handle = open(os.path.join("../../Tests/Intelligenetics/", filename))
+            for record in IgIterator(handle) :
+                print record.id, len(record)
+            handle.close()
+    print "Done"
diff --git a/Bio/SeqIO/__init__.py b/Bio/SeqIO/__init__.py
@@ -170,6 +170,7 @@
 
 import AceIO
 import FastaIO
+import IgIO #IntelliGenetics or MASE format
 import InsdcIO #EMBL and GenBank
 import PhdIO
 import SwissIO
@@ -188,6 +189,7 @@
                     "genbank-cds" : InsdcIO.GenBankCdsFeatureIterator,
                     "embl" : InsdcIO.EmblIterator,
                     "embl-cds" : InsdcIO.EmblCdsFeatureIterator,
+                    "ig" : IgIO.IgIterator,
                     "swiss" : SwissIO.SwissIterator,
                     "phd" : PhdIO.PhdIterator,
                     "ace" : AceIO.AceIterator,

diff --git a/Tests/output/test_AlignIO b/Tests/output/test_AlignIO
@@ -213,4 +213,16 @@ Testing reading fasta-m10 format file Fasta/output003.m10 with 3 alignments
  Checking can write/read as 'phylip' format
  Checking can write/read as 'clustal' format
  Checking can write/read as 'stockholm' format
+Testing reading ig format file IntelliGenetics/VIF_mase-pro.txt with 1 alignments
+ Alignment 0, with 16 sequences of length 298
+  MMMMMMMMMMMMMMMM alignment column 0
+  EEEEEEETEEEENEEE alignment column 1
+  NNNNNNNAEEEEQRKK alignment column 2
+  --------DEEEEE-- alignment column 3
+  --------KKKKKK-- alignment column 4
+  |||||||||||||||| ...
+  HHHHHHH-AAAAL-R- alignment column 297
+ Checking can write/read as 'phylip' format
+ Checking can write/read as 'clustal' format
+ Checking can write/read as 'stockholm' format
 Finished tested reading files
diff --git a/Tests/output/test_SeqIO b/Tests/output/test_SeqIO
@@ -785,6 +785,62 @@ Testing reading ace format file Ace/seq.cap.ace
  Checking can write/read as 'clustal' format
  Checking can write/read as 'phylip' format
  Checking can write/read as 'stockholm' format
+Testing reading ig format file IntelliGenetics/TAT_mase_nuc.txt
+ ID and Name='A_U455',
+ Seq='ATGGAGCCAGTAGATCCTAACCTAGAGCCCTGGAAACACC...ATTCGCT', length=303
+ ID and Name='B_HXB2R',
+ Seq='ATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATC...CGATTAG', length=306
+ ID and Name='C_UG268A',
+ Seq='%CAGGAAGTCAGCCTAAAACTCCTTGTACTAAGTGTTTTG...AGATTAA', length=267
+ ...
+ ID and Name='SYK_SYK',
+ Seq='ATGTCCTCAACGGACCAGATATGCCAGACACAGAGGGTAC...GAATCTT', length=330
+ Checking can write/read as 'fasta' format
+ Checking can write/read as 'clustal' format
+ Failed: Sequences must all be the same length
+ Checking can write/read as 'phylip' format
+ Failed: Sequences must all be the same length
+ Checking can write/read as 'stockholm' format
+ Failed: Sequences must all be the same length
+Testing reading ig format file IntelliGenetics/VIF_mase-pro.txt
+ ID and Name='most-likely',
+ Seq='MEN--RW-QVMIVWQVDRMRIRTWKSLVKHHMYRSKKA-K...-----GH', length=298
+ ID and Name='U455',
+ Seq='MEN--RW-QVMIVWQVDRMKIRTWNSLVKHHMYVSKKA-Q...-----RH', length=298
+ ID and Name='HXB2R',
+ Seq='MEN--RW-QVMIVWQVDRMRIRTWKSLVKHHMYVSGKA-R...-----GH', length=298
+ ...
+ ID and Name='SYK',
+ Seq='MEK--EW-IVVPTWRMTPRQIDRLQHIIKTHKYKSKELEK...-------', length=298
+Testing reading ig format file IntelliGenetics/VIF_mase-pro.txt as an alignment
+ MMMMMMMMMMMMMMMM alignment column 0
+ EEEEEEETEEEENEEE alignment column 1
+ NNNNNNNAEEEEQRKK alignment column 2
+ --------DEEEEE-- alignment column 3
+ --------KKKKKK-- alignment column 4
+ |||||||||||||||| ...
+ HHHHHHH-AAAAL-R- alignment column 297
+ Checking can write/read as 'fasta' format
+ Checking can write/read as 'clustal' format
+ Checking can write/read as 'phylip' format
+ Checking can write/read as 'stockholm' format
+Testing reading ig format file IntelliGenetics/vpu_nucaligned.txt
+ ID and Name='VPU_CONSENSUS',
+ Seq='ATGc?tcattt?ga??t?ttagcaaTaa?agcattaatag...?gA?ctg', length=294
+ ID and Name='A_U455',
+ Seq='ATGACACCTTTGGAAATCTGGGCAATAACAGGGCTGATAG...-AATTTG', length=294
+ ID and Name='B_SF2',
+ Seq='ATGCAATCTTTACAAATATTAGCAATAGTATCATTAGTAG...-GATCTG', length=294
+ ...
+ ID and Name='CPZANT',
+ Seq='ATGACTAATATATTTGAGTATGCTTTT-------------...-GACGAA', length=294
+ Checking can write/read as 'fasta' format
+ Checking can write/read as 'clustal' format
+ Failed: Sequences must all be the same length
+ Checking can write/read as 'phylip' format
+ Failed: Sequences must all be the same length
+ Checking can write/read as 'stockholm' format
+ Failed: Sequences must all be the same length
 Finished tested reading files
 
 Starting testing writing records

diff --git a/Tests/test_AlignIO.py b/Tests/test_AlignIO.py
@@ -42,6 +42,7 @@
     ("fasta-m10", 2, 4, 'Fasta/output001.m10'),
     ("fasta-m10", 2, 6, 'Fasta/output002.m10'),
     ("fasta-m10", 2, 3, 'Fasta/output003.m10'),
+    ("ig", 16, 1, 'IntelliGenetics/VIF_mase-pro.txt'),
     ]
 
 def str_summary(text, max_len=40) :

diff --git a/Tests/test_SeqIO.py b/Tests/test_SeqIO.py
@@ -106,7 +106,7 @@
     ("embl",      False, 'EMBL/AAA03323.embl', 1), # 2008, PA line but no AC
     ("stockholm", True,  'Stockholm/simple.sth', 2),
     ("stockholm", True,  'Stockholm/funny.sth', 5),
-#Following PHYLIP files are currently only used here (test_SeqIO)
+#Following PHYLIP files are currently only used here and in test_AlignIO.py,
 #and are mostly from Joseph Felsenstein's PHYLIP v3.6 documentation:
     ("phylip",    True,  'Phylip/reference_dna.phy', 6),
     ("phylip",    True,  'Phylip/reference_dna2.phy', 6),
@@ -126,6 +126,13 @@
     ("ace",       False, 'Ace/contig1.ace', 2),
     ("ace",       False, 'Ace/consed_sample.ace', 1),
     ("ace",       False, 'Ace/seq.cap.ace', 1),
+#Following IntelliGenetics / MASE files are also used in test_intelligenetics.py
+    ("ig",        False, 'IntelliGenetics/TAT_mase_nuc.txt', 17),
+    ("ig",        True,  'IntelliGenetics/VIF_mase-pro.txt', 16),
+    #This next file is a MASE alignment but sequence O_ANT70 is shorter than
+    #the others (so the to_alignment() call will fail).  Perhaps MASE doesn't
+    #write trailing gaps?
+    ("ig",        False,  'IntelliGenetics/vpu_nucaligned.txt', 9),
     ]
 
 # This is a list of two-tuples.  Each tuple contains a