Skip to content

Commit

Permalink
Adding Intelligenetics/MASE format to Bio.SeqIO (see also Bio.Intelli…
Browse files Browse the repository at this point in the history
…Genetics).
  • Loading branch information
peterc committed Jul 4, 2008
1 parent 50d9872 commit bf58af2
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 1 deletion.
87 changes: 87 additions & 0 deletions Bio/SeqIO/IgIO.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Copyright 2008 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
#
# This module is for reading and writing IntelliGenetics format files as
# SeqRecord objects. This file format appears to be the same as the MASE
# multiple sequence alignment format.

"""Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format.
You are expected to use this module via the Bio.SeqIO functions."""

from Bio.Alphabet import single_letter_alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

#This is a generator function!
def IgIterator(handle, alphabet = single_letter_alphabet) :
"""Iterate over IntelliGenetics records (as SeqRecord objects).
handle - input file
alphabet - optional alphabet
The optional free format file header lines (which start with two
semi-colons) are ignored.
The free format commentary lines at the start of each record (which
start with a semi-colon) are recorded as a single string with embedded
new line characters in the SeqRecord's annotations dictionary under the
key 'comment'.
"""
#Skip any file header text before the first record (;; lines)
while True :
line = handle.readline()
if not line : break #Premature end of file, or just empty?
if not line.startswith(";;") : break

while line :
#Now iterate over the records
if line[0]<>";" :
raise ValueError, \
"Records should start with ';' and not:\n%s" % repr(line)

#Try and agree with SeqRecord convention from the GenBank parser,
#(and followed in the SwissProt parser) which stores the comments
#as a long string with newlines under annotations key 'comment'.

#Note some examples use "; ..." and others ";..."
comment_lines = []
while line.startswith(";") :
#TODO - Extract identifier from lines like "LOCUS\tB_SF2"?
comment_lines.append(line[1:].strip())
line = handle.readline()
title = line.rstrip()

seq_lines = []
while True:
line = handle.readline()
if not line : break
if line[0] == ";": break
#Remove trailing whitespace, and any internal spaces
seq_lines.append(line.rstrip().replace(" ",""))

#Return the record and then continue...
record= SeqRecord(Seq("".join(seq_lines), alphabet),
id = title, name = title)
record.annotations['comment'] = "\n".join(comment_lines)
yield record

#We should be at the end of the file now
assert not line

if __name__ == "__main__" :
print "Running quick self test"

import os
for filename in os.listdir("../../Tests/Intelligenetics/") :
if os.path.splitext(filename)[-1] == ".txt" :
print
print filename
print "-"*len(filename)
handle = open(os.path.join("../../Tests/Intelligenetics/", filename))
for record in IgIterator(handle) :
print record.id, len(record)
handle.close()
print "Done"
2 changes: 2 additions & 0 deletions Bio/SeqIO/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@

import AceIO
import FastaIO
import IgIO #IntelliGenetics or MASE format
import InsdcIO #EMBL and GenBank
import PhdIO
import SwissIO
Expand All @@ -188,6 +189,7 @@
"genbank-cds" : InsdcIO.GenBankCdsFeatureIterator,
"embl" : InsdcIO.EmblIterator,
"embl-cds" : InsdcIO.EmblCdsFeatureIterator,
"ig" : IgIO.IgIterator,
"swiss" : SwissIO.SwissIterator,
"phd" : PhdIO.PhdIterator,
"ace" : AceIO.AceIterator,
Expand Down
12 changes: 12 additions & 0 deletions Tests/output/test_AlignIO
Original file line number Diff line number Diff line change
Expand Up @@ -213,4 +213,16 @@ Testing reading fasta-m10 format file Fasta/output003.m10 with 3 alignments
Checking can write/read as 'phylip' format
Checking can write/read as 'clustal' format
Checking can write/read as 'stockholm' format
Testing reading ig format file IntelliGenetics/VIF_mase-pro.txt with 1 alignments
Alignment 0, with 16 sequences of length 298
MMMMMMMMMMMMMMMM alignment column 0
EEEEEEETEEEENEEE alignment column 1
NNNNNNNAEEEEQRKK alignment column 2
--------DEEEEE-- alignment column 3
--------KKKKKK-- alignment column 4
|||||||||||||||| ...
HHHHHHH-AAAAL-R- alignment column 297
Checking can write/read as 'phylip' format
Checking can write/read as 'clustal' format
Checking can write/read as 'stockholm' format
Finished tested reading files
56 changes: 56 additions & 0 deletions Tests/output/test_SeqIO
Original file line number Diff line number Diff line change
Expand Up @@ -785,6 +785,62 @@ Testing reading ace format file Ace/seq.cap.ace
Checking can write/read as 'clustal' format
Checking can write/read as 'phylip' format
Checking can write/read as 'stockholm' format
Testing reading ig format file IntelliGenetics/TAT_mase_nuc.txt
ID and Name='A_U455',
Seq='ATGGAGCCAGTAGATCCTAACCTAGAGCCCTGGAAACACC...ATTCGCT', length=303
ID and Name='B_HXB2R',
Seq='ATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATC...CGATTAG', length=306
ID and Name='C_UG268A',
Seq='%CAGGAAGTCAGCCTAAAACTCCTTGTACTAAGTGTTTTG...AGATTAA', length=267
...
ID and Name='SYK_SYK',
Seq='ATGTCCTCAACGGACCAGATATGCCAGACACAGAGGGTAC...GAATCTT', length=330
Checking can write/read as 'fasta' format
Checking can write/read as 'clustal' format
Failed: Sequences must all be the same length
Checking can write/read as 'phylip' format
Failed: Sequences must all be the same length
Checking can write/read as 'stockholm' format
Failed: Sequences must all be the same length
Testing reading ig format file IntelliGenetics/VIF_mase-pro.txt
ID and Name='most-likely',
Seq='MEN--RW-QVMIVWQVDRMRIRTWKSLVKHHMYRSKKA-K...-----GH', length=298
ID and Name='U455',
Seq='MEN--RW-QVMIVWQVDRMKIRTWNSLVKHHMYVSKKA-Q...-----RH', length=298
ID and Name='HXB2R',
Seq='MEN--RW-QVMIVWQVDRMRIRTWKSLVKHHMYVSGKA-R...-----GH', length=298
...
ID and Name='SYK',
Seq='MEK--EW-IVVPTWRMTPRQIDRLQHIIKTHKYKSKELEK...-------', length=298
Testing reading ig format file IntelliGenetics/VIF_mase-pro.txt as an alignment
MMMMMMMMMMMMMMMM alignment column 0
EEEEEEETEEEENEEE alignment column 1
NNNNNNNAEEEEQRKK alignment column 2
--------DEEEEE-- alignment column 3
--------KKKKKK-- alignment column 4
|||||||||||||||| ...
HHHHHHH-AAAAL-R- alignment column 297
Checking can write/read as 'fasta' format
Checking can write/read as 'clustal' format
Checking can write/read as 'phylip' format
Checking can write/read as 'stockholm' format
Testing reading ig format file IntelliGenetics/vpu_nucaligned.txt
ID and Name='VPU_CONSENSUS',
Seq='ATGc?tcattt?ga??t?ttagcaaTaa?agcattaatag...?gA?ctg', length=294
ID and Name='A_U455',
Seq='ATGACACCTTTGGAAATCTGGGCAATAACAGGGCTGATAG...-AATTTG', length=294
ID and Name='B_SF2',
Seq='ATGCAATCTTTACAAATATTAGCAATAGTATCATTAGTAG...-GATCTG', length=294
...
ID and Name='CPZANT',
Seq='ATGACTAATATATTTGAGTATGCTTTT-------------...-GACGAA', length=294
Checking can write/read as 'fasta' format
Checking can write/read as 'clustal' format
Failed: Sequences must all be the same length
Checking can write/read as 'phylip' format
Failed: Sequences must all be the same length
Checking can write/read as 'stockholm' format
Failed: Sequences must all be the same length
Finished tested reading files

Starting testing writing records
Expand Down
1 change: 1 addition & 0 deletions Tests/test_AlignIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
("fasta-m10", 2, 4, 'Fasta/output001.m10'),
("fasta-m10", 2, 6, 'Fasta/output002.m10'),
("fasta-m10", 2, 3, 'Fasta/output003.m10'),
("ig", 16, 1, 'IntelliGenetics/VIF_mase-pro.txt'),
]

def str_summary(text, max_len=40) :
Expand Down
9 changes: 8 additions & 1 deletion Tests/test_SeqIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@
("embl", False, 'EMBL/AAA03323.embl', 1), # 2008, PA line but no AC
("stockholm", True, 'Stockholm/simple.sth', 2),
("stockholm", True, 'Stockholm/funny.sth', 5),
#Following PHYLIP files are currently only used here (test_SeqIO)
#Following PHYLIP files are currently only used here and in test_AlignIO.py,
#and are mostly from Joseph Felsenstein's PHYLIP v3.6 documentation:
("phylip", True, 'Phylip/reference_dna.phy', 6),
("phylip", True, 'Phylip/reference_dna2.phy', 6),
Expand All @@ -126,6 +126,13 @@
("ace", False, 'Ace/contig1.ace', 2),
("ace", False, 'Ace/consed_sample.ace', 1),
("ace", False, 'Ace/seq.cap.ace', 1),
#Following IntelliGenetics / MASE files are also used in test_intelligenetics.py
("ig", False, 'IntelliGenetics/TAT_mase_nuc.txt', 17),
("ig", True, 'IntelliGenetics/VIF_mase-pro.txt', 16),
#This next file is a MASE alignment but sequence O_ANT70 is shorter than
#the others (so the to_alignment() call will fail). Perhaps MASE doesn't
#write trailing gaps?
("ig", False, 'IntelliGenetics/vpu_nucaligned.txt', 9),
]

# This is a list of two-tuples. Each tuple contains a
Expand Down

0 comments on commit bf58af2

Please sign in to comment.