-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding Intelligenetics/MASE format to Bio.SeqIO (see also Bio.Intelli…
…Genetics).
- Loading branch information
peterc
committed
Jul 4, 2008
1 parent
50d9872
commit bf58af2
Showing
6 changed files
with
166 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# Copyright 2008 by Peter Cock. All rights reserved. | ||
# This code is part of the Biopython distribution and governed by its | ||
# license. Please see the LICENSE file that should have been included | ||
# as part of this package. | ||
# | ||
# This module is for reading and writing IntelliGenetics format files as | ||
# SeqRecord objects. This file format appears to be the same as the MASE | ||
# multiple sequence alignment format. | ||
|
||
"""Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format. | ||
You are expected to use this module via the Bio.SeqIO functions.""" | ||
|
||
from Bio.Alphabet import single_letter_alphabet | ||
from Bio.Seq import Seq | ||
from Bio.SeqRecord import SeqRecord | ||
|
||
#This is a generator function! | ||
def IgIterator(handle, alphabet = single_letter_alphabet) : | ||
"""Iterate over IntelliGenetics records (as SeqRecord objects). | ||
handle - input file | ||
alphabet - optional alphabet | ||
The optional free format file header lines (which start with two | ||
semi-colons) are ignored. | ||
The free format commentary lines at the start of each record (which | ||
start with a semi-colon) are recorded as a single string with embedded | ||
new line characters in the SeqRecord's annotations dictionary under the | ||
key 'comment'. | ||
""" | ||
#Skip any file header text before the first record (;; lines) | ||
while True : | ||
line = handle.readline() | ||
if not line : break #Premature end of file, or just empty? | ||
if not line.startswith(";;") : break | ||
|
||
while line : | ||
#Now iterate over the records | ||
if line[0]<>";" : | ||
raise ValueError, \ | ||
"Records should start with ';' and not:\n%s" % repr(line) | ||
|
||
#Try and agree with SeqRecord convention from the GenBank parser, | ||
#(and followed in the SwissProt parser) which stores the comments | ||
#as a long string with newlines under annotations key 'comment'. | ||
|
||
#Note some examples use "; ..." and others ";..." | ||
comment_lines = [] | ||
while line.startswith(";") : | ||
#TODO - Extract identifier from lines like "LOCUS\tB_SF2"? | ||
comment_lines.append(line[1:].strip()) | ||
line = handle.readline() | ||
title = line.rstrip() | ||
|
||
seq_lines = [] | ||
while True: | ||
line = handle.readline() | ||
if not line : break | ||
if line[0] == ";": break | ||
#Remove trailing whitespace, and any internal spaces | ||
seq_lines.append(line.rstrip().replace(" ","")) | ||
|
||
#Return the record and then continue... | ||
record= SeqRecord(Seq("".join(seq_lines), alphabet), | ||
id = title, name = title) | ||
record.annotations['comment'] = "\n".join(comment_lines) | ||
yield record | ||
|
||
#We should be at the end of the file now | ||
assert not line | ||
|
||
if __name__ == "__main__" : | ||
print "Running quick self test" | ||
|
||
import os | ||
for filename in os.listdir("../../Tests/Intelligenetics/") : | ||
if os.path.splitext(filename)[-1] == ".txt" : | ||
print filename | ||
print "-"*len(filename) | ||
handle = open(os.path.join("../../Tests/Intelligenetics/", filename)) | ||
for record in IgIterator(handle) : | ||
print record.id, len(record) | ||
handle.close() | ||
print "Done" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters