Fetching contributors…
Cannot retrieve contributors at this time
243 lines (194 sloc) 9.24 KB
#!/usr/bin/env python
# Copyright 2001-2004 by Brad Chapman. All rights reserved.
# Revisions copyright 2007-2016 by Peter Cock. All rights reserved.
# Revisions copyright 2013 by Kai Blin. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Test the GenBank parser and make sure everything is working smoothly.
# standard library
from __future__ import print_function
import os
from Bio._py3k import StringIO
import warnings
from Bio import BiopythonParserWarning
# GenBank stuff to test
from Bio import GenBank
from Bio.GenBank import utils
from Bio.Alphabet import _get_base_alphabet, ProteinAlphabet
gb_file_dir = os.path.join(os.getcwd(), 'GenBank')
test_files = ['', '', '', '', '',
'', '', '',
'', '', '',
'', 'gbvrl1_start.seq', '',
'', '',
'', '',
'', '']
# We only test writing on a subset of the examples:
write_format_files = ['', '', '', '', '',
'', '', '']
# don't test writing on protein_refseq, since it is horribly nasty
# don't test writing on the CONTIG refseq, because the wrapping of
# locations won't work exactly
# don't test writing on blank_seq because it lacks a sequence type
# don't test dbsource_wrap because it is a junky RefSeq file
files_to_parse = []
for file in test_files:
files_to_parse.append(os.path.join(gb_file_dir, file))
# parse the bioperl test files
# comment this out for now -- there are a bunch of junky records in here
# that no longer exist in GenBank -- do we really need to support those?
# files_to_parse = [os.path.join(os.getcwd(), 'GenBank', '')]
# parse the biojava test files
# files_to_parse += [os.path.join(os.getcwd(), 'GenBank', '')]
# test the parsers
feature_parser = GenBank.FeatureParser(debug_level=0)
record_parser = GenBank.RecordParser(debug_level=0)
all_parsers = [feature_parser, record_parser]
print("Testing parsers...")
for parser in all_parsers:
for filename in files_to_parse:
if not os.path.isfile(filename):
print("Missing test input file: %s" % filename)
handle = open(filename, 'r')
iterator = GenBank.Iterator(handle, parser)
while True:
with warnings.catch_warnings():
warnings.simplefilter("ignore", BiopythonParserWarning)
# e.g. BiopythonParserWarning: Premature end of file in sequence data
cur_record = next(iterator)
if cur_record is None:
if isinstance(parser, GenBank.FeatureParser):
print("***Record from %s with the FeatureParser"
% filename.split(os.path.sep)[-1])
print("Seq: %r" % cur_record.seq)
print("Id: %s" %
print("Name: %s" %
print("Description: %s" % cur_record.description)
ann_keys = sorted(cur_record.annotations)
for ann_key in ann_keys:
if ann_key != 'references':
print("Key: %s" % ann_key)
print("Value: %s" %
for reference in cur_record.annotations[ann_key]:
for feature in cur_record.features:
if isinstance(_get_base_alphabet(cur_record.seq.alphabet),
assert feature.strand is None
# Assuming no mixed strand examples...
assert feature.strand is not None
print("DB cross refs %s" % cur_record.dbxrefs)
elif isinstance(parser, GenBank.RecordParser):
print("***Record from %s with the RecordParser"
% filename.split(os.path.sep)[-1])
print("sequence length: %i" % len(cur_record.sequence))
print("locus: %s" %
print("definition: %s" % cur_record.definition)
print("accession: %s" % cur_record.accession)
for reference in cur_record.references:
print("reference title: %s" % reference.title)
for feature in cur_record.features:
print("feature key: %s" % feature.key)
print("location: %s" % feature.location)
print("num qualifiers: %i" % len(feature.qualifiers))
for qualifier in feature.qualifiers:
print("key: %s value: %s" % (qualifier.key, qualifier.value))
# test writing GenBank format
print("Testing writing GenBank format...")
def do_comparison(good_record, test_record):
"""Compare two records to see if they are the same.
Ths compares the two GenBank record, and will raise an AssertionError
if two lines do not match, showing the non-matching lines.
good_handle = StringIO(good_record)
test_handle = StringIO(test_record)
while True:
good_line = good_handle.readline()
test_line = test_handle.readline()
if not(good_line) and not(test_line):
if not(good_line):
raise AssertionError("Extra info in Test: %r" % test_line)
if not(test_line):
raise AssertionError("Extra info in Expected: %r" % good_line)
test_normalized = ' '.join(x for x in test_line.split() if x)
good_normalized = ' '.join(x for x in good_line.split() if x)
assert test_normalized == good_normalized, \
"Expected does not match Test.\nExpect: %r\nTest: %r\n" % \
(good_line, test_line)
def t_write_format():
record_parser = GenBank.RecordParser(debug_level=0)
for file in write_format_files:
print("Testing GenBank writing for %s..." % os.path.basename(file))
cur_handle = open(os.path.join("GenBank", file), "r")
compare_handle = open(os.path.join("GenBank", file), "r")
iterator = GenBank.Iterator(cur_handle, record_parser)
compare_iterator = GenBank.Iterator(compare_handle)
while True:
cur_record = next(iterator)
compare_record = next(compare_iterator)
if cur_record is None or compare_record is None:
print("\tTesting for %s" % cur_record.version)
output_record = str(cur_record) + "\n"
do_comparison(compare_record, output_record)
def t_cleaning_features():
"""Test the ability to clean up feature values.
parser = GenBank.FeatureParser(feature_cleaner=utils.FeatureValueCleaner())
handle = open(os.path.join("GenBank", ""))
iterator = GenBank.Iterator(handle, parser)
first_record = next(iterator)
# test for cleaning of translation
translation_feature = first_record.features[1]
test_trans = translation_feature.qualifiers["translation"][0]
assert ' ' not in test_trans, \
"Did not clean spaces out of the translation"
assert '\012' not in test_trans, \
"Did not clean newlines out of the translation"
print("Testing feature cleaning...")
def t_ensembl_locus():
line = "LOCUS HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011\n"
s = GenBank.Scanner.GenBankScanner()
c = GenBank._FeatureConsumer(True)
s._feed_first_line(c, line)
assert == "HG531_PATCH",
assert c._expected_size == 1000000, c._expected_size
line = "LOCUS HG531_PATCH 759984 bp DNA HTG 18-JUN-2011\n"
s = GenBank.Scanner.GenBankScanner()
c = GenBank._FeatureConsumer(True)
s._feed_first_line(c, line)
assert == "HG531_PATCH",
assert c._expected_size == 759984, c._expected_size
line = "LOCUS HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011\n"
s = GenBank.Scanner.GenBankScanner()
c = GenBank._FeatureConsumer(True)
s._feed_first_line(c, line)
assert == "HG506_HG1000_1_PATCH",
assert c._expected_size == 814959, c._expected_size
line = "LOCUS HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011\n"
s = GenBank.Scanner.GenBankScanner()
c = GenBank._FeatureConsumer(True)
s._feed_first_line(c, line)
assert == "HG506_HG1000_1_PATCH",
assert c._expected_size == 1219964, c._expected_size
print("Testing EnsEMBL LOCUS lines...")