Permalink
Browse files

Adding test case from Kai Blin; Fixed some warnings.

This test GenBank file with an invalid LOCUS line lets
us test the strand behaviour fixed in the previous
commit.

Some generic warnings from the GenBank parser should
have been BiopythonParserWarning - and thus silenced
in the unit tests.

Also fixed line lengths for some other GenBank warnings.
  • Loading branch information...
1 parent 1839931 commit 6bff33978b948acb49d7331b2695c97b5c62eec8 @peterjc peterjc committed Jun 17, 2013
Showing with 191 additions and 9 deletions.
  1. +15 −8 Bio/GenBank/Scanner.py
  2. +125 −0 Tests/GenBank/invalid_locus_line_spacing.gb
  3. +40 −0 Tests/output/test_GenBank
  4. +11 −1 Tests/test_GenBank.py
View
@@ -185,7 +185,8 @@ def parse_features(self, skip=False):
#over indenting the location and qualifiers.
feature_key, line = line[2:].strip().split(None, 1)
feature_lines = [line]
- warnings.warn("Overindented %s feature?" % feature_key)
+ warnings.warn("Overindented %s feature?" % feature_key,
+ BiopythonParserWarning)
else:
feature_key = line[2:self.FEATURE_QUALIFIER_INDENT].strip()
feature_lines = [line[self.FEATURE_QUALIFIER_INDENT:]]
@@ -893,7 +894,7 @@ def parse_features(self, skip=False):
#and since it is so common I don't want to issue a warning
#warnings.warn("Feature location %s is invalid, "
# "moving greater than sign before position"
- # % location)
+ # % location, BiopythonParserWarning)
location = bad_position_re.sub(r'>\1', location)
features.append((feature_key, location, qualifiers))
self.line = line
@@ -935,12 +936,14 @@ def parse_footer(self):
line = self.line
while True:
if not line:
- warnings.warn("Premature end of file in sequence data", BiopythonParserWarning)
+ warnings.warn("Premature end of file in sequence data",
+ BiopythonParserWarning)
line = '//'
break
line = line.rstrip()
if not line:
- warnings.warn("Blank line in sequence data", BiopythonParserWarning)
+ warnings.warn("Blank line in sequence data",
+ BiopythonParserWarning)
line = self.handle.readline()
continue
if line == '//':
@@ -950,7 +953,8 @@ def parse_footer(self):
if len(line) > 9 and line[9:10] != ' ':
# Some broken programs indent the sequence by one space too many
# so try to get rid of that and test again.
- warnings.warn("Invalid indentation for sequence line", BiopythonParserWarning)
+ warnings.warn("Invalid indentation for sequence line",
+ BiopythonParserWarning)
line = line[1:]
if len(line) > 9 and line[9:10] != ' ':
raise ValueError("Sequence line mal-formed, '%s'" % line)
@@ -1135,7 +1139,8 @@ def _feed_first_line(self, consumer, line):
else:
#Must just have just "LOCUS ", is this even legitimate?
#We should be able to continue parsing... we need real world testcases!
- warnings.warn("Minimal LOCUS line found - is this correct?\n:%r" % line)
+ warnings.warn("Minimal LOCUS line found - is this "
+ "correct?\n:%r" % line, BiopythonParserWarning)
elif len(line.split()) == 7 and line.split()[3] in ["aa", "bp"]:
#Cope with EnsEMBL genbank files which use space separation rather
#than the expected column based layout. e.g.
@@ -1154,14 +1159,16 @@ def _feed_first_line(self, consumer, line):
elif len(line.split()) >= 4 and line.split()[3] in ["aa", "bp"]:
#Cope with EMBOSS seqret output where it seems the locus id can cause
#the other fields to overflow. We just IGNORE the other fields!
- warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line)
+ warnings.warn("Malformed LOCUS line found - is this "
+ "correct?\n:%r" % line, BiopythonParserWarning)
consumer.locus(line.split()[1])
consumer.size(line.split()[2])
elif len(line.split()) >= 4 and line.split()[-1] in ["aa", "bp"]:
#Cope with pseudo-GenBank files like this:
# "LOCUS RNA5 complete 1718 bp"
#Treat everything between LOCUS and the size as the identifier.
- warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line)
+ warnings.warn("Malformed LOCUS line found - is this "
+ "correct?\n:%r" % line, BiopythonParserWarning)
consumer.locus(line[5:].rsplit(None, 2)[0].strip())
consumer.size(line.split()[-2])
else:
@@ -0,0 +1,125 @@
+LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001
+DEFINITION Streptomyces avermitilis melanin biosynthetic gene cluster.
+ACCESSION AB070938
+VERSION AB070938.1 GI:15823953
+KEYWORDS .
+SOURCE Streptomyces avermitilis
+ ORGANISM Streptomyces avermitilis
+ Bacteria; Actinobacteria; Actinobacteridae; Actinomycetales;
+ Streptomycineae; Streptomycetaceae; Streptomyces.
+FEATURES Location/Qualifiers
+ source 1..6497
+ /organism="Streptomyces avermitilis"
+ /mol_type="genomic DNA"
+ /db_xref="taxon:33903"
+ORIGIN
+ 1 ctagcagccc gcatcgccct cgacgttggc gatcatcgtg cgcagcacct tgagcgcggt
+ 61 cacgtactcc tcgtcgctga tgccctcgtg gaccaccgcg cgcagctcgg tcaccagctc
+ 121 acgcagccgc ttcctggcag cctccccggt gtcggtgaga cgcaggcgct gtccggcgtc
+ 181 gatccgaagc cagccccggt gaagcagctg gtcgacgacc cgcgcgatct cgtgcggccc
+ 241 gtccgcgagg ggcgtcagct gggtgaccac ctcctcccgg cccggcgccg cgggcccgcc
+ 301 gtgcacgcgg ttgagcaccc agtactgcgg ctgtgtgacg tcgatcctgg ccatggcgtc
+ 361 ccgcagctgc cgggtgaccg ccgtgtgggc cagaccgctc cagtagccga tgggctgggt
+ 421 ggccaacacg tcgtcggtgg cggccggatc ggccggtgcc tggtcggtgg tgctgccggt
+ 481 caacggtttc atgatcgtga cgctaggtcc ccgtagcgtg cgtgaacacc gtcgaaccag
+ 541 gcaaggtctg gccgaaacct ccgcccctcc aggtggacca ccccgtgcgg cgcgaccttc
+ 601 gtccacgtcc cgacgcacgg tgatcgtgct gggggccagc ccctcctgga gggcgtcggc
+ 661 ggcgtcggac ccgggcccgg agcgcccggc ggagcgtggc atgatcgggg gcatgtctga
+ 721 acgtgtggtg gccgcctgtg acggggcgtc gaagggaaac cccggaccgg ccggatgggc
+ 781 ctgggtcgtc gccgacggcg aggagacccc gacccgctgg gcggccgggg cgctcggcac
+ 841 ggccacgaac aacgtggccg aactcaccgc gctggagcgc ttgttgagcg cgatggatcc
+ 901 ggacgtcccg ctggagatcc ggatggactc ccagtacgcg atgaaggccg tcacgacctg
+ 961 gctgcccggc tggaagcgca agggctggaa gacggccgcg gggaagccgg tcgccaacca
+ 1021 ggaactggtc gcccgcatcg acgaactgct cgacggacgc tccgtcgagt tccgttacgt
+ 1081 ccccgcgcac caggtcgacg gcgaccggct caacgacttc gccgaccgtg ccgccagcca
+ 1141 ggcggcgatc gtccaggaac cggcgggcag cgagtacggc tccccggagc cgccgaagtc
+ 1201 gcccgacacc gtcgcggccg gctccgcggg tcgcggcgct cccgccaaga agcgtgcctc
+ 1261 cgcgcgcacc gccaagacga gcacgcgcac gatcaaggcg aagttccccg gccgctgtgt
+ 1321 ctgcggccgc ccctacgcgg cgggcgagcc catcgccaag aacgcgcagg gctggggcca
+ 1381 cccggagtgc cgtaccgccg acgacgtcta ggacctcccc ggcggagcat gcccaaggac
+ 1441 gcgggggctg acaggccgtg cggcttttcc cgcccgcctg atccgccggc cctggatcac
+ 1501 gaccccggcg gcctcccacg agtggccgcc gggacctcga gcgcctcggc cgtcagacgg
+ 1561 tgtcgaacgt gtagtgcgcg gtgtggtcca gcaggtccgc ggggcgtacg tcgttccacg
+ 1621 gcttcatggt ctcgttcagg tcgacgacgt tcggcgtgcc gcccgtcggc acataaccgg
+ 1681 agcccgggtg gcggctctgc cactgggccc agagcctgtc gatgtaggcg tggtggagcc
+ 1741 agaagaccgg gtcgttgggg gagaccccgg tggccatctg gccgccgacc cagacgtgga
+ 1801 cgcggttgtg caggttcaca ccgcgccagc cctcgagatg gttgcggaac ccgtccgacg
+ 1861 cgctgttcca cggggccatg tcgtacgtgg acatcgcgag cacggagtcg acctccgccc
+ 1921 gggtcggcag ctcgcgcccg ccgccgccga gcgtgcgccg cagatacgta cggctgtcga
+ 1981 cccgcacgtt gaccggccag ttgccggtgg acgccgcgaa cggcccgtcc atcacccggc
+ 2041 cgtccaggct gcgtccgctg ccgccgagga agtcgggcgc ccacagggag gcacgcgccg
+ 2101 tgcggtcggt gctccagtcc cagtacggca gcgcgaccga cgggtcgacc gcctggagcg
+ 2161 cctgctcgaa ctcgatcaaa aatctgcggt gccagggcag gaaggaaggc gaacgatggc
+ 2221 ccgtgcgttc gccgctgtcg gtgtcgccca tgatgaaggc gttgtgggtc gtgacgaact
+ 2281 cgtcgtagcg gccactgcgc ttgagcgcca cgagcgcgtc gacgaagcgc cgcttctcgt
+ 2341 cggccgtcag ggtcgcctgg ttcttgcgta cggtcatgtg cgggtgactc cagaactcta
+ 2401 cgtgcgggac ggtcagttga aggggacgag cggcgcgccc tgaagctcca cgaccgcggc
+ 2461 gcgggcggcg gcgcgcgggg tggccacggg gtcgtagtgg ctgacgacgc tgatccagct
+ 2521 gccgtcgacg ttctgcatca cgtgcagttc catcccgtcg atgaacacgc cgtacccgga
+ 2581 gccgtggtga tgaccgcccc cggtcgcgcg gccctctatt cggcgcccct gatagacctc
+ 2641 gtcgaacggc tggggacccc cgtggtgccc ggccgcggac gcggacggag cggcaagggc
+ 2701 ctgagtgccg gccacggccg ccagggcggc ggcggccccg agggcatgac ggcgggtgag
+ 2761 ttcgggcatg cgaagtcctt ctgagtcgag gtgtgttgac gactcggcat gcctatccgc
+ 2821 ccggtcggga gccggagaaa tcgacgaaaa ccggttggct acgatccgga caattaccta
+ 2881 catgtcatac aggattgaac gaagatgatc ttgccgcccc gggtggccca cccggcggcg
+ 2941 gagggggaag ttccaccccg gatcggcgcc atcgcggtga tcttttgccg tcgatgcggg
+ 3001 gcgagtggtg cggctcacgt gcgcagactg gcgcgaactg gcgcctgccc tcaccgctcc
+ 3061 agggggttcc cgcagcgatt gcagtagcgg gcgtcgctct ccgtggccat acgtccgcac
+ 3121 tcggcgcaga cctggtgcag caggcatgcg ggctctccgg tcgcccccgt cgtcggcagg
+ 3181 gccgggggag cggccggacg gcgcgggcgt acggtcacgc agtgcagccc cgcctccccc
+ 3241 gcgtgcagcc ggtatccagc ggcggggggc agccacacgg cggccggggg tgccaactcc
+ 3301 agccatccgc ccggtgtccg gagccggccg cccccgtcga gggcgatcac gaggacgtcc
+ 3361 cgcgagcggt caccgggcgg ctcggccgcc gcacccggcg ggatgtgcat cgcctcggcg
+ 3421 tcgagaccgg cgcccggccg gtccagtcgc cagtagcggc ccccggtcgc ccgggagacg
+ 3481 agtgcggcca ggaccgggcc ggcgggcgga tgcgtcacag gggctcctgt cgtctgcggc
+ 3541 gggacgggcc gcgatcgtac gaccgccccc gcccgccgcg cggcggaaga ggccgggacg
+ 3601 gtcggtctgc acgatggcgc tgctaccctt cgtggtcaat tgaccgcttt gcgtaacata
+ 3661 ggggagtgcg cgtgaagatc gcgtgcgtcg gcggcggacc cgcaagcctg tacttctcga
+ 3721 tcctgatgaa gcgccaggac ccgtcccacg acatcaccgt ccacgagcgg aaccccgccg
+ 3781 gatcgaccta cggctggggc gtgacctact ggagcggcct gctcgacaaa ctccgcggga
+ 3841 gtgaccccga gtcggcgctc gccgtcagcg agaactccgt ccgctggagc gacggagtcg
+ 3901 cccacgtccg gaaccgcacc acggtccacc acggcgacga gggcttcggc atcggccgcc
+ 3961 gcagattcct cgacgtactg gccgaccggg cccggtccct gggcgtccgc atcgagtacg
+ 4021 agcatgagat cggcgccgac gacccactgc ccgaggccga tctggtcgtc gccggcgacg
+ 4081 gggtcaacag cgtgctgcgc ggccgctacg ccgaccactt cggcagcgag accgtgctcg
+ 4141 gccgcaaccg ctacatctgg ctcggcacca ccaaggtctt cgactcgttc accttcgcct
+ 4201 ttgtggagag cgaacacggc tggatctggt gctacggcta tggattcagc gacggccaca
+ 4261 gcacctgcgt catcgagtgc tccccggaaa cctggaccgg gctcggcctc gaccgggcca
+ 4321 gcgaggccga cggtctcgcc ctgctggaga agctcttcgc cgacgtcctc gacgggcacg
+ 4381 agctgatcgg ccgggcgcag agcgacggtg ccgcccagtg gctgaacttc cgcaccctca
+ 4441 ccaaccgcac ctggcatcgc gacaacctcg tcctgatcgg cgacgccgcc cacaccaccc
+ 4501 actactccat cggcgcgggc accaccctcg ccctggagga cgccatcgcc ctcgccgaag
+ 4561 ccctgagcgc gcaccgcgac ctgccgggcg cgctcgccgc ctacgagcgg gaacgcaagt
+ 4621 ccgcgctcct gcacatccag agcgcggccc ggctcagcgc ccagtggtac gagaacctcc
+ 4681 cgcgctacat ccgccttccg cccccgcaga tgttcgccct gctcggccag cgccattccc
+ 4741 cgctgctgcc gtacgtgcct ccgcagctct actaccggat cgaccgggcg gccggacaac
+ 4801 tggaggcgct gcgcaggctc aagcgctggc tggggccgcg actggcgcgt accgtccagg
+ 4861 cgcgcacggg ccggtaggcc ggccgccggc ggccgcgtcc gacggagaat tctgggtgaa
+ 4921 tgaccattca cccggctaag gtgaattcct attcacctcc cttcttcacg tcggctgccg
+ 4981 cccctggagt gaccatggtc ccgatatcca ccccgtccga ccggtccgcg acccccgacg
+ 5041 gaccggccgg acggccgggt gtccgcgacc ggctgacggt ccccgtcctg gcgttcggcg
+ 5101 gaatcctcat ggccgtcatg cagacggtcg tggtgccgct gctgcccgac ctgccgcgcc
+ 5161 tgaccggcgc ttccgcgggc gccgtctcct ggatggtcac cgccaccctg ctctccggcg
+ 5221 cggtgctgac cccggtgctc ggccgggccg gcgacatgta cggcaagcgg cgggttctgc
+ 5281 tcgccgccct cgcgctgatg accctgggct cgctgctgtg cgccgtcacc tccgacatcc
+ 5341 gcgtgctcat cgccgcgcgg gccctccagg gcgcggcggc cgccgtcgta ccgctgtcga
+ 5401 tcagcatcct gcgcgacgaa ctcccgcccg agcgcacggg ttccgcggtg gccctgatga
+ 5461 gttccaccgt gggcatcggc gccgcgctcg gtctgccgat cgccgcgatg atcgtgcagt
+ 5521 acgccgactg gcacgtcatg ttctgggcga ccaccgggct cggcgccggc ggactggcac
+ 5581 tggcgtggtg ggcggtgcgc gagtcgcccg tccggcagcc gggccgcttc gacacgctgg
+ 5641 gtgcgctggg gctggccgcg ggcctggtct gcctgctcct cggtgtgtcg cagggcgggc
+ 5701 agtggggctg gaccagtccg cggatcgtcg gcctgctcgt ggcctgcgta ctcgtactga
+ 5761 cgctgtggtg gttccagcag tggcgggccc cgcggcccct ggtggacctg aagctggcct
+ 5821 cccgcccccg ggtcgccctg ccgcacgtgg ccgcgctgct gaccggattc gccttctacg
+ 5881 gcaactcgct ggtcacggcg cagctggtgc aggcgcccaa ggccaccggc tacggactcg
+ 5941 ggctgtccat cgtgcagacc ggtctgtgcc tgctgcccgg cggcgtcatc atgctgctgt
+ 6001 tctcgccggt ctcggcgcgc atctcggccg cccgcggccc gcgcgtgacg ctggcactcg
+ 6061 gggccgcggt catcgccgtc ggctacgccg tgcgcatcgc ggacagccgc gacctgtgga
+ 6121 tgatcatcgt gggcgccacg gtcatcgcgg tcggcacgac cctcgcctac tcggccctgc
+ 6181 ccaccctgat cctgcgtgcc gtgcccgccg gacagaccgc ctccgccaac ggcgtcaacg
+ 6241 tcctgatgcg caccatcggc caagccgtgt gcagcgcggc ggtcgccgcc gtcctggtcc
+ 6301 accacaccag cctggtggga ggcgccccgg tacccaccct gcacggctat ctgctggcgt
+ 6361 tcgcgatggc gggtacggtc gcagtgatgg cctgcgccgc cgccctcgtc atccccgggg
+ 6421 accccgactc ccacggcacg cgacgggccc gcggccgtac ccggccgtcc cacgacgagg
+ 6481 cgctggaagg agcatga
+//
View
@@ -3366,6 +3366,35 @@ qualifiers:
Key: organism, Value: ['Streptomyces avermitilis']
DB cross refs []
+***Record from invalid_locus_line_spacing.gb with the FeatureParser
+Seq: Seq('CTAGCAGCCCGCATCGCCCTCGACGTTGGCGATCATCGTGCGCAGCACCTTGAG...TGA', Alphabet())
+Id: AB070938.1
+Name: AB070938
+Description Streptomyces avermitilis melanin biosynthetic gene cluster.
+Annotations***
+Key: accessions
+Value: ['AB070938']
+Key: gi
+Value: 15823953
+Key: keywords
+Value: ['']
+Key: organism
+Value: Streptomyces avermitilis
+Key: sequence_version
+Value: 1
+Key: source
+Value: Streptomyces avermitilis
+Key: taxonomy
+Value: ['Bacteria', 'Actinobacteria', 'Actinobacteridae', 'Actinomycetales', 'Streptomycineae', 'Streptomycetaceae', 'Streptomyces']
+Feaures
+type: source
+location: [0:6497](+)
+qualifiers:
+ Key: db_xref, Value: ['taxon:33903']
+ Key: mol_type, Value: ['genomic DNA']
+ Key: organism, Value: ['Streptomyces avermitilis']
+
+DB cross refs []
***Record from noref.gb with the RecordParser
sequence length: 1622
locus: NM_006141
@@ -4691,6 +4720,17 @@ num qualifiers: 3
key: /organism= value: "Streptomyces avermitilis"
key: /mol_type= value: "genomic DNA"
key: /db_xref= value: "taxon:33903"
+***Record from invalid_locus_line_spacing.gb with the RecordParser
+sequence length: 6497
+locus: AB070938
+definition: Streptomyces avermitilis melanin biosynthetic gene cluster.
+accession: ['AB070938']
+feature key: source
+location: 1..6497
+num qualifiers: 3
+key: /organism= value: "Streptomyces avermitilis"
+key: /mol_type= value: "genomic DNA"
+key: /db_xref= value: "taxon:33903"
Testing writing GenBank format...
Testing GenBank writing for noref.gb...
Testing for NM_006141.1
View
@@ -12,6 +12,8 @@
from Bio import GenBank
from Bio.GenBank import utils
+from Bio.Alphabet import _get_base_alphabet, ProteinAlphabet
+
#TODO - Test we get the warnings we expect on the bad input files
warnings.simplefilter('ignore', BiopythonParserWarning)
@@ -21,7 +23,9 @@
'protein_refseq.gb', 'extra_keywords.gb', 'one_of.gb',
'NT_019265.gb', 'origin_line.gb', 'blank_seq.gb',
'dbsource_wrap.gb', 'gbvrl1_start.seq', 'NC_005816.gb',
- 'no_end_marker.gb', 'wrong_sequence_indent.gb']
+ 'no_end_marker.gb', 'wrong_sequence_indent.gb',
+ 'invalid_locus_line_spacing.gb',
+ ]
# We only test writing on a subset of the examples:
write_format_files = ['noref.gb', 'cor6_6.gb', 'iro.gb', 'pri1.gb', 'arab1.gb',
@@ -87,6 +91,12 @@
print "Feaures"
for feature in cur_record.features:
print feature
+ if isinstance(_get_base_alphabet(cur_record.seq.alphabet),
+ ProteinAlphabet):
+ assert feature.strand is None
+ else:
+ #Assuming no mixed strand examples...
+ assert feature.strand is not None
print "DB cross refs", cur_record.dbxrefs
elif isinstance(parser, GenBank.RecordParser):
print "***Record from %s with the RecordParser" \

0 comments on commit 6bff339

Please sign in to comment.