Skip to content

Commit

Permalink
Use OrderedDict for SeqFeature qualifiers.
Browse files Browse the repository at this point in the history
Part of this behaviour will happen by default as of Python 3.6 where the
Python dict preserves order, but goal here is to help round-trip EMBL/GenBank
files by preserving the feature qualifier ordering.

This means we can stop explicitly sorting the qualifiers on output (which
produced stable but arbitrary ordering, rather than attempting to match the
original order).
  • Loading branch information
peterjc committed Nov 10, 2016
1 parent 1fb9129 commit c1f93f3
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 7 deletions.
12 changes: 7 additions & 5 deletions Bio/SeqFeature.py
@@ -1,6 +1,6 @@
# Copyright 2000-2003 Jeff Chang.
# Copyright 2001-2008 Brad Chapman.
# Copyright 2005-2015 by Peter Cock.
# Copyright 2005-2016 by Peter Cock.
# Copyright 2006-2009 Michiel de Hoon.
# All rights reserved.
# This code is part of the Biopython distribution and governed by its
Expand Down Expand Up @@ -54,6 +54,8 @@

from __future__ import print_function

from collections import OrderedDict

from Bio._py3k import _is_int_or_long

from Bio.Seq import MutableSeq, reverse_complement
Expand Down Expand Up @@ -85,7 +87,7 @@ class SeqFeature(object):
- qualifiers - A dictionary of qualifiers on the feature. These are
analogous to the qualifiers from a GenBank feature table. The keys of
the dictionary are qualifier names, the values are the qualifier
values.
values. As of Biopython 1.69 this is an ordered dictionary.
"""

def __init__(self, location=None, type='', location_operator='',
Expand Down Expand Up @@ -150,7 +152,7 @@ def __init__(self, location=None, type='', location_operator='',
self.strand = strand
self.id = id
if qualifiers is None:
qualifiers = {}
qualifiers = OrderedDict()
self.qualifiers = qualifiers
if sub_features is not None:
raise TypeError("Rather than sub_features, use a CompoundFeatureLocation")
Expand Down Expand Up @@ -271,7 +273,7 @@ def _shift(self, offset):
type=self.type,
location_operator=self.location_operator,
id=self.id,
qualifiers=dict(self.qualifiers.items()))
qualifiers=OrderedDict(self.qualifiers.items()))

def _flip(self, length):
"""Returns a copy of the feature with its location flipped (PRIVATE).
Expand All @@ -287,7 +289,7 @@ def _flip(self, length):
type=self.type,
location_operator=self.location_operator,
id=self.id,
qualifiers=dict(self.qualifiers.items()))
qualifiers=OrderedDict(self.qualifiers.items()))

def extract(self, parent_sequence):
"""Extract feature sequence from the supplied parent sequence.
Expand Down
4 changes: 2 additions & 2 deletions Bio/SeqIO/InsdcIO.py
Expand Up @@ -357,8 +357,8 @@ def _write_feature(self, feature, record_length):
+ self._wrap_location(location) + "\n"
self.handle.write(line)
# Now the qualifiers...
for key in sorted(feature.qualifiers.keys()):
values = feature.qualifiers[key]
# Note as of Biopython 1.69, this is an ordered-dict, don't sort it:
for key, values in feature.qualifiers.items():
if isinstance(values, (list, tuple)):
for value in values:
self._write_feature_qualifier(key, value)
Expand Down
4 changes: 4 additions & 0 deletions NEWS
Expand Up @@ -23,6 +23,10 @@ now also allowed in identifiers.
For consistency the Bio.Seq module now offers a complement function (already
available as a method on the Seq and MutableSeq objects).

The SeqFeature object's qualifiers is now an explicitly ordered dictionary
(note that as of Python 3.6 the Python dict is ordered by default anyway).
This helps reproduce GenBank/EMBL files on input/output.

Additionally, a number of small bugs have been fixed with further additions
to the test suite, and there has been further work to follow the Python PEP8,
PEP257 and best practice standard coding style.
Expand Down
7 changes: 7 additions & 0 deletions Tests/test_GenBank_unittest.py
Expand Up @@ -175,6 +175,13 @@ def test_locus_line_topogoly(self):
orig_first_line = fh.readline().strip()
self.assertEqual(first_line, orig_first_line)

def test_qualifier_order(self):
"""Check the qualifier order is preserved."""
record = SeqIO.read("GenBank/DS830848.gb", "gb")
f = record.features[0]
self.assertEqual(list(f.qualifiers),
['organism', 'mol_type', 'strain', 'db_xref', 'dev_stage'])

def test_long_names(self):
"""Various GenBank names which push the column based LOCUS line."""
original = SeqIO.read("GenBank/iro.gb", "gb")
Expand Down

0 comments on commit c1f93f3

Please sign in to comment.