Skip to content

Commit

Permalink
Merge pull request #797 from anderspitman/move_find_features
Browse files Browse the repository at this point in the history
Move find_features method into NucleotideSequence
  • Loading branch information
jairideout committed Dec 27, 2014
2 parents a077741 + ff37483 commit 7b60cc2
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 48 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
### Features
* Modified ``skbio.stats.distance.pwmantel`` to accept a list of filepaths. This is useful as it allows for a smaller amount of memory consumption as it only loads two matrices at a time as opposed to requiring that all distance matrices are loaded into memory.

### Backward-incompatible changes
* Removed ``feature_types`` attribute from ``BiologicalSequence`` and all subclasses ([#797](https://github.com/biocore/scikit-bio/pull/797)).
* Removed ``find_features`` method from ``BiologicalSequence`` and ``ProteinSequence`` ([#797](https://github.com/biocore/scikit-bio/pull/797)).

## Version 0.2.2 (2014-12-04)

### Features
Expand Down
100 changes: 56 additions & 44 deletions skbio/sequence/_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,6 @@ class BiologicalSequence(Sequence, SkbioObject):
"""
default_write_format = 'fasta'

feature_types = set([])

@classmethod
def alphabet(cls):
"""Return the set of characters allowed in a `BiologicalSequence`.
Expand Down Expand Up @@ -1503,46 +1501,6 @@ def regex_iter(self, regex, retrieve_group_0=False):
for g in range(start, len(match.groups())+1):
yield (match.start(g), match.end(g), match.group(g))

def find_features(self, feature_type, min_length=1, allow_gaps=False):
"""Search the sequence for features
Parameters
----------
feature_type : str
The type of feature to find
min_length : int, optional
Defaults to 1. Only features at least as long as this will be
returned
allow_gaps : bool, optional
Defaults to ``False``. If ``True``, then gaps will not be
considered to disrupt a feature
Returns
-------
generator
Yields tuples of the start of the feature, the end of the feature,
and the subsequence that composes the feature
"""
if feature_type not in self.feature_types:
raise ValueError("Unknown feature type: %s" % feature_type)

acceptable = '-' if allow_gaps else ''

if isinstance(self, NucleotideSequence):
if feature_type == 'purine_run':
pat_str = '([AGag%s]{%d,})' % (acceptable, min_length)
if feature_type == 'pyrimidine_run':
pat_str = '([CTUctu%s]{%d,})' % (acceptable, min_length)

pat = re.compile(pat_str)

for hits in self.regex_iter(pat):
if allow_gaps:
if len(hits[2].replace('-', '')) >= min_length:
yield hits
else:
yield hits


class NucleotideSequence(BiologicalSequence):
"""Base class for nucleotide sequences.
Expand All @@ -1561,8 +1519,6 @@ class NucleotideSequence(BiologicalSequence):
"""

feature_types = set(['purine_run', 'pyrimidine_run'])

@classmethod
def complement_map(cls):
"""Return the mapping of characters to their complements.
Expand Down Expand Up @@ -1743,6 +1699,62 @@ def reverse_complement(self):
return self._complement(reverse=True)
rc = reverse_complement

def find_features(self, feature_type, min_length=1, allow_gaps=False):
"""Search the sequence for features
Parameters
----------
feature_type : {'purine_run', 'pyrimidine_run'}
The type of feature to find
min_length : int, optional
Defaults to 1. Only features at least as long as this will be
returned
allow_gaps : bool, optional
Defaults to ``False``. If ``True``, then gaps will not be
considered to disrupt a feature
Returns
-------
generator
Yields tuples of the start of the feature, the end of the feature,
and the subsequence that composes the feature
Examples
--------
>>> from skbio.sequence import NucleotideSequence
>>> s = NucleotideSequence('G-AT.T')
>>> list(s.find_features('purine_run'))
[(0, 1, 'G'), (2, 3, 'A')]
>>> list(s.find_features('purine_run', 2))
[]
>>> list(s.find_features('purine_run', 2, allow_gaps=True))
[(0, 3, 'G-A')]
>>> list(s.find_features('pyrimidine_run', 2, allow_gaps=True))
[(3, 6, 'T.T')]
"""
gaps = re.escape(''.join(self.gap_alphabet()))
acceptable = gaps if allow_gaps else ''

if feature_type == 'purine_run':
pat_str = '([AGag%s]{%d,})' % (acceptable, min_length)
elif feature_type == 'pyrimidine_run':
pat_str = '([CTUctu%s]{%d,})' % (acceptable, min_length)
else:
raise ValueError("Unknown feature type: %s" % feature_type)

pat = re.compile(pat_str)

for hits in self.regex_iter(pat):
if allow_gaps:
degapped = hits[2]
for gap_char in self.gap_alphabet():
degapped = degapped.replace(gap_char, '')
if len(degapped) >= min_length:
yield hits
else:
yield hits


class DNASequence(NucleotideSequence):
"""Base class for DNA sequences.
Expand Down
14 changes: 10 additions & 4 deletions skbio/sequence/tests/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -779,10 +779,6 @@ def test_regex_iter(self):
exp = [(2, 7, 'TTACA'), (2, 5, 'TTA'), (5, 7, 'CA')]
self.assertEqual(obs, exp)

def test_find_features_nonexistent_feature_type(self):
with self.assertRaises(ValueError):
list(self.b1.find_features('purine_run'))


class NucelotideSequenceTests(TestCase):

Expand All @@ -792,6 +788,7 @@ def setUp(self):
self.b2 = NucleotideSequence(
'ACCGGUACC', id="test-seq-2",
description="A test sequence")
self.b3 = NucleotideSequence('G-AT-TG.AT.T')

def test_alphabet(self):
exp = {
Expand Down Expand Up @@ -951,6 +948,15 @@ def test_find_features_no_feature_type(self):
with self.assertRaises(ValueError):
list(self.b1.find_features('nonexistent_feature_type'))

def test_find_features_allow_gaps(self):
exp = [(0, 3, 'G-A'), (6, 9, 'G.A')]
obs = list(self.b3.find_features('purine_run', 2, True))
self.assertEqual(obs, exp)

exp = [(3, 6, 'T-T'), (9, 12, 'T.T')]
obs = list(self.b3.find_features('pyrimidine_run', 2, True))
self.assertEqual(obs, exp)

def test_nondegenerates_propagate_optional_properties(self):
seq = NucleotideSequence('RS', id='foo', description='bar',
quality=[42, 999])
Expand Down

0 comments on commit 7b60cc2

Please sign in to comment.