Skip to content
This repository has been archived by the owner on Jun 16, 2018. It is now read-only.

Commit

Permalink
Merge pull request #18 from mruffalo/master
Browse files Browse the repository at this point in the history
Fix #17 in both Python 2 and 3
  • Loading branch information
brentp committed Oct 30, 2013
2 parents 307b030 + 6bc19fc commit cb3d16d
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 29 deletions.
10 changes: 5 additions & 5 deletions pyfasta/__init__.py
Expand Up @@ -43,9 +43,9 @@ def info(args):
<BLANKLINE>
tests/data/three_chrs.fasta
===========================
>chr3 length:3600
>chr2 length:80
>chr1 length:80
>chr3 length:3600
>chr2 length:80
>chr1 length:80
<BLANKLINE>
3760 basepairs in 3 sequences
"""
Expand All @@ -70,7 +70,7 @@ def info(args):
total_len = sum(l for k, l in info)
nseqs = len(f)
if options.nseqs > -1:
info = sorted(info, key=operator.itemgetter(1), reverse=True)
info = sorted(info, key=operator.itemgetter(1, 0), reverse=True)
info = info[:options.nseqs]
else:
info.sort()
Expand All @@ -85,7 +85,7 @@ def info(args):
c = seq.count('C')
gc = 100.0 * (g + c) / float(l)
gc = "gc:%.2f%%" % gc
print((">%s length:%i " % (k, l)) + gc)
print((">%s length:%i" % (k, l)) + gc)

if total_len > 1000000:
total_len = "%.3fM" % (total_len / 1000000.)
Expand Down
64 changes: 40 additions & 24 deletions pyfasta/fasta.py
@@ -1,6 +1,8 @@
from __future__ import print_function
import string
import os.path
from collections import Mapping
import sys
import numpy as np

from records import NpyFastaRecord
Expand All @@ -14,8 +16,22 @@
# 3.x
maketrans = str.maketrans

_complement = maketrans(u'ATCGatcgNnXx', u'TAGCtagcNnXx').decode('latin-1')
complement = lambda s: s.translate(_complement)
_complement = maketrans('ATCGatcgNnXx', 'TAGCtagcNnXx')
# Python 2: string.maketrans returns a bytes object of length 256,
# that is used as a lookup table to translate bytes to other bytes.
# Python 3: str.maketrans returns a dict mapping Unicode code points
# to other Unicode code points. Can't use a fully-allocated lookup
# table since it would have to be of size `sys.maxunicode`, which
# is equal to 1114111 on wide builds of <= 3.2 and all builds of
# Python >= 3.3.
# In Python 2, it's safe to use a unicode object as the translation
# table; this causes str.translate to return a unicode object instead
# of a str. This is safe as long as the string that you're translating
# can be decoded as ASCII, and will fail with a UnicodeDecodeError
# otherwise.
if sys.version_info[0] < 3:
_complement = _complement.decode('latin-1')
complement = lambda s: s.translate(_complement)

class FastaNotFound(Exception): pass

Expand All @@ -39,12 +55,12 @@ def __init__(self, fasta_name, record_class=NpyFastaRecord,
FastaRecord('tests/data/three_chrs.fasta.flat', 0..80)
extract sequence with normal python syntax
>>> f['chr1'][:10]
'ACTGACTGAC'
>>> print(f['chr1'][:10])
ACTGACTGAC
take the first basepair in each codon...
>>> f['chr1'][0:10:3]
'AGTC'
>>> print(f['chr1'][0:10:3])
AGTC
"""
if not os.path.exists(fasta_name):
Expand Down Expand Up @@ -128,29 +144,29 @@ def sequence(self, f, asstring=True, auto_rc=True
>>> from pyfasta import Fasta
>>> f = Fasta('tests/data/three_chrs.fasta')
>>> f.sequence({'start':1, 'stop':2, 'strand':1, 'chr': 'chr1'})
u'AC'
>>> print(f.sequence({'start':1, 'stop':2, 'strand':1, 'chr': 'chr1'}))
AC
>>> f.sequence({'start':1, 'stop':2, 'strand': -1, 'chr': 'chr1'})
u'GT'
>>> print(f.sequence({'start':1, 'stop':2, 'strand': -1, 'chr': 'chr1'}))
GT
>>> sorted(f.index.items())
[('chr1', (0, 80)), ('chr2', (80, 160)), ('chr3', (160, 3760))]
NOTE: these 2 are reverse-complement-ary because of strand
#>>> f.sequence({'start':10, 'stop':12, 'strand': -1, 'chr': 'chr1'})
'CAG'
>>> f.sequence({'start':10, 'stop':12, 'strand': 1, 'chr': 'chr1'})
u'CTG'
>>> print(f.sequence({'start':10, 'stop':12, 'strand': 1, 'chr': 'chr1'}))
CTG
>>> f.sequence({'start':10, 'stop':12, 'strand': -1, 'chr': 'chr3'})
u'TGC'
>>> f.sequence({'start':10, 'stop':12, 'strand': 1, 'chr': 'chr3'})
u'GCA'
>>> print(f.sequence({'start':10, 'stop':12, 'strand': -1, 'chr': 'chr3'}))
TGC
>>> print(f.sequence({'start':10, 'stop':12, 'strand': 1, 'chr': 'chr3'}))
GCA
>>> f['chr3'][:][-10:]
u'CGCACGCTAC'
>>> print(f['chr3'][:][-10:])
CGCACGCTAC
a feature can have exons:
Expand All @@ -159,14 +175,14 @@ def sequence(self, f, asstring=True, auto_rc=True
by default, it just returns the full sequence between start
and stop.
>>> f.sequence(feat)
u'ACTGACTGACT'
>>> print(f.sequence(feat))
ACTGACTGACT
but if exon_keys is set to an iterable, it will search for
those keys and will use the first to create a sequence and
return the concatenated result.
>>> f.sequence(feat, exon_keys=('rnas', 'exons'))
u'ACTACTACT'
>>> print(f.sequence(feat, exon_keys=('rnas', 'exons')))
ACTACTACT
Note that sequence is 2 characters shorter than the entire
feature, to account for the introns at base-pairs 12 and 16.
Expand All @@ -175,8 +191,8 @@ def sequence(self, f, asstring=True, auto_rc=True
fine one, so it continued on to 'exons'. If it doesn't find
any of the exon keys, it will fall back on the start, stop of
the feature:
>>> f.sequence(feat, exon_keys=('fake', 'also_fake'))
u'ACTGACTGACT'
>>> print(f.sequence(feat, exon_keys=('fake', 'also_fake')))
ACTGACTGACT
"""
assert 'chr' in f and f['chr'] in self, (f, f['chr'], self.keys())
fasta = self[f['chr']]
Expand Down

0 comments on commit cb3d16d

Please sign in to comment.