In [39]:
from test_data import gapped_sequences, sequences, motif_1
from benchmark import Benchmark
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC, generic_dna, _verify_alphabet
from Bio.SeqUtils import GC
from Bio import SeqIO, motifs
from qiime_default_reference import get_reference_sequences

biopy_benchmark = Benchmark("biopython")

biopy_seqs = [Seq(seq, generic_dna) for ident, seq in sequences]
gapped_biopy_seqs = [Seq(seq, generic_dna) for ident, seq in gapped_sequences]

In [40]:
@biopy_benchmark
def object_creation():
    for ident, seq in sequences:
        Seq(seq, generic_dna)

1000 loops, best of 3: 290 µs per loop


In [41]:
@biopy_benchmark
def object_creation_validate():
    for ident, seq in sequences:
        _verify_alphabet(Seq(seq, IUPAC.IUPACAmbiguousDNA))

1 loops, best of 3: 245 ms per loop


In [42]:
@biopy_benchmark
def read_fasta_file():
    list(SeqIO.parse(open(get_reference_sequences()),'fasta'))

1 loops, best of 3: 1.41 s per loop


In [43]:
@biopy_benchmark
def reverse_complement():
    for seq in biopy_seqs:
        seq.reverse_complement()

100 loops, best of 3: 3.58 ms per loop


In [44]:
instances = [Seq(motif_1)]
m = motifs.create(instances)
@biopy_benchmark
def search_for_motif():
    for seq in biopy_seqs:
        for pos, motif in m.instances.search(seq):
            pass
    

1 loops, best of 3: 1.29 s per loop


In [45]:
@biopy_benchmark
def translate():
    for seq in biopy_seqs:
        seq.translate()


10 loops, best of 3: 162 ms per loop


In [46]:
# @biopy_benchmark
# def reverse_translate():
#     pass

In [47]:
# from Bio.Tool import Translate
# standard_translator = Translate.unambiguous_dna_by_id[1] 
# seq = biopy_seqs[0]
# prot = seq.translate()
# standard_translator.back_translate(prot)

In [48]:
seqs = biopy_seqs + gapped_biopy_seqs
@biopy_benchmark
def filter_fasta_to_no_gaps():
    [s for s in seqs if '-' in s or '.' in s]

100 loops, best of 3: 3.12 ms per loop


In [49]:
@biopy_benchmark
def degap_all():
    for seq in gapped_biopy_seqs:
        seq.ungap('-').ungap('.')

10 loops, best of 3: 56.4 ms per loop


In [50]:

# seq = biopy_seqs[0]
# seq.
# @biopy_benchmark
# def kmer_count_5():
#     pass

In [51]:
# @biopy_benchmark
# def kmer_count_25():
#     pass

In [52]:
# @biopy_benchmark
# def validate_chars():


In [53]:
# @biopy_benchmark
# def filter_invalid_seqs():
#     pass

seq = Seq('TCXTGA', IUPAC.IUPACAmbiguousDNA)
seq

Seq('TCXTGA', <class 'Bio.Alphabet.IUPAC.IUPACAmbiguousDNA'>)

In [54]:
# @biopy_benchmark
# def expand_degenerates():
#     pass

In [55]:
@biopy_benchmark
def gc_content():
    for seq in biopy_seqs:
        GC(seq)

100 loops, best of 3: 14.5 ms per loop


In [56]:
@biopy_benchmark
def find_motif_in_gapped():
    pass

The slowest run took 12.50 times longer than the fastest. This could mean that an intermediate result is being cached 
10000000 loops, best of 3: 95.4 ns per loop


In [57]:
seq = Seq('AT-CGT')
instances = [Seq('TC')]
m = motifs.create(instances)
for pos, motif in m.instances.search(seq.ungap('.').ungap('-')):
    print pos, motif

1 TC


In [58]:
biopy_benchmark.record("./biopy.csv")