In [23]:
from test_data import gapped_sequences, sequences, motif_1
from benchmark import Benchmark
from skbio.sequence import DNASequence,  genetic_code
import skbio.io
skbio_023_benchmark = Benchmark("scikit-bio 0.2.3")
from qiime_default_reference import get_reference_sequences

In [24]:
from skbio import DNA
skbio_seqs = [DNA(seq, id=str(id)) for id_, seq in sequences]
skbio_gapped_seqs = [DNA(seq, id=str(id)) for id_, seq in gapped_sequences]

In [25]:
@skbio_023_benchmark
def object_creation():
    for id_, seq in sequences:
        DNA(seq, id=id_)

1000 loops, best of 3: 695 µs per loop


In [26]:
@skbio_023_benchmark
def object_creation_validate():
    for id_, seq in sequences:
        DNA(seq, id=id_, validate=True)

10 loops, best of 3: 56.2 ms per loop


In [27]:
@skbio_023_benchmark
def read_fasta_file():
    list(skbio.io.read(get_reference_sequences(), format='fasta'))

1 loops, best of 3: 1.23 s per loop


In [28]:
@skbio_023_benchmark
def reverse_complement():
    for s in skbio_seqs:
        s.reverse_complement()

1 loops, best of 3: 173 ms per loop


In [29]:
@skbio_023_benchmark
def search_for_motif():
    for s in skbio_seqs:
        s.regex_iter("("+motif_1+")")

1000 loops, best of 3: 405 µs per loop


In [30]:
@skbio_023_benchmark
def translate():
    gen_code = genetic_code(1)
    for s in skbio_seqs:
        gen_code.translate(s)
        

1 loops, best of 3: 2.21 s per loop


In [31]:
# @skbio_023_benchmark
# def reverse_translate():
#     gen_code = genetic_code(1)
#     for s in skbio_seqs:
#         gen_code.translate(''.join(reversed(s)))

In [32]:
@skbio_023_benchmark
def filter_fasta_to_no_gaps():
    [s for s in skbio_gapped_seqs if not s.is_gapped()]

1000 loops, best of 3: 780 µs per loop


In [33]:
@skbio_023_benchmark
def degap_all():
    for s in skbio_gapped_seqs:
        s.degap()

1 loops, best of 3: 648 ms per loop


In [34]:
@skbio_023_benchmark
def kmer_count_5():
    for s in skbio_seqs:
        s.k_word_counts(5, overlapping=True)

1 loops, best of 3: 5.73 s per loop


In [35]:
@skbio_023_benchmark
def kmer_count_25():
    for s in skbio_seqs:
        s.k_word_counts(25, overlapping=True)

1 loops, best of 3: 5.76 s per loop


In [36]:
@skbio_023_benchmark
def validate_chars():
    for s in skbio_seqs:
        s.is_valid()

10 loops, best of 3: 58.5 ms per loop


In [37]:
@skbio_023_benchmark
def filter_invalid_seqs():
    [s for s in skbio_seqs if s.is_valid()]

10 loops, best of 3: 55.9 ms per loop


In [38]:
@skbio_023_benchmark
def expand_degenerates():
    for s in skbio_seqs:
        list(s.nondegenerates())

1 loops, best of 3: 416 ms per loop


In [39]:
@skbio_023_benchmark
def gc_content():
    for s in skbio_seqs:
        gc = 0
        for i in s:
            if i in ['G', 'C']:
                gc += 1
        gc * 1.0 / len(s)

10 loops, best of 3: 63.8 ms per loop


In [40]:
@skbio_023_benchmark
def find_motif_in_gapped():
    m_len = len(motif_1)
    for s in skbio_gapped_seqs:
        s_ungapped = str(s.degap())
        m = s.gap_maps()
        found = s_ungapped.find(motif_1)
        if found > 0:
            start = m[0][found]
            end   = start + m_len
            s[start:end]

1 loops, best of 3: 5.6 s per loop


In [41]:
@skbio_023_benchmark
def slice_at_midpoint():
    for e in skbio_seqs:
        e[:len(e)]

100 loops, best of 3: 2.82 ms per loop


In [42]:
skbio_023_benchmark.record("skbio023.csv")