In [1]:
from test_data import gapped_sequences, sequences, motif_1
from benchmark import Benchmark
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC, generic_dna, _verify_alphabet
from Bio.SeqUtils import GC
from Bio import SeqIO, motifs
from qiime_default_reference import get_reference_sequences

biopy_benchmark = Benchmark("biopython")

biopy_seqs = [Seq(seq, generic_dna) for ident, seq in sequences]
gapped_biopy_seqs = [Seq(seq, generic_dna) for ident, seq in gapped_sequences]

In [2]:
@biopy_benchmark
def object_creation():
    for ident, seq in sequences:
        Seq(seq, generic_dna)

1000 loops, best of 3: 425 µs per loop


In [3]:
@biopy_benchmark
def object_creation_validate():
    for ident, seq in sequences:
        _verify_alphabet(Seq(seq, IUPAC.IUPACAmbiguousDNA))

1 loops, best of 3: 360 ms per loop


In [4]:
@biopy_benchmark
def read_fasta_file():
    list(SeqIO.parse(open(get_reference_sequences()),'fasta'))

1 loops, best of 3: 2.55 s per loop


In [5]:
@biopy_benchmark
def reverse_complement():
    for seq in biopy_seqs:
        seq.reverse_complement()

100 loops, best of 3: 4.56 ms per loop


In [6]:
instances = [Seq(motif_1)]
m = motifs.create(instances)
@biopy_benchmark
def search_for_motif():
    for seq in biopy_seqs:
        for pos, motif in m.instances.search(seq):
            pass
    

1 loops, best of 3: 2.15 s per loop


In [7]:
@biopy_benchmark
def translate():
    for seq in biopy_seqs:
        seq.translate()


1 loops, best of 3: 298 ms per loop




In [8]:
seqs = biopy_seqs + gapped_biopy_seqs
@biopy_benchmark
def filter_fasta_to_no_gaps():
    [s for s in seqs if '-' in s or '.' in s]

100 loops, best of 3: 5.63 ms per loop


In [9]:
@biopy_benchmark
def degap_all():
    for seq in gapped_biopy_seqs:
        seq.ungap('-').ungap('.')

10 loops, best of 3: 69.3 ms per loop


In [10]:
from collections import defaultdict

@biopy_benchmark
def kmer_count_5():
    for e in biopy_seqs:
        d = defaultdict(int)
        for i in range(len(e)-5 +1):
            d[e[i:i+5]] += 1

1 loops, best of 3: 9.39 s per loop




In [11]:
@biopy_benchmark
def kmer_count_25():
    for e in biopy_seqs:
        d = defaultdict(int)
        for i in range(len(e)-25 +1):
            d[e[i:i+25]] += 1

1 loops, best of 3: 6.99 s per loop


In [12]:
#*We were unable to do this with a public method*
verify_seqs = [Seq(seq, IUPAC.IUPACAmbiguousDNA) for ident, seq in sequences]
@biopy_benchmark
def validate_chars():
    for seq in verify_seqs:
        _verify_alphabet(seq)


1 loops, best of 3: 356 ms per loop


In [13]:
#*We were unable to do this with a public method*
@biopy_benchmark
def filter_invalid_seqs():
    list([seq for seq in verify_seqs if _verify_alphabet(seq)])

1 loops, best of 3: 372 ms per loop


In [14]:
# We were unable to perform this operation in biopython
# @biopy_benchmark
# def expand_degenerates():
#     pass

In [15]:
@biopy_benchmark
def gc_content():
    for seq in biopy_seqs:
        GC(seq)

100 loops, best of 3: 18 ms per loop


In [16]:
@biopy_benchmark
def slice_at_midpoint():
    for e in biopy_seqs:
        e[:len(e)]

1000 loops, best of 3: 1.09 ms per loop


In [17]:
# We were unable to perform this operation in biopython
# @biopy_benchmark
# def find_motif_in_gapped():
#     pass

In [18]:
biopy_benchmark.record("./biopy.csv")