In [3]:
from qiime_default_reference import get_reference_sequences

from test_data import gapped_sequences, sequences, motif_1
from cogent.core.sequence import NucleicAcidSequence, DnaSequence
from cogent.core.alphabet import AlphabetError
from cogent.core.genetic_code import DEFAULT as standard_code
from benchmark import Benchmark
cogent_benchmark = Benchmark("pycogent")

cogent_sequences = [DnaSequence(seq) for _, seq in sequences]
cogent_gapped_sequences = [DnaSequence(seq) for _, seq in gapped_sequences]

In [4]:
@cogent_benchmark
def object_creation():
    for id_, seq in sequences:
        DnaSequence(seq, check=False)

10 loops, best of 3: 34 ms per loop


In [5]:
@cogent_benchmark
def object_creation_validate():
    for id_, seq in sequences:
        DnaSequence(seq)

10 loops, best of 3: 56.9 ms per loop


In [6]:
@cogent_benchmark
def reverse_complement():
    for e in cogent_sequences:
        DnaSequence(e).rc()

1 loops, best of 3: 187 ms per loop


In [7]:
@cogent_benchmark
def degap_all():
    for e in cogent_gapped_sequences:
        DnaSequence(e).degap()

1 loops, best of 3: 291 ms per loop


In [8]:
@cogent_benchmark
def translate():
    for e in cogent_sequences:
        try:
            e.getTranslation()
        except AlphabetError:
            pass

10 loops, best of 3: 123 ms per loop


In [9]:
@cogent_benchmark
def search_for_motif():
    for e in cogent_sequences:
        str(e).find(motif_1)

100 loops, best of 3: 7.24 ms per loop


In [10]:
from collections import defaultdict

@cogent_benchmark
def kmer_count_5():
    for e in cogent_sequences:
        d = defaultdict(int)
        for i in range(len(e)-5+1):
            d[e[i:i+5]] += 1

1 loops, best of 3: 35.7 s per loop


In [11]:
@cogent_benchmark
def kmer_count_25():
    for e in cogent_sequences:
        d = defaultdict(int)
        for i in range(len(e)-25+1):
            d[e[i:i+25]] += 1

1 loops, best of 3: 33.3 s per loop


In [12]:
x = [DnaSequence(seq, check=False) for _, seq in sequences]

@cogent_benchmark
def validate_chars():
    for e in x:
        e.isValid()

10 loops, best of 3: 72.2 ms per loop


In [13]:
@cogent_benchmark
def filter_invalid_seqs():
    l = []
    for id_, seq in sequences:
        try:
            l.append(DnaSequence(seq, check=True))
        except AlphabetError:
            pass

10 loops, best of 3: 48.2 ms per loop


In [14]:
from cogent.seqsim.sequence_generators import SequenceGenerator, IUPAC_DNA

def expand_degeneracies(raw_primer):
    """Cut from PrimerProspector: 
    http://sourceforge.net/p/pprospector/code/HEAD/tree/tags/1.0.1-release/primerprospector/check_primer_barcode_dimers.py
    """
    
    primers = SequenceGenerator(template=raw_primer, alphabet=IUPAC_DNA)
    expanded_primers = []
    for primer in primers:
        expanded_primers.append(primer)
        
    return expanded_primers

@cogent_benchmark
def expand_degenerates():
    for e in cogent_sequences:
        expand_degeneracies(e)
        

1 loops, best of 3: 1.03 s per loop


In [15]:
@cogent_benchmark
def gc_content():
    for e in cogent_sequences:
        (e.count('G') + e.count('C')) / len(e)

100 loops, best of 3: 6.99 ms per loop


In [16]:
@cogent_benchmark
def find_motif_in_gapped():
    for e in cogent_sequences:
        gapped, ungapped = e.gapMaps()
        degapped_e = e.degap()
        try:
            start = str(degapped_e).index(motif_1)
        except ValueError:
            pass
        

1 loops, best of 3: 550 ms per loop


In [17]:
from cogent.parse.fasta import MinimalFastaParser
@cogent_benchmark
def read_fasta_file():
    for e in MinimalFastaParser(open(get_reference_sequences())):
        DnaSequence(e[1], Name=e[0])

1 loops, best of 3: 8.35 s per loop


In [18]:
@cogent_benchmark
def slice_at_midpoint():
    for e in cogent_sequences:
        e[:len(e)]

10 loops, best of 3: 38.2 ms per loop


In [19]:
cogent_benchmark.record("cogent.csv")