# Compilation of aligner execution times table

Compiles execution times from aligner output into the table in Chapter 2

In [1]:
import os

from datetime import datetime
from collections import defaultdict

root_path = os.getcwd() + '/exec_times/'
os.chdir(root_path)

## *i)* Other aligner timings

In [2]:
aligners_filenames = {
    'bt2': 'bt2_ecoli_sim.txt',
    'bwa': 'bwa_ecoli_sim.txt',
    'mosaik': 'mosaik_ecoli_sim.txt',
    'segemehl': 'segemehl_ecoli_sim.txt'
}

In [3]:
timings = defaultdict(defaultdict)

for aligner, fn in aligners_filenames.items():
    with open(fn, 'r') as timings_fh:
        for line in timings_fh:
            if line.startswith('sim_E_coli_'):
                reads_name = line.strip()
            elif line.startswith('real'):
                timing = line.partition('real')[2].strip()  # Bareback parsing of bash `time` cmd output 
                mins, secs = timing[:-5].split('m')
#                 print(mins, secs)
                timing_secs = int(mins)*60 + int(secs)
                timings[aligner][reads_name] = timing_secs

timings

defaultdict(collections.defaultdict,
            {'bt2': defaultdict(None,
                         {'sim_E_coli_1000000x400_ind0_sub0': 374,
                          'sim_E_coli_1000000x400_ind0_sub1': 4314,
                          'sim_E_coli_1000000x400_ind0_sub2': 4346,
                          'sim_E_coli_1000000x400_ind0_sub3': 4144,
                          'sim_E_coli_1000000x400_ind0_sub4': 3938,
                          'sim_E_coli_1000000x400_ind0_sub5': 3751,
                          'sim_E_coli_1000000x400_ind1_sub0': 4233,
                          'sim_E_coli_1000000x400_ind1_sub1': 4271,
                          'sim_E_coli_1000000x400_ind2_sub0': 4249,
                          'sim_E_coli_1000000x400_ind2_sub2': 4004,
                          'sim_E_coli_1000000x400_ind3_sub0': 3977,
                          'sim_E_coli_1000000x400_ind3_sub3': 3559,
                          'sim_E_coli_1000000x400_ind4_sub0': 3708,
                          'sim_E_coli_1000

In [4]:
overall_timings = {}

for aligner, aligner_timings in timings.items():
    aligner_total = sum([time for time in aligner_timings.values()])
    overall_timings[aligner] = aligner_total
    print(f'{aligner} {aligner_total}s')

bt2 58660s
bwa 13925s
mosaik 25129s
segemehl 196682s


## *ii)* Time series aligner timings

Dir named `last_resort_ecoli` due to *unfounded* fear of needing to use file modification times, which was thankfully unnecessary.

In [6]:
ts_path = root_path + 'last_resort_ecoli/'

approaches = ('dft', 'dwt', 'paa')

timings_ts = {}

for approach in approaches:
    for fn in os.listdir(f'{ts_path}ecoli_{approach}/'):
        if fn.startswith('sim_E_coli_'):
            secs = 0
            with open(f'{ts_path}ecoli_{approach}/{fn}', 'r') as  summary_fh:
                for line in summary_fh:
                    if line.startswith('> Overall alignment time '):
                        secs = int(line.partition('> Overall alignment time ')[2].partition(' in seconds')[0])
                        timings_ts[fn] = secs

timings_ts

{'sim_E_coli_1000000x400_ind0_sub0.fastq_Voss_indicators_100kmer_DFT_008complvl_non_exhaustive_settings_non_shuffle_reads_KNN_5.txt': 2548,
 'sim_E_coli_1000000x400_ind0_sub0.fastq_Voss_indicators_100kmer_DWT_008complvl_non_exhaustive_settings_non_shuffle_reads_KNN_5.txt': 1906,
 'sim_E_coli_1000000x400_ind0_sub0.fastq_Voss_indicators_100kmer_PAA_008complvl_non_exhaustive_settings_non_shuffle_reads_KNN_5.txt': 1941,
 'sim_E_coli_1000000x400_ind0_sub0.fastq_Voss_indicators_150kmer_DFT_008complvl_non_exhaustive_settings_non_shuffle_reads_KNN_5.txt': 2521,
 'sim_E_coli_1000000x400_ind0_sub0.fastq_Voss_indicators_150kmer_DWT_008complvl_non_exhaustive_settings_non_shuffle_reads_KNN_5.txt': 1773,
 'sim_E_coli_1000000x400_ind0_sub0.fastq_Voss_indicators_150kmer_PAA_008complvl_non_exhaustive_settings_non_shuffle_reads_KNN_5.txt': 1923,
 'sim_E_coli_1000000x400_ind0_sub0.fastq_Voss_indicators_200kmer_DFT_008complvl_non_exhaustive_settings_non_shuffle_reads_KNN_5.txt': 2538,
 'sim_E_coli_1000000

In [7]:
timings_ts_dft = {fn: secs for fn, secs in timings_ts.items() if 'DFT' in fn}
timings_ts_dwt = {fn: secs for fn, secs in timings_ts.items() if 'DWT' in fn}
timings_ts_paa = {fn: secs for fn, secs in timings_ts.items() if 'PAA' in fn}

**Since results comprise tests for six *k*-mers, we consider the mean case:**

In [10]:
overall_timings['ts-dft'] = int(sum([secs for secs in timings_ts_dft.values()])/6)
overall_timings['ts-dwt'] = int(sum([secs for secs in timings_ts_dwt.values()])/6)
overall_timings['ts-paa'] = int(sum([secs for secs in timings_ts_paa.values()])/6)

In [11]:
overall_timings

{'bt2': 58660,
 'bwa': 13925,
 'mosaik': 25129,
 'segemehl': 196682,
 'ts-dft': 59319,
 'ts-dwt': 44716,
 'ts-paa': 47747}

**One result missing for two of the implementations:**

In [8]:
print(len(timings_ts_dft))
print(len(timings_ts_dwt))
print(len(timings_ts_paa))

80
79
79


**Throughput calculation**

```
16*(400*1000000)/<time in seconds>
/100 for kbp

```

In [17]:
overall_throughput = {}
for aligner, timing in overall_timings.items():
    overall_throughput[aligner] = int(round(16*(400*1000000)/timing, 0))

overall_throughput

{'bt2': 109103,
 'bwa': 459605,
 'mosaik': 254686,
 'segemehl': 32540,
 'ts-dft': 107891,
 'ts-dwt': 143126,
 'ts-paa': 134040}