# Dependency code

In [None]:
%run ../lib/cmdbench/bioinformatics/multibench.py

COBS will have to be built directly from source <https://github.com/bingmann/cobs>. Version used is commit `7c030bbb4804b142f4336a1808af1095d9398f4a`.

# Benchmarking

In [None]:
import sys
import os
import shutil
import numpy as np
import glob
import cmdbench

def clean_if_exists(path):
    if os.path.exists(path):
        if(os.path.isfile(path)):
            os.remove(path)
        else:
            shutil.rmtree(path)
            os.mkdir(path)
            
def reset_func():
    clean_if_exists("cobs-data/samples/")
    clean_if_exists("cobs-data/example.cobs_compact")
    clean_if_exists("cobs-data/example.cobs_compact.tmp")
            
def get_last_n_lines(string, n):
    return "\n".join(string.split("\n")[-n:])
    
def benchmark_list_to_results(benchmark_firsts_list):
    return {
        "memory": max(list(map(lambda result: result.memory.max, benchmark_firsts_list))),
        "disk_read": max(list(map(lambda result: result.disk.read_chars, benchmark_firsts_list))),
        "disk_write": max(list(map(lambda result: result.disk.write_chars, benchmark_firsts_list))),
        "runtime": sum(list(map(lambda result: result.process.execution_time, benchmark_firsts_list)))
    }

def create_folder_if_doesnt_exist(path):
    if not os.path.exists(path):
        os.makedirs(path)

# Summarize numpy array if it has more than 10 elements
np.set_printoptions(threshold=10)

input_dir = 'input'

input_files_1 = [os.path.basename(f) for f in glob.glob(f'{input_dir}/*_1.fastq.gz')]
input_samples = [f.replace('_1.fastq.gz','') for f in input_files_1]
print(input_samples)

In [None]:
create_folder_if_doesnt_exist("cobs-data")
create_folder_if_doesnt_exist("cobs-data/samples")

In [None]:
nproc=4
kmer=31
hashes=3

sample_sizes = [1,10,20,30,40,50,60,70,80,85]

def sampling_func(sample_size):
    # Copy files we want to be part of the sample
    # from inputs to cobs-data/samples/ and return their path
    samples = input_samples[:sample_size]
    samples_out = []
    for sample in samples:
        infile1, infile2 = os.path.abspath(f"{input_dir}/{sample}_1.fastq.gz"), os.path.abspath(f"{input_dir}/{sample}_2.fastq.gz")
        outfile1, outfile2 = f"cobs-data/samples/{sample}_1.fastq.gz", f"cobs-data/samples/{sample}_2.fastq.gz"
        os.symlink(infile1, outfile1)
        os.symlink(infile2, outfile2)
    return ["cobs-data/samples/"]

build_command = {
    "command": f"cobs compact-construct -k {kmer} --threads {nproc} --num-hashes {hashes} cobs-data/samples/ cobs-data/example.cobs_compact",
}

query_repeat_count = 2
query_sequence = "GAAGAAGATGGTGTACGCGGTGCGCGCCGCTATCTCGACCACCTTAAAATGGAATATGCCTTCTGGATGGACG"

query_command = {
    "command": "cobs query -i cobs-data/example.cobs_compact %",
    "parallel_argfiles": [query_sequence] * query_repeat_count,
    "parallel_args": f"-j {nproc} -I%"
}

multibench_results, debug_str = multi_cmdbench({
"index": [build_command],
"query": [query_command]
}, reset_func = reset_func, iterations = 1, sampling_func = sampling_func, sample_sizes = sample_sizes, 
    benchmark_list_to_results=benchmark_list_to_results, active_output_print = True)


# Save and reload results

In [None]:
save_path = "cobs-results.txt"

samples_per_sample_size = []
for sample_size in sample_sizes:
        samples_per_sample_size.append(input_samples[:sample_size])

save_multibench_results(multibench_results, samples_per_sample_size, save_path)

In [None]:
multibench_results, samples_per_sample_size = read_multibench_results(save_path)
print(samples_per_sample_size)

# Plot

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import numpy as np
from pylab import rcParams
rcParams['figure.figsize'] = 15, 3

In [None]:
# Indexing Plots
plot_resources(multibench_results, sample_sizes, "index")

In [None]:
# Querying Plots
plot_resources(multibench_results, sample_sizes, "query")