# Dependency code

In [None]:
import imp
import os
import sys
import numpy as np
import glob
import cmdbench

fp, pathname, description = imp.find_module('benchmark', ['../lib'])
benchmark = imp.load_module('benchmark', fp, pathname, description)

# Summarize numpy array if it has more than 10 elements
np.set_printoptions(threshold=10)

# Software versions

In [None]:
!conda run --name snippy snippy --version

# Benchmarking

## Input data and constants

In [None]:
input_dir = '../data/input-files/reads'
input_files_1 = [os.path.basename(f) for f in glob.glob(f'{input_dir}/*_1.fastq.gz')]
input_samples = [f.replace('_1.fastq.gz','') for f in input_files_1]
input_samples.sort()
print(input_samples)

snippy_out = 'snippy-output'
nproc = 32

reference_genome = "../data/input-files/2011C-3609.fasta"

sample_sizes = [1,10,20,30,40,50,60,70,80]
#sample_sizes = [3]

benchmark.create_folder_if_doesnt_exist(snippy_out)
benchmark.clean_if_exists(snippy_out)

## Benchmark functions

In [None]:
def reset_func():
    benchmark.clean_if_exists(snippy_out)
    
def sampling_func(sample_size):
    samples = input_samples[:sample_size]
    
    samples_tab_file_lines = []
    
    input_dir_abs = os.path.abspath(input_dir)
    reference_genome_abs = os.path.abspath(reference_genome)

    for sample in samples:
        samples_tab_file_line = f"{sample}\t{input_dir_abs}/{sample}_1.fastq.gz\t{input_dir_abs}/{sample}_2.fastq.gz"
        samples_tab_file_lines.append(samples_tab_file_line)
        
    samples_tab_file_content = "\n".join(samples_tab_file_lines)
        
    file = open(f"{snippy_out}/samples.tab", "w") 
    file.write(samples_tab_file_content + "\n")
    file.close()
    
    os.system(f"conda run --name snippy snippy-multi {snippy_out}/samples.tab --ref {reference_genome_abs} --cpus 1 > {snippy_out}/snippy-commands.sh")
    
    # Split commands so I can run alignments in parallel
    os.system(f'grep -v "snippy-core" {snippy_out}/snippy-commands.sh > {snippy_out}/snippy-align-commands.sh')
    os.system(f'grep "snippy-core" {snippy_out}/snippy-commands.sh > {snippy_out}/snippy-core-commands.sh')
    
    return samples

## Benchmark

In [None]:
snippy_align_command = {
    "use_parallel": False,
    "command": f"conda run --name snippy cd {snippy_out}; parallel -j {nproc} -a snippy-align-commands.sh"
}

snippy_core_command = {
    "use_parallel": False,
    "command": f"conda run --name snippy cd {snippy_out}; bash snippy-core-commands.sh"
}

multibench_results, debug_str = benchmark.multibench.multi_cmdbench({
        "build_tree": [snippy_align_command, snippy_core_command]
    },
    reset_func = reset_func, iterations = 1, sampling_func = sampling_func, sample_sizes = sample_sizes, 
    benchmark_list_to_results = benchmark.benchmark_list_to_results, active_output_print = False
)

print('Done')

# Save and reload results

In [None]:
save_path = "snippy_results.txt"

samples_per_sample_size = []
for sample_size in sample_sizes:
        samples_per_sample_size.append(input_samples[:sample_size])

benchmark.multibench.save_multibench_results(multibench_results, samples_per_sample_size, save_path)

multibench_results, samples_per_sample_size = benchmark.multibench.read_multibench_results(save_path)
print(samples_per_sample_size)

# Plot

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import numpy as np
from pylab import rcParams
rcParams['figure.figsize'] = 15, 3

In [None]:
# Typer command Plots
benchmark.multibench.plot_resources(multibench_results, sample_sizes, "build_tree")