# 1. Parameters

In [1]:
# Defaults
simulation_dir = 'simulations/unset'
reference_file = 'simulations/reference/reference.fa.gz'
iterations = 3
mincov = 10
ncores = 32

In [2]:
from pathlib import Path
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

simulation_dir_path = Path(simulation_dir)

case_name = str(simulation_dir_path.name)
reads_dir = simulation_dir_path / 'simulated_data' / 'reads'
assemblies_dir = simulation_dir_path / 'simulated_data' / 'assemblies'
index_reads_path = simulation_dir_path / 'index-reads'
index_assemblies_path = simulation_dir_path / 'index-assemblies'
output_data_dir = 'output'

# 2. Index genomes

In [3]:
!gdi --version

gdi, version 0.3.0.dev9


## 2.1. Index reads

In [4]:
input_genomes_file = simulation_dir_path / 'input-reads.tsv'

!gdi input --absolute {reads_dir}/*.fq.gz > {input_genomes_file}

In [5]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name} reads')
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_reads_path, input_files_file=input_genomes_file,
                                             reference_file=reference_file, mincov=mincov, build_tree=True,
                                             ncores=ncores)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1630518475.2789054']
Removing any existing indexes simulations/unset/index-reads
Creating new index: [gdi init simulations/unset/index-reads]
Creating a new index took 2.84 seconds
Analysis running: [gdi --project-dir simulations/unset/index-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-reads.tsv]
Analysis took 1.68 minutes
Index running: [gdi --project-dir simulations/unset/index-reads --ncores 32 load vcf --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1630518673.2465343/gdi-input.fofn]
Indexing took 0.10 minutes
Building tree: [gdi --project-dir simu

A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1630518673.2465343']
Removing any existing indexes simulations/unset/index-reads
Creating new index: [gdi init simulations/unset/index-reads]
Creating a new index took 3.05 seconds
Analysis running: [gdi --project-dir simulations/unset/index-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-reads.tsv]
Analysis took 1.69 minutes
Index running: [gdi --project-dir simulations/unset/index-reads --ncores 32 load vcf --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1630518787.9305496/gdi-input.fofn]
Indexing took 0.10 minutes
Building tree: [gdi --project-dir simu

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f9b6cdb8a30> but it is already set
A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1630518787.9305496']
Removing any existing indexes simulations/unset/index-reads
Creating new index: [gdi init simulations/unset/index-reads]
Creating a new index took 2.82 seconds
Analysis running: [gdi --project-dir simulations/unset/index-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-reads.tsv]
Analysis took 1.73 minutes
Index running: [gdi --project-dir simulations/unset/index-reads --ncores 32 load vcf --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1630518903.2066467/gdi-input.fofn]
Indexing took 0.10 minutes
Building tree: [gdi --project-dir simu

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f9b6c891c40> but it is already set
A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [6]:
benchmark_df

Unnamed: 0,Name,Iteration,Number samples,Number cores,Reference length,Analysis runtime,Analysis memory (max),Analysis memory (max/process),Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,unset reads,1,59,32,10834,100.82,2778575000.0,386371584.0,3270447000.0,5.81,5790167000.0,258936832.0,1429504.0,2.83,218873856.0,218873856.0,109.46,5790167000.0
0,unset reads,2,59,32,10834,101.36,2812486000.0,391815168.0,3270447000.0,5.63,5785166000.0,258347008.0,1433600.0,3.05,218939392.0,218939392.0,110.04,5785166000.0
0,unset reads,3,59,32,10834,103.63,2619900000.0,383447040.0,3270447000.0,5.6,5793993000.0,258801664.0,1421312.0,2.98,218488832.0,218488832.0,112.21,5793993000.0


In [7]:
index_reads_runtime = simulation_dir_path / 'reads-index-info.tsv'
benchmark_df.to_csv(index_reads_runtime, sep='\t', index=False)

## 2.2. Index assemblies

In [8]:
input_genomes_file = simulation_dir_path / 'input-assemblies.tsv'

!gdi input --absolute {assemblies_dir}/*.fa.gz > {input_genomes_file}

In [9]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name} assemblies')
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_assemblies_path, input_files_file=input_genomes_file,
                                             reference_file=reference_file, mincov=mincov, build_tree=True,
                                             ncores=ncores)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1630518903.2066467']
Removing any existing indexes simulations/unset/index-assemblies
Creating new index: [gdi init simulations/unset/index-assemblies]
Creating a new index took 2.87 seconds
Analysis running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-assemblies.tsv]
Analysis took 1.56 minutes
Index running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 load vcf --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1630519023.3730545/gdi-input.fofn]
Indexing took 0.09 minutes
Building tree

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f9b98366700> but it is already set
A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1630519023.3730545']
Removing any existing indexes simulations/unset/index-assemblies
Creating new index: [gdi init simulations/unset/index-assemblies]
Creating a new index took 2.96 seconds
Analysis running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-assemblies.tsv]
Analysis took 1.55 minutes
Index running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 load vcf --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1630519129.5817757/gdi-input.fofn]
Indexing took 0.09 minutes
Building tree

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f9b9836c370> but it is already set
A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1630519129.5817757']
Removing any existing indexes simulations/unset/index-assemblies
Creating new index: [gdi init simulations/unset/index-assemblies]
Creating a new index took 2.83 seconds
Analysis running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-assemblies.tsv]
Analysis took 1.53 minutes
Index running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 load vcf --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1630519234.895392/gdi-input.fofn]
Indexing took 0.08 minutes
Building tree:

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f9b37b5e190> but it is already set
A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [10]:
benchmark_df

Unnamed: 0,Name,Iteration,Number samples,Number cores,Reference length,Analysis runtime,Analysis memory (max),Analysis memory (max/process),Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,unset assemblies,1,59,32,10834,93.19,1213432000.0,373661696.0,2082591000.0,5.3,5780230000.0,237645824.0,1064960.0,2.78,218537984.0,218537984.0,101.27,5780230000.0
0,unset assemblies,2,59,32,10834,92.77,1278243000.0,369811456.0,2082599000.0,5.29,5780476000.0,238088192.0,1069056.0,2.76,219455488.0,219455488.0,100.82,5780476000.0
0,unset assemblies,3,59,32,10834,91.7,1157829000.0,369258496.0,2082583000.0,4.95,5770764000.0,237805568.0,1060864.0,2.68,218931200.0,218931200.0,99.33,5770764000.0


In [11]:
index_assemblies_runtime = simulation_dir_path / 'assemblies-index-info.tsv'
benchmark_df.to_csv(index_assemblies_runtime, sep='\t', index=False)