# 1. Parameters

In [1]:
# Defaults
simulation_dir = 'simulations/unset'
reference_file = 'simulations/reference/reference.fa.gz'
iterations = 3
mincov = 10
ncores = 32

In [2]:
from pathlib import Path
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

simulation_dir_path = Path(simulation_dir)

case_name = str(simulation_dir_path.name)
reads_dir = simulation_dir_path / 'simulated_data' / 'reads'
assemblies_dir = simulation_dir_path / 'simulated_data' / 'assemblies'
index_reads_path = simulation_dir_path / 'index-reads'
index_assemblies_path = simulation_dir_path / 'index-assemblies'
output_reads_tree = index_reads_path / 'reads.tre'
output_assemblies_tree = index_assemblies_path / 'assemblies.tre'
reference_name = Path(reference_file).name.split('.')[0]

# 2. Index genomes

In [3]:
!gdi --version

gdi, version 0.3.0.dev9


## 2.1. Index reads

In [4]:
input_genomes_file = simulation_dir_path / 'input-reads.tsv'

!gdi input --absolute {reads_dir}/*.fq.gz > {input_genomes_file}

In [5]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name} reads')
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_reads_path, input_files_file=input_genomes_file,
                                             reference_file=reference_file, mincov=mincov, build_tree=True,
                                             ncores=ncores)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1630519234.895392']
Removing any existing indexes simulations/unset/index-reads
Creating new index: [gdi init simulations/unset/index-reads]
Creating a new index took 2.93 seconds
Analysis running: [gdi --project-dir simulations/unset/index-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-reads.tsv]
Analysis took 1.97 minutes
Index running: [gdi --project-dir simulations/unset/index-reads --ncores 32 load vcf --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1630526712.647205/gdi-input.fofn]
Indexing took 0.11 minutes
Building tree: [gdi --project-dir simula

A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1630526712.647205']
Removing any existing indexes simulations/unset/index-reads
Creating new index: [gdi init simulations/unset/index-reads]
Creating a new index took 2.87 seconds
Analysis running: [gdi --project-dir simulations/unset/index-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-reads.tsv]
Analysis took 1.75 minutes
Index running: [gdi --project-dir simulations/unset/index-reads --ncores 32 load vcf --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1630526938.8724523/gdi-input.fofn]
Indexing took 0.10 minutes
Building tree: [gdi --project-dir simul

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f8a2f088e50> but it is already set
A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1630526938.8724523']
Removing any existing indexes simulations/unset/index-reads
Creating new index: [gdi init simulations/unset/index-reads]
Creating a new index took 2.85 seconds
Analysis running: [gdi --project-dir simulations/unset/index-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-reads.tsv]
Analysis took 1.69 minutes
Index running: [gdi --project-dir simulations/unset/index-reads --ncores 32 load vcf --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1630527139.785993/gdi-input.fofn]
Indexing took 0.10 minutes
Building tree: [gdi --project-dir simul

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f8a2f088e80> but it is already set
A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [6]:
benchmark_df

Unnamed: 0,Name,Iteration,Number samples,Number cores,Reference length,Analysis runtime,Analysis memory (max),Analysis memory (max/process),Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,unset reads,1,59,32,10834,118.0,2777940000.0,400138240.0,3270439000.0,6.48,5782909000.0,258179072.0,1425408.0,96.73,250138624.0,223268864.0,221.21,5782909000.0
0,unset reads,2,59,32,10834,104.74,2644685000.0,392278016.0,3270447000.0,6.07,5804933000.0,258678784.0,1433600.0,84.93,249937920.0,223731712.0,195.74,5804933000.0
0,unset reads,3,59,32,10834,101.57,2660401000.0,394743808.0,3270439000.0,5.79,5791068000.0,259006464.0,1433600.0,106.51,249761792.0,223404032.0,213.87,5791068000.0


In [7]:
index_reads_runtime = simulation_dir_path / 'reads-index-info.tsv'
benchmark_df.to_csv(index_reads_runtime, sep='\t', index=False)

## 2.2. Index assemblies

In [8]:
input_genomes_file = simulation_dir_path / 'input-assemblies.tsv'

!gdi input --absolute {assemblies_dir}/*.fa.gz > {input_genomes_file}

In [9]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name} assemblies')
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_assemblies_path, input_files_file=input_genomes_file,
                                             reference_file=reference_file, mincov=mincov, build_tree=True,
                                             ncores=ncores)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1630527139.785993']
Removing any existing indexes simulations/unset/index-assemblies
Creating new index: [gdi init simulations/unset/index-assemblies]
Creating a new index took 2.77 seconds
Analysis running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-assemblies.tsv]
Analysis took 1.56 minutes
Index running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 load vcf --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1630527361.2770522/gdi-input.fofn]
Indexing took 0.09 minutes
Building tree:

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f8a1e18c1f0> but it is already set
A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1630527361.2770522']
Removing any existing indexes simulations/unset/index-assemblies
Creating new index: [gdi init simulations/unset/index-assemblies]
Creating a new index took 2.91 seconds
Analysis running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-assemblies.tsv]
Analysis took 1.52 minutes
Index running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 load vcf --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1630527547.7826889/gdi-input.fofn]
Indexing took 0.09 minutes
Building tree

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f8a740d0be0> but it is already set
A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1630527547.7826889']
Removing any existing indexes simulations/unset/index-assemblies
Creating new index: [gdi init simulations/unset/index-assemblies]
Creating a new index took 2.85 seconds
Analysis running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-assemblies.tsv]
Analysis took 1.56 minutes
Index running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 load vcf --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1630527734.9501612/gdi-input.fofn]
Indexing took 0.09 minutes
Building tree

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f8a2684adc0> but it is already set
A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [10]:
benchmark_df

Unnamed: 0,Name,Iteration,Number samples,Number cores,Reference length,Analysis runtime,Analysis memory (max),Analysis memory (max/process),Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,unset assemblies,1,59,32,10834,93.76,1211167000.0,370278400.0,2082595000.0,5.17,5779100000.0,238125056.0,1064960.0,82.85,240742400.0,223617024.0,181.78,5779100000.0
0,unset assemblies,2,59,32,10834,90.87,1155748000.0,362246144.0,2082591000.0,5.22,5772460000.0,237326336.0,1069056.0,86.31,240992256.0,223879168.0,182.4,5772460000.0
0,unset assemblies,3,59,32,10834,93.33,1210978000.0,381284352.0,2082595000.0,5.0,5766971000.0,237223936.0,1060864.0,88.57,241487872.0,223850496.0,186.9,5766971000.0


In [11]:
index_assemblies_runtime = simulation_dir_path / 'assemblies-index-info.tsv'
benchmark_df.to_csv(index_assemblies_runtime, sep='\t', index=False)

# 3. Export trees

In [12]:
!gdi --project-dir {index_assemblies_path} export tree {reference_name} > {output_assemblies_tree}
print(f'Wrote assemblies tree to {output_assemblies_tree}')

!gdi --project-dir {index_reads_path} export tree {reference_name} > {output_reads_tree}
print(f'Wrote assemblies tree to {output_reads_tree}')

Wrote assemblies tree to simulations/unset/index-assemblies/assemblies.tre
Wrote assemblies tree to simulations/unset/index-reads/reads.tre
