# 1. Parameters

In [1]:
# Defaults
simulation_dir = 'simulations/unset'
reference_file = 'simulations/reference/reference.fa.gz'
iterations = 3
mincov = 10
ncores = 32

In [2]:
from pathlib import Path
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

simulation_dir_path = Path(simulation_dir)

case_name = str(simulation_dir_path.name)
reads_dir = simulation_dir_path / 'simulated_data' / 'reads'
assemblies_dir = simulation_dir_path / 'simulated_data' / 'assemblies'
index_reads_path = simulation_dir_path / 'index-reads'
index_assemblies_path = simulation_dir_path / 'index-assemblies'
output_reads_tree = index_reads_path / 'reads.tre'
output_assemblies_tree = index_assemblies_path / 'assemblies.tre'
reference_name = Path(reference_file).name.split('.')[0]

# 2. Index genomes

In [3]:
!gdi --version

gdi, version 0.3.0.dev12


## 2.1. Index reads

In [4]:
input_genomes_file = simulation_dir_path / 'input-reads.tsv'

!gdi input --absolute {reads_dir}/*.fq.gz > {input_genomes_file}

In [5]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name} reads')
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_reads_path, input_files_file=input_genomes_file,
                                             reference_file=reference_file, mincov=mincov, build_tree=True,
                                             ncores=ncores)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: []
Removing any existing indexes simulations/unset/index-reads
Creating new index: [gdi init simulations/unset/index-reads]
Creating a new index took 2.63 seconds
Analysis running: [gdi --project-dir simulations/unset/index-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-reads.tsv]
Analysis took 2.23 minutes
Index running: [gdi --project-dir simulations/unset/index-reads --ncores 32 load vcf-kmer --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1631305122.0575578/gdi-input.fofn]
Indexing took 0.12 minutes
Building tree: [gdi --project-dir simulations/unset/index-reads --ncores 3

A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1631305122.0575578']
Removing any existing indexes simulations/unset/index-reads
Creating new index: [gdi init simulations/unset/index-reads]
Creating a new index took 2.79 seconds
Analysis running: [gdi --project-dir simulations/unset/index-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-reads.tsv]
Analysis took 1.66 minutes
Index running: [gdi --project-dir simulations/unset/index-reads --ncores 32 load vcf-kmer --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1631305374.6891673/gdi-input.fofn]
Indexing took 0.12 minutes
Building tree: [gdi --project-dir

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7fb741af8fa0> but it is already set
A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1631305374.6891673']
Removing any existing indexes simulations/unset/index-reads
Creating new index: [gdi init simulations/unset/index-reads]
Creating a new index took 2.87 seconds
Analysis running: [gdi --project-dir simulations/unset/index-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-reads.tsv]
Analysis took 1.75 minutes
Index running: [gdi --project-dir simulations/unset/index-reads --ncores 32 load vcf-kmer --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1631305578.969097/gdi-input.fofn]
Indexing took 0.11 minutes
Building tree: [gdi --project-dir 

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7fb741079f70> but it is already set
A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [6]:
benchmark_df

Unnamed: 0,Name,Iteration,Number samples,Number features (all),Number features (no unknown),Number cores,Reference length,Analysis runtime,Analysis memory (max),Analysis memory (max/process),Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,unset reads,1,59,1915,748,32,10834,133.62,2695356000.0,397889536.0,3268301000.0,7.24,5756752000.0,254717952.0,1671168.0,106.92,246706176.0,220422144.0,247.78,5756752000.0
0,unset reads,2,59,1915,748,32,10834,99.29,2902262000.0,395087872.0,3268305000.0,6.81,5772964000.0,256073728.0,1671168.0,92.98,247300096.0,221020160.0,199.08,5772964000.0
0,unset reads,3,59,1915,748,32,10834,104.74,2775405000.0,397524992.0,3268301000.0,6.62,5773795000.0,256442368.0,1667072.0,108.42,248168448.0,221270016.0,219.78,5773795000.0


In [7]:
index_reads_runtime = simulation_dir_path / 'reads-index-info.tsv'
benchmark_df.to_csv(index_reads_runtime, sep='\t', index=False)

## 2.2. Index assemblies

In [8]:
input_genomes_file = simulation_dir_path / 'input-assemblies.tsv'

!gdi input --absolute {assemblies_dir}/*.fa.gz > {input_genomes_file}

In [9]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name} assemblies')
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_assemblies_path, input_files_file=input_genomes_file,
                                             reference_file=reference_file, mincov=mincov, build_tree=True,
                                             ncores=ncores)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1631305578.969097']
Removing any existing indexes simulations/unset/index-assemblies
Creating new index: [gdi init simulations/unset/index-assemblies]
Creating a new index took 2.60 seconds
Analysis running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-assemblies.tsv]
Analysis took 1.53 minutes
Index running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 load vcf-kmer --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1631305806.4988441/gdi-input.fofn]
Indexing took 0.10 minutes
Building 

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7fb76827a550> but it is already set
A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1631305806.4988441']
Removing any existing indexes simulations/unset/index-assemblies
Creating new index: [gdi init simulations/unset/index-assemblies]
Creating a new index took 2.74 seconds
Analysis running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/unset/input-assemblies.tsv]
Analysis took 1.51 minutes
Index running: [gdi --project-dir simulations/unset/index-assemblies --ncores 32 load vcf-kmer --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1631305998.7694087/gdi-input.fofn]
Indexing took 0.11 minutes
Building

Process Process-59:
Process Process-60:
Traceback (most recent call last):
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-sc2/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-sc2/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-sc2/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-sc2/lib/python3.8/site-packages/cmdbench/core.py", line 138, in collect_fixed_data
    while not shared_process_dict["skip_benchmarking"]:
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-sc2/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "<string>", line 2, in __getitem__
  File "/home/CSCScience.ca/apetkau/m

KeyboardInterrupt: 

In [None]:
benchmark_df

In [None]:
index_assemblies_runtime = simulation_dir_path / 'assemblies-index-info.tsv'
benchmark_df.to_csv(index_assemblies_runtime, sep='\t', index=False)

# 3. Export trees

In [None]:
!gdi --project-dir {index_assemblies_path} export tree {reference_name} > {output_assemblies_tree}
print(f'Wrote assemblies tree to {output_assemblies_tree}')

!gdi --project-dir {index_reads_path} export tree {reference_name} > {output_reads_tree}
print(f'Wrote assemblies tree to {output_reads_tree}')