# 1. Parameters

In [1]:
# Defaults
simulation_dir = 'simulations/unset'
reference_file = 'simulations/reference/reference.fa.gz'
iterations = 3
mincov = 10
ncores = 32

In [2]:
# Parameters
read_coverage = 30
mincov = 10
simulation_dir = "simulations/alpha-1.0-cov-30"
iterations = 3
sub_alpha = 1.0


In [3]:
from pathlib import Path
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

simulation_dir_path = Path(simulation_dir)

case_name = str(simulation_dir_path.name)
reads_dir = simulation_dir_path / 'simulated_data' / 'reads'
assemblies_dir = simulation_dir_path / 'simulated_data' / 'assemblies'
index_reads_path = simulation_dir_path / 'index-reads'
index_assemblies_path = simulation_dir_path / 'index-assemblies'
output_reads_tree = index_reads_path / 'reads.tre'
output_assemblies_tree = index_assemblies_path / 'assemblies.tre'
reference_name = Path(reference_file).name.split('.')[0]

# 2. Index genomes

In [4]:
!gdi --version

gdi, version 0.6.0.dev2


## 2.1. Index reads

In [5]:
input_genomes_file = simulation_dir_path / 'input-reads.tsv'

!gdi input --absolute {reads_dir}/*.fq.gz > {input_genomes_file}



In [6]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name} reads')
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_reads_path, input_files_file=input_genomes_file,
                                             reference_file=reference_file, mincov=mincov, build_tree=True,
                                             ncores=ncores)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1646771955.4134176']


Creating new index: [gdi init simulations/alpha-1.0-cov-30/index-reads]


Creating a new index took 3.53 seconds
Analysis running: [gdi --project-dir simulations/alpha-1.0-cov-30/index-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/alpha-1.0-cov-30/input-reads.tsv]


Analysis took 2.01 minutes
Index running: [gdi --project-dir simulations/alpha-1.0-cov-30/index-reads --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1646772213.4816306/gdi-input.fofn]


Indexing took 0.17 minutes
Building tree: [gdi --project-dir simulations/alpha-1.0-cov-30/index-reads --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' reference]


Building tree took 1.89 minutes


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1646772213.4816306']


Removing any existing indexes simulations/alpha-1.0-cov-30/index-reads
Creating new index: [gdi init simulations/alpha-1.0-cov-30/index-reads]


Creating a new index took 3.34 seconds
Analysis running: [gdi --project-dir simulations/alpha-1.0-cov-30/index-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/alpha-1.0-cov-30/input-reads.tsv]


Analysis took 1.99 minutes
Index running: [gdi --project-dir simulations/alpha-1.0-cov-30/index-reads --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1646772463.1542895/gdi-input.fofn]


Indexing took 0.17 minutes
Building tree: [gdi --project-dir simulations/alpha-1.0-cov-30/index-reads --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' reference]


Building tree took 1.92 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f5d5f215c70> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1646772463.1542895']


Removing any existing indexes simulations/alpha-1.0-cov-30/index-reads
Creating new index: [gdi init simulations/alpha-1.0-cov-30/index-reads]


Creating a new index took 3.32 seconds
Analysis running: [gdi --project-dir simulations/alpha-1.0-cov-30/index-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/alpha-1.0-cov-30/input-reads.tsv]


Analysis took 2.07 minutes
Index running: [gdi --project-dir simulations/alpha-1.0-cov-30/index-reads --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1646772712.9821587/gdi-input.fofn]


Indexing took 0.17 minutes
Building tree: [gdi --project-dir simulations/alpha-1.0-cov-30/index-reads --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' reference]


Building tree took 2.10 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f5d1e69ca00> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [7]:
benchmark_df

Unnamed: 0,Name,Reference name,Iteration,Number samples,Number features (all),Number features (no unknown),Number cores,Reference length,Analysis runtime,Analysis memory (max),...,Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,alpha-1.0-cov-30 reads,reference,1,59,5753,3388,32,19699,120.62,2679648000.0,...,2986447000.0,10.13,6035612000.0,318558208.0,3137536.0,113.53,283095040.0,234217472.0,244.28,6035612000.0
0,alpha-1.0-cov-30 reads,reference,2,59,5753,3388,32,19699,119.37,2685141000.0,...,2986451000.0,9.86,6050075000.0,319131648.0,3137536.0,115.02,283504640.0,234741760.0,244.25,6050075000.0
0,alpha-1.0-cov-30 reads,reference,3,59,5753,3388,32,19699,124.37,2610356000.0,...,2986447000.0,9.89,6052069000.0,318713856.0,3141632.0,126.03,282886144.0,234442752.0,260.29,6052069000.0


In [8]:
index_reads_runtime = simulation_dir_path / 'reads-index-info.tsv'
benchmark_df.to_csv(index_reads_runtime, sep='\t', index=False)

## 2.2. Index assemblies

In [9]:
input_genomes_file = simulation_dir_path / 'input-assemblies.tsv'

!gdi input --absolute {assemblies_dir}/*.fa.gz > {input_genomes_file}



In [10]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name} assemblies')
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_assemblies_path, input_files_file=input_genomes_file,
                                             reference_file=reference_file, mincov=mincov, build_tree=True,
                                             ncores=ncores)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1646772712.9821587']


Creating new index: [gdi init simulations/alpha-1.0-cov-30/index-assemblies]


Creating a new index took 3.36 seconds
Analysis running: [gdi --project-dir simulations/alpha-1.0-cov-30/index-assemblies --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/alpha-1.0-cov-30/input-assemblies.tsv]


Analysis took 1.94 minutes
Index running: [gdi --project-dir simulations/alpha-1.0-cov-30/index-assemblies --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1646772982.1663227/gdi-input.fofn]


Indexing took 0.24 minutes
Building tree: [gdi --project-dir simulations/alpha-1.0-cov-30/index-assemblies --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' reference]


Building tree took 0.13 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f5d1e6ae2e0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1646772982.1663227']


Removing any existing indexes simulations/alpha-1.0-cov-30/index-assemblies
Creating new index: [gdi init simulations/alpha-1.0-cov-30/index-assemblies]


Creating a new index took 3.38 seconds
Analysis running: [gdi --project-dir simulations/alpha-1.0-cov-30/index-assemblies --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/alpha-1.0-cov-30/input-assemblies.tsv]


Analysis took 1.92 minutes
Index running: [gdi --project-dir simulations/alpha-1.0-cov-30/index-assemblies --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1646773125.956094/gdi-input.fofn]


Indexing took 0.23 minutes
Building tree: [gdi --project-dir simulations/alpha-1.0-cov-30/index-assemblies --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' reference]


Building tree took 0.13 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f5d1e610f70> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 59 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1646773125.956094']


Removing any existing indexes simulations/alpha-1.0-cov-30/index-assemblies
Creating new index: [gdi init simulations/alpha-1.0-cov-30/index-assemblies]


Creating a new index took 3.08 seconds
Analysis running: [gdi --project-dir simulations/alpha-1.0-cov-30/index-assemblies --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/alpha-1.0-cov-30/input-assemblies.tsv]


Analysis took 1.87 minutes
Index running: [gdi --project-dir simulations/alpha-1.0-cov-30/index-assemblies --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/simulation/snakemake-assemblies.1646773267.843922/gdi-input.fofn]


Indexing took 0.24 minutes
Building tree: [gdi --project-dir simulations/alpha-1.0-cov-30/index-assemblies --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' reference]


Building tree took 0.13 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f5d1e69cd90> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [11]:
benchmark_df

Unnamed: 0,Name,Reference name,Iteration,Number samples,Number features (all),Number features (no unknown),Number cores,Reference length,Analysis runtime,Analysis memory (max),...,Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,alpha-1.0-cov-30 assemblies,reference,1,59,22855,3156,32,19699,116.1,1314152000.0,...,2203156000.0,14.36,6050181000.0,373972992.0,3842048.0,7.69,245387264.0,234569728.0,138.15,6050181000.0
0,alpha-1.0-cov-30 assemblies,reference,2,59,22855,3156,32,19699,115.23,1303347000.0,...,2203144000.0,13.84,6076887000.0,370630656.0,3829760.0,7.79,245727232.0,234909696.0,136.86,6076887000.0
0,alpha-1.0-cov-30 assemblies,reference,3,59,22855,3156,32,19699,112.19,1267831000.0,...,2203144000.0,13.99,6059651000.0,370233344.0,3842048.0,7.5,245219328.0,234455040.0,133.68,6059651000.0


In [12]:
index_assemblies_runtime = simulation_dir_path / 'assemblies-index-info.tsv'
benchmark_df.to_csv(index_assemblies_runtime, sep='\t', index=False)

# 3. Export trees

In [13]:
!gdi --project-dir {index_assemblies_path} export tree {reference_name} > {output_assemblies_tree}
print(f'Wrote assemblies tree to {output_assemblies_tree}')

!gdi --project-dir {index_reads_path} export tree {reference_name} > {output_reads_tree}
print(f'Wrote assemblies tree to {output_reads_tree}')

[32m2022-03-08 15:03:22[0m [1;30mERROR:[0m [31mreference_genome=[<Reference(id=1, name=reference, length=19699)>] does not have a tree[0m


Wrote assemblies tree to simulations/alpha-1.0-cov-30/index-assemblies/assemblies.tre


Wrote assemblies tree to simulations/alpha-1.0-cov-30/index-reads/reads.tre
