# 1. Parameters

In [1]:
# Defaults
simulation_dir = 'simulations/unset'
reference_file = 'simulations/reference/reference.fa.gz'
iterations = 1
mincov = 10
ncores = 32

In [2]:
# Parameters
read_coverage = 30
mincov = 10
simulation_dir = "simulations/cov-30"
iterations = 1
sub_alpha = 0.2


In [3]:
from pathlib import Path
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

simulation_dir_path = Path(simulation_dir)

case_name = str(simulation_dir_path.name)
reads_dir = simulation_dir_path / 'simulated_data' / 'reads'
assemblies_dir = simulation_dir_path / 'simulated_data' / 'assemblies'
assemblies_reads_dir = simulation_dir_path / 'simulated_data' / 'assembled_reads'
index_reads_path = simulation_dir_path / 'index-reads'
index_assemblies_path = simulation_dir_path / 'index-assemblies'
index_assemblies_reads_path = simulation_dir_path / 'index-assemblies-reads'
output_reads_tree = index_reads_path / 'reads.tre'
output_assemblies_tree = index_assemblies_path / 'assemblies.tre'
output_assemblies_reads_tree = index_assemblies_reads_path / 'assemblies-reads.tre'
reference_name = Path(reference_file).name.split('.')[0]

  import imp


# 2. Index genomes

In [4]:
!gdi --version

gdi, version 0.8.1


## 2.1. Index reads

In [5]:
input_genomes_file = simulation_dir_path / 'input-reads.tsv'

!gdi input --absolute {reads_dir}/*.fq.gz > {input_genomes_file}



In [6]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name} reads')
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_reads_path, input_files_file=input_genomes_file,
                                             reference_file=reference_file, mincov=mincov, build_tree=True,
                                             ncores=ncores)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 60 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1666140855.5153775']


Creating new index: [gdi init simulations/cov-30/index-reads]


  self.pid = os.fork()


  self.pid = os.fork()


Creating a new index took 3.36 seconds
Analysis running: [gdi --project-dir simulations/cov-30/index-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/cov-30/input-reads.tsv]


  self.pid = os.fork()


  self.pid = os.fork()


Analysis took 1.92 minutes
Index running: [gdi --project-dir simulations/cov-30/index-reads --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation-manuscript/evaluations/simulation/snakemake-assemblies.1666141004.0454142/gdi-input.fofn]


  self.pid = os.fork()


  self.pid = os.fork()


Indexing took 0.14 minutes
Building tree: [gdi --project-dir simulations/cov-30/index-reads --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' reference]


  self.pid = os.fork()


  self.pid = os.fork()


Building tree took 0.18 minutes


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [7]:
benchmark_df

Unnamed: 0,Name,Reference name,Iteration,Number samples,Number features (all),Number features (no unknown),Number cores,Reference length,Analysis runtime,Analysis memory (max),...,Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,cov-30 reads,reference,1,60,3488,1385,32,19699,114.95,3383751000.0,...,3248796000.0,8.35,6204375000.0,307191808.0,2162688.0,10.41,286846976.0,251371520.0,133.71,6204375000.0


In [8]:
index_reads_runtime = simulation_dir_path / 'reads-index-info.tsv'
benchmark_df.to_csv(index_reads_runtime, sep='\t', index=False)

## 2.2. Index assemblies

In [9]:
input_genomes_file = simulation_dir_path / 'input-assemblies.tsv'

!gdi input --absolute {assemblies_dir}/*.fa.gz > {input_genomes_file}



In [10]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name} assemblies')
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_assemblies_path, input_files_file=input_genomes_file,
                                             reference_file=reference_file, mincov=mincov, build_tree=True,
                                             ncores=ncores)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 60 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1666141004.0454142']


Creating new index: [gdi init simulations/cov-30/index-assemblies]


  self.pid = os.fork()


  self.pid = os.fork()


Creating a new index took 3.06 seconds
Analysis running: [gdi --project-dir simulations/cov-30/index-assemblies --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/cov-30/input-assemblies.tsv]


  self.pid = os.fork()


  self.pid = os.fork()


Analysis took 1.89 minutes
Index running: [gdi --project-dir simulations/cov-30/index-assemblies --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation-manuscript/evaluations/simulation/snakemake-assemblies.1666141146.8857586/gdi-input.fofn]


  self.pid = os.fork()


  self.pid = os.fork()


Indexing took 0.13 minutes
Building tree: [gdi --project-dir simulations/cov-30/index-assemblies --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' reference]


  self.pid = os.fork()


  self.pid = os.fork()


Building tree took 0.15 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f1c1dee05e0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [11]:
benchmark_df

Unnamed: 0,Name,Reference name,Iteration,Number samples,Number features (all),Number features (no unknown),Number cores,Reference length,Analysis runtime,Analysis memory (max),...,Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,cov-30 assemblies,reference,1,60,1498,1421,32,19699,113.08,2888667000.0,...,2232443000.0,7.46,6789632000.0,270413824.0,1388544.0,9.03,273051648.0,251838464.0,129.57,6789632000.0


In [12]:
index_assemblies_runtime = simulation_dir_path / 'assemblies-index-info.tsv'
benchmark_df.to_csv(index_assemblies_runtime, sep='\t', index=False)

## 2.3. Index assembled reads

In [13]:
input_genomes_file = simulation_dir_path / 'input-assemblies-reads.tsv'

!gdi input --absolute {assemblies_reads_dir}/*.fasta > {input_genomes_file}



In [14]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name} assemblies')
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_assemblies_reads_path, input_files_file=input_genomes_file,
                                             reference_file=reference_file, mincov=mincov, build_tree=True,
                                             ncores=ncores)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 60 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1666141146.8857586']


Creating new index: [gdi init simulations/cov-30/index-assemblies-reads]


  self.pid = os.fork()


  self.pid = os.fork()


Creating a new index took 3.27 seconds
Analysis running: [gdi --project-dir simulations/cov-30/index-assemblies-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/cov-30/input-assemblies-reads.tsv]


  self.pid = os.fork()


  self.pid = os.fork()


Analysis took 1.77 minutes
Index running: [gdi --project-dir simulations/cov-30/index-assemblies-reads --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation-manuscript/evaluations/simulation/snakemake-assemblies.1666141285.1668816/gdi-input.fofn]


  self.pid = os.fork()


  self.pid = os.fork()
Exception during reset or similar
Traceback (most recent call last):
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/multiprocessing/managers.py", line 243, in serve_client
    request = recv()
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
    buf = self._recv(4)
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/multiprocessing/connection.py", line 383, in _recv
    raise EOFError
EOFError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/home/CSCScien

  self.pid = os.fork()


Exception closing connection <sqlite3.Connection object at 0x7f1c1df01120>
Traceback (most recent call last):
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/multiprocessing/managers.py", line 243, in serve_client
    request = recv()
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
    buf = self._recv(4)
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/multiprocessing/connection.py", line 383, in _recv
    raise EOFError
EOFError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  Fi

Indexing took 0.13 minutes
Building tree: [gdi --project-dir simulations/cov-30/index-assemblies-reads --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' reference]


  self.pid = os.fork()
Exception during reset or similar
Traceback (most recent call last):
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/site-packages/sqlalchemy/pool/base.py", line 753, in _finalize_fairy
    fairy._reset(pool)
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/site-packages/sqlalchemy/pool/base.py", line 1004, in _reset
    pool._dialect.do_rollback(self)
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/site-packages/sqlalchemy/engine/default.py", line 683, in do_rollback
    dbapi_connection.rollback()
sqlite3.ProgrammingError: SQLite objects created in a thread can only be used in that same thread. The object was created in thread id 139760914429760 and this is thread id 139760805738240.


Exception closing connection <sqlite3.Connection object at 0x7f1c1df01120>
Traceback (most recent call last):
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/site-packages/sqlalchemy/pool/base.py", line 753, in _finalize_fairy
    fairy._reset(pool)
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/site-packages/sqlalchemy/pool/base.py", line 1004, in _reset
    pool._dialect.do_rollback(self)
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/site-packages/sqlalchemy/engine/default.py", line 683, in do_rollback
    dbapi_connection.rollback()
sqlite3.ProgrammingError: SQLite objects created in a thread can only be used in that same thread. The object was created in thread id 139760914429760 and this is thread id 139760805738240.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manus

  self.pid = os.fork()


  self.pid = os.fork()


Building tree took 0.17 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f1c1d8805e0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [15]:
benchmark_df

Unnamed: 0,Name,Reference name,Iteration,Number samples,Number features (all),Number features (no unknown),Number cores,Reference length,Analysis runtime,Analysis memory (max),...,Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,cov-30 assemblies,reference,1,60,2329,1408,32,19699,106.07,2547433000.0,...,2232214000.0,7.5,6298698000.0,290177024.0,1486848.0,10.04,280662016.0,251682816.0,123.61,6298698000.0


In [16]:
index_assemblies_reads_runtime = simulation_dir_path / 'assemblies-reads-index-info.tsv'
benchmark_df.to_csv(index_assemblies_reads_runtime, sep='\t', index=False)

# 3. Export trees

In [17]:
!gdi --project-dir {index_assemblies_path} export tree {reference_name} > {output_assemblies_tree}
print(f'Wrote assemblies tree to {output_assemblies_tree}')

!gdi --project-dir {index_reads_path} export tree {reference_name} > {output_reads_tree}
print(f'Wrote assemblies tree to {output_reads_tree}')

!gdi --project-dir {index_assemblies_reads_path} export tree {reference_name} > {output_assemblies_reads_tree}
print(f'Wrote assemblies tree to {output_assemblies_reads_tree}')

Wrote assemblies tree to simulations/cov-30/index-assemblies/assemblies.tre


Wrote assemblies tree to simulations/cov-30/index-reads/reads.tre


Wrote assemblies tree to simulations/cov-30/index-assemblies-reads/assemblies-reads.tre
