# 1. Parameters

In [1]:
# Defaults
simulation_dir = 'simulations/unset'
reference_file = 'simulations/reference/reference.fa.gz'
iterations = 1
mincov = 10
ncores = 32

In [2]:
# Parameters
read_coverage = 40
mincov = 10
simulation_dir = "simulations/cov-40"
iterations = 1
sub_alpha = 0.2


In [3]:
from pathlib import Path
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

simulation_dir_path = Path(simulation_dir)

case_name = str(simulation_dir_path.name)
reads_dir = simulation_dir_path / 'simulated_data' / 'reads'
assemblies_dir = simulation_dir_path / 'simulated_data' / 'assemblies'
assemblies_reads_dir = simulation_dir_path / 'simulated_data' / 'assembled_reads'
index_reads_path = simulation_dir_path / 'index-reads'
index_assemblies_path = simulation_dir_path / 'index-assemblies'
index_assemblies_reads_path = simulation_dir_path / 'index-assemblies-reads'
output_reads_tree = index_reads_path / 'reads.tre'
output_assemblies_tree = index_assemblies_path / 'assemblies.tre'
output_assemblies_reads_tree = index_assemblies_reads_path / 'assemblies-reads.tre'
reference_name = Path(reference_file).name.split('.')[0]

  import imp


# 2. Index genomes

In [4]:
!gdi --version

gdi, version 0.8.1


## 2.1. Index reads

In [5]:
input_genomes_file = simulation_dir_path / 'input-reads.tsv'

!gdi input --absolute {reads_dir}/*.fq.gz > {input_genomes_file}



In [6]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name} reads')
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_reads_path, input_files_file=input_genomes_file,
                                             reference_file=reference_file, mincov=mincov, build_tree=True,
                                             ncores=ncores)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 60 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1666141285.1668816']


Creating new index: [gdi init simulations/cov-40/index-reads]


  self.pid = os.fork()


  self.pid = os.fork()


Creating a new index took 3.11 seconds
Analysis running: [gdi --project-dir simulations/cov-40/index-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/cov-40/input-reads.tsv]


  self.pid = os.fork()


  self.pid = os.fork()


Analysis took 1.99 minutes
Index running: [gdi --project-dir simulations/cov-40/index-reads --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation-manuscript/evaluations/simulation/snakemake-assemblies.1666141436.9208624/gdi-input.fofn]


  self.pid = os.fork()


  self.pid = os.fork()


Indexing took 0.13 minutes
Building tree: [gdi --project-dir simulations/cov-40/index-reads --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' reference]


  self.pid = os.fork()


  self.pid = os.fork()


Building tree took 0.18 minutes


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [7]:
benchmark_df

Unnamed: 0,Name,Reference name,Iteration,Number samples,Number features (all),Number features (no unknown),Number cores,Reference length,Analysis runtime,Analysis memory (max),...,Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,cov-40 reads,reference,1,60,2940,1393,32,19699,119.18,3117457000.0,...,3257885000.0,7.68,6204473000.0,298397696.0,2105344.0,10.96,284803072.0,251408384.0,137.82,6204473000.0


In [8]:
index_reads_runtime = simulation_dir_path / 'reads-index-info.tsv'
benchmark_df.to_csv(index_reads_runtime, sep='\t', index=False)

## 2.2. Index assemblies

In [9]:
input_genomes_file = simulation_dir_path / 'input-assemblies.tsv'

!gdi input --absolute {assemblies_dir}/*.fa.gz > {input_genomes_file}



In [10]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name} assemblies')
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_assemblies_path, input_files_file=input_genomes_file,
                                             reference_file=reference_file, mincov=mincov, build_tree=True,
                                             ncores=ncores)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 60 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1666141436.9208624']


Creating new index: [gdi init simulations/cov-40/index-assemblies]


  self.pid = os.fork()


  self.pid = os.fork()


Creating a new index took 3.25 seconds
Analysis running: [gdi --project-dir simulations/cov-40/index-assemblies --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/cov-40/input-assemblies.tsv]


  self.pid = os.fork()


  self.pid = os.fork()


Analysis took 1.73 minutes
Index running: [gdi --project-dir simulations/cov-40/index-assemblies --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation-manuscript/evaluations/simulation/snakemake-assemblies.1666141583.815372/gdi-input.fofn]


  self.pid = os.fork()


  self.pid = os.fork()


Indexing took 0.12 minutes
Building tree: [gdi --project-dir simulations/cov-40/index-assemblies --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' reference]


  self.pid = os.fork()


  self.pid = os.fork()


Building tree took 0.14 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7feafd604e50> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [11]:
benchmark_df

Unnamed: 0,Name,Reference name,Iteration,Number samples,Number features (all),Number features (no unknown),Number cores,Reference length,Analysis runtime,Analysis memory (max),...,Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,cov-40 assemblies,reference,1,60,1498,1421,32,19699,103.83,2660344000.0,...,2232439000.0,6.8,6180565000.0,270778368.0,1388544.0,8.18,273027072.0,251703296.0,118.81,6180565000.0


In [12]:
index_assemblies_runtime = simulation_dir_path / 'assemblies-index-info.tsv'
benchmark_df.to_csv(index_assemblies_runtime, sep='\t', index=False)

## 2.3. Index assembled reads

In [13]:
input_genomes_file = simulation_dir_path / 'input-assemblies-reads.tsv'

!gdi input --absolute {assemblies_reads_dir}/*.fasta > {input_genomes_file}



In [14]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name} assemblies')
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_assemblies_reads_path, input_files_file=input_genomes_file,
                                             reference_file=reference_file, mincov=mincov, build_tree=True,
                                             ncores=ncores)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 60 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1666141583.815372']


Creating new index: [gdi init simulations/cov-40/index-assemblies-reads]


  self.pid = os.fork()


  self.pid = os.fork()


Creating a new index took 3.28 seconds
Analysis running: [gdi --project-dir simulations/cov-40/index-assemblies-reads --ncores 32 analysis --use-conda --no-load-data --reference-file simulations/reference/reference.fa.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file simulations/cov-40/input-assemblies-reads.tsv]


  self.pid = os.fork()


  self.pid = os.fork()


Analysis took 1.73 minutes
Index running: [gdi --project-dir simulations/cov-40/index-assemblies-reads --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file simulations/reference/reference.fa.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation-manuscript/evaluations/simulation/snakemake-assemblies.1666141711.4751303/gdi-input.fofn]


  self.pid = os.fork()


  self.pid = os.fork()


Indexing took 0.13 minutes
Building tree: [gdi --project-dir simulations/cov-40/index-assemblies-reads --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' reference]


  self.pid = os.fork()
Exception during reset or similar
Traceback (most recent call last):
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/site-packages/sqlalchemy/pool/base.py", line 753, in _finalize_fairy
    fairy._reset(pool)
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/site-packages/sqlalchemy/pool/base.py", line 1004, in _reset
    pool._dialect.do_rollback(self)
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/site-packages/sqlalchemy/engine/default.py", line 683, in do_rollback
    dbapi_connection.rollback()
sqlite3.ProgrammingError: SQLite objects created in a thread can only be used in that same thread. The object was created in thread id 140649005532992 and this is thread id 140648694933248.


Exception closing connection <sqlite3.Connection object at 0x7feb05e18120>
Traceback (most recent call last):
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/site-packages/sqlalchemy/pool/base.py", line 753, in _finalize_fairy
    fairy._reset(pool)
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/site-packages/sqlalchemy/pool/base.py", line 1004, in _reset
    pool._dialect.do_rollback(self)
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manuscript/lib/python3.8/site-packages/sqlalchemy/engine/default.py", line 683, in do_rollback
    dbapi_connection.rollback()
sqlite3.ProgrammingError: SQLite objects created in a thread can only be used in that same thread. The object was created in thread id 140649005532992 and this is thread id 140648694933248.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/CSCScience.ca/apetkau/miniconda3/envs/gdi-manus

  self.pid = os.fork()


  self.pid = os.fork()


Building tree took 0.17 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7feb05dfb610> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [15]:
benchmark_df

Unnamed: 0,Name,Reference name,Iteration,Number samples,Number features (all),Number features (no unknown),Number cores,Reference length,Analysis runtime,Analysis memory (max),...,Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,cov-40 assemblies,reference,1,60,2001,1408,32,19699,103.91,2606768000.0,...,2232226000.0,7.42,6495486000.0,286674944.0,1449984.0,9.76,279793664.0,252243968.0,121.09,6495486000.0


In [16]:
index_assemblies_reads_runtime = simulation_dir_path / 'assemblies-reads-index-info.tsv'
benchmark_df.to_csv(index_assemblies_reads_runtime, sep='\t', index=False)

# 3. Export trees

In [17]:
!gdi --project-dir {index_assemblies_path} export tree {reference_name} > {output_assemblies_tree}
print(f'Wrote assemblies tree to {output_assemblies_tree}')

!gdi --project-dir {index_reads_path} export tree {reference_name} > {output_reads_tree}
print(f'Wrote assemblies tree to {output_reads_tree}')

!gdi --project-dir {index_assemblies_reads_path} export tree {reference_name} > {output_assemblies_reads_tree}
print(f'Wrote assemblies tree to {output_assemblies_reads_tree}')

Wrote assemblies tree to simulations/cov-40/index-assemblies/assemblies.tre


Wrote assemblies tree to simulations/cov-40/index-reads/reads.tre


Wrote assemblies tree to simulations/cov-40/index-assemblies-reads/assemblies-reads.tre
