# 1. Parameters

In [1]:
# Defaults
cases_dir = 'cases/unset'
reference_file = 'references/NC_045512.gbk.gz'
input_files_all = 'input/input-files.tsv'
iterations = 3
mincov = 10
ncores = 32
number_samples = 10
build_tree = False
sample_batch_size=2000

In [2]:
# Parameters
cases_dir = "cases/case-5000"
iterations = 3
number_samples = 5000
build_tree = True


In [3]:
from pathlib import Path
from shutil import rmtree
from os import makedirs
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

cases_dir_path = Path(cases_dir)

if cases_dir_path.exists():
    rmtree(cases_dir_path)
    
if not cases_dir_path.exists():
    makedirs(cases_dir_path)

input_files_all = Path(input_files_all)
reference_file = Path(reference_file)

case_name = str(cases_dir_path.name)
reference_name = reference_file.name.split('.')[0]

cases_input = cases_dir_path / 'input-files-case.tsv'
index_path = cases_dir_path / 'index'
benchmark_path = cases_dir_path / 'index-info.tsv'
output_tree = cases_dir_path / 'tree.tre'

  import imp


# 2. Create subset input

In [4]:
import pandas as pd

all_input_df = pd.read_csv(input_files_all, sep='\t')
all_input_total = len(all_input_df)

subset_input_df = all_input_df.head(number_samples)
subset_input_total = len(subset_input_df)

subset_input_df.to_csv(cases_input, sep='\t', index=False)

print(f'Wrote {subset_input_total}/{all_input_total} samples to {cases_input}')

Wrote 5000/100000 samples to cases/case-5000/input-files-case.tsv


# 2. Index genomes

In [5]:
!gdi --version

gdi, version 0.8.1


## 2.1. Index reads

In [6]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=case_name)
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_path, input_files_file=cases_input,
                                             reference_file=reference_file, mincov=mincov,
                                             build_tree=build_tree,
                                             ncores=ncores,
                                             sample_batch_size=sample_batch_size)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 5000 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1665334680.1428204']


Creating new index: [gdi init cases/case-5000/index]


  self.pid = os.fork()


  self.pid = os.fork()


Creating a new index took 3.31 seconds
Analysis running: [gdi --project-dir cases/case-5000/index --ncores 32 analysis --use-conda --no-load-data --reference-file references/NC_045512.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-5000/input-files-case.tsv]


  self.pid = os.fork()


  self.pid = os.fork()


Analysis took 31.16 minutes
Index running: [gdi --project-dir cases/case-5000/index --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file references/NC_045512.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation-manuscript/evaluations/sars-cov-2/snakemake-assemblies.1665335144.4719963/gdi-input.fofn]


  self.pid = os.fork()


  self.pid = os.fork()


Indexing took 5.35 minutes
Building tree: [gdi --project-dir cases/case-5000/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' NC_045512]


  self.pid = os.fork()


  self.pid = os.fork()


Building tree took 105.25 minutes


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 5000 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1665335144.4719963']


Removing any existing indexes cases/case-5000/index


Creating new index: [gdi init cases/case-5000/index]


  self.pid = os.fork()


  self.pid = os.fork()


Creating a new index took 3.56 seconds
Analysis running: [gdi --project-dir cases/case-5000/index --ncores 32 analysis --use-conda --no-load-data --reference-file references/NC_045512.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-5000/input-files-case.tsv]


  self.pid = os.fork()


  self.pid = os.fork()


Analysis took 30.83 minutes
Index running: [gdi --project-dir cases/case-5000/index --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file references/NC_045512.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation-manuscript/evaluations/sars-cov-2/snakemake-assemblies.1665343659.1091423/gdi-input.fofn]


  self.pid = os.fork()


  self.pid = os.fork()


Indexing took 5.24 minutes
Building tree: [gdi --project-dir cases/case-5000/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' NC_045512]


  self.pid = os.fork()


  self.pid = os.fork()


Building tree took 150.14 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f6cb43ade20> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 5000 samples with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1665343659.1091423']


Removing any existing indexes cases/case-5000/index


Creating new index: [gdi init cases/case-5000/index]


  self.pid = os.fork()


  self.pid = os.fork()


Creating a new index took 3.43 seconds
Analysis running: [gdi --project-dir cases/case-5000/index --ncores 32 analysis --use-conda --no-load-data --reference-file references/NC_045512.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-5000/input-files-case.tsv]


  self.pid = os.fork()


  self.pid = os.fork()


Analysis took 31.06 minutes
Index running: [gdi --project-dir cases/case-5000/index --ncores 32 load vcf-kmer --sample-batch-size 2000 --reference-file references/NC_045512.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation-manuscript/evaluations/sars-cov-2/snakemake-assemblies.1665354840.7630746/gdi-input.fofn]


  self.pid = os.fork()


  self.pid = os.fork()


Indexing took 5.39 minutes
Building tree: [gdi --project-dir cases/case-5000/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' NC_045512]


  self.pid = os.fork()


  self.pid = os.fork()


Building tree took 90.74 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f6c26455fa0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [7]:
benchmark_df

Unnamed: 0,Name,Reference name,Iteration,Number samples,Number features (all),Number features (no unknown),Number cores,Reference length,Analysis runtime,Analysis memory (max),...,Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,case-5000,NC_045512,1,5000,40166,10263,32,29903,1869.21,2203341000.0,...,4705559000.0,320.61,26930380000.0,1600315000.0,107741184.0,6301.0,19157520000.0,18735670000.0,8490.82,26930380000.0
0,case-5000,NC_045512,2,5000,40166,10263,32,29903,1849.4,2379756000.0,...,4705599000.0,313.98,22178780000.0,1454494000.0,107728896.0,9000.0,19157520000.0,18735840000.0,11163.38,22178780000.0
0,case-5000,NC_045512,3,5000,40166,10263,32,29903,1863.52,2662666000.0,...,4705837000.0,323.02,26874730000.0,1593364000.0,107773952.0,5404.0,19157190000.0,18735040000.0,7590.54,26874730000.0


In [8]:
benchmark_df.to_csv(benchmark_path, sep='\t', index=False)

# 3. Export trees

In [9]:
if build_tree:
    !gdi --project-dir {index_path} export tree {reference_name} > {output_tree}
    print(f'Wrote tree to {output_tree}')
else:
    print(f'build_tree={build_tree} so no tree to export')

Wrote tree to cases/case-5000/tree.tre
