# 1. Parameters

In [1]:
# Defaults
cases_dir = 'cases/unset'
metadata_file = 'metadata.tsv'
iterations = 3
mincov = 10
ncores = 4
max_samples_per_organism = 4
full_dataset = False
single_index = False

build_tree = True
sample_batch_size=10

In [2]:
# Parameters
cases_dir = "cases/case-full-si"
iterations = 3
max_samples_per_organism = "None"
full_dataset = True
single_index = True
build_tree = True


In [3]:
from pathlib import Path
from shutil import rmtree
from os import makedirs
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

if full_dataset:
    max_samples_per_organism = None

reference_files = {
    '0810PADBR-1': Path('data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz'),
    '1405WAEXK-1': Path('data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz'),
    '1408MLGX6-3WGS': Path('data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz'),
    '1203NYJAP-1 - Tuna Scrape Outbreak': Path('data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz'),
}

cases_dir_path = Path(cases_dir)
index_path = cases_dir_path / 'index'

if cases_dir_path.exists():
    rmtree(cases_dir_path)
    
if not cases_dir_path.exists():
    makedirs(cases_dir_path)

case_name = str(cases_dir_path.name)

benchmark_path = cases_dir_path / 'index-info.tsv'

output_trees = {x: cases_dir_path / f'{x}_tree.tre'.replace(' ', '_') for x in reference_files}
output_trees

{'0810PADBR-1': PosixPath('cases/case-full-si/0810PADBR-1_tree.tre'),
 '1405WAEXK-1': PosixPath('cases/case-full-si/1405WAEXK-1_tree.tre'),
 '1408MLGX6-3WGS': PosixPath('cases/case-full-si/1408MLGX6-3WGS_tree.tre'),
 '1203NYJAP-1 - Tuna Scrape Outbreak': PosixPath('cases/case-full-si/1203NYJAP-1_-_Tuna_Scrape_Outbreak_tree.tre')}

# 2. Create subset inputs

In [4]:
import pandas as pd
from pathlib import Path
from typing import Dict

metadata_df = pd.read_csv(metadata_file, sep='\t')

def write_subset_input(metadata_df: pd.DataFrame, dataset_name: str) -> Path:
    all_input_total = len(metadata_df)
    cases_input = cases_dir_path / f'{dataset_name}_input-files-case.tsv'.replace(' ', '_')

    input_df = metadata_df.copy().loc[metadata_df['dataSetName'] == dataset_name]
    
    # Subset samples
    if not full_dataset:
        input_df = input_df.head(max_samples_per_organism)
    
    input_df['Sample'] = input_df['strain']
    input_df['Assemblies'] = pd.NA
    input_df['Reads1'] = input_df['Sample'].apply(lambda x: str((Path('data') / 'fastq' / (x + '_1.fastq.gz')).absolute()))
    input_df['Reads2'] = input_df['Sample'].apply(lambda x: str((Path('data') / 'fastq' / (x + '_2.fastq.gz')).absolute()))
    input_df = input_df[['Sample', 'Assemblies', 'Reads1', 'Reads2']]

    input_df.to_csv(cases_input, sep='\t', index=False)

    subset_input_total = len(input_df)

    print(f'Wrote dataset={dataset_name} consisting of {subset_input_total}/{all_input_total} samples to {cases_input}')
    
    return cases_input

cases_inputs = {x: write_subset_input(metadata_df, dataset_name=x) for x in reference_files}
cases_index = {x: cases_dir_path / f'{x}-index'.replace(' ', '_') for x in cases_inputs}
benchmark_outs = {x: cases_dir_path / f'{x}-results.tsv'.replace(' ', '_') for x in cases_inputs}
print(benchmark_outs)
print(cases_index)
cases_inputs

Wrote dataset=0810PADBR-1 consisting of 22/85 samples to cases/case-full-si/0810PADBR-1_input-files-case.tsv
Wrote dataset=1405WAEXK-1 consisting of 9/85 samples to cases/case-full-si/1405WAEXK-1_input-files-case.tsv
Wrote dataset=1408MLGX6-3WGS consisting of 31/85 samples to cases/case-full-si/1408MLGX6-3WGS_input-files-case.tsv
Wrote dataset=1203NYJAP-1 - Tuna Scrape Outbreak consisting of 23/85 samples to cases/case-full-si/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv
{'0810PADBR-1': PosixPath('cases/case-full-si/0810PADBR-1-results.tsv'), '1405WAEXK-1': PosixPath('cases/case-full-si/1405WAEXK-1-results.tsv'), '1408MLGX6-3WGS': PosixPath('cases/case-full-si/1408MLGX6-3WGS-results.tsv'), '1203NYJAP-1 - Tuna Scrape Outbreak': PosixPath('cases/case-full-si/1203NYJAP-1_-_Tuna_Scrape_Outbreak-results.tsv')}
{'0810PADBR-1': PosixPath('cases/case-full-si/0810PADBR-1-index'), '1405WAEXK-1': PosixPath('cases/case-full-si/1405WAEXK-1-index'), '1408MLGX6-3WGS': PosixPath('cases/case

{'0810PADBR-1': PosixPath('cases/case-full-si/0810PADBR-1_input-files-case.tsv'),
 '1405WAEXK-1': PosixPath('cases/case-full-si/1405WAEXK-1_input-files-case.tsv'),
 '1408MLGX6-3WGS': PosixPath('cases/case-full-si/1408MLGX6-3WGS_input-files-case.tsv'),
 '1203NYJAP-1 - Tuna Scrape Outbreak': PosixPath('cases/case-full-si/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv')}

# 2. Index genomes

In [5]:
!gdi --version

gdi, version 0.6.0.dev2


## 2.1. Index reads

In [6]:
import time

start = time.time()
if single_index:
    benchmarker = gdi_benchmark.IndexBenchmarkerMultiple(index_path=index_path, input_files_files=cases_inputs,
                                                 reference_files=reference_files, mincov=mincov,
                                                 build_tree=build_tree,
                                                 ncores=ncores,
                                                 sample_batch_size=sample_batch_size)
    benchmark_df = benchmarker.benchmark(iterations=iterations)
    benchmark_df.to_csv(benchmark_path, sep='\t', index=False)
else:
    for dataset_name in cases_index:
        index_path_organism = cases_index[dataset_name]
        reference_file_organism = reference_files[dataset_name]
        cases_input_organism = cases_inputs[dataset_name]
        out_file = benchmark_outs[dataset_name]
        
        print(f'\n\nHandling {dataset_name} as separate index located at {index_path_organism} '
              f'and reference file {reference_file_organism} and input file {cases_input_organism}\n')
        results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name}:{dataset_name}')
        benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                                     index_path=index_path_organism,
                                                     input_files_file=cases_input_organism,
                                                     reference_file=reference_file_organism,
                                                     mincov=mincov,
                                                     build_tree=build_tree,
                                                     ncores=ncores,
                                                     sample_batch_size=sample_batch_size
                                                    )
        benchmark_organism_df = benchmarker.benchmark(iterations=iterations)
        print(f'Finished analysis for {dataset_name}: writing results to {out_file}\n')
        benchmark_organism_df.to_csv(out_file, sep='\t')
end = time.time()
print(f'***Finished benchmarking, took {(end - start)/60:0.2f} minutes***')

Creating new index: [gdi init cases/case-full-si/index]




Creating a new index took 3.09 seconds



Iteration 1 of index/analysis of 22 samples for reference=data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647615454.5888474']


Analysis running: [gdi --project-dir cases/case-full-si/index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full-si/0810PADBR-1_input-files-case.tsv]


Analysis took 53.05 minutes
Index running: [gdi --project-dir cases/case-full-si/index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647617331.333969/gdi-input.fofn]


Indexing took 0.85 minutes
Building tree: [gdi --project-dir cases/case-full-si/index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCA_001879185.2_ASM187918v2_genomic]


Building tree took 0.38 minutes


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 1 of index/analysis of 9 samples for reference=data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647617331.333969']


Analysis running: [gdi --project-dir cases/case-full-si/index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full-si/1405WAEXK-1_input-files-case.tsv]


Analysis took 16.72 minutes
Index running: [gdi --project-dir cases/case-full-si/index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647620592.7710931/gdi-input.fofn]


Indexing took 3.57 minutes
Building tree: [gdi --project-dir cases/case-full-si/index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000703365.1_Ec2011C-3609_genomic]


Building tree took 0.51 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f192f3f84c0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 1 of index/analysis of 31 samples for reference=data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647620592.7710931']


Analysis running: [gdi --project-dir cases/case-full-si/index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full-si/1408MLGX6-3WGS_input-files-case.tsv]


Analysis took 39.83 minutes
Index running: [gdi --project-dir cases/case-full-si/index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647621845.6933353/gdi-input.fofn]


Indexing took 1.96 minutes
Building tree: [gdi --project-dir cases/case-full-si/index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_001047715.2_ASM104771v2_genomic]


Building tree took 0.62 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f19587eeac0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 1 of index/analysis of 23 samples for reference=data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647621845.6933353']


Analysis running: [gdi --project-dir cases/case-full-si/index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full-si/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv]


Analysis took 25.34 minutes
Index running: [gdi --project-dir cases/case-full-si/index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647624395.2156107/gdi-input.fofn]


Indexing took 5.21 minutes
Building tree: [gdi --project-dir cases/case-full-si/index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000439415.1_ASM43941v1_genomic]


Building tree took 0.61 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f193fa86850> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


Removing any existing indexes cases/case-full-si/index
Creating new index: [gdi init cases/case-full-si/index]


Creating a new index took 3.30 seconds



Iteration 2 of index/analysis of 22 samples for reference=data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647624395.2156107']


Analysis running: [gdi --project-dir cases/case-full-si/index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full-si/0810PADBR-1_input-files-case.tsv]


Analysis took 52.79 minutes
Index running: [gdi --project-dir cases/case-full-si/index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647626272.7751667/gdi-input.fofn]


Indexing took 0.91 minutes
Building tree: [gdi --project-dir cases/case-full-si/index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCA_001879185.2_ASM187918v2_genomic]


Building tree took 0.35 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f195874a8e0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 9 samples for reference=data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647626272.7751667']


Analysis running: [gdi --project-dir cases/case-full-si/index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full-si/1405WAEXK-1_input-files-case.tsv]


Analysis took 16.76 minutes
Index running: [gdi --project-dir cases/case-full-si/index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647629521.6229932/gdi-input.fofn]


Indexing took 3.61 minutes
Building tree: [gdi --project-dir cases/case-full-si/index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000703365.1_Ec2011C-3609_genomic]


Building tree took 0.47 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f192f418850> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 31 samples for reference=data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647629521.6229932']


Analysis running: [gdi --project-dir cases/case-full-si/index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full-si/1408MLGX6-3WGS_input-files-case.tsv]


Analysis took 39.93 minutes
Index running: [gdi --project-dir cases/case-full-si/index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647630777.0668023/gdi-input.fofn]


Indexing took 2.07 minutes
Building tree: [gdi --project-dir cases/case-full-si/index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_001047715.2_ASM104771v2_genomic]


Building tree took 0.60 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f193fced550> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 23 samples for reference=data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647630777.0668023']


Analysis running: [gdi --project-dir cases/case-full-si/index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full-si/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv]


Analysis took 25.58 minutes
Index running: [gdi --project-dir cases/case-full-si/index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647633338.9394805/gdi-input.fofn]


Indexing took 5.03 minutes
Building tree: [gdi --project-dir cases/case-full-si/index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000439415.1_ASM43941v1_genomic]


Building tree took 0.63 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f19261a4b50> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


Removing any existing indexes cases/case-full-si/index
Creating new index: [gdi init cases/case-full-si/index]


Creating a new index took 3.28 seconds



Iteration 3 of index/analysis of 22 samples for reference=data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647633338.9394805']


Analysis running: [gdi --project-dir cases/case-full-si/index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full-si/0810PADBR-1_input-files-case.tsv]


Analysis took 52.88 minutes
Index running: [gdi --project-dir cases/case-full-si/index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647635221.8456268/gdi-input.fofn]


Indexing took 0.91 minutes
Building tree: [gdi --project-dir cases/case-full-si/index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCA_001879185.2_ASM187918v2_genomic]


Building tree took 0.40 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f193f8662e0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 9 samples for reference=data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647635221.8456268']


Analysis running: [gdi --project-dir cases/case-full-si/index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full-si/1405WAEXK-1_input-files-case.tsv]


Analysis took 16.84 minutes
Index running: [gdi --project-dir cases/case-full-si/index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647638478.024706/gdi-input.fofn]


Indexing took 3.76 minutes
Building tree: [gdi --project-dir cases/case-full-si/index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000703365.1_Ec2011C-3609_genomic]


Building tree took 0.50 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f192f496e50> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 31 samples for reference=data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647638478.024706']


Analysis running: [gdi --project-dir cases/case-full-si/index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full-si/1408MLGX6-3WGS_input-files-case.tsv]


Analysis took 39.91 minutes
Index running: [gdi --project-dir cases/case-full-si/index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647639748.7230027/gdi-input.fofn]


Indexing took 1.94 minutes
Building tree: [gdi --project-dir cases/case-full-si/index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_001047715.2_ASM104771v2_genomic]


Building tree took 0.59 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f198031c8e0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 23 samples for reference=data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647639748.7230027']


Analysis running: [gdi --project-dir cases/case-full-si/index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full-si/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv]


Analysis took 25.19 minutes
Index running: [gdi --project-dir cases/case-full-si/index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647642300.7274983/gdi-input.fofn]


Indexing took 5.40 minutes
Building tree: [gdi --project-dir cases/case-full-si/index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000439415.1_ASM43941v1_genomic]


Building tree took 0.66 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f192f496b20> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


***Finished benchmarking, took 447.54 minutes***


# 3. Export trees

In [7]:
if build_tree and single_index:
    # Using variables with shell commands in Jupyter isn't working for me so I have to avoid using them
    !gdi --project-dir {index_path} export tree GCA_001879185.2_ASM187918v2_genomic > {cases_dir}/0810PADBR-1_tree.tre
    !gdi --project-dir {index_path} export tree GCF_000703365.1_Ec2011C-3609_genomic > {cases_dir}/1405WAEXK-1_tree.tre
    !gdi --project-dir {index_path} export tree GCF_001047715.2_ASM104771v2_genomic > {cases_dir}/1408MLGX6-3WGS_tree.tre
    !gdi --project-dir {index_path} export tree GCF_000439415.1_ASM43941v1_genomic > {cases_dir}/1203NYJAP-1_-_Tuna_Scrape_Outbreak_tree.tre
else:
    print(f'build_tree={build_tree} and single_index={single_index} so no trees to export')