# 1. Parameters

In [1]:
# Defaults
cases_dir = 'cases/unset'
metadata_file = 'metadata.tsv'
iterations = 3
mincov = 10
ncores = 4
max_samples_per_organism = 4
full_dataset = False
single_index = False

build_tree = True
sample_batch_size=10

In [2]:
# Parameters
cases_dir = "cases/case-full"
iterations = 3
max_samples_per_organism = "None"
full_dataset = True
single_index = False
build_tree = True


In [3]:
from pathlib import Path
from shutil import rmtree
from os import makedirs
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

if full_dataset:
    max_samples_per_organism = None

reference_files = {
    '0810PADBR-1': Path('data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz'),
    '1405WAEXK-1': Path('data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz'),
    '1408MLGX6-3WGS': Path('data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz'),
    '1203NYJAP-1 - Tuna Scrape Outbreak': Path('data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz'),
}

cases_dir_path = Path(cases_dir)
index_path = cases_dir_path / 'index'

if cases_dir_path.exists():
    rmtree(cases_dir_path)
    
if not cases_dir_path.exists():
    makedirs(cases_dir_path)

case_name = str(cases_dir_path.name)

benchmark_path = cases_dir_path / 'index-info.tsv'

output_trees = {x: cases_dir_path / f'{x}_tree.tre'.replace(' ', '_') for x in reference_files}
output_trees

{'0810PADBR-1': PosixPath('cases/case-full/0810PADBR-1_tree.tre'),
 '1405WAEXK-1': PosixPath('cases/case-full/1405WAEXK-1_tree.tre'),
 '1408MLGX6-3WGS': PosixPath('cases/case-full/1408MLGX6-3WGS_tree.tre'),
 '1203NYJAP-1 - Tuna Scrape Outbreak': PosixPath('cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_tree.tre')}

# 2. Create subset inputs

In [4]:
import pandas as pd
from pathlib import Path
from typing import Dict

metadata_df = pd.read_csv(metadata_file, sep='\t')

def write_subset_input(metadata_df: pd.DataFrame, dataset_name: str) -> Path:
    all_input_total = len(metadata_df)
    cases_input = cases_dir_path / f'{dataset_name}_input-files-case.tsv'.replace(' ', '_')

    input_df = metadata_df.copy().loc[metadata_df['dataSetName'] == dataset_name]
    
    # Subset samples
    if not full_dataset:
        input_df = input_df.head(max_samples_per_organism)
    
    input_df['Sample'] = input_df['strain']
    input_df['Assemblies'] = pd.NA
    input_df['Reads1'] = input_df['Sample'].apply(lambda x: str((Path('data') / 'fastq' / (x + '_1.fastq.gz')).absolute()))
    input_df['Reads2'] = input_df['Sample'].apply(lambda x: str((Path('data') / 'fastq' / (x + '_2.fastq.gz')).absolute()))
    input_df = input_df[['Sample', 'Assemblies', 'Reads1', 'Reads2']]

    input_df.to_csv(cases_input, sep='\t', index=False)

    subset_input_total = len(input_df)

    print(f'Wrote dataset={dataset_name} consisting of {subset_input_total}/{all_input_total} samples to {cases_input}')
    
    return cases_input

cases_inputs = {x: write_subset_input(metadata_df, dataset_name=x) for x in reference_files}
cases_index = {x: cases_dir_path / f'{x}-index'.replace(' ', '_') for x in cases_inputs}
benchmark_outs = {x: cases_dir_path / f'{x}-results.tsv'.replace(' ', '_') for x in cases_inputs}
print(benchmark_outs)
print(cases_index)
cases_inputs

Wrote dataset=0810PADBR-1 consisting of 22/85 samples to cases/case-full/0810PADBR-1_input-files-case.tsv
Wrote dataset=1405WAEXK-1 consisting of 9/85 samples to cases/case-full/1405WAEXK-1_input-files-case.tsv
Wrote dataset=1408MLGX6-3WGS consisting of 31/85 samples to cases/case-full/1408MLGX6-3WGS_input-files-case.tsv
Wrote dataset=1203NYJAP-1 - Tuna Scrape Outbreak consisting of 23/85 samples to cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv
{'0810PADBR-1': PosixPath('cases/case-full/0810PADBR-1-results.tsv'), '1405WAEXK-1': PosixPath('cases/case-full/1405WAEXK-1-results.tsv'), '1408MLGX6-3WGS': PosixPath('cases/case-full/1408MLGX6-3WGS-results.tsv'), '1203NYJAP-1 - Tuna Scrape Outbreak': PosixPath('cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-results.tsv')}
{'0810PADBR-1': PosixPath('cases/case-full/0810PADBR-1-index'), '1405WAEXK-1': PosixPath('cases/case-full/1405WAEXK-1-index'), '1408MLGX6-3WGS': PosixPath('cases/case-full/1408MLGX6-3WGS-index'), 

{'0810PADBR-1': PosixPath('cases/case-full/0810PADBR-1_input-files-case.tsv'),
 '1405WAEXK-1': PosixPath('cases/case-full/1405WAEXK-1_input-files-case.tsv'),
 '1408MLGX6-3WGS': PosixPath('cases/case-full/1408MLGX6-3WGS_input-files-case.tsv'),
 '1203NYJAP-1 - Tuna Scrape Outbreak': PosixPath('cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv')}

# 2. Index genomes

In [5]:
!gdi --version

gdi, version 0.6.0.dev2


## 2.1. Index reads

In [6]:
import time

start = time.time()
if single_index:
    benchmarker = gdi_benchmark.IndexBenchmarkerMultiple(index_path=index_path, input_files_files=cases_inputs,
                                                 reference_files=reference_files, mincov=mincov,
                                                 build_tree=build_tree,
                                                 ncores=ncores,
                                                 sample_batch_size=sample_batch_size)
    benchmark_df = benchmarker.benchmark(iterations=iterations)
    benchmark_df.to_csv(benchmark_path, sep='\t', index=False)
else:
    for dataset_name in cases_index:
        index_path_organism = cases_index[dataset_name]
        reference_file_organism = reference_files[dataset_name]
        cases_input_organism = cases_inputs[dataset_name]
        out_file = benchmark_outs[dataset_name]
        
        print(f'\n\nHandling {dataset_name} as separate index located at {index_path_organism} '
              f'and reference file {reference_file_organism} and input file {cases_input_organism}\n')
        results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name}:{dataset_name}')
        benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                                     index_path=index_path_organism,
                                                     input_files_file=cases_input_organism,
                                                     reference_file=reference_file_organism,
                                                     mincov=mincov,
                                                     build_tree=build_tree,
                                                     ncores=ncores,
                                                     sample_batch_size=sample_batch_size
                                                    )
        benchmark_organism_df = benchmarker.benchmark(iterations=iterations)
        print(f'Finished analysis for {dataset_name}: writing results to {out_file}\n')
        benchmark_organism_df.to_csv(out_file, sep='\t')
end = time.time()
print(f'***Finished benchmarking, took {(end - start)/60:0.2f} minutes***')



Handling 0810PADBR-1 as separate index located at cases/case-full/0810PADBR-1-index and reference file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz and input file cases/case-full/0810PADBR-1_input-files-case.tsv




Iteration 1 of index/analysis of 22 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647589122.6117032']


Creating new index: [gdi init cases/case-full/0810PADBR-1-index]


Creating a new index took 3.35 seconds
Analysis running: [gdi --project-dir cases/case-full/0810PADBR-1-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full/0810PADBR-1_input-files-case.tsv]


Analysis took 52.86 minutes
Index running: [gdi --project-dir cases/case-full/0810PADBR-1-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647590535.9341507/gdi-input.fofn]


Indexing took 0.88 minutes
Building tree: [gdi --project-dir cases/case-full/0810PADBR-1-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCA_001879185.2_ASM187918v2_genomic]


Building tree took 0.40 minutes


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 22 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647590535.9341507']


Removing any existing indexes cases/case-full/0810PADBR-1-index
Creating new index: [gdi init cases/case-full/0810PADBR-1-index]


Creating a new index took 3.09 seconds
Analysis running: [gdi --project-dir cases/case-full/0810PADBR-1-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full/0810PADBR-1_input-files-case.tsv]


Analysis took 52.99 minutes
Index running: [gdi --project-dir cases/case-full/0810PADBR-1-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647593791.1410506/gdi-input.fofn]


Indexing took 0.97 minutes
Building tree: [gdi --project-dir cases/case-full/0810PADBR-1-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCA_001879185.2_ASM187918v2_genomic]


Building tree took 0.38 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3099dd12b0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 22 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647593791.1410506']


Removing any existing indexes cases/case-full/0810PADBR-1-index
Creating new index: [gdi init cases/case-full/0810PADBR-1-index]


Creating a new index took 3.34 seconds
Analysis running: [gdi --project-dir cases/case-full/0810PADBR-1-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full/0810PADBR-1_input-files-case.tsv]


Analysis took 53.05 minutes
Index running: [gdi --project-dir cases/case-full/0810PADBR-1-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647597058.7507536/gdi-input.fofn]


Indexing took 0.88 minutes
Building tree: [gdi --project-dir cases/case-full/0810PADBR-1-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCA_001879185.2_ASM187918v2_genomic]


Building tree took 0.37 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3099dc1af0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


Finished analysis for 0810PADBR-1: writing results to cases/case-full/0810PADBR-1-results.tsv



Handling 1405WAEXK-1 as separate index located at cases/case-full/1405WAEXK-1-index and reference file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz and input file cases/case-full/1405WAEXK-1_input-files-case.tsv




Iteration 1 of index/analysis of 9 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647597058.7507536']


Creating new index: [gdi init cases/case-full/1405WAEXK-1-index]


Creating a new index took 3.06 seconds
Analysis running: [gdi --project-dir cases/case-full/1405WAEXK-1-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full/1405WAEXK-1_input-files-case.tsv]


Analysis took 16.84 minutes
Index running: [gdi --project-dir cases/case-full/1405WAEXK-1-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647600324.0154092/gdi-input.fofn]


Indexing took 3.75 minutes
Building tree: [gdi --project-dir cases/case-full/1405WAEXK-1-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000703365.1_Ec2011C-3609_genomic]


Building tree took 0.50 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f30a2402850> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 9 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647600324.0154092']


Removing any existing indexes cases/case-full/1405WAEXK-1-index
Creating new index: [gdi init cases/case-full/1405WAEXK-1-index]


Creating a new index took 3.44 seconds
Analysis running: [gdi --project-dir cases/case-full/1405WAEXK-1-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full/1405WAEXK-1_input-files-case.tsv]


Analysis took 16.24 minutes
Index running: [gdi --project-dir cases/case-full/1405WAEXK-1-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647601596.0829206/gdi-input.fofn]


Indexing took 3.49 minutes
Building tree: [gdi --project-dir cases/case-full/1405WAEXK-1-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000703365.1_Ec2011C-3609_genomic]


Building tree took 0.49 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f30a20e7160> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 9 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647601596.0829206']


Removing any existing indexes cases/case-full/1405WAEXK-1-index
Creating new index: [gdi init cases/case-full/1405WAEXK-1-index]


Creating a new index took 3.30 seconds
Analysis running: [gdi --project-dir cases/case-full/1405WAEXK-1-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full/1405WAEXK-1_input-files-case.tsv]


Analysis took 16.47 minutes
Index running: [gdi --project-dir cases/case-full/1405WAEXK-1-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647602816.3492427/gdi-input.fofn]


Indexing took 3.96 minutes
Building tree: [gdi --project-dir cases/case-full/1405WAEXK-1-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000703365.1_Ec2011C-3609_genomic]


Building tree took 0.51 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f30a267dee0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


Finished analysis for 1405WAEXK-1: writing results to cases/case-full/1405WAEXK-1-results.tsv



Handling 1408MLGX6-3WGS as separate index located at cases/case-full/1408MLGX6-3WGS-index and reference file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz and input file cases/case-full/1408MLGX6-3WGS_input-files-case.tsv




Iteration 1 of index/analysis of 31 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647602816.3492427']


Creating new index: [gdi init cases/case-full/1408MLGX6-3WGS-index]


Creating a new index took 3.31 seconds
Analysis running: [gdi --project-dir cases/case-full/1408MLGX6-3WGS-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full/1408MLGX6-3WGS_input-files-case.tsv]


Analysis took 40.00 minutes
Index running: [gdi --project-dir cases/case-full/1408MLGX6-3WGS-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647604080.9889867/gdi-input.fofn]


Indexing took 2.04 minutes
Building tree: [gdi --project-dir cases/case-full/1408MLGX6-3WGS-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_001047715.2_ASM104771v2_genomic]


Building tree took 0.57 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3099a37f10> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 31 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647604080.9889867']


Removing any existing indexes cases/case-full/1408MLGX6-3WGS-index
Creating new index: [gdi init cases/case-full/1408MLGX6-3WGS-index]


Creating a new index took 3.14 seconds
Analysis running: [gdi --project-dir cases/case-full/1408MLGX6-3WGS-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full/1408MLGX6-3WGS_input-files-case.tsv]


Analysis took 39.89 minutes
Index running: [gdi --project-dir cases/case-full/1408MLGX6-3WGS-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647606644.5436273/gdi-input.fofn]


Indexing took 1.90 minutes
Building tree: [gdi --project-dir cases/case-full/1408MLGX6-3WGS-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_001047715.2_ASM104771v2_genomic]


Building tree took 0.57 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f30a254da60> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 31 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647606644.5436273']


Removing any existing indexes cases/case-full/1408MLGX6-3WGS-index
Creating new index: [gdi init cases/case-full/1408MLGX6-3WGS-index]


Creating a new index took 3.06 seconds
Analysis running: [gdi --project-dir cases/case-full/1408MLGX6-3WGS-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full/1408MLGX6-3WGS_input-files-case.tsv]


Analysis took 40.17 minutes
Index running: [gdi --project-dir cases/case-full/1408MLGX6-3WGS-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647609193.0921483/gdi-input.fofn]


Indexing took 1.87 minutes
Building tree: [gdi --project-dir cases/case-full/1408MLGX6-3WGS-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_001047715.2_ASM104771v2_genomic]


Building tree took 0.61 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3099a37ee0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


Finished analysis for 1408MLGX6-3WGS: writing results to cases/case-full/1408MLGX6-3WGS-results.tsv



Handling 1203NYJAP-1 - Tuna Scrape Outbreak as separate index located at cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index and reference file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz and input file cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv




Iteration 1 of index/analysis of 23 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647609193.0921483']


Creating new index: [gdi init cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index]


Creating a new index took 3.43 seconds
Analysis running: [gdi --project-dir cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv]


Analysis took 25.06 minutes
Index running: [gdi --project-dir cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647611759.7403781/gdi-input.fofn]


Indexing took 4.60 minutes
Building tree: [gdi --project-dir cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000439415.1_ASM43941v1_genomic]


Building tree took 0.61 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f30a2512250> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 23 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647611759.7403781']


Removing any existing indexes cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index
Creating new index: [gdi init cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index]


Creating a new index took 3.37 seconds
Analysis running: [gdi --project-dir cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv]


Analysis took 25.19 minutes
Index running: [gdi --project-dir cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647613583.2253702/gdi-input.fofn]


Indexing took 5.19 minutes
Building tree: [gdi --project-dir cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000439415.1_ASM43941v1_genomic]


Building tree took 0.68 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f30a23d6e50> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 23 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647613583.2253702']


Removing any existing indexes cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index
Creating new index: [gdi init cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index]


Creating a new index took 3.37 seconds
Analysis running: [gdi --project-dir cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv]


Analysis took 25.34 minutes
Index running: [gdi --project-dir cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647615454.5888474/gdi-input.fofn]


Indexing took 5.04 minutes
Building tree: [gdi --project-dir cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000439415.1_ASM43941v1_genomic]


Building tree took 0.61 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f30a2228610> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


Finished analysis for 1203NYJAP-1 - Tuna Scrape Outbreak: writing results to cases/case-full/1203NYJAP-1_-_Tuna_Scrape_Outbreak-results.tsv

***Finished benchmarking, took 446.43 minutes***


# 3. Export trees

In [7]:
if build_tree and single_index:
    # Using variables with shell commands in Jupyter isn't working for me so I have to avoid using them
    !gdi --project-dir {index_path} export tree GCA_001879185.2_ASM187918v2_genomic > {cases_dir}/0810PADBR-1_tree.tre
    !gdi --project-dir {index_path} export tree GCF_000703365.1_Ec2011C-3609_genomic > {cases_dir}/1405WAEXK-1_tree.tre
    !gdi --project-dir {index_path} export tree GCF_001047715.2_ASM104771v2_genomic > {cases_dir}/1408MLGX6-3WGS_tree.tre
    !gdi --project-dir {index_path} export tree GCF_000439415.1_ASM43941v1_genomic > {cases_dir}/1203NYJAP-1_-_Tuna_Scrape_Outbreak_tree.tre
else:
    print(f'build_tree={build_tree} and single_index={single_index} so no trees to export')

build_tree=True and single_index=False so no trees to export
