# 1. Parameters

In [1]:
# Defaults
cases_dir = 'cases/unset'
metadata_file = 'metadata.tsv'
iterations = 3
mincov = 10
ncores = 4
max_samples_per_organism = 4
full_dataset = False
single_index = False

build_tree = True
sample_batch_size=10

In [2]:
# Parameters
cases_dir = "cases/case-08"
iterations = 3
max_samples_per_organism = 8
full_dataset = False
single_index = False
build_tree = True


In [3]:
from pathlib import Path
from shutil import rmtree
from os import makedirs
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

if full_dataset:
    max_samples_per_organism = None

reference_files = {
    '0810PADBR-1': Path('data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz'),
    '1405WAEXK-1': Path('data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz'),
    '1408MLGX6-3WGS': Path('data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz'),
    '1203NYJAP-1 - Tuna Scrape Outbreak': Path('data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz'),
}

cases_dir_path = Path(cases_dir)
index_path = cases_dir_path / 'index'

if cases_dir_path.exists():
    rmtree(cases_dir_path)
    
if not cases_dir_path.exists():
    makedirs(cases_dir_path)

case_name = str(cases_dir_path.name)

benchmark_path = cases_dir_path / 'index-info.tsv'

output_trees = {x: cases_dir_path / f'{x}_tree.tre'.replace(' ', '_') for x in reference_files}
output_trees

{'0810PADBR-1': PosixPath('cases/case-08/0810PADBR-1_tree.tre'),
 '1405WAEXK-1': PosixPath('cases/case-08/1405WAEXK-1_tree.tre'),
 '1408MLGX6-3WGS': PosixPath('cases/case-08/1408MLGX6-3WGS_tree.tre'),
 '1203NYJAP-1 - Tuna Scrape Outbreak': PosixPath('cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak_tree.tre')}

# 2. Create subset inputs

In [4]:
import pandas as pd
from pathlib import Path
from typing import Dict

metadata_df = pd.read_csv(metadata_file, sep='\t')

def write_subset_input(metadata_df: pd.DataFrame, dataset_name: str) -> Path:
    all_input_total = len(metadata_df)
    cases_input = cases_dir_path / f'{dataset_name}_input-files-case.tsv'.replace(' ', '_')

    input_df = metadata_df.copy().loc[metadata_df['dataSetName'] == dataset_name]
    
    # Subset samples
    if not full_dataset:
        input_df = input_df.head(max_samples_per_organism)
    
    input_df['Sample'] = input_df['strain']
    input_df['Assemblies'] = pd.NA
    input_df['Reads1'] = input_df['Sample'].apply(lambda x: str((Path('data') / 'fastq' / (x + '_1.fastq.gz')).absolute()))
    input_df['Reads2'] = input_df['Sample'].apply(lambda x: str((Path('data') / 'fastq' / (x + '_2.fastq.gz')).absolute()))
    input_df = input_df[['Sample', 'Assemblies', 'Reads1', 'Reads2']]

    input_df.to_csv(cases_input, sep='\t', index=False)

    subset_input_total = len(input_df)

    print(f'Wrote dataset={dataset_name} consisting of {subset_input_total}/{all_input_total} samples to {cases_input}')
    
    return cases_input

cases_inputs = {x: write_subset_input(metadata_df, dataset_name=x) for x in reference_files}
cases_index = {x: cases_dir_path / f'{x}-index'.replace(' ', '_') for x in cases_inputs}
benchmark_outs = {x: cases_dir_path / f'{x}-results.tsv'.replace(' ', '_') for x in cases_inputs}
print(benchmark_outs)
print(cases_index)
cases_inputs

Wrote dataset=0810PADBR-1 consisting of 8/85 samples to cases/case-08/0810PADBR-1_input-files-case.tsv
Wrote dataset=1405WAEXK-1 consisting of 8/85 samples to cases/case-08/1405WAEXK-1_input-files-case.tsv
Wrote dataset=1408MLGX6-3WGS consisting of 8/85 samples to cases/case-08/1408MLGX6-3WGS_input-files-case.tsv
Wrote dataset=1203NYJAP-1 - Tuna Scrape Outbreak consisting of 8/85 samples to cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv
{'0810PADBR-1': PosixPath('cases/case-08/0810PADBR-1-results.tsv'), '1405WAEXK-1': PosixPath('cases/case-08/1405WAEXK-1-results.tsv'), '1408MLGX6-3WGS': PosixPath('cases/case-08/1408MLGX6-3WGS-results.tsv'), '1203NYJAP-1 - Tuna Scrape Outbreak': PosixPath('cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-results.tsv')}
{'0810PADBR-1': PosixPath('cases/case-08/0810PADBR-1-index'), '1405WAEXK-1': PosixPath('cases/case-08/1405WAEXK-1-index'), '1408MLGX6-3WGS': PosixPath('cases/case-08/1408MLGX6-3WGS-index'), '1203NYJAP-1 - Tuna Scrap

{'0810PADBR-1': PosixPath('cases/case-08/0810PADBR-1_input-files-case.tsv'),
 '1405WAEXK-1': PosixPath('cases/case-08/1405WAEXK-1_input-files-case.tsv'),
 '1408MLGX6-3WGS': PosixPath('cases/case-08/1408MLGX6-3WGS_input-files-case.tsv'),
 '1203NYJAP-1 - Tuna Scrape Outbreak': PosixPath('cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv')}

# 2. Index genomes

In [5]:
!gdi --version

gdi, version 0.6.0.dev2


## 2.1. Index reads

In [6]:
import time

start = time.time()
if single_index:
    benchmarker = gdi_benchmark.IndexBenchmarkerMultiple(index_path=index_path, input_files_files=cases_inputs,
                                                 reference_files=reference_files, mincov=mincov,
                                                 build_tree=build_tree,
                                                 ncores=ncores,
                                                 sample_batch_size=sample_batch_size)
    benchmark_df = benchmarker.benchmark(iterations=iterations)
    benchmark_df.to_csv(benchmark_path, sep='\t', index=False)
else:
    for dataset_name in cases_index:
        index_path_organism = cases_index[dataset_name]
        reference_file_organism = reference_files[dataset_name]
        cases_input_organism = cases_inputs[dataset_name]
        out_file = benchmark_outs[dataset_name]
        
        print(f'\n\nHandling {dataset_name} as separate index located at {index_path_organism} '
              f'and reference file {reference_file_organism} and input file {cases_input_organism}\n')
        results_handler = gdi_benchmark.BenchmarkResultsHandler(name=f'{case_name}:{dataset_name}')
        benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                                     index_path=index_path_organism,
                                                     input_files_file=cases_input_organism,
                                                     reference_file=reference_file_organism,
                                                     mincov=mincov,
                                                     build_tree=build_tree,
                                                     ncores=ncores,
                                                     sample_batch_size=sample_batch_size
                                                    )
        benchmark_organism_df = benchmarker.benchmark(iterations=iterations)
        print(f'Finished analysis for {dataset_name}: writing results to {out_file}\n')
        benchmark_organism_df.to_csv(out_file, sep='\t')
end = time.time()
print(f'***Finished benchmarking, took {(end - start)/60:0.2f} minutes***')



Handling 0810PADBR-1 as separate index located at cases/case-08/0810PADBR-1-index and reference file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz and input file cases/case-08/0810PADBR-1_input-files-case.tsv




Iteration 1 of index/analysis of 8 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647558270.0628698']


Creating new index: [gdi init cases/case-08/0810PADBR-1-index]


Creating a new index took 3.15 seconds
Analysis running: [gdi --project-dir cases/case-08/0810PADBR-1-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-08/0810PADBR-1_input-files-case.tsv]


Analysis took 21.76 minutes
Index running: [gdi --project-dir cases/case-08/0810PADBR-1-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647558802.9777098/gdi-input.fofn]


Indexing took 0.35 minutes
Building tree: [gdi --project-dir cases/case-08/0810PADBR-1-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCA_001879185.2_ASM187918v2_genomic]


Building tree took 0.35 minutes


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 8 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647558802.9777098']


Removing any existing indexes cases/case-08/0810PADBR-1-index
Creating new index: [gdi init cases/case-08/0810PADBR-1-index]


Creating a new index took 3.25 seconds
Analysis running: [gdi --project-dir cases/case-08/0810PADBR-1-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-08/0810PADBR-1_input-files-case.tsv]


Analysis took 21.93 minutes
Index running: [gdi --project-dir cases/case-08/0810PADBR-1-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647560157.121017/gdi-input.fofn]


Indexing took 0.35 minutes
Building tree: [gdi --project-dir cases/case-08/0810PADBR-1-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCA_001879185.2_ASM187918v2_genomic]


Building tree took 0.31 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f1c074722e0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 8 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647560157.121017']


Removing any existing indexes cases/case-08/0810PADBR-1-index
Creating new index: [gdi init cases/case-08/0810PADBR-1-index]


Creating a new index took 3.27 seconds
Analysis running: [gdi --project-dir cases/case-08/0810PADBR-1-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-08/0810PADBR-1_input-files-case.tsv]


Analysis took 21.78 minutes
Index running: [gdi --project-dir cases/case-08/0810PADBR-1-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647561518.4858708/gdi-input.fofn]


Indexing took 0.34 minutes
Building tree: [gdi --project-dir cases/case-08/0810PADBR-1-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCA_001879185.2_ASM187918v2_genomic]


Building tree took 0.31 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f1c07472370> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


Finished analysis for 0810PADBR-1: writing results to cases/case-08/0810PADBR-1-results.tsv



Handling 1405WAEXK-1 as separate index located at cases/case-08/1405WAEXK-1-index and reference file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz and input file cases/case-08/1405WAEXK-1_input-files-case.tsv




Iteration 1 of index/analysis of 8 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647561518.4858708']


Creating new index: [gdi init cases/case-08/1405WAEXK-1-index]


Creating a new index took 3.25 seconds
Analysis running: [gdi --project-dir cases/case-08/1405WAEXK-1-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-08/1405WAEXK-1_input-files-case.tsv]


Analysis took 12.80 minutes
Index running: [gdi --project-dir cases/case-08/1405WAEXK-1-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647562871.447983/gdi-input.fofn]


Indexing took 3.76 minutes
Building tree: [gdi --project-dir cases/case-08/1405WAEXK-1-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000703365.1_Ec2011C-3609_genomic]


Building tree took 0.48 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f1c1ff7fd90> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 8 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647562871.447983']


Removing any existing indexes cases/case-08/1405WAEXK-1-index
Creating new index: [gdi init cases/case-08/1405WAEXK-1-index]


Creating a new index took 3.27 seconds
Analysis running: [gdi --project-dir cases/case-08/1405WAEXK-1-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-08/1405WAEXK-1_input-files-case.tsv]


Analysis took 12.79 minutes
Index running: [gdi --project-dir cases/case-08/1405WAEXK-1-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647563901.1439133/gdi-input.fofn]


Indexing took 3.37 minutes
Building tree: [gdi --project-dir cases/case-08/1405WAEXK-1-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000703365.1_Ec2011C-3609_genomic]


Building tree took 0.49 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f1a54983e80> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 8 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647563901.1439133']


Removing any existing indexes cases/case-08/1405WAEXK-1-index
Creating new index: [gdi init cases/case-08/1405WAEXK-1-index]


Creating a new index took 3.29 seconds
Analysis running: [gdi --project-dir cases/case-08/1405WAEXK-1-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-08/1405WAEXK-1_input-files-case.tsv]


Analysis took 13.09 minutes
Index running: [gdi --project-dir cases/case-08/1405WAEXK-1-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647564907.6986184/gdi-input.fofn]


Indexing took 3.41 minutes
Building tree: [gdi --project-dir cases/case-08/1405WAEXK-1-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000703365.1_Ec2011C-3609_genomic]


Building tree took 0.51 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f1c1ff97c70> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


Finished analysis for 1405WAEXK-1: writing results to cases/case-08/1405WAEXK-1-results.tsv



Handling 1408MLGX6-3WGS as separate index located at cases/case-08/1408MLGX6-3WGS-index and reference file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz and input file cases/case-08/1408MLGX6-3WGS_input-files-case.tsv




Iteration 1 of index/analysis of 8 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647564907.6986184']


Creating new index: [gdi init cases/case-08/1408MLGX6-3WGS-index]


Creating a new index took 3.12 seconds
Analysis running: [gdi --project-dir cases/case-08/1408MLGX6-3WGS-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-08/1408MLGX6-3WGS_input-files-case.tsv]


Analysis took 12.38 minutes
Index running: [gdi --project-dir cases/case-08/1408MLGX6-3WGS-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647565936.1444206/gdi-input.fofn]


Indexing took 0.59 minutes
Building tree: [gdi --project-dir cases/case-08/1408MLGX6-3WGS-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_001047715.2_ASM104771v2_genomic]


Building tree took 0.36 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f1c07b41850> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 8 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647565936.1444206']


Removing any existing indexes cases/case-08/1408MLGX6-3WGS-index
Creating new index: [gdi init cases/case-08/1408MLGX6-3WGS-index]


Creating a new index took 3.36 seconds
Analysis running: [gdi --project-dir cases/case-08/1408MLGX6-3WGS-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-08/1408MLGX6-3WGS_input-files-case.tsv]


Analysis took 12.35 minutes
Index running: [gdi --project-dir cases/case-08/1408MLGX6-3WGS-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647566742.6296582/gdi-input.fofn]


Indexing took 0.60 minutes
Building tree: [gdi --project-dir cases/case-08/1408MLGX6-3WGS-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_001047715.2_ASM104771v2_genomic]


Building tree took 0.33 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f1c1ff1e250> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 8 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647566742.6296582']


Removing any existing indexes cases/case-08/1408MLGX6-3WGS-index
Creating new index: [gdi init cases/case-08/1408MLGX6-3WGS-index]


Creating a new index took 3.31 seconds
Analysis running: [gdi --project-dir cases/case-08/1408MLGX6-3WGS-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-08/1408MLGX6-3WGS_input-files-case.tsv]


Analysis took 12.46 minutes
Index running: [gdi --project-dir cases/case-08/1408MLGX6-3WGS-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647567546.069963/gdi-input.fofn]


Indexing took 0.61 minutes
Building tree: [gdi --project-dir cases/case-08/1408MLGX6-3WGS-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_001047715.2_ASM104771v2_genomic]


Building tree took 0.37 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f1c078c97f0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


Finished analysis for 1408MLGX6-3WGS: writing results to cases/case-08/1408MLGX6-3WGS-results.tsv



Handling 1203NYJAP-1 - Tuna Scrape Outbreak as separate index located at cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index and reference file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz and input file cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv




Iteration 1 of index/analysis of 8 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647567546.069963']


Creating new index: [gdi init cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index]


Creating a new index took 3.36 seconds
Analysis running: [gdi --project-dir cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv]


Analysis took 10.76 minutes
Index running: [gdi --project-dir cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647568359.201032/gdi-input.fofn]


Indexing took 1.32 minutes
Building tree: [gdi --project-dir cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000439415.1_ASM43941v1_genomic]


Building tree took 0.41 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f1c07c6b2b0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 8 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647568359.201032']


Removing any existing indexes cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index
Creating new index: [gdi init cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index]


Creating a new index took 3.27 seconds
Analysis running: [gdi --project-dir cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv]


Analysis took 10.73 minutes
Index running: [gdi --project-dir cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647569114.739662/gdi-input.fofn]


Indexing took 1.32 minutes
Building tree: [gdi --project-dir cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000439415.1_ASM43941v1_genomic]


Building tree took 0.45 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f1c2876f070> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 8 samples with 4 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1647569114.739662']


Removing any existing indexes cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index
Creating new index: [gdi init cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index]


Creating a new index took 3.06 seconds
Analysis running: [gdi --project-dir cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 analysis --use-conda --no-load-data --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv]


Analysis took 11.10 minutes
Index running: [gdi --project-dir cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1647569871.4047613/gdi-input.fofn]


Indexing took 1.35 minutes
Building tree: [gdi --project-dir cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-index --ncores 4 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000439415.1_ASM43941v1_genomic]


Building tree took 0.47 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f1c417ce8e0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


Finished analysis for 1203NYJAP-1 - Tuna Scrape Outbreak: writing results to cases/case-08/1203NYJAP-1_-_Tuna_Scrape_Outbreak-results.tsv

***Finished benchmarking, took 197.50 minutes***


# 3. Export trees

In [7]:
if build_tree and single_index:
    # Using variables with shell commands in Jupyter isn't working for me so I have to avoid using them
    !gdi --project-dir {index_path} export tree GCA_001879185.2_ASM187918v2_genomic > {cases_dir}/0810PADBR-1_tree.tre
    !gdi --project-dir {index_path} export tree GCF_000703365.1_Ec2011C-3609_genomic > {cases_dir}/1405WAEXK-1_tree.tre
    !gdi --project-dir {index_path} export tree GCF_001047715.2_ASM104771v2_genomic > {cases_dir}/1408MLGX6-3WGS_tree.tre
    !gdi --project-dir {index_path} export tree GCF_000439415.1_ASM43941v1_genomic > {cases_dir}/1203NYJAP-1_-_Tuna_Scrape_Outbreak_tree.tre
else:
    print(f'build_tree={build_tree} and single_index={single_index} so no trees to export')

build_tree=True and single_index=False so no trees to export
