# 1. Parameters

In [1]:
from pathlib import Path

# Defaults
cases_dir = 'cases/full'
metadata_file = 'metadata.tsv'
iterations = 3
mincov = 10
ncores = 32

reference_files = {
    '0810PADBR-1': Path('data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz'),
    '1405WAEXK-1': Path('data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz'),
    '1408MLGX6-3WGS': Path('data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz'),
    '1203NYJAP-1 - Tuna Scrape Outbreak': Path('data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz'),
}
    
build_tree = True
sample_batch_size=10

In [2]:
from pathlib import Path
from shutil import rmtree
from os import makedirs
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

cases_dir_path = Path(cases_dir)
index_path = cases_dir_path / 'index'

if cases_dir_path.exists():
    rmtree(cases_dir_path)
    
if not cases_dir_path.exists():
    makedirs(cases_dir_path)

case_name = str(cases_dir_path.name)

benchmark_path = cases_dir_path / 'index-info.tsv'

output_trees = {x: cases_dir_path / f'{x}_tree.tre'.replace(' ', '_') for x in reference_files}
output_trees

{'0810PADBR-1': PosixPath('cases/full/0810PADBR-1_tree.tre'),
 '1405WAEXK-1': PosixPath('cases/full/1405WAEXK-1_tree.tre'),
 '1408MLGX6-3WGS': PosixPath('cases/full/1408MLGX6-3WGS_tree.tre'),
 '1203NYJAP-1 - Tuna Scrape Outbreak': PosixPath('cases/full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_tree.tre')}

# 2. Create subset inputs

In [3]:
import pandas as pd
from pathlib import Path
from typing import Dict

metadata_df = pd.read_csv(metadata_file, sep='\t')

def write_subset_input(metadata_df: pd.DataFrame, dataset_name: str) -> Path:
    all_input_total = len(metadata_df)
    cases_input = cases_dir_path / f'{dataset_name}_input-files-case.tsv'.replace(' ', '_')

    input_df = metadata_df.copy().loc[metadata_df['dataSetName'] == dataset_name]
    input_df['Sample'] = input_df['strain']
    input_df['Assemblies'] = pd.NA
    input_df['Reads1'] = input_df['Sample'].apply(lambda x: str((Path('data') / 'fastq' / (x + '_1.fastq.gz')).absolute()))
    input_df['Reads2'] = input_df['Sample'].apply(lambda x: str((Path('data') / 'fastq' / (x + '_2.fastq.gz')).absolute()))
    input_df = input_df[['Sample', 'Assemblies', 'Reads1', 'Reads2']]

    input_df.to_csv(cases_input, sep='\t', index=False)

    subset_input_total = len(input_df)

    print(f'Wrote dataset={dataset_name} consisting of {subset_input_total}/{all_input_total} samples to {cases_input}')
    
    return cases_input

cases_inputs = {x: write_subset_input(metadata_df, dataset_name=x) for x in reference_files}
cases_inputs

Wrote dataset=0810PADBR-1 consisting of 22/85 samples to cases/full/0810PADBR-1_input-files-case.tsv
Wrote dataset=1405WAEXK-1 consisting of 9/85 samples to cases/full/1405WAEXK-1_input-files-case.tsv
Wrote dataset=1408MLGX6-3WGS consisting of 31/85 samples to cases/full/1408MLGX6-3WGS_input-files-case.tsv
Wrote dataset=1203NYJAP-1 - Tuna Scrape Outbreak consisting of 23/85 samples to cases/full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv


{'0810PADBR-1': PosixPath('cases/full/0810PADBR-1_input-files-case.tsv'),
 '1405WAEXK-1': PosixPath('cases/full/1405WAEXK-1_input-files-case.tsv'),
 '1408MLGX6-3WGS': PosixPath('cases/full/1408MLGX6-3WGS_input-files-case.tsv'),
 '1203NYJAP-1 - Tuna Scrape Outbreak': PosixPath('cases/full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv')}

# 2. Index genomes

In [4]:
!gdi --version

gdi, version 0.5.0


## 2.1. Index reads

In [5]:
import time

start = time.time()
benchmarker = gdi_benchmark.IndexBenchmarkerMultiple(index_path=index_path, input_files_files=cases_inputs,
                                             reference_files=reference_files, mincov=mincov,
                                             build_tree=build_tree,
                                             ncores=ncores,
                                             sample_batch_size=sample_batch_size)
benchmark_df = benchmarker.benchmark(iterations=iterations)
end = time.time()
print(f'***Finished benchmarking, took {(end - start)/60:0.2f} minutes***')

Creating new index: [gdi init cases/full/index]


Creating a new index took 3.32 seconds



Iteration 1 of index/analysis of 22 samples for reference=data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz with 32 cores
Removing any extra snakemake directories: []
Analysis running: [gdi --project-dir cases/full/index --ncores 32 analysis --use-conda --no-load-data --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/full/0810PADBR-1_input-files-case.tsv]


Analysis took 16.15 minutes
Index running: [gdi --project-dir cases/full/index --ncores 32 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1644355456.9301004/gdi-input.fofn]


Indexing took 0.88 minutes
Building tree: [gdi --project-dir cases/full/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCA_001879185.2_ASM187918v2_genomic]


Building tree took 1.75 minutes


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 1 of index/analysis of 9 samples for reference=data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1644355456.9301004']


Analysis running: [gdi --project-dir cases/full/index --ncores 32 analysis --use-conda --no-load-data --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/full/1405WAEXK-1_input-files-case.tsv]


Analysis took 12.26 minutes
Index running: [gdi --project-dir cases/full/index --ncores 32 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1644356589.9788718/gdi-input.fofn]


Indexing took 3.61 minutes
Building tree: [gdi --project-dir cases/full/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000703365.1_Ec2011C-3609_genomic]


Building tree took 1.61 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3af2231df0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 1 of index/analysis of 31 samples for reference=data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1644356589.9788718']


Analysis running: [gdi --project-dir cases/full/index --ncores 32 analysis --use-conda --no-load-data --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/full/1408MLGX6-3WGS_input-files-case.tsv]


Analysis took 12.39 minutes
Index running: [gdi --project-dir cases/full/index --ncores 32 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1644357647.0070922/gdi-input.fofn]


Indexing took 1.95 minutes
Building tree: [gdi --project-dir cases/full/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_001047715.2_ASM104771v2_genomic]


Building tree took 2.11 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3af1df11f0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 1 of index/analysis of 23 samples for reference=data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1644357647.0070922']


Analysis running: [gdi --project-dir cases/full/index --ncores 32 analysis --use-conda --no-load-data --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv]


Analysis took 9.05 minutes
Index running: [gdi --project-dir cases/full/index --ncores 32 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1644358638.9349027/gdi-input.fofn]


Indexing took 4.91 minutes
Building tree: [gdi --project-dir cases/full/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000439415.1_ASM43941v1_genomic]


Building tree took 2.04 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3af2120a00> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


Removing any existing indexes cases/full/index
Creating new index: [gdi init cases/full/index]


Creating a new index took 3.40 seconds



Iteration 2 of index/analysis of 22 samples for reference=data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1644358638.9349027']


Analysis running: [gdi --project-dir cases/full/index --ncores 32 analysis --use-conda --no-load-data --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/full/0810PADBR-1_input-files-case.tsv]


Analysis took 15.32 minutes
Index running: [gdi --project-dir cases/full/index --ncores 32 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1644359606.104131/gdi-input.fofn]


Indexing took 0.86 minutes
Building tree: [gdi --project-dir cases/full/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCA_001879185.2_ASM187918v2_genomic]


Building tree took 1.78 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3af24eb130> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 9 samples for reference=data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1644359606.104131']


Analysis running: [gdi --project-dir cases/full/index --ncores 32 analysis --use-conda --no-load-data --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/full/1405WAEXK-1_input-files-case.tsv]


Analysis took 11.98 minutes
Index running: [gdi --project-dir cases/full/index --ncores 32 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1644360688.4343693/gdi-input.fofn]


Indexing took 3.55 minutes
Building tree: [gdi --project-dir cases/full/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000703365.1_Ec2011C-3609_genomic]


Building tree took 1.95 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3af24fbcd0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 31 samples for reference=data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1644360688.4343693']


Analysis running: [gdi --project-dir cases/full/index --ncores 32 analysis --use-conda --no-load-data --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/full/1408MLGX6-3WGS_input-files-case.tsv]


Analysis took 12.49 minutes
Index running: [gdi --project-dir cases/full/index --ncores 32 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1644361741.3132155/gdi-input.fofn]


Indexing took 1.98 minutes
Building tree: [gdi --project-dir cases/full/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_001047715.2_ASM104771v2_genomic]


Building tree took 2.09 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3af24e1130> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 2 of index/analysis of 23 samples for reference=data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1644361741.3132155']


Analysis running: [gdi --project-dir cases/full/index --ncores 32 analysis --use-conda --no-load-data --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv]


Analysis took 9.37 minutes
Index running: [gdi --project-dir cases/full/index --ncores 32 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1644362740.5378377/gdi-input.fofn]


Indexing took 4.95 minutes
Building tree: [gdi --project-dir cases/full/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000439415.1_ASM43941v1_genomic]


Building tree took 2.16 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3af1df12b0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


Removing any existing indexes cases/full/index
Creating new index: [gdi init cases/full/index]


Creating a new index took 3.42 seconds



Iteration 3 of index/analysis of 22 samples for reference=data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1644362740.5378377']


Analysis running: [gdi --project-dir cases/full/index --ncores 32 analysis --use-conda --no-load-data --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/full/0810PADBR-1_input-files-case.tsv]


Analysis took 15.33 minutes
Index running: [gdi --project-dir cases/full/index --ncores 32 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1644363736.9374013/gdi-input.fofn]


Indexing took 0.92 minutes
Building tree: [gdi --project-dir cases/full/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCA_001879185.2_ASM187918v2_genomic]


Building tree took 1.34 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3af277da60> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 9 samples for reference=data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1644363736.9374013']


Analysis running: [gdi --project-dir cases/full/index --ncores 32 analysis --use-conda --no-load-data --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/full/1405WAEXK-1_input-files-case.tsv]


Analysis took 12.19 minutes
Index running: [gdi --project-dir cases/full/index --ncores 32 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/ecoli/GCF_000703365.1_Ec2011C-3609_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1644364797.6535573/gdi-input.fofn]


Indexing took 3.53 minutes
Building tree: [gdi --project-dir cases/full/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000703365.1_Ec2011C-3609_genomic]


Building tree took 1.86 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3af2120220> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 31 samples for reference=data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1644364797.6535573']


Analysis running: [gdi --project-dir cases/full/index --ncores 32 analysis --use-conda --no-load-data --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/full/1408MLGX6-3WGS_input-files-case.tsv]


Analysis took 12.19 minutes
Index running: [gdi --project-dir cases/full/index --ncores 32 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/listeria/GCF_001047715.2_ASM104771v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1644365856.8611856/gdi-input.fofn]


Indexing took 1.93 minutes
Building tree: [gdi --project-dir cases/full/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_001047715.2_ASM104771v2_genomic]


Building tree took 2.42 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3af2397eb0> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)



Iteration 3 of index/analysis of 23 samples for reference=data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz with 32 cores
Removing any extra snakemake directories: ['snakemake-assemblies.1644365856.8611856']


Analysis running: [gdi --project-dir cases/full/index --ncores 32 analysis --use-conda --no-load-data --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_input-files-case.tsv]


Analysis took 9.43 minutes
Index running: [gdi --project-dir cases/full/index --ncores 32 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/salmonella/GCF_000439415.1_ASM43941v1_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1644366855.186532/gdi-input.fofn]


Indexing took 4.93 minutes
Building tree: [gdi --project-dir cases/full/index --ncores 32 rebuild tree --align-type full --extra-params '--fast -m GTR+F+R4' GCF_000439415.1_ASM43941v1_genomic]


Building tree took 1.76 minutes


Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f3af2397c40> but it is already set


A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


***Finished benchmarking, took 206.18 minutes***


In [6]:
benchmark_df

Unnamed: 0,Name,Iteration,Number samples,Number features (all),Number features (no unknown),Number cores,Reference length,Analysis runtime,Analysis memory (max),Analysis memory (max/process),Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,0810PADBR-1,1,22,45117,1102,32,1634890,969.08,31638370000.0,3428127000.0,12011500000.0,52.91,12353740000.0,740229100.0,46665728.0,105.17,544391200.0,275349504.0,1127.16,31638370000.0
0,1405WAEXK-1,1,9,45117,1102,32,5412686,735.64,9107980000.0,1683624000.0,6563504000.0,216.23,9673462000.0,3800670000.0,134737920.0,96.33,817635300.0,459063296.0,1048.2,9673462000.0
0,1408MLGX6-3WGS,1,31,45117,1102,32,2939733,743.46,28593830000.0,1928933000.0,10815540000.0,116.57,16805350000.0,986611700.0,174526464.0,126.51,993718300.0,660197376.0,986.54,28593830000.0
0,1203NYJAP-1 - Tuna Scrape Outbreak,1,23,45117,1102,32,4808805,542.71,20647380000.0,1370657000.0,9024676000.0,294.53,46270370000.0,3002397000.0,252043264.0,122.26,1333158000.0,869711872.0,959.5,46270370000.0
0,0810PADBR-1,2,22,45117,1102,32,1634890,918.73,30784110000.0,3428184000.0,12011490000.0,51.36,13775990000.0,744910800.0,46403584.0,106.84,544092200.0,275890176.0,1076.93,30784110000.0
0,1405WAEXK-1,2,9,45117,1102,32,5412686,718.66,8670171000.0,1683505000.0,6563508000.0,212.72,8914026000.0,3803709000.0,134504448.0,116.89,816930800.0,459284480.0,1048.27,8914026000.0
0,1408MLGX6-3WGS,2,31,45117,1102,32,2939733,749.41,28070240000.0,1928933000.0,10815500000.0,118.72,16806610000.0,1037353000.0,173465600.0,125.41,992878600.0,660254720.0,993.54,28070240000.0
0,1203NYJAP-1 - Tuna Scrape Outbreak,2,23,45117,1102,32,4808805,562.04,19194090000.0,1370464000.0,9024688000.0,296.85,37711820000.0,2726552000.0,249270272.0,129.44,1334055000.0,869912576.0,988.33,37711820000.0
0,0810PADBR-1,3,22,45117,1102,32,1634890,919.8,30768410000.0,3428151000.0,12011480000.0,55.0,15078370000.0,740212700.0,46637056.0,80.24,543809500.0,275492864.0,1055.04,30768410000.0
0,1405WAEXK-1,3,9,45117,1102,32,5412686,731.42,8699195000.0,1683472000.0,6563488000.0,211.65,9736053000.0,3804434000.0,134230016.0,111.76,818577400.0,459481088.0,1054.83,9736053000.0


In [7]:
benchmark_df.to_csv(benchmark_path, sep='\t', index=False)

# 3. Export trees

In [8]:
if build_tree:
    # Using variables with shell commands in Jupyter isn't working for me so I have to avoid using them
    !gdi --project-dir cases/full/index export tree GCA_001879185.2_ASM187918v2_genomic > cases/full/0810PADBR-1_tree.tre
    !gdi --project-dir cases/full/index export tree GCF_000703365.1_Ec2011C-3609_genomic > cases/full/1405WAEXK-1_tree.tre
    !gdi --project-dir cases/full/index export tree GCF_001047715.2_ASM104771v2_genomic > cases/full/1408MLGX6-3WGS_tree.tre
    !gdi --project-dir cases/full/index export tree GCF_000439415.1_ASM43941v1_genomic > cases/full/1203NYJAP-1_-_Tuna_Scrape_Outbreak_tree.tre
else:
    print(f'build_tree={build_tree} so no trees to export')