# 1. Parameters

In [1]:
# Defaults
cases_dir = 'cases/unset'
reference_file = 'data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz'
reference_name = 'GCA_001879185.2_ASM187918v2_genomic'
metadata_file = 'metadata.tsv'
iterations = 1
mincov = 10
ncores = 32
dataset_name = '0810PADBR-1'
build_tree = True
sample_batch_size=10

In [2]:
from pathlib import Path
from shutil import rmtree
from os import makedirs
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

cases_dir_path = Path(cases_dir)

if cases_dir_path.exists():
    rmtree(cases_dir_path)
    
if not cases_dir_path.exists():
    makedirs(cases_dir_path)

reference_file = Path(reference_file)

case_name = str(cases_dir_path.name)
# reference_name = reference_file.name.split('.')[0]

cases_input = cases_dir_path / 'input-files-case.tsv'
index_path = cases_dir_path / 'index'
benchmark_path = cases_dir_path / 'index-info.tsv'
output_tree = cases_dir_path / 'tree.tre'

# 2. Create subset input

In [3]:
import pandas as pd
from pathlib import Path

metadata_df = pd.read_csv(metadata_file, sep='\t')
all_input_total = len(metadata_df)

input_df = metadata_df.copy().loc[metadata_df['dataSetName'] == dataset_name]
input_df['Sample'] = input_df['strain']
input_df['Assemblies'] = pd.NA
input_df['Reads1'] = input_df['Sample'].apply(lambda x: str((Path('data') / 'fastq' / (x + '_1.fastq.gz')).absolute()))
input_df['Reads2'] = input_df['Sample'].apply(lambda x: str((Path('data') / 'fastq' / (x + '_2.fastq.gz')).absolute()))
input_df = input_df[['Sample', 'Assemblies', 'Reads1', 'Reads2']]

input_df.to_csv(cases_input, sep='\t', index=False)

subset_input_total = len(input_df)

print(f'Wrote dataset={dataset_name} consisting of {subset_input_total}/{all_input_total} samples to {cases_input}')

Wrote dataset=0810PADBR-1 consisting of 22/85 samples to cases/unset/input-files-case.tsv


# 2. Index genomes

In [4]:
!gdi --version

gdi, version 0.4.0.dev1


## 2.1. Index reads

In [5]:
results_handler = gdi_benchmark.BenchmarkResultsHandler(name=case_name)
benchmarker = gdi_benchmark.IndexBenchmarker(benchmark_results_handler=results_handler,
                                             index_path=index_path, input_files_file=cases_input,
                                             reference_file=reference_file, mincov=mincov,
                                             build_tree=build_tree,
                                             ncores=ncores,
                                             sample_batch_size=sample_batch_size)

benchmark_df = benchmarker.benchmark(iterations=iterations)


Iteration 1 of index/analysis of 22 samples with 32 cores
Removing any extra snakemake directories: []
Creating new index: [gdi init cases/unset/index]
Creating a new index took 2.71 seconds
Analysis running: [gdi --project-dir cases/unset/index --ncores 32 analysis --use-conda --no-load-data --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz --kmer-size 31 --kmer-size 51 --kmer-size 71 --include-kmer --reads-mincov 10 --input-structured-genomes-file cases/unset/input-files-case.tsv]
Analysis took 15.70 minutes
Index running: [gdi --project-dir cases/unset/index --ncores 32 load vcf-kmer --sample-batch-size 10 --reference-file data/reference/campylobacter/GCA_001879185.2_ASM187918v2_genomic.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/reads/snakemake-assemblies.1635881888.3302698/gdi-input.fofn]
Indexing took 0.80 minutes
Building tree: [gdi --project-dir cases/unset/index --ncores 32 rebuild tree --align

A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


In [6]:
benchmark_df

Unnamed: 0,Name,Iteration,Number samples,Number features (all),Number features (no unknown),Number cores,Reference length,Analysis runtime,Analysis memory (max),Analysis memory (max/process),Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Tree runtime,Tree memory (max),Tree memory (max/process),Total runtime,Max memory
0,unset,1,22,45117,1102,32,1634890,942.0,31709460000.0,3427553000.0,12237030000.0,47.75,13877420000.0,692555776.0,46649344.0,87.71,457441280.0,266956800.0,1077.46,31709460000.0


In [7]:
benchmark_df.to_csv(benchmark_path, sep='\t', index=False)

# 3. Export trees

In [8]:
if build_tree:
    !gdi --project-dir {index_path} export tree {reference_name} > {output_tree}
    print(f'Wrote tree to {output_tree}')
else:
    print(f'build_tree={build_tree} so no tree to export')

Wrote tree to cases/unset/tree.tre
