# Initiate index

In [1]:
from pathlib import Path
from os import mkdir
from shutil import rmtree

index_path = Path('index')

if index_path.exists():
    rmtree(index_path)
    
!gdi init {index_path}

Initializing empty project in [index]


# Index batches of genomes

In [2]:
import glob
import pandas as pd
import cmdbench
import time
from shutil import rmtree
import gzip
from Bio import SeqIO

input_file = Path('input/input-files.tsv')
ncores = 32
reference_file = Path("references/NC_045512.gbk.gz")

input_df = pd.read_csv(input_file, sep='\t')
number_samples = len(input_df)
sample_batch_size = 10000

with gzip.open(reference_file, mode='rt') as f:
    sequences = list(SeqIO.parse(f, 'genbank'))
    reference_length = len(sequences[0])

def get_and_validate_index_input(expected_number_samples):
    snakemake_dirs = glob.glob('snakemake*')
    if len(snakemake_dirs) == 1:
        snakemake_dir = snakemake_dirs[0]
    else:
        raise Exception(f'Invalid number of snakemake directories: {snakemake_dirs}')

    vcf_input_file = (Path(snakemake_dir) / 'gdi-input.fofn').absolute()

    if not vcf_input_file.exists():
        raise Exception(f'VCF input file {vcf_input_file} does not exist')
        
    vcf_df = pd.read_csv(vcf_input_file, sep='\t')
    actual_number_samples = len(vcf_df)
    
    assert expected_number_samples == actual_number_samples, f'expected={expected_number_samples} != actual={actual_number_samples}'
    
    return vcf_input_file

## Do index

In [3]:
import cmdbench
import time
from shutil import rmtree

snakemake_dirs = glob.glob('snakemake*')
print(f'Removing any extra snakemake directories: {snakemake_dirs}')
for d in snakemake_dirs:
    rmtree(d)

print(f'\n***ANALYSIS of {number_samples} samples with {ncores} cores***')
analysis_cmd = (
    f"gdi --project-dir {index_path} --ncores {ncores} analysis"
    f" --use-conda --no-load-data --reference-file {reference_file}" 
    f" --sample-batch-size {sample_batch_size} --input-structured-genomes-file {input_file}"
)
print(f"Running: [{analysis_cmd}]")
before_time = time.time()
benchmark_analysis = cmdbench.benchmark_command(analysis_cmd, iterations_num = 1)
after_time = time.time()
print(f'Analysis took {(after_time - before_time)/60:0.2f} minutes')
fig = benchmark_analysis.get_resources_plot(width = 15, height = 10)
fig.savefig('analysis-plot.png')

print(f'\n***INDEX of {number_samples} samples with {ncores} cores***')
index_vcf_file = get_and_validate_index_input(expected_number_samples=number_samples)
index_cmd = (
    f"gdi --project-dir {index_path} --ncores {ncores} load vcf"
    f" --reference-file {reference_file} {index_vcf_file}"
)
print(f"Running: [{index_cmd}]")
before_time = time.time()
benchmark_index = cmdbench.benchmark_command(index_cmd, iterations_num = 1)
after_time = time.time()
print(f'Indexing took {(after_time - before_time)/60:0.2f} minutes')
fig = benchmark_index.get_resources_plot(width = 15, height = 10)
fig.savefig('index-plot.png')

Removing any extra snakemake directories: ['snakemake-assemblies.1628891403.471974']

***ANALYSIS of 20 samples with 32 cores***
Running: [gdi --project-dir index --ncores 32 analysis --use-conda --no-load-data --reference-file references/NC_045512.gbk.gz --sample-batch-size 10000 --input-structured-genomes-file input/input-files.tsv]
Analysis took 0.69 minutes

***INDEX of 20 samples with 32 cores***
Running: [gdi --project-dir index --ncores 32 load vcf --reference-file references/NC_045512.gbk.gz /home/CSCScience.ca/apetkau/workspace/genomics-data-index-evaluation/evaluations/sars-cov-2/snakemake-assemblies.1628916352.0267313/gdi-input.fofn]
Indexing took 0.11 minutes


# Write benchmark results

In [4]:
import subprocess
import genomics_data_index.api as gdi

def benchmark_to_df(number_samples, benchmark_analysis, benchmark_index,
                    index_path: Path, analysis_path: Path,
                    ncores: int, reference_length: int) -> pd.DataFrame:
    analysis_data = benchmark_analysis.get_first_iteration()
    index_data = benchmark_index.get_first_iteration()
    
    analysis_size = subprocess.check_output(
        ['du', '-s', '--block-size=1', str(analysis_path)]).split()[0].decode('utf-8')
    
    db = gdi.GenomicsDataIndex.connect(index_path)
    index_size = db.db_size(unit='B').set_index('Type').loc['Total']['Data Size (B)']
    
    df = pd.DataFrame(data={
        'Number samples': [int(number_samples)],
        'Number cores': [ncores],
        'Reference length': [reference_length],
        'Analysis runtime': [float(analysis_data['process']['execution_time'])],
        'Analysis memory (max)': [float(analysis_data['memory']['max'])],
        'Analysis memory (max/process)': [float(analysis_data['memory']['max_perprocess'])],
        'Analysis disk uage': [float(analysis_size)],
        'Index runtime': [float(index_data['process']['execution_time'])],
        'Index memory (max)': [float(index_data['memory']['max'])],
        'Index memory (max/process)': [float(index_data['memory']['max_perprocess'])],
        'Index size': [index_size],
    })
    
    df['Total runtime'] = df['Analysis runtime'] + df['Index runtime']
    df['Max memory'] = df[['Analysis memory (max)', 'Index memory (max)']].max(axis='columns')
    
    return df

benchmark_df = benchmark_to_df(number_samples=number_samples,
                               benchmark_analysis=benchmark_analysis,
                               benchmark_index=benchmark_index,
                               index_path=index_path,
                               analysis_path=index_vcf_file.parent,
                               ncores=ncores,
                               reference_length=reference_length)

benchmark_df.to_csv('index-benchmark.tsv', sep='\t', index=False)

def convert_sizes(df: pd.DataFrame) -> pd.DataFrame:
    size_factor = 1024**3 # GB
    time_factor = 1 # min
    
    new_df = df.copy()
    size_cols = ['Analysis memory (max)', 'Analysis memory (max/process)',
           'Analysis disk uage', 'Index memory (max)', 'Index memory (max/process)',
           'Index size', 'Max memory']
    time_cols = ['Analysis runtime', 'Index runtime', 'Total runtime']
    
    for col in size_cols:
        new_df[col] = df[col] / size_factor
        
    for col in time_cols:
        new_df[col] = df[col] / time_factor
        
    return new_df

convert_sizes(benchmark_df)

A reminder to myself to look for a Python solution for directory sizes (instead of running `du`)


Unnamed: 0,Number samples,Number cores,Reference length,Analysis runtime,Analysis memory (max),Analysis memory (max/process),Analysis disk uage,Index runtime,Index memory (max),Index memory (max/process),Index size,Total runtime,Max memory
0,20,32,29903,41.33,1.832592,0.264275,0.615688,6.61,5.15641,0.254543,0.001846,47.94,5.15641
