# 1. Parameters

In [1]:
simulation_dir = 'simulations/unset'
metadata_file = 'input/metadata.tsv.gz'

In [2]:
from pathlib import Path
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

simulation_dir_path = Path(simulation_dir)

case_name = str(simulation_dir_path.name)
index_reads_path = simulation_dir_path / 'index-reads'
index_assemblies_path = simulation_dir_path / 'index-assemblies'

output_api_reads_path = simulation_dir_path / 'query-reads-api.tsv'
output_api_assemblies_path = simulation_dir_path / 'query-assemblies-api.tsv'
output_cli_reads_path = simulation_dir_path / 'query-reads-cli.tsv'
output_cli_assemblies_path = simulation_dir_path / 'query-assemblies-cli.tsv'

# 2. Benchmark command-line

In [3]:
import pandas as pd
import genomics_data_index.api as gdi

def benchmark_cli_index(name: str, index_path: Path) -> pd.DataFrame:
    benchmark_commands = {
        'query':         f'gdi --project-dir {index_path} --ncores 1 query mutation "chrom0:8983:A:C"',
        'query summary': f'gdi --project-dir {index_path} --ncores 1 query mutation "chrom0:8983:A:C" --summarize',
        'list samples':  f'gdi --project-dir {index_path} --ncores 1 list samples',
    }
    
    db = gdi.GenomicsDataIndex.connect(index_path)
    number_samples = db.count_samples()
    number_features = db.count_mutations(reference_genome='reference', include_unknown=True)
    iterations = 10
    
    benchmarker = gdi_benchmark.QueryBenchmarkHandler()
    return benchmarker.benchmark_cli(name=name, kind_commands=benchmark_commands, number_samples=number_samples,
                                    number_features=number_features, iterations=iterations)

## 2.1. Benchmark reads

In [4]:
reads_cli_df = benchmark_cli_index(name=f'{case_name} (reads)', index_path=index_reads_path)
reads_cli_df.head(3)

Unnamed: 0,Name,Kind,Iteration,Number samples,Number features,Runtime,Memory (max),Mmemory (max/process)
0,unset (reads),query,1,59,1915,2.74,219983872.0,219983872.0
0,unset (reads),query,2,59,1915,2.75,219795456.0,219795456.0
0,unset (reads),query,3,59,1915,2.73,219656192.0,219656192.0


In [5]:
reads_cli_df.to_csv(output_cli_reads_path, sep='\t', index=False)

## 2.1. Benchmark assemblies

In [6]:
assemblies_cli_df = benchmark_cli_index(name=f'{case_name} (reads)', index_path=index_assemblies_path)
assemblies_cli_df.head(3)

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7ff25921d130> but it is already set


Unnamed: 0,Name,Kind,Iteration,Number samples,Number features,Runtime,Memory (max),Mmemory (max/process)
0,unset (reads),query,1,59,816,2.64,219615232.0,219615232.0
0,unset (reads),query,2,59,816,2.79,220221440.0,220221440.0
0,unset (reads),query,3,59,816,2.67,219774976.0,219774976.0


In [7]:
assemblies_cli_df.to_csv(output_cli_assemblies_path, sep='\t', index=False)

# 3. Test query API

## 3.1. Load (example) metadata

The simulated data is based off of real sample names and a real tree. So I can load up real metadata and attach it to a query (though the mutations and reference genome are all simulated).

In [8]:
import pandas as pd

metadata_df = pd.read_csv(metadata_file, sep='\t').rename({'Sample Name': 'Sample Name Orig'}, axis='columns')
metadata_df.head(2)

Unnamed: 0,Sample Name Orig,Run,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,BioSampleModel,Bytes,Center Name,...,PFGE_SecondaryEnzyme_pattern,Phagetype,Platform,ReleaseDate,Serovar,SRA Study,STRAIN,sub_species,Host_disease,Host
0,SH08-001,SRR3028792,WGS,429,354123684,PRJNA305824,SAMN04334683,Pathogen.cl,197484364,MCGILL UNIVERSITY,...,SHBNI.0001,19,ILLUMINA,2015-12-19T00:00:00Z,Heidelberg,SRP067504,SH08-001,enterica,Salmonella gastroenteritis,Homo sapiens
1,SH09-29,SRR3028793,WGS,422,519366460,PRJNA305824,SAMN04334684,Pathogen.cl,288691068,MCGILL UNIVERSITY,...,SHBNI.0001,26,ILLUMINA,2015-12-19T00:00:00Z,Heidelberg,SRP067504,SH09-29,enterica,Salmonella gastroenteritis,Homo sapiens


## 3.2. Define benchmark cases

In [9]:
from typing import List
import genomics_data_index.api as gdi

def benchmark_api_index(name: str, index_path: Path) -> pd.DataFrame:
    db = gdi.GenomicsDataIndex.connect(index_path)
    q_no_join = db.samples_query(reference_name='reference', universe='mutations')
    q_join = db.samples_query(reference_name='reference', universe='mutations').join(metadata_df, sample_names_column='Sample Name Orig')

    q = q_join.hasa('chrom0:1211:C:T')
    r = q_join.hasa("chrom0:8983:A:C")
    
    number_samples = db.count_samples()
    number_features = db.count_mutations(reference_genome='reference', include_unknown=True)
    repeat = 10

    benchmark_cases = {
        'db.samples_query':      lambda: db.samples_query(reference_name='reference', universe='mutations'),
        'q.join':                lambda: q_no_join.join(metadata_df, sample_names_column='Sample Name Orig'),
        'q.features_summary':    lambda: q_join.features_summary(),
        'q.features_comparison': lambda: q_join.features_comparison(sample_categories='outbreak_number', categories_kind='dataframe', kind='mutations', unit='proportion'),
        'q.hasa':                lambda: q_join.hasa("chrom0:8983:A:C"),
        'q.isa':                 lambda: q_join.isa("SH11-001"),
        'q.isin (distance)':     lambda: q_join.isin("SH11-001", kind='distance', distance=20, units='substitutions'),
        'q.isin (mrca)':         lambda: q_join.isin(["SH11-001", "SH10-001"], kind='mrca'),
        'q AND r':               lambda: q & r,
        'q.toframe':             lambda: q_join.toframe(),
        'q.summary':             lambda: q_join.summary(),
    }

    benchmarker = gdi_benchmark.QueryBenchmarkHandler()
    return benchmarker.benchmark_api(name=name, kind_functions=benchmark_cases,
                                       number_samples=number_samples, number_features=number_features,
                                       repeat=repeat)

## 3.3. Benchmark reads index

In [10]:
reads_df = benchmark_api_index(name=f'{case_name} (reads)', index_path=index_reads_path)
reads_df.head(5)

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7ff2591b1490> but it is already set


Unnamed: 0,Name,Kind,Number samples,Number features,Number executions,Iteration,Time
0,unset (reads),db.samples_query,59,1915,200,1,0.001661
1,unset (reads),db.samples_query,59,1915,200,2,0.001647
2,unset (reads),db.samples_query,59,1915,200,3,0.001657
3,unset (reads),db.samples_query,59,1915,200,4,0.001691
4,unset (reads),db.samples_query,59,1915,200,5,0.001639


In [11]:
reads_df.to_csv(output_api_reads_path, sep='\t', index=False)

## 3.4. Benchmark assemblies index

In [12]:
assemblies_df = benchmark_api_index(name=f'{case_name} (assemblies)', index_path=index_assemblies_path)
assemblies_df.head(5)

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7ff25921d130> but it is already set


Unnamed: 0,Name,Kind,Number samples,Number features,Number executions,Iteration,Time
0,unset (assemblies),db.samples_query,59,816,200,1,0.001626
1,unset (assemblies),db.samples_query,59,816,200,2,0.001631
2,unset (assemblies),db.samples_query,59,816,200,3,0.001631
3,unset (assemblies),db.samples_query,59,816,200,4,0.001628
4,unset (assemblies),db.samples_query,59,816,200,5,0.001628


In [13]:
assemblies_df.to_csv(output_api_assemblies_path, sep='\t', index=False)