# 1. Parameters

In [1]:
cases_dir = 'cases/unset'
metadata_file = 'input/metadata-subsample-pangolin.tsv'
build_tree = False

In [2]:
# Parameters
cases_dir = "cases/case-20000"
iterations = 3
number_samples = 20000
build_tree = False


In [3]:
from pathlib import Path
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

cases_dir_path = Path(cases_dir)

case_name = str(cases_dir_path.name)
index_path = cases_dir_path / 'index'

output_api_path = cases_dir_path / 'query-api.tsv'
output_cli_path = cases_dir_path / 'query-cli.tsv'

# 2. Benchmark command-line

In [4]:
import pandas as pd
import genomics_data_index.api as gdi

def benchmark_cli_index(name: str, index_path: Path, build_tree: bool) -> pd.DataFrame:
    benchmark_commands = {
        'query hasa':               f'gdi --project-dir {index_path} --ncores 1 query "hasa:hgvs_gn:NC_045512.2:S:p.D614G"',
        'query isa':                f'gdi --project-dir {index_path} --ncores 1 query "isa:Switzerland/100108/2020"',
        'query --summary':          f'gdi --project-dir {index_path} --ncores 1 query "hasa:hgvs_gn:NC_045512.2:S:p.D614G" --summary',
        'query --features-summary': f'gdi --project-dir {index_path} --ncores 1 query --features-summary mutations',
        'list samples':             f'gdi --project-dir {index_path} --ncores 1 list samples',
    }
    
    if build_tree:
        benchmark_commands['query isin'] = f'gdi --project-dir {index_path} --ncores 1 query --reference-name NC_045512 "isin_5_substitutions:Switzerland/100108/2020"'
    
    db = gdi.GenomicsDataIndex.connect(index_path)
    number_samples = db.count_samples()
    number_features_no_unknown = db.count_mutations(reference_genome='NC_045512', include_unknown=False)
    number_features_all = db.count_mutations(reference_genome='NC_045512', include_unknown=True)
    iterations = 10
    
    benchmarker = gdi_benchmark.QueryBenchmarkHandler()
    return benchmarker.benchmark_cli(name=name, kind_commands=benchmark_commands, number_samples=number_samples,
                                    number_features_no_unknown=number_features_no_unknown, number_features_all=number_features_all,
                                     iterations=iterations)

In [5]:
cli_df = benchmark_cli_index(name=case_name, index_path=index_path, build_tree=build_tree)
cli_df.head(3)

Unnamed: 0,Name,Kind,Iteration,Number samples,Number features (no unknown),Number features (all),Runtime,Memory (max),Mmemory (max/process)
0,case-20000,query hasa,1,20000,15764,45667,3.63,258801664.0,258801664.0
0,case-20000,query hasa,2,20000,15764,45667,3.55,259846144.0,259846144.0
0,case-20000,query hasa,3,20000,15764,45667,3.56,259112960.0,259112960.0


In [6]:
cli_df.to_csv(output_cli_path, sep='\t', index=False)

# 3. Test query API

## 3.1. Load (example) metadata

The simulated data is based off of real sample names and a real tree. So I can load up real metadata and attach it to a query (though the mutations and reference genome are all simulated).

In [7]:
import pandas as pd

metadata_df = pd.read_csv(metadata_file, sep='\t')
metadata_df.head(2)

Unnamed: 0,strain,virus,gisaid_epi_isl,genbank_accession,sra_accession,date,region,country,division,location,...,ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,version,pangolin_version,pangoLEARN_version,pango_version,status,note
0,OU420663,ncov,?,OU420663,ERR5939958,2020,Europe,United Kingdom,England,,...,1.0,Alpha (B.1.1.7-like),1.0,0.0,PLEARN-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,scorpio call: Alt alleles 23; Ref alleles 0; A...
1,USA/NY-CUIMC-NP-3606/2020,ncov,?,MZ702266,,2020-12-02,North America,USA,New York,New York City,...,0.95097,,,,PLEARN-v1.2.66,3.1.11,2021-08-24,v1.2.66,passed_qc,


## 3.2. Define benchmark cases

In [8]:
from typing import List
import genomics_data_index.api as gdi

def benchmark_api_index(name: str, index_path: Path, build_tree: bool) -> pd.DataFrame:
    db = gdi.GenomicsDataIndex.connect(index_path)
    q_no_join = db.samples_query(reference_name='NC_045512', universe='mutations')
    q_join = db.samples_query(reference_name='NC_045512', universe='mutations').join(metadata_df, sample_names_column='strain')

    q = q_join.hasa('hgvs_gn:NC_045512.2:S:p.D614G')
    r = q_join.hasa('hgvs_gn:NC_045512.2:N:p.R203K')
    
    number_samples = db.count_samples()
    number_features_no_unknown = db.count_mutations(reference_genome='NC_045512', include_unknown=False)
    number_features_all = db.count_mutations(reference_genome='NC_045512', include_unknown=True)
    repeat = 10

    benchmark_cases = {
        'db.samples_query':      lambda: db.samples_query(reference_name='NC_045512', universe='mutations'),
        'q.join':                lambda: q_no_join.join(metadata_df, sample_names_column='strain'),
        'q.features_summary':    lambda: q_join.features_summary(),
        'q.features_comparison': lambda: q_join.features_comparison(sample_categories='lineage', categories_kind='dataframe', kind='mutations', unit='proportion'),
        'q.hasa':                lambda: q_join.hasa("hgvs_gn:NC_045512.2:N:p.R203K"),
        'q.isa':                 lambda: q_join.isa("Switzerland/100112/2020"),
        'q AND r':               lambda: q & r,
        'q.toframe':             lambda: q_join.toframe(),
        'q.summary':             lambda: q_join.summary(),
    }
    
    if build_tree:
        benchmark_cases['q.isin (distance)'] = lambda: q_join.isin("Switzerland/100108/2020", kind='distance', distance=5, units='substitutions')
        benchmark_cases['q.isin (mrca)'] = lambda: q_join.isin(["Switzerland/100108/2020", "FR993751"], kind='mrca')

    benchmarker = gdi_benchmark.QueryBenchmarkHandler()
    return benchmarker.benchmark_api(name=name, kind_functions=benchmark_cases,
                                       number_samples=number_samples,
                                       number_features_no_unknown=number_features_no_unknown,
                                       number_features_all=number_features_all,
                                       repeat=repeat)

## 3.3. Benchmark reads index

In [9]:
api_df = benchmark_api_index(name=case_name, index_path=index_path, build_tree=build_tree)
api_df.head(5)

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7fdff31ddaf0> but it is already set


Unnamed: 0,Name,Kind,Number samples,Number features (no unknown),Number features (all),Number executions,Iteration,Time
0,case-20000,db.samples_query,20000,15764,45667,10,1,0.023996
1,case-20000,db.samples_query,20000,15764,45667,10,2,0.02374
2,case-20000,db.samples_query,20000,15764,45667,10,3,0.023454
3,case-20000,db.samples_query,20000,15764,45667,10,4,0.023452
4,case-20000,db.samples_query,20000,15764,45667,10,5,0.02345


In [10]:
api_df.to_csv(output_api_path, sep='\t', index=False)