# 1. Parameters

In [1]:
simulation_dir = 'simulations/unset'

In [2]:
# Parameters
read_coverage = 50
mincov = 10
simulation_dir = "simulations/cov-50"
iterations = 3


In [3]:
from pathlib import Path

simulation_dir_path = Path(simulation_dir)

case_name = str(simulation_dir_path.name)
index_reads_path = simulation_dir_path / 'index-reads'
index_assemblies_path = simulation_dir_path / 'index-assemblies'

output_api_reads_path = simulation_dir_path / 'query-reads-api.tsv'
output_api_assemblies_path = simulation_dir_path / 'query-assemblies-api.tsv'

# 2. Test query command-line

# 3. Test query API

In [4]:
from typing import List
import pandas as pd
import genomics_data_index.api as gdi

def create_timing_df_single_case(timings: List[float], name: str, kind: str,
                                number_features: int) -> pd.DataFrame:
    iterations = len(timings)
    return pd.DataFrame({
        'Name': [name]*iterations,
        'Kind': [kind]*iterations,
        'Number features': [number_features]*iterations,
        'Iteration': range(1, iterations + 1),
        'Time': timings,
    })

def create_timing_df(name: str, index_path: Path) -> pd.DataFrame:
    db = gdi.GenomicsDataIndex.connect(index_path)
    q = db.samples_query(reference_name='reference', universe='mutations')
    number_features = len(q.features_summary())
    
    # features_summary
    result = %timeit -r 10 -o q.features_summary()
    fs_df = create_timing_df_single_case(result.timings, name=name, kind='features_summary', number_features=number_features)
    
    # hasa
    result = %timeit -r 10 -o q.hasa("chrom0:8983:A:C")
    hasa_df = create_timing_df_single_case(result.timings, name=name, kind='hasa', number_features=number_features)
    
    # isa
    result = %timeit -r 10 -o q.isa("SH11-001")
    isa_df = create_timing_df_single_case(result.timings, name=name, kind='isa', number_features=number_features)
    
    # isin distance
    result = %timeit -r 10 -o q.isin("SH11-001", kind='distance', distance=20, units='substitutions')
    isin_distance_df = create_timing_df_single_case(result.timings, name=name, kind='isin distance', number_features=number_features)
    
    # isin mrca
    result = %timeit -r 10 -o q.isin(["SH11-001", "SH10-001"], kind='mrca')
    isin_mrca_df = create_timing_df_single_case(result.timings, name=name, kind='isin mrca', number_features=number_features)

    return pd.concat([fs_df, hasa_df, isa_df, isin_distance_df, isin_mrca_df])

## 3.1. Test reads

In [5]:
reads_df = create_timing_df(name=f'{case_name} (reads)', index_path=index_reads_path)
reads_df.head(5)

52 ms ± 223 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)


974 µs ± 2.97 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


385 µs ± 1.2 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


2 ms ± 6.65 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


679 µs ± 10.4 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


Unnamed: 0,Name,Kind,Number features,Iteration,Time
0,cov-50 (reads),features_summary,750,1,0.051866
1,cov-50 (reads),features_summary,750,2,0.05197
2,cov-50 (reads),features_summary,750,3,0.051855
3,cov-50 (reads),features_summary,750,4,0.051882
4,cov-50 (reads),features_summary,750,5,0.052103


In [6]:
reads_df.to_csv(output_api_reads_path, sep='\t', index=False)

## 3.2. Test assemblies

In [7]:
assemblies_df = create_timing_df(name=f'{case_name} (assemblies)', index_path=index_assemblies_path)
assemblies_df.head(5)

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f4438737610> but it is already set


53.4 ms ± 1.01 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


918 µs ± 1.83 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


405 µs ± 6.56 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


2.31 ms ± 32.3 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


671 µs ± 1.23 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


Unnamed: 0,Name,Kind,Number features,Iteration,Time
0,cov-50 (assemblies),features_summary,765,1,0.053362
1,cov-50 (assemblies),features_summary,765,2,0.05326
2,cov-50 (assemblies),features_summary,765,3,0.053371
3,cov-50 (assemblies),features_summary,765,4,0.055017
4,cov-50 (assemblies),features_summary,765,5,0.052532


In [8]:
assemblies_df.to_csv(output_api_assemblies_path, sep='\t', index=False)