# 1. Parameters

In [1]:
simulation_dir = 'simulations/unset'

In [2]:
# Parameters
read_coverage = 40
mincov = 10
simulation_dir = "simulations/cov-40"
iterations = 3


In [3]:
from pathlib import Path

simulation_dir_path = Path(simulation_dir)

case_name = str(simulation_dir_path.name)
index_reads_path = simulation_dir_path / 'index-reads'
index_assemblies_path = simulation_dir_path / 'index-assemblies'

output_api_reads_path = simulation_dir_path / 'query-reads-api.tsv'
output_api_assemblies_path = simulation_dir_path / 'query-assemblies-api.tsv'

# 2. Test query command-line

# 3. Test query API

In [4]:
from typing import List
import pandas as pd
import genomics_data_index.api as gdi

def create_timing_df_single_case(timings: List[float], name: str, kind: str,
                                number_features: int) -> pd.DataFrame:
    iterations = len(timings)
    return pd.DataFrame({
        'Name': [name]*iterations,
        'Kind': [kind]*iterations,
        'Number features': [number_features]*iterations,
        'Iteration': range(1, iterations + 1),
        'Time': timings,
    })

def create_timing_df(name: str, index_path: Path) -> pd.DataFrame:
    db = gdi.GenomicsDataIndex.connect(index_path)
    q = db.samples_query(reference_name='reference', universe='mutations')
    number_features = len(q.features_summary())
    
    # features_summary
    result = %timeit -r 10 -o q.features_summary()
    fs_df = create_timing_df_single_case(result.timings, name=name, kind='features_summary', number_features=number_features)
    
    # hasa
    result = %timeit -r 10 -o q.hasa("chrom0:8983:A:C")
    hasa_df = create_timing_df_single_case(result.timings, name=name, kind='hasa', number_features=number_features)
    
    # isa
    result = %timeit -r 10 -o q.isa("SH11-001")
    isa_df = create_timing_df_single_case(result.timings, name=name, kind='isa', number_features=number_features)
    
    # isin distance
    result = %timeit -r 10 -o q.isin("SH11-001", kind='distance', distance=20, units='substitutions')
    isin_distance_df = create_timing_df_single_case(result.timings, name=name, kind='isin distance', number_features=number_features)
    
    # isin mrca
    result = %timeit -r 10 -o q.isin(["SH11-001", "SH10-001"], kind='mrca')
    isin_mrca_df = create_timing_df_single_case(result.timings, name=name, kind='isin mrca', number_features=number_features)

    return pd.concat([fs_df, hasa_df, isa_df, isin_distance_df, isin_mrca_df])

## 3.1. Test reads

In [5]:
reads_df = create_timing_df(name=f'{case_name} (reads)', index_path=index_reads_path)
reads_df.head(5)

72.7 ms ± 1.45 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


1.06 ms ± 9.55 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


397 µs ± 2.06 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


2.12 ms ± 21.8 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


673 µs ± 2.75 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


Unnamed: 0,Name,Kind,Number features,Iteration,Time
0,cov-40 (reads),features_summary,751,1,0.072091
1,cov-40 (reads),features_summary,751,2,0.071351
2,cov-40 (reads),features_summary,751,3,0.071599
3,cov-40 (reads),features_summary,751,4,0.072342
4,cov-40 (reads),features_summary,751,5,0.072885


In [6]:
reads_df.to_csv(output_api_reads_path, sep='\t', index=False)

## 3.2. Test assemblies

In [7]:
assemblies_df = create_timing_df(name=f'{case_name} (assemblies)', index_path=index_assemblies_path)
assemblies_df.head(5)

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f45be871730> but it is already set


72.7 ms ± 526 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)


945 µs ± 9.94 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


398 µs ± 573 ns per loop (mean ± std. dev. of 10 runs, 1000 loops each)


2.35 ms ± 13.3 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


635 µs ± 2.85 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


Unnamed: 0,Name,Kind,Number features,Iteration,Time
0,cov-40 (assemblies),features_summary,765,1,0.07292
1,cov-40 (assemblies),features_summary,765,2,0.074043
2,cov-40 (assemblies),features_summary,765,3,0.072308
3,cov-40 (assemblies),features_summary,765,4,0.07224
4,cov-40 (assemblies),features_summary,765,5,0.072368


In [8]:
assemblies_df.to_csv(output_api_assemblies_path, sep='\t', index=False)