# 1. Parameters

In [1]:
simulation_dir = 'simulations/unset'
metadata_file = 'input/metadata.tsv.gz'

In [2]:
# Parameters
read_coverage = 5
mincov = 10
simulation_dir = "simulations/cov-5"
iterations = 3
sub_alpha = 0.2


In [3]:
from pathlib import Path
import imp
fp, pathname, description = imp.find_module('gdi_benchmark', ['../../lib'])
gdi_benchmark = imp.load_module('gdi_benchmark', fp, pathname, description)

simulation_dir_path = Path(simulation_dir)

case_name = str(simulation_dir_path.name)
index_reads_path = simulation_dir_path / 'index-reads'
index_assemblies_path = simulation_dir_path / 'index-assemblies'

output_api_reads_path = simulation_dir_path / 'query-reads-api.tsv'
output_api_assemblies_path = simulation_dir_path / 'query-assemblies-api.tsv'
output_cli_reads_path = simulation_dir_path / 'query-reads-cli.tsv'
output_cli_assemblies_path = simulation_dir_path / 'query-assemblies-cli.tsv'

# 2. Benchmark command-line

In [4]:
import pandas as pd
import genomics_data_index.api as gdi

def benchmark_cli_index(name: str, index_path: Path) -> pd.DataFrame:
    db = gdi.GenomicsDataIndex.connect(index_path)
    mutations_df = db.mutations_summary(reference_name='reference').sort_values('Count', ascending=False)
    top_mutation = mutations_df.iloc[0].name
    
    if 'chrom' not in top_mutation:
        raise Exception(f'Does not exist a single mutation for index {index_path}')
    else:
        print(f'top_mutation={top_mutation}')
        
    benchmark_commands = {
        'query hasa':               f'gdi --project-dir {index_path} --ncores 1 query "hasa:{top_mutation}"',
        'query isa':                f'gdi --project-dir {index_path} --ncores 1 query "isa:SH13-007"',
        'query --summary':          f'gdi --project-dir {index_path} --ncores 1 query --summary',
        'query --features-summary': f'gdi --project-dir {index_path} --ncores 1 query --features-summary mutations',
        'query isin':               f'gdi --project-dir {index_path} --ncores 1 query --reference-name reference "isin_100_substitutions:SH13-007"',
        'list samples':             f'gdi --project-dir {index_path} --ncores 1 list samples',
    }
        
    number_samples = db.count_samples()
    number_features_no_unknown = db.count_mutations(reference_genome='reference', include_unknown=False)
    number_features_all = db.count_mutations(reference_genome='reference', include_unknown=True)
    iterations = 10
    
    benchmarker = gdi_benchmark.QueryBenchmarkHandler()
    return benchmarker.benchmark_cli(name=name, kind_commands=benchmark_commands,
                                     number_samples=number_samples,
                                     number_features_no_unknown=number_features_no_unknown,
                                     number_features_all=number_features_all,
                                     iterations=iterations)

## 2.1. Benchmark reads

In [5]:
reads_cli_df = benchmark_cli_index(name=f'{case_name} (reads)', index_path=index_reads_path)
reads_cli_df.head(3)

top_mutation=chrom0:9206:C:T


Unnamed: 0,Name,Kind,Iteration,Number samples,Number features (no unknown),Number features (all),Runtime,Memory (max),Mmemory (max/process)
0,cov-5 (reads),query hasa,1,59,287,19986,2.68,221753344.0,221753344.0
0,cov-5 (reads),query hasa,2,59,287,19986,2.75,222740480.0,222740480.0
0,cov-5 (reads),query hasa,3,59,287,19986,2.77,221843456.0,221843456.0


In [6]:
reads_cli_df.to_csv(output_cli_reads_path, sep='\t', index=False)

## 2.1. Benchmark assemblies

In [7]:
assemblies_cli_df = benchmark_cli_index(name=f'{case_name} (reads)', index_path=index_assemblies_path)
assemblies_cli_df.head(3)

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f653e2f5ca0> but it is already set


top_mutation=chrom0:738:G:A


Unnamed: 0,Name,Kind,Iteration,Number samples,Number features (no unknown),Number features (all),Runtime,Memory (max),Mmemory (max/process)
0,cov-5 (reads),query hasa,1,59,1350,1350,2.75,220192768.0,220192768.0
0,cov-5 (reads),query hasa,2,59,1350,1350,2.73,221011968.0,221011968.0
0,cov-5 (reads),query hasa,3,59,1350,1350,2.68,220786688.0,220786688.0


In [8]:
assemblies_cli_df.to_csv(output_cli_assemblies_path, sep='\t', index=False)

# 3. Test query API

## 3.1. Load (example) metadata

The simulated data is based off of real sample names and a real tree. So I can load up real metadata and attach it to a query (though the mutations and reference genome are all simulated).

In [9]:
import pandas as pd

metadata_df = pd.read_csv(metadata_file, sep='\t').rename({'Sample Name': 'Sample Name Orig'}, axis='columns')
metadata_df.head(2)

Unnamed: 0,Sample Name Orig,Run,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,BioSampleModel,Bytes,Center Name,...,PFGE_SecondaryEnzyme_pattern,Phagetype,Platform,ReleaseDate,Serovar,SRA Study,STRAIN,sub_species,Host_disease,Host
0,SH08-001,SRR3028792,WGS,429,354123684,PRJNA305824,SAMN04334683,Pathogen.cl,197484364,MCGILL UNIVERSITY,...,SHBNI.0001,19,ILLUMINA,2015-12-19T00:00:00Z,Heidelberg,SRP067504,SH08-001,enterica,Salmonella gastroenteritis,Homo sapiens
1,SH09-29,SRR3028793,WGS,422,519366460,PRJNA305824,SAMN04334684,Pathogen.cl,288691068,MCGILL UNIVERSITY,...,SHBNI.0001,26,ILLUMINA,2015-12-19T00:00:00Z,Heidelberg,SRP067504,SH09-29,enterica,Salmonella gastroenteritis,Homo sapiens


## 3.2. Define benchmark cases

In [10]:
from typing import List
import genomics_data_index.api as gdi

def benchmark_api_index(name: str, index_path: Path) -> pd.DataFrame:
    db = gdi.GenomicsDataIndex.connect(index_path)
    q_no_join = db.samples_query(reference_name='reference', universe='mutations')
    q_join = db.samples_query(reference_name='reference', universe='mutations').join(metadata_df, sample_names_column='Sample Name Orig')
    
    mutations_df = db.mutations_summary(reference_name='reference').sort_values('Count', ascending=False)
    top_mutations = mutations_df.iloc[[0,1]].index.tolist()
    
    if len(top_mutations) != 2:
        raise Exception(f'Does not exist two mutations for index {index_path}')
    else:
        mutation1 = top_mutations[0]
        mutation2 = top_mutations[1]
        print(f'mutation1={mutation1}, mutation2={mutation2}')

    q = q_join.hasa(mutation1)
    r = q_join.hasa(mutation2)
    
    number_samples = db.count_samples()
    number_features_no_unknown = db.count_mutations(reference_genome='reference', include_unknown=False)
    number_features_all = db.count_mutations(reference_genome='reference', include_unknown=True)
    repeat = 10
    
    benchmark_cases = {
        'db.samples_query':      lambda: db.samples_query(reference_name='reference', universe='mutations'),
        'q.join':                lambda: q_no_join.join(metadata_df, sample_names_column='Sample Name Orig'),
        'q.features_summary':    lambda: q_join.features_summary(),
        'q.features_comparison': lambda: q_join.features_comparison(sample_categories='outbreak_number', categories_kind='dataframe', kind='mutations', unit='proportion'),
        'q.hasa':                lambda: q_join.hasa(mutation1),
        'q.isa':                 lambda: q_join.isa("SH13-007"),
        'q AND r':               lambda: q & r,
        'q.toframe':             lambda: q_join.toframe(),
        'q.summary':             lambda: q_join.summary(),
        'q.isin (distance)':     lambda: q_join.isin("SH13-007", kind='distance', distance=100, units='substitutions'),
        'q.isin (mrca)':         lambda: q_join.isin(["SH13-007", "SH12-001"], kind='mrca'),
    }

    benchmarker = gdi_benchmark.QueryBenchmarkHandler()
    return benchmarker.benchmark_api(name=name, kind_functions=benchmark_cases,
                                     number_samples=number_samples,
                                     number_features_no_unknown=number_features_no_unknown,
                                     number_features_all=number_features_all,
                                     repeat=repeat)

## 3.3. Benchmark reads index

In [11]:
reads_df = benchmark_api_index(name=f'{case_name} (reads)', index_path=index_reads_path)
reads_df.head(5)

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f6535f77ca0> but it is already set


mutation1=chrom0:9206:C:T, mutation2=chrom0:667:G:A


Unnamed: 0,Name,Kind,Number samples,Number features (no unknown),Number features (all),Number executions,Iteration,Time
0,cov-5 (reads),db.samples_query,59,287,19986,200,1,0.001774
1,cov-5 (reads),db.samples_query,59,287,19986,200,2,0.001773
2,cov-5 (reads),db.samples_query,59,287,19986,200,3,0.001774
3,cov-5 (reads),db.samples_query,59,287,19986,200,4,0.001768
4,cov-5 (reads),db.samples_query,59,287,19986,200,5,0.001765


In [12]:
reads_df.to_csv(output_api_reads_path, sep='\t', index=False)

## 3.4. Benchmark assemblies index

In [13]:
assemblies_df = benchmark_api_index(name=f'{case_name} (assemblies)', index_path=index_assemblies_path)
assemblies_df.head(5)

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f6535edb670> but it is already set


mutation1=chrom0:738:G:A, mutation2=chrom1:6298:T:G


Unnamed: 0,Name,Kind,Number samples,Number features (no unknown),Number features (all),Number executions,Iteration,Time
0,cov-5 (assemblies),db.samples_query,59,1350,1350,200,1,0.001687
1,cov-5 (assemblies),db.samples_query,59,1350,1350,200,2,0.001708
2,cov-5 (assemblies),db.samples_query,59,1350,1350,200,3,0.001652
3,cov-5 (assemblies),db.samples_query,59,1350,1350,200,4,0.001655
4,cov-5 (assemblies),db.samples_query,59,1350,1350,200,5,0.001647


In [14]:
assemblies_df.to_csv(output_api_assemblies_path, sep='\t', index=False)