# Measure performance for different methods of querying variant data

First let's print out the exact git commit of my thesis-index code I am using.

In [66]:
!git -C ../../../thesis-index rev-parse HEAD

41a96d85dbbb3b5910206a5fff4455dcb1cb1785


In [6]:
from storage.variant.io.SnippyVariantsReader import SnippyVariantsReader
from pathlib import Path
from os import listdir, path

snippy_dir = Path('..', 'phylogeny')
sample_dirs = [snippy_dir / d for d in listdir(snippy_dir) if path.isdir(snippy_dir / d)]

variants_reader = SnippyVariantsReader(sample_dirs)

A function used to wrap around another function and measure runtime

In [54]:
import time

def get_runtime(func, **args):
    start = time.time()
    value = func(**args)
    end = time.time()
    runtime = end - start
    print(f'Runtime: {runtime:0.2f} seconds')
    return value, end - start

# Test variants union

Let's test getting set of all variants among a list of samples (union)

In [64]:
case1_samples = ['SH12-001']
case2_samples = ["SH14-004" ,"SH13-001" ,"SH14-011" ,"SH14-016" ,"SH09-29" ,"SH12-008" ,"SH14-010" ,"SH14-028" ,"SH10-30" ,"SH12-007"]

## 1. From VCF files

In [65]:
from typing import List, Set

def get_union_from_vcf(samples: List[str]) -> Set[str]:
    sample_dirs = [snippy_dir / s for s in samples]
    variants_reader = SnippyVariantsReader(sample_dirs)
    var_df = variants_reader.get_variants_table()
    
    vars_union = set()
    var_df['SPDI'] = var_df['CHROM'] + ':' + var_df['POS'].astype(str) + ':' + var_df['REF'] + ':' + var_df['ALT']
    for sample in samples:
        vars_union = vars_union.union(set(var_df[var_df['SAMPLE'] == sample]['SPDI'].tolist()))
        
    return vars_union

print(f'Case 1: Single ({len(case1_samples)}) sample')
union, runtime = get_runtime(get_union_from_vcf, samples=case1_samples)
print(f'Length of union: {len(union)}')

print(f'\nCase 2: {len(case2_samples)} samples')
union, runtime = get_runtime(get_union_from_vcf, samples=case2_samples)
print(f'Length of union: {len(union)}')

Case 1: Single (1) sample
Runtime: 1.43 seconds
Length of union: 17193

Case 2: 10 samples
Runtime: 13.18 seconds
Length of union: 36920


## 2. From relational database

To test this out, I first have to load all the VCF files into the database, which takes a while. Since I'm only testing a specific query (finding union of all variants in some samples) I'm not timing this loading time.

In [69]:
start = time.time()
!variants --database-connection 'mysql+pymysql://test:test@localhost/thesis?charset=utf8mb4' \
    --seqrepo-dir seq_repo --verbose \
    load-snippy --reference-file ../input/S_HeidelbergSL476.fasta.gz ../phylogeny
end = time.time()
print(f'Took {(end-start)/60:0.1f} minutes')

[32m2021-02-23 15:32:07[0m [1;30mINFO[0m [34mstorage.main,53:[0m Connecting to database mysql+pymysql://test:test@localhost/thesis?charset=utf8mb4
[32m2021-02-23 15:32:07[0m [1;30mINFO[0m [34mstorage.main,56:[0m Use seqrepo directory seq_repo
Loading ../phylogeny
Loaded variants from [../phylogeny] into database
Took 4.9 minutes


In [102]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

def create_session():
    engine = create_engine('mysql+pymysql://test:test@localhost/thesis?charset=utf8mb4', echo=False)

    Session = sessionmaker(bind=engine)
    session = Session()
    return session

### 2.1. No checking for proper reference/sequence name

In [103]:
from storage.variant.model import Sample

def get_union_from_relational_db(samples: List[str]) -> Set[str]:
    session = create_session()
    sample_objs = session.query(Sample).filter(Sample.name.in_(samples)).all()
    
    vars_union = set()
    for sample in sample_objs:
        svars = {v.id for v in sample.variants}
        vars_union = vars_union.union(svars)
        
    return vars_union

print(f'Case 1: Single ({len(case1_samples)}) sample')
union, runtime = get_runtime(get_union_from_relational_db, samples=case1_samples)
print(f'Length of union: {len(union)}')

print(f'\nCase 2: {len(case2_samples)} samples')
union, runtime = get_runtime(get_union_from_relational_db, samples=case2_samples)
print(f'Length of union: {len(union)}')

Case 1: Single (1) sample
Runtime: 0.63 seconds
Length of union: 17193

Case 2: 10 samples
Runtime: 3.57 seconds
Length of union: 36920


### 2.2. Checking for proper reference/sequence name from database

In [105]:
from storage.variant.model import VariationAllele, ReferenceSequence, Reference

def get_union_from_relational_db_via_query(samples: List[str]) -> Set[str]:
    session = create_session()
    variants = session.query(VariationAllele) \
        .select_from(Sample) \
        .join(Sample.variants) \
        .join(ReferenceSequence) \
        .join(Reference) \
        .filter(Reference.name == 'S_HeidelbergSL476') \
        .filter(Sample.name.in_(samples)) \
        .all()
    
    vars_union = {v.id for v in variants}
    return vars_union

print(f'Case 1: Single ({len(case1_samples)}) sample')
union, runtime = get_runtime(get_union_from_relational_db_via_query, samples=case1_samples)
print(f'Length of union: {len(union)}')

print(f'\nCase 2: {len(case2_samples)} samples')
union, runtime = get_runtime(get_union_from_relational_db_via_query, samples=case2_samples)
print(f'Length of union: {len(union)}')

Case 1: Single (1) sample
Runtime: 2.34 seconds
Length of union: 17193

Case 2: 10 samples
Runtime: 3.32 seconds
Length of union: 36920


### 2.3. Checking for proper reference/sequence name in code

In [118]:
from storage.variant.model import Sample, Reference, ReferenceSequence

def get_union_from_relational_db_via_code(samples: List[str]) -> Set[str]:
    session = create_session()
    sample_objs = session.query(Sample).filter(Sample.name.in_(samples)).all()
    ref_sequences = session.query(ReferenceSequence) \
        .join(Reference) \
        .filter(Reference.name == 'S_HeidelbergSL476') \
        .all()
    
    ref_sequence_ids = {r.id for r in ref_sequences}
    
    vars_union = set()
    for sample in sample_objs:
        svars = {v.id for v in sample.variants if v.sequence_id in ref_sequence_ids}
        vars_union = vars_union.union(svars)
        
    return vars_union

print(f'Case 1: Single ({len(case1_samples)}) sample')
union, runtime = get_runtime(get_union_from_relational_db_via_code, samples=case1_samples)
print(f'Length of union: {len(union)}')

print(f'\nCase 2: {len(case2_samples)} samples')
union, runtime = get_runtime(get_union_from_relational_db_via_code, samples=case2_samples)
print(f'Length of union: {len(union)}')

Case 1: Single (1) sample
Runtime: 0.52 seconds
Length of union: 17193

Case 2: 10 samples
Runtime: 3.31 seconds
Length of union: 36920
