# 1. Load simulated variants (VCF)

In [2]:
import vcf
import pandas as pd
import time

simulated_variants_file = 'simulated_data/haplotypes.vcf.gz'

reader = vcf.Reader(filename=simulated_variants_file)
simulated_df = pd.DataFrame([vars(r) for r in reader])
simulated_df = simulated_df[['CHROM', 'POS', 'REF', 'samples']]
simulated_df.head(5)

Unnamed: 0,CHROM,POS,REF,samples
0,NC_011083,45,T,"[Call(sample=reference, CallData(GT=0, GQ=4414..."
1,NC_011083,82,T,"[Call(sample=reference, CallData(GT=0, GQ=4414..."
2,NC_011083,96,C,"[Call(sample=reference, CallData(GT=0, GQ=4414..."
3,NC_011083,104,A,"[Call(sample=reference, CallData(GT=0, GQ=4414..."
4,NC_011083,123,G,"[Call(sample=reference, CallData(GT=0, GQ=4414..."


## 1.1. Construct sample:variant identifiers

In [3]:
before = time.time()

# Explode dataframe so that we have one row per sample
simulated_df_exploded = simulated_df.explode('samples')
simulated_df_exploded['SAMPLE'] = simulated_df_exploded['samples'].apply(lambda x: x.sample)

# Extrat ALT for each sample
simulated_df_exploded['ALT'] = simulated_df_exploded['samples'].apply(lambda x: x.gt_bases)

# Only keep mutations/those where REF and ALT are different
simulated_df_exploded = simulated_df_exploded[simulated_df_exploded['REF'] != simulated_df_exploded['ALT']]

# Create SPDI-like identifier with Sample name for comparison (SAMPLE:CHROM:POS:REF:ALT)
# Add '.1' to CHROM to match actual results (the version onto the sequnce identifier)
simulated_df_exploded['ID'] = simulated_df_exploded.apply(
    lambda x: f"{x['SAMPLE']}:{x['CHROM']}.1:{x['POS']}:{x['REF']}:{x['ALT']}", axis='columns')

simulated_variants = simulated_df_exploded['ID'].reset_index(drop=True)
expected_sample_variants = set(simulated_variants)

after = time.time()

print(f'There are {len(expected_sample_variants)} expected sample/variant pairs')
print(f'Took {(after - before)/60:0.1f} minutes')

There are 1039018 expected sample/variant pairs
Took 0.4 minutes


## 1.2. Load reference genome for use with positive/negative calculations

In [4]:
import gzip
from Bio import SeqIO

reference_file = 'input/S_HeidelbergSL476.fasta.gz'

with gzip.open(reference_file, mode='rt') as f:
    sequences = list(SeqIO.parse(f, 'fasta'))
    reference_length = len(sequences[0])
    
sample_names = set(simulated_df_exploded.groupby('SAMPLE').agg({'SAMPLE': 'count'}).index)
sample_names = sample_names - {'reference'}
number_samples = len(sample_names)

print(f'Reference length: {reference_length}')
print(f'Number samples: {number_samples}')

Reference length: 4888768
Number samples: 59


# 2. Load detected variants

## 2.1. Load from reads index

In [5]:
from typing import Set
import genomics_data_index.api as gdi

def get_sample_variant_idenifiers(index_dir: str) -> Set[str]:
    db = gdi.GenomicsDataIndex.connect(index_dir)
    q = db.samples_query()

    actual_sample_variants = set()
    for sample in q.tolist():
        sample_features = q.isa(sample).features_summary().reset_index()
        sample_features_set = set(sample_features['Mutation'].apply(lambda x: f"{sample}:{x}"))
        actual_sample_variants.update(sample_features_set)
    return actual_sample_variants

before = time.time()
actual_sample_variants_reads = get_sample_variant_idenifiers('index-reads')
after = time.time()

print(f'There are {len(actual_sample_variants_reads)} actual sample/variant pairs (reads)')
print(f'Took {(after - before)/60:0.1f} minutes')

There are 1009208 actual sample/variant pairs (reads)
Took 3.4 minutes


## 2.2. Load from assemblies index

In [13]:
before = time.time()
actual_sample_variants_assemblies = get_sample_variant_idenifiers('index-assemblies')
after = time.time()

print(f'There are {len(actual_sample_variants_assemblies)} actual sample/variant pairs (assemblies)')
print(f'Took {(after - before)/60:0.1f} minutes')

Attempting to set global database_path_translator=<genomics_data_index.storage.model.db.DatabasePathTranslator.DatabasePathTranslator object at 0x7f9618722fd0> but it is already set


There are 1031274 actual sample/variant pairs (assemblies)
Took 3.3 minutes


# 3. Compare expected/actual variants

## 3.1. Compare with reads index

In [15]:
def compare_expected_actual(actual_sample_variants: Set[str]) -> pd.DataFrame:
    number_expected = len(expected_sample_variants)
    number_actual = len(actual_sample_variants)

    true_positives = actual_sample_variants & expected_sample_variants
    false_negatives = expected_sample_variants - actual_sample_variants
    false_positives = actual_sample_variants - expected_sample_variants
    # I cannot get true negatives since I would need to know the total number of negatives (i.e., all possible 
    #  variants with respect to the reference genome that were not simulated). This would be a finite, but 
    #  very very large number (and I haven't worked out how to calculate it).
    # For example, one negative is Sample:1:A:T, another negative is Sample:1:AG:TT, and so on for the entire 
    # length of the genome.
    # true_negatives = set()

    sensitivity = len(true_positives) / (len(true_positives) + len(false_negatives))
    precision = len(true_positives) / (len(true_positives) + len(false_positives))

    # Since true_negatives are a very large number, then for all intents and purposes
    # specificity will be very very close to 1. So instead of trying to calculate it
    # I just call it ~1, but it's also not very useful because of this.
    #specificity = len(true_negatives) / (len(true_negatives) + len(false_positives))
    specificity = '~1 (not calculated)'

    print(f'Sensitivity: {sensitivity:0.4f}')
    print(f'Specificity: {specificity}')
    print(f'Precision: {precision:0.4f}')

    comparison_df = pd.DataFrame([
        ['Actual Positive', len(true_positives), len(false_negatives)],
        ['Actual Negative', len(false_positives), pd.NA]
    ], columns=['Actual', 'Detected Positive', 'Detected Negative'])
    
    data = {
        'tp': true_positives,
        'fp': false_positives,
        'fn': false_negatives
    }
    
    return comparison_df, data
    
comparison_reads_df, data_reads = compare_expected_actual(actual_sample_variants_reads)
comparison_reads_df

Sensitivity: 0.9541
Specificity: ~1 (not calculated)
Precision: 0.9823


Unnamed: 0,Actual,Detected Positive,Detected Negative
0,Actual Positive,991322,47696.0
1,Actual Negative,17886,


## 3.2. Compare with assemblies index

In [16]:
comparison_assemblies_df, data_assemblies = compare_expected_actual(actual_sample_variants_assemblies)
comparison_assemblies_df

Sensitivity: 0.9901
Specificity: ~1 (not calculated)
Precision: 0.9976


Unnamed: 0,Actual,Detected Positive,Detected Negative
0,Actual Positive,1028782,10236.0
1,Actual Negative,2492,
