# 1. Load simulated variants (VCF)

In [2]:
import vcf
import pandas as pd
import time

simulated_variants_file = 'simulated_data/haplotypes.vcf.gz'

reader = vcf.Reader(filename=simulated_variants_file)
simulated_df = pd.DataFrame([vars(r) for r in reader])
simulated_df = simulated_df[['CHROM', 'POS', 'REF', 'samples']]
simulated_df.head(5)

Unnamed: 0,CHROM,POS,REF,samples
0,NC_011083,45,T,"[Call(sample=reference, CallData(GT=0, GQ=4414..."
1,NC_011083,82,T,"[Call(sample=reference, CallData(GT=0, GQ=4414..."
2,NC_011083,96,C,"[Call(sample=reference, CallData(GT=0, GQ=4414..."
3,NC_011083,104,A,"[Call(sample=reference, CallData(GT=0, GQ=4414..."
4,NC_011083,123,G,"[Call(sample=reference, CallData(GT=0, GQ=4414..."


## 1.1. Construct sample:variant identifiers

In [3]:
before = time.time()

# Explode dataframe so that we have one row per sample
simulated_df_exploded = simulated_df.explode('samples')
simulated_df_exploded['SAMPLE'] = simulated_df_exploded['samples'].apply(lambda x: x.sample)

# Extrat ALT for each sample
simulated_df_exploded['ALT'] = simulated_df_exploded['samples'].apply(lambda x: x.gt_bases)

# Only keep mutations/those where REF and ALT are different
simulated_df_exploded = simulated_df_exploded[simulated_df_exploded['REF'] != simulated_df_exploded['ALT']]

# Create SPDI-like identifier with Sample name for comparison (SAMPLE:CHROM:POS:REF:ALT)
# Add '.1' to CHROM to match actual results (the version onto the sequnce identifier)
simulated_df_exploded['ID'] = simulated_df_exploded.apply(
    lambda x: f"{x['SAMPLE']}:{x['CHROM']}.1:{x['POS']}:{x['REF']}:{x['ALT']}", axis='columns')

simulated_variants = simulated_df_exploded['ID'].reset_index(drop=True)
expected_sample_variants = set(simulated_variants)

after = time.time()

print(f'There are {len(expected_sample_variants)} expected sample/variant pairs')
print(f'Took {(after - before)/60:0.1f} minutes')

There are 1039018 expected sample/variant pairs
Took 0.4 minutes


## 1.2. Load reference genome for use with positive/negative calculations

In [4]:
import gzip
from Bio import SeqIO

reference_file = 'input/S_HeidelbergSL476.fasta.gz'

with gzip.open(reference_file, mode='rt') as f:
    sequences = list(SeqIO.parse(f, 'fasta'))
    reference_length = len(sequences[0])
    
sample_names = set(simulated_df_exploded.groupby('SAMPLE').agg({'SAMPLE': 'count'}).index)
sample_names = sample_names - {'reference'}
number_samples = len(sample_names)

print(f'Reference length: {reference_length}')
print(f'Number samples: {number_samples}')

Reference length: 4888768
Number samples: 59


# 2. Load detected variants

## 2.1. Load from reads index

In [5]:
from typing import Set
import genomics_data_index.api as gdi

def get_sample_variant_idenifiers(index_dir: str) -> Set[str]:
    db = gdi.GenomicsDataIndex.connect(index_dir)
    q = db.samples_query()

    actual_sample_variants = set()
    for sample in q.tolist():
        sample_features = q.isa(sample).features_summary().reset_index()
        sample_features_set = set(sample_features['Mutation'].apply(lambda x: f"{sample}:{x}"))
        actual_sample_variants.update(sample_features_set)
    return actual_sample_variants

before = time.time()
actual_sample_variants_reads = get_sample_variant_idenifiers('index-reads')
after = time.time()

print(f'There are {len(actual_sample_variants_reads)} actual sample/variant pairs (reads)')
print(f'Took {(after - before)/60:0.1f} minutes')

There are 1009208 actual sample/variant pairs (reads)
Took 3.4 minutes


## 2.2. Load from assemblies index

In [None]:
before = time.time()
actual_sample_variants_assemblies = get_sample_variant_idenifiers('index-assemblies')
after = time.time()

print(f'There are {len(actual_sample_variants_assemblies)} actual sample/variant pairs (assemblies)')
print(f'Took {(after - before)/60:0.1f} minutes')

# 3. Compare expected/actual variants

## 3.1. Compare with reads index

In [7]:
def compare_expected_actual(actual_sample_variants: Set[str]) -> pd.DataFrame:
    number_expected = len(expected_sample_variants)
    number_actual = len(actual_sample_variants)

    true_positives = actual_sample_variants & expected_sample_variants
    false_negatives = expected_sample_variants - actual_sample_variants
    false_positives = actual_sample_variants - expected_sample_variants
    # I cannot get true negatives since I would need to know the total number of negatives (i.e., all possible 
    #  variants with respect to the reference genome that were not simulated). This would be a finite, but 
    #  very very large number (and I haven't worked out how to calculate it).
    # For example, one negative is Sample:1:A:T, another negative is Sample:1:AG:TT, and so on for the entire 
    # length of the genome.
    # true_negatives = set()

    sensitivity = len(true_positives) / (len(true_positives) + len(false_negatives))
    precision = len(true_positives) / (len(true_positives) + len(false_positives))

    # Since true_negatives are a very large number, then for all intents and purposes
    # specificity will be very very close to 1. So instead of trying to calculate it
    # I just call it ~1, but it's also not very useful because of this.
    #specificity = len(true_negatives) / (len(true_negatives) + len(false_positives))
    specificity = '~1 (not calculated)'

    print(f'Sensitivity: {sensitivity:0.3f}')
    print(f'Specificity: {specificity}')
    print(f'Precision: {precision:0.3f}')

    comparison_df = pd.DataFrame([
        ['Actual Positive', len(true_positives), len(false_negatives)],
        ['Actual Negative', len(false_positives), pd.NA]
    ], columns=['Actual', 'Detected Positive', 'Detected Negative'])
    
    return comparison_df
    
comparison_reads_df = compare_expected_actual(actual_sample_variants_reads)
comparison_reads_df

Sensitivity: 0.954
Specificity: ~1 (not calculated)
Precision: 0.982


Unnamed: 0,Actual,Detected Positive,Detected Negative
0,Actual Positive,991322,47696.0
1,Actual Negative,17886,


## 3.2. Compare with assemblies index

In [None]:
comparison_assemblies_df = compare_expected_actual(actual_sample_variants_assemblies)
comparison_assemblies_df

# Other

In [43]:
simulated_df_exploded[(simulated_df_exploded['SAMPLE'] == 'SH12-008') & 
                      (simulated_df_exploded['POS'] >= 2145770) &
                      (simulated_df_exploded['POS'] <= 2145790)]

Unnamed: 0,CHROM,POS,REF,samples,SAMPLE,ALT,ID
35299,NC_011083,2145779,A,"Call(sample=SH12-008, CallData(GT=1, GQ=441453))",SH12-008,G,SH12-008:NC_011083.1:2145779:A:G
35300,NC_011083,2145780,T,"Call(sample=SH12-008, CallData(GT=1, GQ=441453))",SH12-008,A,SH12-008:NC_011083.1:2145780:T:A


In [44]:
simulated_df_exploded[(simulated_df_exploded['SAMPLE'] == 'SH14-001') & 
                      (simulated_df_exploded['POS'] >= 4606340) &
                      (simulated_df_exploded['POS'] <= 4606360)]

Unnamed: 0,CHROM,POS,REF,samples,SAMPLE,ALT,ID
75880,NC_011083,4606350,A,"Call(sample=SH14-001, CallData(GT=1, GQ=441453))",SH14-001,AAGGT,SH14-001:NC_011083.1:4606350:A:AAGGT
75881,NC_011083,4606351,A,"Call(sample=SH14-001, CallData(GT=1, GQ=441453))",SH14-001,G,SH14-001:NC_011083.1:4606351:A:G


In [39]:
false_positives

{'SH12-008:NC_011083.1:2145779:AT:GA',
 'SH14-001:NC_011083.1:4606351:A:AGGTG',
 'SH13-007:NC_011083.1:4840056:GGC:TGG',
 'SH13-005:NC_011083.1:3263005:AA:CC',
 'SH12-008:NC_011083.1:4794547:TAAA:T',
 'SH14-012:NC_011083.1:4230753:TT:AA',
 'SH14-020:NC_011083.1:3061801:CT:GC',
 'SH12-007:NC_011083.1:904942:AC:GT',
 'SH14-013:NC_011083.1:3481404:TCTGG:GCTGA',
 'SH10-014:NC_011083.1:4307830:ATG:TTT',
 'SH14-024:NC_011083.1:2798753:GTA:TTC',
 'SH13-001:NC_011083.1:751405:GT:CA',
 'SH14-019:NC_011083.1:379838:GCCAG:ACCAC',
 'SH12-005:NC_011083.1:4230281:ACGT:TCGC',
 'SH12-012:NC_011083.1:1438748:GG:AT',
 'SH14-006:NC_011083.1:1478776:CG:AT',
 'SH13-007:NC_011083.1:1172199:ATTG:GTTT',
 'SH14-022:NC_011083.1:4495528:AT:TA',
 'SH14-003:NC_011083.1:4848311:CGT:GGC',
 'SH14-024:NC_011083.1:1830673:AT:CA',
 'SH12-009:NC_011083.1:748982:GCGG:TCGC',
 'SH14-014:NC_011083.1:2682359:CTGG:GTGT',
 'SH14-009:NC_011083.1:3766644:T:TGCG',
 'SH10-001:NC_011083.1:4230281:ACGT:TCGC',
 'SH13-004:NC_011083.1:3