In [1]:
import numpy
import datetime
import re
import collections
from collections import OrderedDict
import math
import allel
import pandas

In [2]:
temp_new_vcf_str = 'temp_metrics_files/temp_new.vcf'
temp_ref_vcf_str = 'temp_metrics_files/temp_ref.vcf'


In [3]:
def vcf_to_df(vcf_file):
    callset = allel.read_vcf(vcf_file, fields=['variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT',  'calldata/GT'])
    
    df_chrom = pandas.DataFrame.from_dict(callset['variants/CHROM'])#, callset['variants/POS'], callset['variants/REF'])
    df_chrom = df_chrom.rename(columns={0:'CHROM' }) #, 1:'POS', 2:'REF'
    df_pos = pandas.DataFrame.from_dict(callset['variants/POS'], dtype='int64')
    df_pos = df_pos.rename(columns={0:'POS' })
    df_ref = pandas.DataFrame.from_dict(callset['variants/REF'])
    df_ref = df_ref.rename(columns={0:'REF'})
    df_alts = pandas.DataFrame.from_dict(callset['variants/ALT'])
    df_alts = df_alts.rename(columns={0:'ALT_0', 1:'ALT_1'})
    df_gt = pandas.DataFrame.from_dict(callset['calldata/GT'][:,0])
    df_gt = df_gt.rename(columns={0:'GT_0', 1:'GT_1'})
    frames = [ df_chrom, df_pos, df_ref, df_alts, df_gt]
    vcf = pandas.concat(frames, axis=1)
    return vcf

In [4]:
def declare_variants_only_with_changes(gt0, gt1, ref, alt0, alt1, pos):
    variants = []
    for i in range(0,len(gt0)):
        
        if(gt0[i] == 0 and gt1[i] == 1):
            s = str(pos[i]) + '-' + str(ref[i]) + str(alt0[i])
            variants.append(s)
        if(gt0[i] == 1 and gt1[i] == 1):
            s = str(pos[i]) + '-' + alt0[i]
            variants.append(s)
        if(gt0[i] == 1 and gt1[i] == 2):
            s = str(pos[i]) + '-' + str(alt0[i]) + str(alt1[i])
            variants.append(s)
            
    return variants

def declare_variants_without_changes(gt0, gt1, ref, alt0, alt1, pos):
    variants = []
    for i in range(0,len(gt0)):
        if(gt0[i] == 0 and gt1[i] == 0):
            s = str(pos[i]) + '-' + ref[i]
            variants.append(s)
            
    return variants
        


In [5]:
def calc_metrics(new_dataset, ref_dataset):
    
    TP = 0
    FP = 0
    FN = 0
    precision = 0
    recall = 0
    f1_score = 0
    
    

    new_position_df = new_dataset['POS'].isin(ref_dataset['POS'])
    ref_position_df = ref_dataset['POS'].isin(new_dataset['POS'])

    new_position_cnt = new_dataset['POS'].isin(ref_dataset['POS']).value_counts()
    ref_position_cnt = ref_dataset['POS'].isin(new_dataset['POS']).value_counts()



    if False in ref_position_cnt.index: FN += ref_position_cnt[False]
    if False in new_position_cnt.index: FP += new_position_cnt[False]


    #indexes of samples from ref file
    ref_match_new_pos = ref_position_df.index[ref_position_df == True].tolist()

     #indexes of samples from new file
    new_match_rf_pos = new_position_df.index[new_position_df == True].tolist()


    new_gt0 = new_dataset['GT_0'].iloc[new_match_rf_pos].values
    new_gt1 = new_dataset['GT_1'].iloc[new_match_rf_pos].values
    new_ref = new_dataset['REF'].iloc[new_match_rf_pos].values
    new_alt0 = new_dataset['ALT_0'].iloc[new_match_rf_pos].values
    new_alt1 = new_dataset['ALT_1'].iloc[new_match_rf_pos].values
    new_pos = new_dataset['POS'].iloc[new_match_rf_pos].values

    new_variants = pandas.Series(declare_variants_only_with_changes(new_gt0, new_gt1, new_ref, new_alt0, new_alt1, new_pos))
    new_non_variants = pandas.Series(declare_variants_without_changes(new_gt0, new_gt1, new_ref, new_alt0, new_alt1, new_pos))
    #     print('new_variants')
    #     print(new_variants)
    #     print('new_non_variants')
    #     print(new_non_variants)

    ref_gt0 = ref_dataset['GT_0'].iloc[ref_match_new_pos].values
    ref_gt1 = ref_dataset['GT_1'].iloc[ref_match_new_pos].values
    ref_ref = ref_dataset['REF'].iloc[ref_match_new_pos].values
    ref_alt0 = ref_dataset['ALT_0'].iloc[ref_match_new_pos].values
    ref_alt1 = ref_dataset['ALT_1'].iloc[ref_match_new_pos].values
    ref_pos = ref_dataset['POS'].iloc[ref_match_new_pos].values

    ref_variants = pandas.Series(declare_variants_only_with_changes(ref_gt0, ref_gt1, ref_ref, ref_alt0, ref_alt1, ref_pos))
    ref_non_variants = pandas.Series(declare_variants_without_changes(ref_gt0, ref_gt1, ref_ref, ref_alt0, ref_alt1, ref_pos))
    #     print('ref_variants')
    #     print(ref_variants)
    #     print('ref_non_variants')
    #     print(ref_non_variants)

    new_call_match_var = new_variants.isin(ref_variants).value_counts()
    new_call_match_non_var = new_non_variants.isin(ref_non_variants).value_counts()
    ref_call_match_var = ref_variants.isin(new_variants).value_counts()
    ref_call_match_non_var = ref_non_variants.isin(new_non_variants).value_counts()

    if True in new_call_match_var.index: TP += new_call_match_var[True]
    if True in new_call_match_non_var.index: TP += new_call_match_non_var[True]
    if False in new_call_match_var.index: FP += new_call_match_var[False]
    if False in new_call_match_non_var.index: FP += new_call_match_non_var[False]
    if False in ref_call_match_var.index: FN += ref_call_match_var[False]
    if False in ref_call_match_non_var.index: FN += ref_call_match_non_var[False]

    #     print (TP)
    #     print (FP)
    #     print (FN)

    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f_score = 2*precision*recall/(precision+recall)
    
    
    return TP, FP, FN, precision, recall, f_score
    
    

In [6]:
def main(new_file, ref_file):

    new_callset = vcf_to_df(new_file)
    ref_callset = vcf_to_df(ref_file)
    
    return calc_metrics(new_callset, ref_callset)


In [8]:

sheet = pandas.DataFrame(columns=['VCF name', 'Binomial_probability', 'TP', 'FP', 'FN', 'Precision', 'Recall', 'F-Score'])
p = [0.5, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.98, 0.99]
row = 0
for i in p:
    file = 'variant_call_results/binom_p' + str(i) + '_variant_call.vcf'
    ref_vcf_str = '/sbgenomics/project-files/final_project_files/final_merged_pileup.called.vcf'
    TP, FP, FN, prec, rec, f_s = main(file, ref_vcf_str)
    sheet.loc[row] = [file] + [i, TP, FP, FN, prec, rec, f_s]
    row += 1
    print('Metrics processing done for binom_p' + str(i) + '_variant_call_metrics.csv ')
    
sheet.to_csv('binom_variant_call_metrics.csv', index =False)
print('Processing done.')





Metrics processing done for binom_p0.5_variant_call_metrics.csv 
Metrics processing done for binom_p0.6_variant_call_metrics.csv 
Metrics processing done for binom_p0.65_variant_call_metrics.csv 
Metrics processing done for binom_p0.7_variant_call_metrics.csv 
Metrics processing done for binom_p0.75_variant_call_metrics.csv 
Metrics processing done for binom_p0.8_variant_call_metrics.csv 
Metrics processing done for binom_p0.85_variant_call_metrics.csv 
Metrics processing done for binom_p0.9_variant_call_metrics.csv 
Metrics processing done for binom_p0.95_variant_call_metrics.csv 
Metrics processing done for binom_p0.98_variant_call_metrics.csv 
Metrics processing done for binom_p0.99_variant_call_metrics.csv 
Processing done.


In [9]:
sheet = pandas.DataFrame(columns=['VCF name', 'Binomial_probability', 'TP', 'FP', 'FN', 'Precision', 'Recall', 'F-Score'])
p = [0.5, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.98, 0.99]
# p = [0.5]
row = 0
for i in p:
    file = 'variant_call_short_results/binom_p' + str(i) + '_short_variant_call.vcf'
    ref_vcf_str = 'final_short_pileup.called.vcf'
    TP, FP, FN, prec, rec, f_s = main(file, ref_vcf_str)
    sheet.loc[row] = [file] + [i, TP, FP, FN, prec, rec, f_s]
    row += 1
    
    print('Metrics processing done for binom_p' + str(i) + '_variant_call_metrics.csv ')
    
sheet.to_csv('binom_short_variant_call_metrics.csv', index =False)
print('Processing done.')



Metrics processing done for binom_p0.5_variant_call_metrics.csv 
Metrics processing done for binom_p0.6_variant_call_metrics.csv 
Metrics processing done for binom_p0.65_variant_call_metrics.csv 
Metrics processing done for binom_p0.7_variant_call_metrics.csv 
Metrics processing done for binom_p0.75_variant_call_metrics.csv 
Metrics processing done for binom_p0.8_variant_call_metrics.csv 
Metrics processing done for binom_p0.85_variant_call_metrics.csv 
Metrics processing done for binom_p0.9_variant_call_metrics.csv 
Metrics processing done for binom_p0.95_variant_call_metrics.csv 
Metrics processing done for binom_p0.98_variant_call_metrics.csv 
Metrics processing done for binom_p0.99_variant_call_metrics.csv 
Processing done.


In [None]:
sheet
