In [1]:
import allel
import pandas as pd
import numpy as np
import csv

def ref_vcf_to_df():
    callset1 = allel.read_vcf('/sbgenomics/project-files/mpileupvcf.mpileup.called.vcf', fields=['variants/REF', 'variants/CHROM', 'variants/POS', 'variants/ALT', 'calldata/GT'])
    # Creating ref_vcf DataFrame
    df1 = pd.DataFrame.from_dict([callset1['variants/CHROM'], callset1['variants/POS'], callset1['variants/REF']]).T
    df1 = df1.rename(columns={0:'CHROM', 1:'POS', 2:'REF'})
    df2 = pd.DataFrame.from_dict(callset1['variants/ALT'])
    df2 = df2.rename(columns={0:'ALT_1', 1:'ALT_2', 2:'ALT_3'})
    df3 = pd.DataFrame.from_dict(callset1['calldata/GT'][:,0])
    df3 = df3.rename(columns={0:'GT_1', 1:'GT_2'})
    frames = [df1, df2, df3]
    ref_vcf = pd.concat(frames, axis=1)
    return ref_vcf

def our_vcf_to_df(vcf_name):
    # Creating our_vcf
    callset2 = allel.read_vcf(vcf_name, fields=['variants/REF', 'variants/CHROM', 'variants/POS', 'variants/ALT', 'calldata/GT'])
    df1 = pd.DataFrame.from_dict([callset2['variants/CHROM'], callset2['variants/POS'], callset2['variants/REF']]).T
    df1 = df1.rename(columns={0:'CHROM', 1:'POS', 2:'REF'})
    df2 = pd.DataFrame.from_dict(callset2['variants/ALT'])
    df2 = df2.rename(columns={0:'ALT_1', 1:'ALT_2', 2:'ALT_3'})
    df3 = pd.DataFrame.from_dict(callset2['calldata/GT'][:,0])
    df3 = df3.rename(columns={0:'GT_1', 1:'GT_2'})
    frames = [df1, df2, df3]
    our_vcf = pd.concat(frames, axis=1)
    return our_vcf

# Function for creating the list of string, where string represents alleles
def allele_list(gt1, gt2, alt1, alt2, alt3, ref):
    lst = []
    for i in range(0,len(gt1)):
        if gt1[i] == 0 and gt2[i] == 0:
            lst.append(ref[i])
        elif gt1[i] == 0 and gt2[i] == 1:
            l = [ref[i],alt1[i]]
            lst.append(''.join(l))
        elif gt1[i] == 1 and gt2[i] == 0:
            l = [alt1[i],ref[i]]
            lst.append(''.join(l)) 
        elif gt1[i] == 1 and gt2[i] == 1:
            lst.append(alt1[i])
        elif gt1[i] == 0 and gt2[i] == 2:
            l = [ref[i],alt2[i]]
            lst.append(''.join(l))
        elif gt1[i] == 2 and gt2[i] == 0:
            l = [alt2[i],ref[i]]
            lst.append(''.join(l))
        elif gt1[i] == 1 and gt2[i] == 2:
            l = [alt1[i], alt2[i]]
            lst.append(''.join(l))
        elif gt1[i] == 2 and gt2[i] == 1:
            l = [alt2[i], alt1[i]]
            lst.append(''.join(l))
        elif gt1[i] == 2 and gt2[i] == 2: 
            lst.append(our_alt2[i])
        elif gt1[i] == 0 and gt2[i] == 3:
            l = [ref[i],alt3[i]]
            lst.append(''.join(l))
        elif gt1[i] == 3 and gt2[i] == 0:
            l = [alt3[i],ref[i]]
            lst.append(''.join(l)) 
        elif gt[i] == 1 and gt2[i] == 3:
            l = [alt1[i],alt3[i]]
            lst.append(''.join(l))
        elif gt[i] == 3 and gt2[i] == 1:
            l = [alt3[i],alt1[i]]
            lst.append(''.join(l)) 
        elif gt[i] == 2 and gt2[i] == 3:
            l = [alt2[i],alt3[i]]
            lst.append(''.join(l))
        elif gt[i] == 3 and gt2[i] == 2:
            l = [alt3[i],alt2[i]]
            lst.append(''.join(l))
        elif gt1[i] == 3 and gt2[i] == 3: 
            lst.append(our_alt3[i])
            
    return lst

def metrics_first(ref_vcf, our_vcf):
    # First case:
    # TP = number of same positions in our_vcf and ref_vcf with detecred variant (we don't care if variants aren't the same)
    # FP = number of those positions in our_vcf which aren't in ref_vcf
    # FN = number of those positions in ref_vcf which aren't in our_vcf

    # Series in which True means that that position is in ref_vcf and our_vcf at the same time, while False means that some position is in our_vcf but it isn't in ref_vcf
    bool_values = our_vcf['POS'].isin(ref_vcf['POS'])
    bool_count = bool_values.value_counts()
    # Number of trues values in series bool_values is our TP
    TP = bool_count[True]
    # Number of false values in series bool_values is our FP
    FP = bool_count[False]

    # # Series in which True means that that position is in ref_vcf and our_vcf at the same time, while False means that some position is in ref_vcf but it isn't in our_vcf
    bool_values2 = ref_vcf['POS'].isin(our_vcf['POS'])
    bool_count2 = bool_values2.value_counts()
    # Number of false values in series bool_values2 is our FN
    FN = bool_count2[False]

    # Calculating values of metrics
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1_score =2*precision*recall/(precision+recall)
    #print('Metrics in first case:\nTP:'+str(TP)+'\nFP:'+str(FP)+'\nFN:'+str(FN))
    #print('Precision:' + str(precision) + '\nRecall:'+ str(recall) + '\nF1-score:' + str(f1_score))
    return [TP,FP,FN,precision,recall,f1_score]

def metrics_second(ref_vcf,our_vcf):
    # Second case:
    # TP = number of identical variants
    # FP - as number of falsely declared variants = number of those positions which are in our_vcf, but aren't in ref_vcf + number of those positions which can be found in both files, but whose variants aren't identical
    # FN = number of those positions in ref_vcf which aren't in our_vcf

    # Series in which True means that that position is in ref_vcf and our_vcf at the same time, while False means that some position is in our_vcf but it isn't in ref_vcf
    bool_values = our_vcf['POS'].isin(ref_vcf['POS'])
    bool_count = bool_values.value_counts()
    TP_base = bool_count[True] 
    FP_base = bool_count[False] # number of those positions which are in our_vcf, but aren't in ref_vcf

    # List of indices of same positions (positions found in both files) in our_vcf
    our_same_pos = bool_values.index[bool_values == True].tolist()

    bool_values2 = ref_vcf['POS'].isin(our_vcf['POS'])
    bool_count2 = bool_values2.value_counts()
    FN_base = bool_count2[False]
    # List of indices of same positions (positions found in both files) in ref_vcf
    ref_same_pos = bool_values2.index[bool_values2 == True].tolist()
    
    our_gt1 = our_vcf['GT_1'].iloc[our_same_pos].values
    our_gt2 = our_vcf['GT_2'].iloc[our_same_pos].values
    our_alt1 = our_vcf['ALT_1'].iloc[our_same_pos].values
    our_alt2 = our_vcf['ALT_2'].iloc[our_same_pos].values
    our_alt3 = our_vcf['ALT_3'].iloc[our_same_pos].values
    our_ref = our_vcf['REF'].iloc[our_same_pos].values
    our_lst = allele_list(our_gt1,our_gt2, our_alt1, our_alt2, our_alt3, our_ref)

    ref_gt1 = ref_vcf['GT_1'].iloc[ref_same_pos].values
    ref_gt2 = ref_vcf['GT_2'].iloc[ref_same_pos].values
    ref_alt1 = ref_vcf['ALT_1'].iloc[ref_same_pos].values
    ref_alt2 = ref_vcf['ALT_2'].iloc[ref_same_pos].values
    ref_alt3 = ref_vcf['ALT_3'].iloc[ref_same_pos].values
    ref_ref = ref_vcf['REF'].iloc[ref_same_pos].values
    ref_lst = allele_list(ref_gt1, ref_gt2, ref_alt1, ref_alt2, ref_alt3, ref_ref)

    our_variants = pd.Series(our_lst)
    ref_variants = pd.Series(ref_lst)

    # First find identical variants (identical alleles)
    var_bool_values = our_variants.isin(ref_variants)
    var_bool_count = var_bool_values.value_counts()
    TP = var_bool_count[True]
    FP_add = var_bool_count[False]
    FP = FP_add + FP_base
    FN = FN_base
    #print('Metrics in second case:\nTP:'+str(TP)+'\nFP:'+str(FP)+'\nFN:'+str(FN))
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1_score =2*precision*recall/(precision+recall)
    #print('Precision:' + str(precision) + '\nRecall:'+ str(recall) + '\nF1-score:' + str(f1_score)+'\n')
    return [TP,FP,FN,precision,recall,f1_score]

def metrics_third(ref_vcf, our_vcf):
    bool_values = our_vcf['POS'].isin(ref_vcf['POS'])
    bool_count = bool_values.value_counts()
    TP_base = bool_count[True] 
    FP_base = bool_count[False] # number of those positions which are in our_vcf, but aren't in ref_vcf

    # List of indices of same positions (positions found in both files) in our_vcf
    our_same_pos = bool_values.index[bool_values == True].tolist()

    bool_values2 = ref_vcf['POS'].isin(our_vcf['POS'])
    bool_count2 = bool_values2.value_counts()
    FN_base = bool_count2[False]
    # List of indices of same positions (positions found in both files) in ref_vcf
    ref_same_pos = bool_values2.index[bool_values2 == True].tolist()
    
    our_gt1 = our_vcf['GT_1'].iloc[our_same_pos].values
    our_gt2 = our_vcf['GT_2'].iloc[our_same_pos].values
    our_alt1 = our_vcf['ALT_1'].iloc[our_same_pos].values
    our_alt2 = our_vcf['ALT_2'].iloc[our_same_pos].values
    our_alt3 = our_vcf['ALT_3'].iloc[our_same_pos].values
    our_ref = our_vcf['REF'].iloc[our_same_pos].values
    our_lst = allele_list(our_gt1,our_gt2, our_alt1, our_alt2, our_alt3, our_ref)

    ref_gt1 = ref_vcf['GT_1'].iloc[ref_same_pos].values
    ref_gt2 = ref_vcf['GT_2'].iloc[ref_same_pos].values
    ref_alt1 = ref_vcf['ALT_1'].iloc[ref_same_pos].values
    ref_alt2 = ref_vcf['ALT_2'].iloc[ref_same_pos].values
    ref_alt3 = ref_vcf['ALT_3'].iloc[ref_same_pos].values
    ref_ref = ref_vcf['REF'].iloc[ref_same_pos].values
    ref_lst = allele_list(ref_gt1, ref_gt2, ref_alt1, ref_alt2, ref_alt3, ref_ref)

    our_variants = pd.Series(our_lst)
    ref_variants = pd.Series(ref_lst)

    # First find identical variants (identical alleles)
    var_bool_values = our_variants.isin(ref_variants)
    var_bool_count = var_bool_values.value_counts()
    TP = var_bool_count[True]
    FN_add = var_bool_count[False]
    FN = FN_base + FN_add
    FP = FP_base
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1_score =2*precision*recall/(precision+recall)
    return [TP,FP,FN,precision,recall,f1_score]
    

def main(name):
    ref_vcf = ref_vcf_to_df()
    ref_len = ref_vcf.shape[0]
    our_vcf = our_vcf_to_df(name)
    our_len = our_vcf.shape[0]
    [TP1, FP1, FN1, precision1, recall1, f1_score1] = metrics_first(ref_vcf, our_vcf)
    [TP2, FP2, FN2, precision2, recall2, f1_score2] = metrics_second(ref_vcf, our_vcf)
    [TP3, FP3, FN3, precision3, recall3, f1_score3] = metrics_third(ref_vcf, our_vcf)
    return [TP1, FP1, FN1, precision1, recall1, f1_score1,TP2, FP2, FN2, precision2, recall2, f1_score2, TP3, FP3, FN3, precision3, recall3, f1_score3, ref_len, our_len]
    
if __name__ == "__main__":
    df = pd.DataFrame(columns=['File Name', 'Probability', 'TP1', 'FP1', 'FN1', 'Precision1', 'Recall1', 'F1-score1','TP2', 'FP2', 'FN2', 'Precision2', 'Recall2', 'F1-score2', 'TP3', 'FP3', 'FN3', 'Precision3', 'Recall3', 'F1-score3'])
    probabilities = [0.5, 0.55, 0.6, 0.65, 0.70, 0.75, 0.8, 0.85, 0.9, 0.95]
    i = 0
    for p in probabilities:
        name = 'vcf4-2('+str(p)+').txt'
        TP1, FP1, FN1, precision1, recall1, f1_score1,TP2, FP2, FN2, precision2, recall2, f1_score2, TP3, FP3, FN3, precision3, recall3, f1_score3, ref_len, our_len = main(name)
        df.loc[i] = [name] + [p, TP1, FP1, FN1, precision1, recall1, f1_score1,TP2, FP2, FN2, precision2, recall2, f1_score2, TP3, FP3, FN3, precision3, recall3, f1_score3]
        i += 1
       
    df.to_csv('metrics.csv', index=False)
        
        
    
    
    
    



In [2]:
df

Unnamed: 0,File Name,Probability,TP1,FP1,FN1,Precision1,Recall1,F1-score1,TP2,FP2,FN2,Precision2,Recall2,F1-score2,TP3,FP3,FN3,Precision3,Recall3,F1-score3
0,vcf4-2(0.5).txt,0.5,2006,457,1169,0.814454,0.631811,0.7116,1946,517,1169,0.790093,0.624719,0.697741,1946,457,1229,0.809821,0.612913,0.697741
1,vcf4-2(0.55).txt,0.55,1959,420,1216,0.823455,0.617008,0.705438,1904,475,1216,0.800336,0.610256,0.69249,1904,420,1271,0.819277,0.599685,0.69249
2,vcf4-2(0.6).txt,0.6,1959,420,1216,0.823455,0.617008,0.705438,1904,475,1216,0.800336,0.610256,0.69249,1904,420,1271,0.819277,0.599685,0.69249
3,vcf4-2(0.65).txt,0.65,1959,420,1216,0.823455,0.617008,0.705438,1904,475,1216,0.800336,0.610256,0.69249,1904,420,1271,0.819277,0.599685,0.69249
4,vcf4-2(0.7).txt,0.7,1977,554,1198,0.781114,0.622677,0.692955,1921,610,1198,0.758989,0.615903,0.68,1921,554,1254,0.776162,0.605039,0.68
5,vcf4-2(0.75).txt,0.75,2020,843,1155,0.705554,0.63622,0.669096,1965,898,1155,0.686343,0.629808,0.656861,1965,843,1210,0.699786,0.618898,0.656861
6,vcf4-2(0.8).txt,0.8,2372,888,804,0.727607,0.746851,0.737104,2325,935,804,0.71319,0.743049,0.727813,2325,888,851,0.723623,0.732053,0.727813
7,vcf4-2(0.85).txt,0.85,2932,1532,244,0.65681,0.923174,0.767539,2888,1576,244,0.646953,0.922095,0.7604,2888,1532,288,0.653394,0.90932,0.7604
8,vcf4-2(0.9).txt,0.9,3116,3027,60,0.507244,0.981108,0.668741,3076,3067,60,0.500733,0.980867,0.663002,3076,3027,100,0.504014,0.968514,0.663002
9,vcf4-2(0.95).txt,0.95,3165,6769,12,0.318603,0.996223,0.482801,3123,6811,12,0.314375,0.996172,0.477925,3123,6769,54,0.31571,0.983003,0.477925
