In [47]:
import argparse
import json
import re
import ast
import collections
import pandas as pd
import numpy as np
from Bio import SeqIO

In [169]:
all_df = pd.read_csv('../dataframes/h3n2_ha_12y_hi.csv')
egg_df = all_df[all_df['passage']=='egg']
#filter data for only paired sequences
df = all_df[all_df['pair_id']!=0]

In [127]:
#Re-organize DF to one row per pair
sub_egg = df[df['passage']=='egg'][['source', 'egg_muts']]
sub_u = df[df['passage']=='unpassaged'][['source', 'strain']].rename(columns = {'strain':'unpassaged_pair'})
sub_u['unpassaged_pair'] = True
sub_cell = df[df['passage']=='cell'][['source', 'strain']].rename(columns = {'strain':'cell_pair'})
sub_cell['cell_pair'] = True

pairs_u_df = sub_egg.merge(sub_u)
pairs_cell_df = sub_egg.merge(sub_cell)
pairs_cell_u_df = sub_u.merge(sub_cell)
pairs_df = pairs_u_df.merge(pairs_cell_df, how='outer')

In [128]:
seqs = SeqIO.to_dict(SeqIO.parse("../results/aa-seq_who_h3n2_ha_12y_concat_hi_HA1.fasta", "fasta"))

In [129]:
positions = [160, 194, 186, 225, 219, 203, 156, 138, 246]

In [130]:
#Find false positives (mutation inferred, but strain is not mutated)
num_muts_inferred = 0
num_false_pos = 0
num_muts_inferred_limitsites = 0
num_false_pos_limitsites = 0

for k,v in pairs_df.iterrows():
    for egg_mut in ast.literal_eval(v['egg_muts']):
        
        egg_mut_pos = int(re.findall('\d+', egg_mut)[0])
        egg_aa = seqs[v['source']+'-egg'][(egg_mut_pos-1)]
        
        num_muts_inferred+=1
        if egg_mut_pos in positions:
            num_muts_inferred_limitsites+=1
            
        if v['unpassaged_pair']==True:
            u_aa = seqs[v['source']][(egg_mut_pos-1)]
            if u_aa == egg_aa:
                num_false_pos+=1
                if egg_mut_pos in positions:
                    num_false_pos_limitsites+=1
                
        if v['cell_pair']==True:
            cell_aa = seqs[v['source']+'-cell'][(egg_mut_pos-1)]
            if cell_aa == egg_aa:
                num_false_pos+=1
                if egg_mut_pos in positions:
                    num_false_pos_limitsites+=1

print(num_muts_inferred)   
print(num_false_pos)
print(num_muts_inferred_limitsites)
print(num_false_pos_limitsites)

591
21
473
2


In [177]:
#Estimate number false positives in ALL egg strains

total_muts_inferred = 0
total_muts_inferred_limitsites = 0

for k,v in egg_df.iterrows():
    for egg_mut in ast.literal_eval(v['egg_muts']):
        egg_mut_pos = int(re.findall('\d+', egg_mut)[0])
        total_muts_inferred+=1
        if egg_mut_pos in positions:
            total_muts_inferred_limitsites+=1
            
est_false_pos = total_muts_inferred*(num_false_pos/num_muts_inferred)
est_false_pos_limitsites = total_muts_inferred_limitsites*(num_false_pos_limitsites/num_muts_inferred_limitsites)

print(est_false_pos)
print(est_false_pos_limitsites)

31.979695431472084
2.608879492600423


In [167]:
#Find false negatives (strain is mutated, but mutation not inferred)
num_muts_direct = 0
num_false_neg = 0
num_muts_direct_limitsites = 0
num_false_neg_limitsites = 0

for k,v in pairs_df.iterrows():
    
    egg_ha1 = seqs[v['source']+'-egg']
    
    for residue in range(len(egg_ha1)):
        egg_aa = egg_ha1[int(residue)-1]
        
        if v['unpassaged_pair']==True:
            u_ha1 = seqs[v['source']]
            u_aa = u_ha1[int(residue)-1]
            if egg_aa != u_aa:
                num_muts_direct+=1
                u_mutation = u_aa + str(residue) + egg_aa
                if u_mutation not in ast.literal_eval(v['egg_muts']):
                    num_false_neg+=1
                
                if residue in positions:
                    num_muts_direct_limitsites+=1
                    if u_mutation not in ast.literal_eval(v['egg_muts']):
                        num_false_neg_limitsites+=1
                    
                
        if v['cell_pair']==True:
            cell_ha1 = seqs[v['source']+'-cell']
            cell_aa = cell_ha1[int(residue)-1]
            
            cell_tip_muts = df[df['strain']==(v['source']+'-cell')]['tip_HA1_muts'].item()
            
            #Only want egg muts, not cell muts
            if str(residue) not in cell_tip_muts: 
                if egg_aa != cell_aa:
                    num_muts_direct+=1
                    cell_mutation = cell_aa + str(residue) + egg_aa
                    if cell_mutation not in ast.literal_eval(v['egg_muts']):
                        num_false_neg+=1

                    if residue in positions:
                        num_muts_direct_limitsites+=1
                        if cell_mutation not in ast.literal_eval(v['egg_muts']):
                            num_false_neg_limitsites+=1

        
print(num_muts_direct)
print(num_false_neg)
print(num_muts_direct_limitsites)
print(num_false_neg_limitsites)

603
46
497
39


In [182]:
#Estimate number false negatives in ALL egg strains
num_paired_egg_seqs = len(pairs_df)
num_total_egg_seqs = len(egg_df)

est_total_mutations = (num_total_egg_seqs/num_paired_egg_seqs) * num_muts_direct
est_total_mutations_limitsites = (num_total_egg_seqs/num_paired_egg_seqs) * num_muts_direct_limitsites
est_false_neg = (num_total_egg_seqs/num_paired_egg_seqs)* num_false_neg
est_false_neg_limitsites = (num_total_egg_seqs/num_paired_egg_seqs)* num_false_neg_limitsites

print(est_total_mutations)
print(est_total_mutations_limitsites)
print(est_false_neg)
print(est_false_neg_limitsites)

928.9459459459459
765.6486486486486
70.86486486486487
60.08108108108108
