In [238]:
import pandas as pd
import numpy as np
import ast
import re
import json
from ete3 import Tree
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

In [189]:
#Test with fake data
t = Tree('((((H,K)D,(F,I)G)B,E)A,((L,(N,Q)O)J,(P,S)M)C);', format=1)
known_test = {'group1':['H','K'],
              'group2':['L','N','Q'],
              'group3':['E']
             }
print(t)


            /-H
         /-|
        |   \-K
      /-|
     |  |   /-F
   /-|   \-|
  |  |      \-I
  |  |
  |   \-E
--|
  |      /-L
  |   /-|
  |  |  |   /-N
  |  |   \-|
   \-|      \-Q
     |
     |   /-P
      \-|
         \-S


In [203]:
subtyped_viruses = {key: set() for key in known_test.keys()}

for node in t.iter_descendants("postorder"):
    leafs = node.get_leaves()
    descendents = [leaf.name for leaf in leafs]

    
    for subtype in known_test.keys():

        this_subtype = known_test[subtype]
        remove_subtype = {key:val for key, val in known_test.items() if key != subtype}
        other_subtypes = [item for sublist in list(remove_subtype.values()) for item in sublist]
        #check if node has all known viruses of a subtype
        if all(elem in descendents for elem in this_subtype):
            #check that node doesn't have any viruses of other known subtypes
            if not any(elem in descendents for elem in other_subtypes):
                subtyped_viruses[subtype].update(descendents)


print(subtyped_viruses)

{'group1': {'I', 'H', 'K', 'F'}, 'group2': {'S', 'N', 'P', 'Q', 'L'}, 'group3': {'E'}}


In [219]:
cov_tree = Tree('../results/tree_cov_full.nwk', format=1)
cov_metadata = pd.read_csv('../results/metadata_cov_full.tsv', delimiter = '\t').set_index('strain')

In [205]:
#Known viruses for each subtype
known = {'oc43':['KF963229', 'KF530087', 'KF530059', 'LC506876', 'KU131570', 'LC506782', 'LC506896'], 
         'hku1':['KR055515', 'KF430197', 'KF686339', 'KR055516', 'MF996629', 'MH940245'], 
         'nl63': ['KM055607', 'KT359837', 'KM055597', 'KM055602', 'JX104161', 'KY862037', 'MF996663'], 
         '229e': ['JX503060', 'LC005741', 'KM055524', 'KM055568', 'KJ866103', 'GU068548', 'KY369908', 'KT359754'],
         'mers': ['KJ156950', 'KT357808', 'MK129253', 'KX034094', 'KJ156890'],
         'sars1':['AY345986', 'GU553363', 'JN247396', 'DQ182595'],
         'sars2':['MT326086', 'MT159717', 'MT304486']
        }


In [233]:
subtyped_viruses = {key: set() for key in known.keys()}

for node in cov_tree.iter_descendants("postorder"):
    leafs = node.get_leaves()
    descendents = [leaf.name for leaf in leafs]

    
    for subtype in known.keys():

        this_subtype = known[subtype]
        remove_subtype = {key:val for key, val in known.items() if key != subtype}
        other_subtypes = [item for sublist in list(remove_subtype.values()) for item in sublist]
        #check if node has all known viruses of a subtype
        if all(elem in descendents for elem in this_subtype):
            #check that node doesn't have any viruses of other known subtypes
            if not any(elem in descendents for elem in other_subtypes):
                subtyped_viruses[subtype].update(descendents)


print(len(subtyped_viruses['oc43']))

869


In [83]:
#Check that known viruses were included in tree
all_viruses_in_tree = []

for k in bt_tree.Objects:
    if isinstance(k,bt.leaf):
        all_viruses_in_tree.append(k.name)

for subtype in known.keys():
    known_names = known[subtype]
    print(str(subtype)+' viruses not found on the tree')
    print(np.setdiff1d(known_names,all_viruses_in_tree))

oc43 viruses not found on the tree
[]
hku1 viruses not found on the tree
[]
nl63 viruses not found on the tree
[]
229e viruses not found on the tree
[]
mers viruses not found on the tree
[]
sars1 viruses not found on the tree
[]
sars2 viruses not found on the tree
[]


In [227]:
#Find viruses that weren't previously subtyped

newly_subtyped = {key: 0 for key in known.keys()}

for subtype in subtyped_viruses.keys():
    for strain in subtyped_viruses[subtype]:
        if cov_metadata.loc[strain]['subtype'] != subtype:
            newly_subtyped[subtype]+=1
#             print(cov_metadata.loc[strain]['subtype'])

print(newly_subtyped)

{'oc43': 495, 'hku1': 0, 'nl63': 0, '229e': 0, 'mers': 606, 'sars1': 54, 'sars2': 1140}


In [247]:
#Edit fasta file to include new labels
#Append virus subtype to fasta fields, to ultimately create column in fauna
def add_newly_subtyped_to_virus_fastas(input_fasta):
    output_fasta = str(input_fasta.replace('.fasta',''))+"_subtyped.fasta"
    cov_types = {'oc43':'beta', 'hku1':'beta', 'nl63':'alpha', '229e':'alpha', 'mers':'beta', 'sars1':'beta', 'sars2':'beta'}
    
    sequences = []

    with open(input_fasta, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):

            new_record_list = record.description.split('|')

            #Annotate subtypes
            for subtype in known.keys():
                strains = subtyped_viruses[subtype]
                if new_record_list[0] in strains:
                    new_record_list[-3] = subtype
                    new_record_list[-2] = cov_types[subtype]
            new_record_description = '|'.join(new_record_list)

            
            sequences.append(SeqRecord(record.seq, id=new_record_description, description=new_record_description))

    SeqIO.write(sequences, output_fasta, "fasta")

In [248]:
add_newly_subtyped_to_virus_fastas('../data/human_cov_full.fasta')

In [259]:
def count_new_annotations(virus):
    oc43_before = 0
    oc43_after = 0
    with open('../data/human_cov_full.fasta', "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            if virus in record.id:
                oc43_before+=1
    with open('../data/human_cov_full_subtyped.fasta', "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            if virus in record.id:
                oc43_after+=1
    print(oc43_before, oc43_after)

In [263]:
count_new_annotations('229e')

301 301
