In [146]:
import pandas as pd
import numpy as np
import ast
import re
import json
from ete3 import Tree

In [189]:
#Test with fake data
t = Tree('((((H,K)D,(F,I)G)B,E)A,((L,(N,Q)O)J,(P,S)M)C);', format=1)
known_test = {'group1':['H','K'],
              'group2':['L','N','Q'],
              'group3':['E']
             }
print(t)


            /-H
         /-|
        |   \-K
      /-|
     |  |   /-F
   /-|   \-|
  |  |      \-I
  |  |
  |   \-E
--|
  |      /-L
  |   /-|
  |  |  |   /-N
  |  |   \-|
   \-|      \-Q
     |
     |   /-P
      \-|
         \-S


In [203]:
subtyped_viruses = {key: set() for key in known_test.keys()}

for node in t.iter_descendants("postorder"):
    leafs = node.get_leaves()
    descendents = [leaf.name for leaf in leafs]

    
    for subtype in known_test.keys():

        this_subtype = known_test[subtype]
        remove_subtype = {key:val for key, val in known_test.items() if key != subtype}
        other_subtypes = [item for sublist in list(remove_subtype.values()) for item in sublist]
        #check if node has all known viruses of a subtype
        if all(elem in descendents for elem in this_subtype):
            #check that node doesn't have any viruses of other known subtypes
            if not any(elem in descendents for elem in other_subtypes):
                subtyped_viruses[subtype].update(descendents)


print(subtyped_viruses)

{'group1': {'I', 'H', 'K', 'F'}, 'group2': {'S', 'N', 'P', 'Q', 'L'}, 'group3': {'E'}}


In [204]:
cov_tree = Tree('../results/tree_cov_full.nwk', format=1)

In [205]:
#Known viruses for each subtype
known = {'oc43':['KF963229', 'KF530087', 'KF530059', 'LC506876', 'KU131570', 'LC506782', 'LC506896'], 
         'hku1':['KR055515', 'KF430197', 'KF686339', 'KR055516', 'MF996629', 'MH940245'], 
         'nl63': ['KM055607', 'KT359837', 'KM055597', 'KM055602', 'JX104161', 'KY862037', 'MF996663'], 
         '229e': ['JX503060', 'LC005741', 'KM055524', 'KM055568', 'KJ866103', 'GU068548', 'KY369908', 'KT359754'],
         'mers': ['KJ156950', 'KT357808', 'MK129253', 'KX034094', 'KJ156890'],
         'sars1':['AY345986', 'GU553363', 'JN247396', 'DQ182595'],
         'sars2':['MT326086', 'MT159717', 'MT304486']
        }


In [209]:
subtyped_viruses = {key: set() for key in known.keys()}

for node in cov_tree.iter_descendants("postorder"):
    leafs = node.get_leaves()
    descendents = [leaf.name for leaf in leafs]

    
    for subtype in known.keys():

        this_subtype = known[subtype]
        remove_subtype = {key:val for key, val in known.items() if key != subtype}
        other_subtypes = [item for sublist in list(remove_subtype.values()) for item in sublist]
        #check if node has all known viruses of a subtype
        if all(elem in descendents for elem in this_subtype):
            #check that node doesn't have any viruses of other known subtypes
            if not any(elem in descendents for elem in other_subtypes):
                subtyped_viruses[subtype].update(descendents)


print(subtyped_viruses)

{'oc43': {'KX538975', 'KR055606', 'LC506948', 'KF572901', 'LC506815', 'KF572724', 'LC506908', 'KF572784', 'LC506907', 'LC506918', 'KF572685', 'KR055600', 'KF572885', 'LC506787', 'KY967360', 'MF996608', 'KF572879', 'LC506848', 'KF572942', 'LC331086', 'KF572816', 'KF572895', 'LC506880', 'LC506783', 'KU745544', 'LC506904', 'KR055609', 'KF923922', 'LC331074', 'KF572947', 'JX513268', 'KR055589', 'LC506864', 'KF572836', 'LC506837', 'LC506898', 'LC331091', 'KF572849', 'KF572811', 'LC506774', 'KF923890', 'KF572687', 'KF572869', 'KF963210', 'KF572739', 'KF530092', 'LC331073', 'KF572916', 'KF572890', 'KF572843', 'KF572690', 'KF572722', 'KF963212', 'LC331084', 'KF923897', 'KF963215', 'LC506755', 'KF572664', 'KF572828', 'KF963232', 'LC331076', 'KF923915', 'KX538978', 'KR055610', 'LC506861', 'KR055581', 'LC506779', 'KF963197', 'KF572756', 'KF572788', 'LC506929', 'KF923907', 'LC506857', 'LC506884', 'KF572758', 'KF572825', 'LC506897', 'KF572677', 'LC331071', 'KF530095', 'KF572630', 'KF923896', 'KF572

In [83]:
#Check that known viruses were included in tree
all_viruses_in_tree = []

for k in bt_tree.Objects:
    if isinstance(k,bt.leaf):
        all_viruses_in_tree.append(k.name)

for subtype in known.keys():
    known_names = known[subtype]
    print(str(subtype)+' viruses not found on the tree')
    print(np.setdiff1d(known_names,all_viruses_in_tree))

oc43 viruses not found on the tree
[]
hku1 viruses not found on the tree
[]
nl63 viruses not found on the tree
[]
229e viruses not found on the tree
[]
mers viruses not found on the tree
[]
sars1 viruses not found on the tree
[]
sars2 viruses not found on the tree
[]
