In [5]:
import json
from augur.utils import json_to_tree
tree_file = '../../../../nextstrain-build/phylogenetic_only_camels/auspice/mers.json'

In [6]:
#read in the tree
with open(tree_file, 'r') as f:
    tree_json = json.load(f)
    

#put tree in Bio.phylo format
tree = json_to_tree(tree_json)
stop = {}
no_stop = {}

In [7]:
# iterate through all branches on the tree to find what mutations are there
for node in tree.find_clades(): 
    # get the mutations on this node from the 'branch_attrs'
    ### write code here
    muts = node.branch_attrs.get('mutations', {})    
    for gene, mutations in muts.items():
        for m in mutations:
            if "-" in m: #skip over deletions and instertions
                continue
            elif "X" in m[-1]: 
                if node.name not in stop:
                    stop[node.name] = {}
                if gene not in stop[node.name]:
                     stop[node.name][gene] = []
                stop[node.name][gene].append(m)
            else:
                if node.name not in no_stop:
                    no_stop[node.name] = {}
                if gene not in no_stop[node.name]:
                     no_stop[node.name][gene] = []
                no_stop[node.name][gene].append(m)

print(stop)   
print(no_stop) 

{'NODE_0000038': {'Orf4b': ['R206X', '*247X']}, 'MK967708': {'Nsp2': ['L117X'], 'ORF1a': ['L310X'], 'S1': ['G60X'], 'Spike': ['G79X', '*1354X']}, 'NODE_0000046': {'Orf4b': ['W67X', 'Y68X', 'K83X', 'Y85X']}, 'NODE_0000048': {'Orf4b': ['Q15X']}, 'NODE_0000049': {'Orf3': ['S92X', 'D97X', 'Y100X', 'N103X', '*104X']}, 'MG923473': {'Orf3': ['P86X']}, 'NODE_0000050': {'Orf3': ['H89X']}, 'MG923477': {'Orf4b': ['L125X']}, 'MG923474': {'Orf4b': ['N65X']}, 'NODE_0000061': {'Orf3': ['F96X', '*104X']}, 'NODE_0000062': {'Orf3': ['S101X']}, 'NODE_0000084': {'Nsp3': ['I186X', 'A187X'], 'ORF1a': ['I1039X', 'A1040X']}, 'NODE_0000099': {'Orf3': ['T87X', '*104X']}, 'NODE_0000100': {'Orf3': ['D85X', 'H89X']}, 'KX108941': {'Orf5': ['L7X', 'F8X', '*225X']}}
{'NODE_0000036': {'nuc': ['A2019T', 'A2040G', 'C2238T', 'A4996G', 'C5031T', 'C5828T', 'T6010C', 'G6277T', 'G6680A', 'C7276T', 'T7733C', 'A8193C', 'T8207C', 'C8210T', 'T8305C', 'T8333C', 'C8726T', 'T9779C', 'G10359T', 'C10833T', 'G11439A', 'C12809T', 'T148

In [8]:
def mut_by_tip(all_muts_by_node):
    mut_and_host_info_by_tip = {}

    for node in tree.find_clades(terminal=True): 
        # get the path from root to this tip
        path = tree.get_path(node)
        # store a list of all mutations along that path
        ### write code here
        ### do this by looking up the mutations for each branch in the path (using the dict you made above)
        ### this can be done with a for loop or with list comprehension
        muts_on_path = {}
        for path_node in path:
            for gene, mutations in all_muts_by_node.get(path_node.name, {}).items():
                if gene == 'nuc':
                    continue
                elif gene == 'Spike':
                    continue
                elif gene == 'Orf1a' or gene == 'ORF1a':
                    continue
                elif gene == 'Orf1b' or gene == 'ORF1b':
                    continue
                if gene not in muts_on_path:
                    muts_on_path[gene] = []
                muts_on_path[gene].extend(mutations)
        # get the host this virus was sampled from
        ### write code here
        ### you can access host from 'node_attrs'
        host = node.node_attrs.get('host', {})
        host_name = host.get('value', {})
        date = node.node_attrs.get('num_date', {})
        date_value = date.get('value', {})
        region = node.node_attrs.get('region', {})
        region_value = region.get('value', {})
        
        # store the tip name as the key with the value being a dictionary that gives the host and S1 mutations  
        mut_and_host_info_by_tip[node.name] = {'region': region_value, 'host': host_name, 'date': date_value,'muts_on_path': muts_on_path}
    return mut_and_host_info_by_tip
    
# use json dump to save 'mut_and_host_info_by_tip' to a file
### write code here
stop_by_tip = mut_by_tip(stop)
nonstop_by_tip = mut_by_tip(no_stop)

print(stop_by_tip)
print(nonstop_by_tip)

with open('stop_mut_and_host_info_by_tip.json', 'w') as outfile:
    json.dump(stop_by_tip, outfile, indent=2)

with open('no_stop_mut_and_host_info_by_tip.json', 'w') as outfile:
    json.dump(nonstop_by_tip, outfile, indent=2)

{'KJ477102': {'region': 'Africa', 'host': 'Camel', 'date': 2013.6677696491379, 'muts_on_path': {}}, 'OP712624': {'region': 'Africa', 'host': 'Camel', 'date': 2016.6844262295083, 'muts_on_path': {'Orf4b': ['R206X', '*247X']}}, 'OP712625': {'region': 'Africa', 'host': 'Camel', 'date': 2016.6844262295083, 'muts_on_path': {'Orf4b': ['R206X', '*247X']}}, 'KJ477103': {'region': 'Africa', 'host': 'Camel', 'date': 2013.9986301369863, 'muts_on_path': {'Orf4b': ['R206X', '*247X']}}, 'OP906306': {'region': 'Africa', 'host': 'Camel', 'date': 2013.9986301369863, 'muts_on_path': {'Orf4b': ['R206X', '*247X']}}, 'MK967708': {'region': 'Africa', 'host': 'Camel', 'date': 2018.0013698630137, 'muts_on_path': {'Orf4b': ['R206X', '*247X'], 'Nsp2': ['L117X'], 'S1': ['G60X']}}, 'OP654178': {'region': 'Africa', 'host': 'Camel', 'date': 2015.0150684931507, 'muts_on_path': {'Orf4b': ['R206X', '*247X']}}, 'OP654179': {'region': 'Africa', 'host': 'Camel', 'date': 2015.0013698630137, 'muts_on_path': {'Orf4b': ['R20