In [5]:
import json
from augur.utils import json_to_tree
tree_file = '../../../nextstrain-build/phylogenetic/auspice/mers.json'

In [6]:
#read in the tree
with open(tree_file, 'r') as f:
    tree_json = json.load(f)
    

#put tree in Bio.phylo format
tree = json_to_tree(tree_json)
# store the S1 mutations on each branch here
all_muts_by_node = {}


In [7]:

# iterate through all branches on the tree to find what mutations are there
for node in tree.find_clades(): 
    # get the mutations on this node from the 'branch_attrs'
    ### write code here
    muts = node.branch_attrs.get('mutations', {})    
    for gene, mutations in muts.items():
        # Ensure the node and gene are initialized in the dictionary
        if node.name not in all_muts_by_node:
            all_muts_by_node[node.name] = {}
        if gene not in all_muts_by_node[node.name]:
            all_muts_by_node[node.name][gene] = []
        
        # Add mutations to the list for this node and gene
        for m in mutations:
            if "-" in m: #skip over deletions and instertions
                continue
            else:
                all_muts_by_node[node.name][gene].append(m)


In [8]:
# for each tip, store all S1 mutations that have happened between root and that tip 
# also store what host this tip was sampled from
mut_and_host_info_by_tip = {}
# iterate through tips only
for node in tree.find_clades(terminal=True): 
    # get the path from root to this tip
    path = tree.get_path(node)

    # store a list of all mutations along that path
    ### write code here
    ### do this by looking up the mutations for each branch in the path (using the dict you made above)
    ### this can be done with a for loop or with list comprehension
    muts_on_path = {}
    for path_node in path:
        for gene, mutations in all_muts_by_node.get(path_node.name, {}).items():
            if gene == 'nuc':
                continue
            elif gene == 'Spike':
                continue
            if gene not in muts_on_path:
                muts_on_path[gene] = []
            muts_on_path[gene].extend(mutations)
    # get the host this virus was sampled from
    ### write code here
    ### you can access host from 'node_attrs'
    host = node.node_attrs.get('host', {})
    host_name = host.get('value', {})
    
    # store the tip name as the key with the value being a dictionary that gives the host and S1 mutations  
    mut_and_host_info_by_tip[node.name] = {'host': host_name, 'muts_on_path': muts_on_path}
    
# use json dump to save 'mut_and_host_info_by_tip' to a file
### write code here
with open('../../intermediate-results-nodownsampling/mut_and_host_info_by_tip.json', 'w') as outfile:
    json.dump(mut_and_host_info_by_tip, outfile, indent=2)