In [5]:
import json
from augur.utils import json_to_tree
tree_file = '../../../nextstrain-build/phylogenetic_subsampling/auspice/mers.json'

In [6]:
#read in the tree
with open(tree_file, 'r') as f:
    tree_json = json.load(f)
    

#put tree in Bio.phylo format
tree = json_to_tree(tree_json)
# store the S1 mutations on each branch here
all_muts_by_node = {}


In [7]:

# iterate through all branches on the tree to find what mutations are there
for node in tree.find_clades(): 
    print(node)
    # get the mutations on this node from the 'branch_attrs'
    ### write code here
    muts = node.branch_attrs.get('mutations', {})    
    for gene, mutations in muts.items():
        # Ensure the node and gene are initialized in the dictionary
        if node.name not in all_muts_by_node:
            all_muts_by_node[node.name] = {}
        if gene not in all_muts_by_node[node.name]:
            all_muts_by_node[node.name][gene] = []
        
        # Add mutations to the list for this node and gene
        for m in mutations:
            if "-" in m: #skip over deletions and instertions
                continue
            else:
                all_muts_by_node[node.name][gene].append(m)


NODE_0000002
NODE_0000053
NODE_0000054
KJ477102
NODE_0000055
NODE_0000056
NODE_0000057
OP712624
OP712625
NODE_0000058
NODE_0000060
KJ477103
OP906306
NODE_0000011
MK967708
NODE_0000061
OP654178
NODE_0000062
OP654179
OP712623
NODE_0000063
NODE_0000064
MG923469
NODE_0000065
MG923470
MG923471
NODE_0000066
MG923473
NODE_0000067
NODE_0000068
KU201953
NODE_0000018
MG923472
KU201959
NODE_0000070
NODE_0000071
MG923479
NODE_0000072
MG923478
MG923480
NODE_0000073
MG923477
NODE_0000074
MG923474
MG923481
NODE_0000076
MG923476
MG923475
NODE_0000077
NODE_0000078
NODE_0000079
NODE_0000080
MK564475
MK564474
NODE_0000081
OP866291
NODE_0000082
OP866292
NODE_0000083
OP866290
OP866294
NODE_0000033
OP866293
OP866289
NODE_0000086
NODE_0000087
OP866285
OP866286
NODE_0000088
NODE_0000089
OR742173
OR742169
NODE_0000038
OR742167
OR742168
NODE_0000092
NODE_0000093
OR742175
OR742174
NODE_0000094
OR742171
NODE_0000046
OR742172
OR742170
NODE_0000096
NODE_0000097
KU740200
MG923465
NODE_0000099
NODE_0000100
NODE_00001

In [8]:
# for each tip, store all S1 mutations that have happened between root and that tip 
# also store what host this tip was sampled from
mut_and_host_info_by_tip = {}
# iterate through tips only
for node in tree.find_clades(terminal=True): 
    # get the path from root to this tip
    path = tree.get_path(node)
    # store a list of all mutations along that path
    ### write code here
    ### do this by looking up the mutations for each branch in the path (using the dict you made above)
    ### this can be done with a for loop or with list comprehension
    muts_on_path = {}
    for path_node in path:
        for gene, mutations in all_muts_by_node.get(path_node.name, {}).items():
            if gene == 'nuc':
                continue
            elif gene == 'Spike':
                continue
            elif gene == 'Orf1a' or gene == 'ORF1a':
                continue
            elif gene == 'Orf1b' or gene == 'ORF1b':
                continue
            if gene not in muts_on_path:
                muts_on_path[gene] = []
            muts_on_path[gene].extend(mutations)
    # get the host this virus was sampled from
    ### write code here
    ### you can access host from 'node_attrs'
    host = node.node_attrs.get('host', {})
    host_name = host.get('value', {})
    date = node.node_attrs.get('num_date', {})
    date_value = date.get('value', {})
    region = node.node_attrs.get('region', {})
    region_value = region.get('value', {})
    
    # store the tip name as the key with the value being a dictionary that gives the host and S1 mutations  
    mut_and_host_info_by_tip[node.name] = {'region': region_value, 'host': host_name, 'date': date_value,'muts_on_path': muts_on_path}
    
# use json dump to save 'mut_and_host_info_by_tip' to a file
### write code here
with open('../../intermediate-results-downsampling/mut_and_host_info_by_tip.json', 'w') as outfile:
    json.dump(mut_and_host_info_by_tip, outfile, indent=2)