In [1]:
import json
from collections import Counter
from augur.utils import json_to_tree
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import pandas as pd

In [159]:
def get_parent_nodes(virus, num_tips):
    """
    Get all internal nodes where no HA1 muts happen on their children branches (except in eggs)
    """
    
    tree_path= f'../nextstrain_builds/egg-enriched/{num_tips}tips/auspice/{virus}_30y_egg_ha.json'
    
    #read in the tree
    with open(tree_path, 'r') as f:
        tree_json = json.load(f)
        
    #put tree in Bio.phylo format
    tree = json_to_tree(tree_json)
    

    # start by finding all branches will only egg descendents
    # may be terminal or non-terminal
    egg_only_descendents = []
    
    for node in tree.find_clades():

        descending_tips = node.get_terminals()
        # don't think we'll ever see an egg only cluster of 20 or more
        if len(descending_tips) < 25:
            # find the passage type of all the descending tips
            passage_type_descendants = set([x.node_attrs['passage_category']['value'] for x in descending_tips])
            # add egg-only clusters to list
            if passage_type_descendants == {'egg'}:
                egg_only_descendents.append(node.name)
                
    # now trace egg-only cluster path back as far as possible without encountering an HA1 mut
    parent_nodes_wo_muts = {}
    for d in egg_only_descendents:
        # walking backwards, check for HA1 muts
        found_ha1_mut = False
        parents_wo_ha1_muts = []
        for p in tree.get_path(d)[::-1]:
            # don't look on d's branch (branch leading to egg cluster)
            if p.name!=d:
                # if we haven't seen an ha1 mut yet
                if found_ha1_mut == False:
                    if 'HA1' in p.branch_attrs['mutations']:
                        found_ha1_mut = True
                    else:
                        parents_wo_ha1_muts.append(p.name)
        parent_nodes_wo_muts[d] = parents_wo_ha1_muts
                        

    # now need to find internal nodes in the lists that are shared by multiple egg clusters
    possiblities = list(set([x for xs in parent_nodes_wo_muts.values() for x in xs]))

    egg_clusters_by_possibility = {p:[] for p in possiblities}
    for clus, pars in parent_nodes_wo_muts.items():
        for p in pars:
            egg_clusters_by_possibility[p].append(clus)
            
    # some of these will be overlapping, so get rid of those (take the largest possible clade)
    
    # convert the dictionary into a list of tuples to compare
    clades = list(egg_clusters_by_possibility.items())
    cleaned_clades = {}

    for parent_node, egg_children in clades:
        is_subset = False
        for other_parent, other_children in clades:
            if parent_node != other_parent and set(egg_children).issubset(set(other_children)):
                is_subset = True
                break
        if not is_subset:
            cleaned_clades[parent_node] = egg_children
            
    return cleaned_clades

In [160]:
test_cleaned_clades = get_parent_nodes('h3n2', 10000)

In [78]:
def get_ha1_muts(virus, segment='ha'):
    """
    return the HA1 egg-passaging muts
    """
    
    curated_mut_file = f"../egg-mutation-analysis/egg-muts-by-strain/{virus}_{segment}_curated-egg-muts.json"
    
    with open(curated_mut_file) as json_handle:
        egg_mut_info = json.load(json_handle)
    
    ha1_egg_muts = {}
    
    for strain, muts in egg_mut_info.items():
        ha1_egg_muts[strain] = muts['HA1']

    
    return ha1_egg_muts

In [165]:
def get_muts_in_replicates(virus, num_tips=10000):
    """
    """
    
    egg_descendents_by_node = get_parent_nodes(virus, num_tips)
    
    ha1_egg_muts = get_ha1_muts(virus)
    
    muts_in_replicates = {}
    
    for n, es in egg_descendents_by_node.items():
        muts_per_n = []
        for e in es:
            # internal nodes won't be in the mut dict,
            # but that's okay because I already counted them when storing the egg muts
            if e in ha1_egg_muts.keys():
                e_muts = ha1_egg_muts[e]
            muts_per_n.append(e_muts)
            
        muts_in_replicates[n] = muts_per_n
        
    return muts_in_replicates

In [None]:
## maybe should exclude the divergent egg-only clade in the middle of h3n2!

In [166]:
get_muts_in_replicates('h3n2')

{'NODE_0013749': [['186N', '263R'], ['186N', '263R']],
 'NODE_0002639': [['145N', '225G']],
 'NODE_0000579': [['194I']],
 'NODE_0002988': [['194I']],
 'NODE_0011338': [['194P'], ['194P'], ['194P'], ['194P']],
 'NODE_0003600': [['186V', '225G'], ['160K', '228T']],
 'NODE_0011144': [['160K', '194P'], [], ['225G', '246S'], ['160K', '194P']],
 'NODE_0002144': [['186V', '219Y']],
 'NODE_0008968': [['194P'], ['156R', '186S']],
 'NODE_0014528': [[]],
 'NODE_0013827': [['195Y'], ['195Y', '246S', '280G']],
 'NODE_0009816': [['140K', '186V'], ['140K', '186V']],
 'NODE_0000856': [['103S'], ['103S'], [], [], []],
 'NODE_0005473': [['226I', '190V'],
  ['226I', '190V'],
  ['226I', '25L', '307I'],
  ['226I', '186S', '248I']],
 'NODE_0013318': [[]],
 'NODE_0007743': [[]],
 'NODE_0001496': [['186V', '219F']],
 'NODE_0002753': [['186V']],
 'NODE_0002900': [['186V']],
 'NODE_0005913': [['246H']],
 'NODE_0005551': [[], ['190E']],
 'NODE_0006696': [['186V', '156Q', '158N'],
  ['186V', '156Q', '158N'],
  ['

In [167]:
get_muts_in_replicates('h1n1pdm')

{'NODE_0005549': [[], [], ['116V']],
 'NODE_0000101': [['116V'], ['266V'], ['266V'], ['222N'], ['127E']],
 'NODE_0010512': [['127E']],
 'NODE_0003438': [['223R'], ['223R'], ['223R']],
 'NODE_0000647': [[],
  [],
  ['30I'],
  ['205K'],
  ['207N', '321I'],
  ['207N', '321I'],
  [],
  [],
  [],
  ['79V'],
  [],
  [],
  ['223R'],
  ['272I'],
  [],
  ['216V'],
  ['216V'],
  ['128P', '191I'],
  ['46E', '256V'],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  ['262E'],
  ['262E'],
  [],
  [],
  [],
  [],
  [],
  ['48T'],
  ['286M'],
  [],
  [],
  ['172V'],
  [],
  [],
  ['183P'],
  ['183P'],
  ['183P'],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  ['321A'],
  ['321A'],
  [],
  [],
  [],
  [],
  [],
  ['77P'],
  ['77P'],
  ['40E'],
  ['321I'],
  ['71Y', '119N', '138Y', '222G'],
  ['71Y', '119N', '138Y', '222G'],
  ['71Y', '119N', '138Y', '222G'],
  ['74R'],
  ['74R'],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
