In [127]:
from augur.utils import json_to_tree
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio import Phylo
from collections import Counter
import requests
import itertools

In [233]:
#Download tree json
tree_url = "https://data.nextstrain.org/ncov_global.json"

tree_json = requests.get(tree_url).json()

#Put tree in Bio.Phylo format
tree = json_to_tree(tree_json)

In [129]:
Phylo.write(tree, "global_tree.nexus", "nexus")

1

In [94]:
# top_hit_muts = ['ORF1a_G3676-']
top_hit_muts = ['ORF1a_G3676-', 'S_P681H', 'S_N501Y', 'N_T205I', 
                'S_L18F', 'S_L452R', 'S_E484K', 'ORF1a_L3606F', 
                'ORF1a_T365I', 'N_M234I']

In [5]:
#Function to find path from root to clade
def get_parent(tree, child_clade):
    node_path = tree.get_path(child_clade)
    return node_path

In [222]:
# initialize dict to store mutations observed in the evolutionary path before the top scoring mutations
def find_prior_muts(hit):
#     prior_muts = {k:[] for k in top_hit_muts}
    
    prior_muts = []

#     combined_genotypes = {k:{} for k in top_hit_muts}
    
    combined_genotypes = {}

#     for hit in top_hit_muts:
    gene = hit.split('_')[0]
    mutation = hit.split('_')[1]

    count = 0
    # find if node has the top hit mutation
    for node in tree.find_clades():
        # only consider mutations that happen on internal branches with at least 10 descendents
        if len(node.get_terminals())>=10:

            # find node with 'hit' mutation
            if hasattr(node, 'branch_attrs'):
                if gene in node.branch_attrs["mutations"].keys():
                    if mutation in node.branch_attrs["mutations"][gene]:
                        count+=1
                        
                        
                        # initialize dictionary to store all mutations that happened 
                        # prior to this occurrence of the 'hit' mutation
                        this_mutation_occurrence_path = {'E': [], 'M': [], 'N': [],
                                                         'ORF1a': [], 'ORF1b': [], 'ORF3a': [], 
                                                         'ORF6': [], 'ORF7a': [], 'ORF7b': [], 
                                                         'ORF8':[], 'ORF9b': [],
                                                         'S': [], 'nuc': []}

                        node_path = get_parent(tree, node)
    
                        for parent in node_path:
                            if len(parent.branch_attrs['mutations']) != 0:
                                for g, mut_list in parent.branch_attrs['mutations'].items():
                                    this_mutation_occurrence_path[g].append(mut_list)
                                
                        # flatten list of lists into list
                        for k, v in this_mutation_occurrence_path.items():
                            new_v = [item for sublist in v for item in sublist]
                            this_mutation_occurrence_path[k] = new_v
                


                        # deal with multiple mutations at same site
                        # and edit mutations to just show site and end aa/nuc
                        # just want to know what final genotype at each site is

                        edited_this_mutation_occurrence_path = {}
                        for k,v in this_mutation_occurrence_path.items():
                            # dict to store all genotypes at mutated sites within this gene
                            final_genotype_at_site = {}

                            for mut in v:
                                mut_pos = str(mut[1:-1])

                                # overwrite genotype with more recent mutation if it already exists
                                # otherwise add it
                                final_genotype_at_site[mut_pos] = mut[-1]

                            new_v = [str(i)+str(j) for i, j in final_genotype_at_site.items()]
                            

                            edited_this_mutation_occurrence_path[k] = new_v



                        # if another occurrence of this 'hit' mutation has already been added to prior_muts, 
                        # then add the path of this occurrence to a list 
                        # to give a list genotypes for each occurrence of the 'hit' mutation


                        prior_muts.append(edited_this_mutation_occurrence_path)




                        for k_c, v_c in edited_this_mutation_occurrence_path.items():
                            if k_c in combined_genotypes.keys():
                                combined_genotypes[k_c]+=v_c
                            else:
                                combined_genotypes[k_c]=v_c


    

    return count, prior_muts, combined_genotypes
                            
                            



In [234]:
count, prior_muts, combined_genotypes = find_prior_muts('S_N501Y')
print(count)
for k,v in combined_genotypes.items():
    print(k)
    print(Counter(v))

4
E
Counter({'71L': 1})
M
Counter()
N
Counter({'205I': 2, '203K': 2, '204R': 2, '80R': 1, '3L': 1, '235F': 1})
ORF1a
Counter({'2235L': 4, '3833N': 4, '3675-': 3, '3676-': 3, '3677-': 3, '1055A': 1, '1538I': 1, '3255I': 1, '3729R': 1, '265I': 1, '3353R': 1, '1188L': 1, '1795Q': 1, '2230T': 1, '1708D': 1, '1001I': 1})
ORF1b
Counter({'314L': 4, '1342S': 1})
ORF3a
Counter({'57H': 2, '106F': 1, '171L': 1, '253P': 1})
ORF6
Counter()
ORF7a
Counter()
ORF7b
Counter()
ORF8
Counter({'11K': 1, '38S': 1, '67F': 1, '73C': 1, '27*': 1})
ORF9b
Counter({'77E': 1})
S
Counter({'614G': 4, '501Y': 4, '484K': 3, '681H': 2, '95I': 1, '950N': 1, '144S': 1, '346K': 1, '80A': 1, '215G': 1, '241-': 1, '242-': 1, '243-': 1, '701V': 1, '655Y': 1, '18F': 1, '26S': 1, '138Y': 1, '190S': 1, '1027I': 1, '1176F': 1, '69-': 1, '70-': 1, '144-': 1, '716I': 1, '982A': 1, '1118H': 1})
nuc
Counter({'6968C': 4, '11764T': 4, '3037T': 4, '14408T': 4, '23403G': 4, '241T': 4, '23063T': 4, '23012A': 3, '11288-': 3, '11289-': 3, '

In [235]:
count, prior_muts, combined_genotypes = find_prior_muts('S_E484K')
print(count)
for k,v in combined_genotypes.items():
    print(k)
    print(Counter(v))

8
E
Counter()
M
Counter({'28L': 1})
N
Counter({'203K': 4, '204R': 4, '199L': 1, '234I': 1, '205I': 1, '187L': 1, '418H': 1})
ORF1a
Counter({'2235L': 8, '3833N': 8, '265I': 2, '3675-': 2, '3676-': 2, '3677-': 2, '3201P': 1, '265T': 1, '3930F': 1})
ORF1b
Counter({'314L': 8, '1011H': 1, '1362R': 1, '1936H': 1})
ORF3a
Counter({'57H': 3, '42L': 1, '57Q': 1})
ORF6
Counter()
ORF7a
Counter()
ORF7b
Counter()
ORF8
Counter({'11I': 1})
ORF9b
Counter()
S
Counter({'614G': 8, '484K': 8, '95I': 2, '950N': 1, '5F': 1, '253G': 1, '701V': 1, '144-': 1, '1176F': 1, '152L': 1, '769V': 1, '655Y': 1})
nuc
Counter({'6968C': 8, '11764T': 8, '3037T': 8, '14408T': 8, '23403G': 8, '241T': 8, '23012A': 8, '28881A': 4, '28882A': 4, '28883C': 4, '25563T': 3, '21993-': 3, '11288-': 3, '11289-': 3, '11290-': 3, '11291-': 3, '11292-': 3, '11293-': 3, '11294-': 3, '11295-': 3, '11296-': 3, '18877T': 2, '21846T': 2, '1059T': 2, '28271-': 2, '21994-': 2, '24410A': 1, '28272G': 1, '9867C': 1, '27925T': 1, '16500C': 1, '202

In [236]:
count, prior_muts, combined_genotypes = find_prior_muts('S_P681H')
print(count)
for k,v in combined_genotypes.items():
    print(k)
    print(Counter(v))

8
E
Counter()
M
Counter({'82T': 1})
N
Counter({'203K': 5, '204R': 5, '205I': 2, '194L': 1, '208-': 1, '209G': 1})
ORF1a
Counter({'2235L': 8, '3833N': 8, '3255I': 2, '1055A': 1, '1538I': 1, '3729R': 1, '265I': 1, '2196P': 1, '2511N': 1, '2936I': 1, '3209V': 1, '3278S': 1, '1246I': 1, '1013K': 1, '3143V': 1, '4175I': 1, '318L': 1, '1283V': 1, '1500R': 1, '3606F': 1, '2230T': 1, '3675-': 1, '3676-': 1, '3677-': 1})
ORF1b
Counter({'314L': 8, '1342S': 1, '2371M': 1})
ORF3a
Counter({'57H': 2, '106F': 1, '55F': 1, '110S': 1})
ORF6
Counter()
ORF7a
Counter()
ORF7b
Counter({'44X': 1})
ORF8
Counter({'11K': 1, '38S': 1, '67F': 1, '1-': 1, '2X': 1, '3X': 1, '106X': 1, '122X': 1})
ORF9b
Counter()
S
Counter({'614G': 8, '681H': 8, '95I': 2, '484K': 2, '950N': 1, '144S': 1, '346K': 1, '501Y': 1, '494P': 1, '144-': 1, '796H': 1, '452R': 1, '732A': 1, '478K': 1, '143F': 1, '69-': 1, '70-': 1})
nuc
Counter({'6968C': 8, '11764T': 8, '3037T': 8, '14408T': 8, '23403G': 8, '241T': 8, '23604A': 8, '28881A': 5,

In [237]:
count, prior_muts, combined_genotypes = find_prior_muts('S_L18F')
print(count)
for k,v in combined_genotypes.items():
    print(k)
    print(Counter(v))

4
E
Counter({'71L': 1})
M
Counter()
N
Counter({'202N': 1, '220V': 1, '205I': 1, '203K': 1, '204R': 1, '80R': 1})
ORF1a
Counter({'2235L': 4, '3833N': 4, '3675-': 2, '3676-': 2, '3677-': 2, '286L': 1, '265I': 1, '3353R': 1, '1655N': 1, '1188L': 1, '1795Q': 1})
ORF1b
Counter({'314L': 3, '1000L': 1})
ORF3a
Counter({'50A': 1, '57H': 1, '171L': 1, '253P': 1})
ORF6
Counter()
ORF7a
Counter()
ORF7b
Counter()
ORF8
Counter({'84S': 1})
ORF9b
Counter({'77E': 1})
S
Counter({'18F': 4, '614G': 3, '484K': 2, '501Y': 2, '222V': 1, '80A': 1, '215G': 1, '241-': 1, '242-': 1, '243-': 1, '701V': 1, '417N': 1, '655Y': 1, '26S': 1, '138Y': 1, '190S': 1, '1027I': 1, '1176F': 1})
nuc
Counter({'6968C': 4, '11764T': 4, '21614T': 4, '3037T': 3, '14408T': 3, '23403G': 3, '241T': 3, '23012A': 2, '11288-': 2, '11289-': 2, '11292-': 2, '11293-': 2, '11294-': 2, '11295-': 2, '11296-': 2, '11290-': 2, '11291-': 2, '23063T': 2, '8782T': 1, '28144C': 1, '28878A': 1, '29742A': 1, '22468T': 1, '1122T': 1, '16466T': 1, '2026

In [238]:
count, prior_muts, combined_genotypes = find_prior_muts('S_L452R')
print(count)
for k,v in combined_genotypes.items():
    print(k)
    print(Counter(v))

5
E
Counter()
M
Counter()
N
Counter({'203K': 2, '204R': 2, '197L': 1, '234I': 1, '383L': 1, '203M': 1, '377Y': 1, '205I': 1, '212V': 1})
ORF1a
Counter({'2235L': 5, '3833N': 5, '3278S': 2, '1246I': 2, '3071Y': 1, '4F': 1, '3255I': 1, '3580Q': 1, '265I': 1, '1013K': 1, '3143V': 1, '4175I': 1, '859V': 1, '102K': 1, '1639N': 1, '2287S': 1, '3222N': 1, '3691S': 1})
ORF1b
Counter({'314L': 4, '1000L': 1, '1183Y': 1})
ORF3a
Counter({'196V': 1, '74F': 1, '26L': 1, '57H': 1})
ORF6
Counter()
ORF7a
Counter({'82A': 1})
ORF7b
Counter()
ORF8
Counter({'84S': 1})
ORF9b
Counter()
S
Counter({'452R': 5, '614G': 5, '141-': 1, '142-': 1, '143-': 1, '142D': 1, '681R': 1, '13I': 1, '152C': 1, '677H': 1})
nuc
Counter({'6968C': 5, '11764T': 5, '22917G': 5, '23403G': 5, '3037T': 4, '14408T': 4, '241T': 4, '26681T': 2, '28272T': 2, '28881A': 2, '28882A': 2, '28883C': 2, '10097A': 2, '23731T': 2, '13536T': 2, '4002T': 2, '8782T': 1, '28144C': 1, '9477A': 1, '14805T': 1, '25979T': 1, '28657T': 1, '28863T': 1, '1281

In [231]:
count, prior_muts, combined_genotypes = find_prior_muts('ORF1a_G3676-')
print(count)
for k,v in combined_genotypes.items():
    print(k)
    print(Counter(v))

7
E
Counter()
M
Counter({'82T': 2})
N
Counter({'203K': 4, '204R': 4, '205I': 1, '13L': 1, '214C': 1, '208-': 1, '209G': 1, '80R': 1, '3L': 1})
ORF1a
Counter({'2235L': 7, '3675-': 7, '3676-': 7, '3677-': 7, '265I': 2, '3201P': 2, '1246I': 1, '3278S': 1, '2287S': 1, '2387V': 1, '3255I': 1, '2511N': 1, '2936I': 1, '3209V': 1, '1188L': 1, '1795Q': 1, '1708D': 1, '1001I': 1})
ORF1b
Counter({'314L': 7, '2371M': 1, '1264D': 1})
ORF3a
Counter({'57H': 2, '42L': 1, '253P': 1})
ORF6
Counter()
ORF7a
Counter()
ORF7b
Counter()
ORF8
Counter({'11I': 1, '106*': 1, '92K': 1, '52I': 1, '27*': 1, '73C': 1})
ORF9b
Counter({'10S': 1, '77E': 1})
S
Counter({'614G': 7, '484K': 3, '80A': 1, '417N': 1, '701V': 1, '75V': 1, '76I': 1, '246-': 1, '247-': 1, '248-': 1, '249-': 1, '250-': 1, '251-': 1, '252-': 1, '253N': 1, '452Q': 1, '490S': 1, '859N': 1, '95I': 1, '144-': 1, '796H': 1, '18F': 1, '20N': 1, '26S': 1, '138Y': 1, '190S': 1, '501Y': 1, '655Y': 1, '1027I': 1, '1176F': 1, '1118H': 1, '570D': 1, '681H': 1,