In [1]:
from utils import sars2_genome_info, add_syn_mut_attribute, add_mut_at_node_attr, add_mut_accumulation_attr
from utils import get_parent, prune_tree
from augur.utils import json_to_tree
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.gridspec as gridspec
from collections import Counter
import requests
import math



## Investigate recombination as potential source of Nsp6 deletions in multiple lineages 

##### If the 8 occurrences of Nsp6 deletion are actually due to recombination, not convergent evolution, then the acceptor lineage should have recieved other mutations near Nsp6 along with the deletion during recombination.

##### For each of the 8 occurrences of the Nsp6 deletion, find what other mutations were present in the lineage. Then, determine whether these mutations are shared by other lineages with the Nsp6 deletion.

### There is no figure associated with this notebook- it is more of a sanity check. The results (no evidence of recombination) are summarized in the table at the bottom.

Import the tree and convert to Bio Phylo format. This is a time-resolved phylogeny built from 9544 SARS-CoV-2 genomes sampled between December 2019 and May 15, 2021. The tree can be viewed at https://nextstrain.org/groups/blab/ncov/adaptive-evolution/2021-05-15

In [2]:
tree_url = 'https://nextstrain-blab.s3.amazonaws.com/ncov_adaptive-evolution_2021-05-15.json'

tree_json = requests.get(tree_url).json()

#Put tree in Bio.Phylo format
tree = json_to_tree(tree_json)

Get information about genome position and length of each gene 

In [3]:
reference_gene_locations, reference_gene_codon, gene_lengths_aa = sars2_genome_info()

Add information about synonymous mutations as an attribute of nodes on the tree

In [4]:
tree = add_syn_mut_attribute(tree)

Add the number of mutations that occur at a node as an attribute of that node, for all nodes on the tree

In [5]:
tree = add_mut_at_node_attr(tree)

Add an attribute to each node that gives the total number of mutations accumulated between the tree root and that node (including mutations on the node). 

In [6]:
tree = add_mut_accumulation_attr(tree)

Find all occurrences of Nsp6 deletion and put them in chronological order

In [110]:
nsp6del_nodes = []

for node in tree.find_clades(terminal=False):

    if len(node.get_terminals()) >=15:

        # finds branchs with the mutation
        if hasattr(node, "branch_attrs") and "mutations" in node.branch_attrs:
            if 'ORF1a' in node.branch_attrs["mutations"]:
                for m in node.branch_attrs["mutations"]['ORF1a']:
                    if '3676-' in m:
                        # manually add emerging lineages that descend from these nodes:
                        emerging_lineage = {'NODE_0000031': 'B.1.619/B.1.620', 'NODE_0004363': 'B.1.526 (Iota)', 
                                            'NODE_0004416': 'B.1.351 (Beta)', 'NODE_0008062': 'P.1 (Gamma)', 
                                            'NODE_0005404': 'B.1.1.7 (Alpha)', 'NODE_0000152': 'B.1.525 (Eta)', 
                                            'NODE_0005215': 'B.1.1.318', 'NODE_0006885': 'C.37 (Lambda)'}
                        nsp6del_nodes.append({'node':node.name, 'date': node.node_attrs["num_date"]["value"], 
                                              'emerging_lineage': emerging_lineage[node.name]})
                        
nsp6del_nodes = sorted(nsp6del_nodes, key = lambda i: i['date'])

For every occurrence of a convergently-evolved mutation, find all nucleotide mutations that occurred before it and what nodes they occurred at

In [62]:
def get_mut_history(gene, mutation):
    """
    Find internal branches with the convergently-evolved mutation.
    Return a dictionary with the name of this branch, the date, 
    and all mutations that occurred on the path to this branch
    """
    
    mut_history = {}
    
    for node in tree.find_clades(terminal=False):

        if len(node.get_terminals()) >=15:
            
            # finds branchs with the mutation
            if hasattr(node, "branch_attrs") and "mutations" in node.branch_attrs:
                if gene in node.branch_attrs["mutations"]:
                    for m in node.branch_attrs["mutations"][gene]:
                        if mutation in m:

                            mut_date = node.node_attrs["num_date"]["value"]

                            # find all mutations on the path leading to this mutation, including this branch
                            muts_in_parents = {'nuc':[],**{k: [] for k,v in gene_lengths_aa.items()}}
                            muts_in_parents = {}
                            
                            parents = get_parent(tree, node)
                            for p in parents:
                                if hasattr(p, "branch_attrs") and "mutations" in p.branch_attrs:
                                    if 'nuc' in p.branch_attrs['mutations']:
#                                     for k,v in p.branch_attrs['mutations'].items():
                                        muts_in_parents[p.name]=p.branch_attrs['mutations']['nuc']
                                        
                            # remove entries for genes that don't have mutations
                            muts_in_parents = {k:v for k,v in muts_in_parents.items() if len(v)!=0}


                            mut_history[node.name] = {'branch_with_mut':node.name, 
                                                      'mut_date': mut_date, 
                                                      'mut_history': muts_in_parents}
    return mut_history   

Run this for the Nsp6 deletion

In [63]:
mut_history = get_mut_history('ORF1a', '3675-')

Now consider all possible pairs of nodes with the nsp6 deletion as potential donors/acceptors of recombination

In [115]:
def find_informative_muts(donor_dict, acceptor_dict, donor_node, acceptor_node, on_deletion_branch):
    """
    Given two dictionaries of nodes with mutations, remove 
    common mutations (that occurred at the same node) 
    from those dictionaries. Also remove the nsp6 deletion 
    itself from the list of nucleotide mutations
    """
    # list of nucleotide muts that make up deletion
    nsp6del_nucs = ['T11288-', 'C11289-', 'T11290-', 'G11291-', 'G11292-', 
                    'T11293-', 'T11294-', 'T11295-', 'T11296-']
    
    # remove shared ancestral mutations
    donor_informative_muts = {k:v for k,v in donor_dict.items() if k not in acceptor_dict}
    acceptor_informative_muts = {k:v for k,v in acceptor_dict.items() if k not in donor_dict}
    
    # if on_deletion_branch is True, then consider nuc muts that happened on same branch as deletion as informative, 
    # otherwise only look at nuc muts that definitely occurred before
    if on_deletion_branch == True:
        # remove the nsp6 deletion
        donor_informative_muts[donor_node] = [x for x in donor_informative_muts[donor['node']] 
                                                 if x not in nsp6del_nucs]
        acceptor_informative_muts[acceptor_node] = [x for x in acceptor_informative_muts[acceptor['node']] 
                                                 if x not in nsp6del_nucs]
    elif on_deletion_branch == False:
        del donor_informative_muts[donor_node]
        del acceptor_informative_muts[acceptor_node]
    
    return donor_informative_muts, acceptor_informative_muts

In [106]:
def find_closest_muts(all_muts):
    """
    Given a list of informative nucleotide mutations, 
    find the mutation that is closest to the deletion on 
    the either side, or return 1 (start of genome) 
    if there are no informative mutations to the left 
    or 29903 (end of genome) if no informative muts to right
    """
    # find position of mutation
    all_muts_pos = [int(x[1:-1]) for x in all_muts]
    
    # find only muts to left
    muts_left = [x for x in all_muts_pos if x<11288]
    if len(muts_left)==0:
        closest_left = 1
    else:
        closest_left = min(muts_left, key=lambda x:(11288-x))
    
    # find all muts to the right
    muts_right = [x for x in all_muts_pos if x>11296]
    if len(muts_right)==0:
        closest_right = 29903
    else:
        closest_right = min(muts_right, key=lambda x:(x-11296))
    
    return closest_left, closest_right

Make a dataframe summarizing whether there is evidence of recombination (shared mutations) in the donor/acceptor pairs, and what the possible window of the genome where recombination could have occurred is

In [122]:
on_deletion_branch =True

# initialize list to keep track of info about possibility of recombination
possible_recomb_info = []

# potential recombination donor/acceptor pair
for donor in nsp6del_nodes:
    for acceptor in nsp6del_nodes:
        if donor!= acceptor:
            # donor had to have existed first
            if donor['date']< acceptor['date']:
                # find informative mutations in the donor and acceptor
                # (mutations that didn't occur in a common ancestor of the donor and acceptor)
                donor_muts = mut_history[donor['node']]['mut_history']
                acceptor_muts = mut_history[acceptor['node']]['mut_history']
                
                donor_informative_muts, acceptor_informative_muts= find_informative_muts(donor_muts, acceptor_muts, 
                                                                                         donor['node'], acceptor['node'], 
                                                                                         on_deletion_branch)
                
                # find informative mutations near the nsp6del
                # closest mutations to the deletion in the donor
                all_donor_muts = [x for y in donor_informative_muts.values() for x in y]
                closest_left_donor, closest_right_donor = find_closest_muts(all_donor_muts)
                # closest mutations to the deletion in the acceptor
                all_acceptor_muts = [x for y in acceptor_informative_muts.values() for x in y]
                closest_left_acceptor, closest_right_acceptor = find_closest_muts(all_acceptor_muts)

                # if closest mutation on either side is shared by both donor and acceptor, 
                # this would support the hypothesis of the different nsp6 deletion occurrences 
                # being a result of recombination
                support_for_recomb_left = False
                support_for_recomb_right = False
                # just looking to see whether mutation occurs at same site. 
                # If it did, would need to check identity of mutation
                if closest_left_donor == closest_left_acceptor:
                    # don't consider lack of informative mutations as evidence
                    if closest_left_donor!= 1:
                        support_for_recomb_left = True
                if closest_right_donor == closest_right_acceptor:
                    if closest_right_donor != 29903:
                        support_for_recomb_right = True

                possible_window_of_recomb = [max(closest_left_donor, closest_left_acceptor), 
                                             min(closest_right_donor, closest_right_acceptor)]
                    
                    
                    
                possible_recomb_info.append({'donor': donor['node'], 'acceptor': acceptor['node'], 
                                             'emerging_lineage_donor': donor['emerging_lineage'], 
                                             'emerging_lineage_acceptor': acceptor['emerging_lineage'], 
                                             'support_for_recomb_left': support_for_recomb_left, 
                                             'support_for_recomb_right':support_for_recomb_right, 
                                             'possible_window_of_recomb': possible_window_of_recomb, 
                                             'on_deletion_branch': on_deletion_branch}) 

possible_recomb_df = pd.DataFrame(possible_recomb_info)

In [123]:
possible_recomb_df

Unnamed: 0,donor,acceptor,emerging_lineage_donor,emerging_lineage_acceptor,support_for_recomb_left,support_for_recomb_right,possible_window_of_recomb,on_deletion_branch
0,NODE_0000031,NODE_0004363,B.1.619/B.1.620,B.1.526 (Iota),False,False,"[9867, 15324]",True
1,NODE_0000031,NODE_0004416,B.1.619/B.1.620,B.1.351 (Beta),False,False,"[10323, 15324]",True
2,NODE_0000031,NODE_0008062,B.1.619/B.1.620,P.1 (Gamma),False,False,"[6613, 12778]",True
3,NODE_0000031,NODE_0005404,B.1.619/B.1.620,B.1.1.7 (Alpha),False,False,"[6954, 14676]",True
4,NODE_0000031,NODE_0000152,B.1.619/B.1.620,B.1.525 (Eta),False,False,"[8593, 14407]",True
5,NODE_0000031,NODE_0005215,B.1.619/B.1.620,B.1.1.318,False,False,"[9891, 15324]",True
6,NODE_0000031,NODE_0006885,B.1.619/B.1.620,C.37 (Lambda),False,False,"[10097, 13536]",True
7,NODE_0004363,NODE_0004416,B.1.526 (Iota),B.1.351 (Beta),False,False,"[10323, 20262]",True
8,NODE_0004363,NODE_0008062,B.1.526 (Iota),P.1 (Gamma),False,False,"[9867, 12778]",True
9,NODE_0004363,NODE_0005404,B.1.526 (Iota),B.1.1.7 (Alpha),False,False,"[9867, 14676]",True
