In [1]:
from augur.utils import json_to_tree
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from Bio import SeqIO
from collections import Counter
from sklearn import linear_model
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_squared_error
import requests


Want to find mutations that are "driving" logisitic growth. Do this by multiple linear regression to see which mutations best predict logistic growth rate

Get global ncov tree and convert to Bio Phylo format

In [2]:
tree_url = "https://data.nextstrain.org/ncov_global.json"
tree_json = requests.get(tree_url).json()
tree = json_to_tree(tree_json)

Download entropy manually from nextstrain.org/ncov/global. Find the 0.5% of sites with highest entropy throughout genome

In [4]:
entropy_file = 'nextstrain_ncov_global_diversity.tsv'
entropy_df = pd.read_csv(entropy_file, sep='\t')
# genome is 29902 nt long, 
# which would be roughly 9967 codons (but this isn't quite right since there are non-coding regions)
# estimate about 9950 codons -> top 0.5% would 50 codons... so take 50 highest entropy values
top_entropy_df = entropy_df.nlargest(50,'entropy')

Read in sequence file, that will be used to find the genotype of all members of a clade

Get sequence file: aws s3 cp s3://nextstrain-ncov-private/global_subsampled_sequences.fasta.xz .

In [5]:
sequences_file = 'global_subsampled_sequences.fasta'
record_dict = SeqIO.to_dict(SeqIO.parse(sequences_file, "fasta"))

Make dataframe where each row is a clade and columns contain information about logistic growth rate, how many S1 mutations from root to clade, and percentage of tips in clade that are genotype X

First, need to find genotype of isolates at desired positions (of highest entropy). Do this for all tips and store in a dictionary

In [6]:
# read in reference file and find genome position for each codon in the top entropy sites

# find the start position of the codon encoding each entropic site
genome_location_of_entropic_sites = {}

for k,v in top_entropy_df.iterrows():

    for record in SeqIO.parse(open("reference_seq_edited.gb","r"), "genbank"):
        for feature in record.features:
            if feature.type == 'CDS':
                if feature.qualifiers['gene'][0] == v['gene']:
                    mut_location_start = feature.location.start + int(v['position'] -1)*3
#                     print((record.seq[mut_location_start:mut_location_start+3]).translate())
                    genome_location_of_entropic_sites[mut_location_start] = f"{v['gene']}_{v['position']}"

                    

In [7]:
# initialize dictionary to store genotype info
tip_genotypes = {}

for k,v in record_dict.items():
    this_tip = {}
    #store genotype at each entropic site
    for nt_start, gene_codon in genome_location_of_entropic_sites.items():
        nt_codon = v.seq[nt_start:nt_start+3]
        if all([characters in ['A', 'C', 'G', 'T'] for characters in nt_codon]):
            this_tip[gene_codon] = str(nt_codon.translate())
        elif all([characters in ['-'] for characters in nt_codon]):
            this_tip[gene_codon] = '-'
        else:
            this_tip[gene_codon] = None

    tip_genotypes[k] = this_tip

Want to limit clade nesting.

Try defining clades as consisting of at least 10 tips, and consisting of a small range of logistic growth values?

In [34]:
#Function to find path from root to clade
def get_parents(tree, child_clade):
    node_path = tree.get_path(child_clade)
    return node_path

    

# initiate list to store all clade info
clade_stats = []

# keep track of the bases of clades that have already been visited (to avoid looking at nested subclades)
clade_bases = []

#Only want to look at clades, don't care about tips
for node in tree.find_clades(order="postorder"):
    
    # clade must have at least 10 tips
    if len(node.get_terminals())>=10:
    
        # only care about clade if logistic growth value is associated
        # find stored logistic growth value
        if "logistic_growth" in node.node_attrs:
            logistic_growth = node.node_attrs["logistic_growth"]["value"]
            clade_base = node
            
            # only look at clades with growth rate of 1.0 or greater
            if logistic_growth >= 1.0:
                parents = get_parents(tree, node)
                # walk back through tree to create clade of similar logistic growth rates
                # clade is defined by oldest node with rate within 0.5 of node
                for parent in reversed(parents[:-1]):
                    if "logistic_growth" in parent.node_attrs:
                        parent_logistic_growth = parent.node_attrs["logistic_growth"]["value"]

                        if (logistic_growth-float(0.5)) <= parent_logistic_growth <= (logistic_growth+float(0.5)):
                            clade_base = parent


                
                
                if clade_base not in clade_bases:
                    clade_bases.append(clade_base)

print(clade_bases)
                

#             #all S1 muts from root to clade already stored as json value
#             if "S1_mutations" in node.node_attrs:
#                 s1_mutations = node.node_attrs["S1_mutations"]["value"]
#             else:
#                 s1_mutations = None

#             # find all tips in this clade
#             tips_in_clade = node.get_terminals()
#             tip_names_in_clade = [tip.name for tip in tips_in_clade]

#             # tally the genotype of each tip within the clade
#             clade_genotype_tally = {}
#             for tip in tip_names_in_clade:
#                 for site, aa in tip_genotypes[tip].items():
#                     # ignore the isolates that were not sequenced
#                     if aa != None:
#                         if site in clade_genotype_tally.keys():
#                             clade_genotype_tally[site]+=[aa]
#                         else:
#                             clade_genotype_tally[site] = [aa]

#             # find proportion of tips that have given genotype
#             clade_genotype_freqs = {}
#             for k, v in clade_genotype_tally.items():
#                 counts = Counter(v)
#                 for item, count in counts.items():
#                     site_genotype = k+item
#                     clade_genotype_freqs[site_genotype] = count/len(v)


#             clade_stats.append({'clade': node.name, 'logistic_growth': logistic_growth, 
#                                 'num_s1_muts': s1_mutations, **clade_genotype_freqs})


clade_stats_df = pd.DataFrame(clade_stats).fillna(0)
                

    

    

    

[Clade(branch_length=0, name='NODE_0000002'), Clade(branch_length=10, name='NODE_0001634'), Clade(branch_length=20, name='NODE_0001739'), Clade(branch_length=21, name='NODE_0001646'), Clade(branch_length=4, name='NODE_0001848'), Clade(branch_length=4, name='NODE_0001701'), Clade(branch_length=5, name='NODE_0002057'), Clade(branch_length=32, name='NODE_0002101'), Clade(branch_length=4, name='NODE_0002237'), Clade(branch_length=7, name='NODE_0002886'), Clade(branch_length=7, name='NODE_0002972'), Clade(branch_length=28, name='NODE_0003185'), Clade(branch_length=28, name='NODE_0003806'), Clade(branch_length=28, name='NODE_0003741'), Clade(branch_length=28, name='NODE_0003199'), Clade(branch_length=7, name='NODE_0002227'), Clade(branch_length=7, name='NODE_0002678'), Clade(branch_length=7, name='NODE_0002064')]


In [10]:
clade_stats_df.shape

(2121, 171)