In [138]:
from augur.utils import json_to_tree
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from Bio import SeqIO
from collections import Counter
from sklearn import linear_model
import requests


ModuleNotFoundError: No module named 'sklearn'

Get global ncov tree and convert to Bio Phylo format

In [25]:
tree_url = "https://data.nextstrain.org/ncov_global.json"
tree_json = requests.get(tree_url).json()
tree = json_to_tree(tree_json)

Download entropy manually from nextstrain.org/ncov/global. Find the 0.5% of sites with highest entropy throughout genome

In [36]:
entropy_file = 'nextstrain_ncov_global_diversity.tsv'
entropy_df = pd.read_csv(entropy_file, sep='\t')
# genome is 29902 nt long, 
# which would be roughly 9967 codons (but this isn't quite right since there are non-coding regions)
# estimate about 9950 codons -> top 0.5% would 50 codons... so take 50 highest entropy values
top_entropy_df = entropy_df.nlargest(50,'entropy')

Read in sequence file, that will be used to find the genotype of all members of a clade

In [26]:
sequences_file = 'global_subsampled_sequences.fasta'
record_dict = SeqIO.to_dict(SeqIO.parse(sequences_file, "fasta"))

Make dataframe where each row is a clade and columns contain information about logistic growth rate, how many S1 mutations from root to clade, and percentage of tips in clade that are genotype X

First, need to find genotype of isolates at desired positions (of highest entropy). Do this for all tips and store in a dictionary

In [76]:
# read in reference file and find genome position for each codon in the top entropy sites

# find the start position of the codon encoding each entropic site
genome_location_of_entropic_sites = {}

for k,v in top_entropy_df.iterrows():

    for record in SeqIO.parse(open("reference_seq_edited.gb","r"), "genbank"):
        for feature in record.features:
            if feature.type == 'CDS':
                if feature.qualifiers['gene'][0] == v['gene']:
                    mut_location_start = feature.location.start + int(v['position'] -1)*3
#                     print((record.seq[mut_location_start:mut_location_start+3]).translate())
                    genome_location_of_entropic_sites[mut_location_start] = f"{v['gene']}_{v['position']}"

                    

In [98]:
# initialize dictionary to store genotype info
tip_genotypes = {}

for k,v in record_dict.items():
    this_tip = {}
    #store genotype at each entropic site
    for nt_start, gene_codon in genome_location_of_entropic_sites.items():
        nt_codon = v.seq[nt_start:nt_start+3]
        if all([characters in ['A', 'C', 'G', 'T'] for characters in nt_codon]):
            this_tip[gene_codon] = str(nt_codon.translate())
        else:
            this_tip[gene_codon] = None

    tip_genotypes[k] = this_tip

In [132]:
#initiate list to store all clade info
clade_stats = []

#Only want to look at clades, don't care about tips
for node in tree.find_clades(terminal=False):
    
    # only care about clade if logistic growth value is associated
    # find stored logistic growth value
    if "logistic_growth" in node.node_attrs:
        logistic_growth = node.node_attrs["logistic_growth"]["value"]
        
        #all S1 muts from root to clade already stored as json value
        if "S1_mutations" in node.node_attrs:
            s1_mutations = node.node_attrs["S1_mutations"]["value"]
        else:
            s1_mutations = None

        # find all tips in this clade
        tips_in_clade = node.get_terminals()
        tip_names_in_clade = [tip.name for tip in tips_in_clade]
        
        # tally the genotype of each tip within the clade
        clade_genotype_tally = {}
        for tip in tip_names_in_clade:
            for site, aa in tip_genotypes[tip].items():
                # ignore the isolates that were not sequenced
                if aa != None:
                    if site in clade_genotype_tally.keys():
                        clade_genotype_tally[site]+=[aa]
                    else:
                        clade_genotype_tally[site] = [aa]
        
        # find proportion of tips that have given genotype
        clade_genotype_freqs = {}
        for k, v in clade_genotype_tally.items():
            counts = Counter(v)
            for item, count in counts.items():
                site_genotype = k+item
                clade_genotype_freqs[site_genotype] = count/len(v)

        
        clade_stats.append({'clade': node.name, 'logistic_growth': logistic_growth, 
                            'num_s1_muts': s1_mutations, **clade_genotype_freqs})


clade_stats_df = pd.DataFrame(clade_stats)
                

    

    

    

In [136]:
clade_stats_df[clade_stats_df['logistic_growth']>=4.0]


Unnamed: 0,clade,logistic_growth,num_s1_muts,N_204G,N_203R,N_203K,S_681P,S_681H,S_681R,S_501N,...,ORF8_52K,S_144F,S_570V,ORF1a_2230T,S_484G,S_417T,N_199Q,ORF1b_2613S,S_701T,S_452Q
1052,NODE_0000055,4.095786,1,0.007336,0.006183,0.992693,0.565900,0.424565,0.008974,0.598519,...,0.006726,0.003460,0.000552,0.361738,0.00057,0.023889,0.000560,0.001147,0.000562,0.011824
1053,NODE_0000056,4.095786,1,0.006776,0.006187,0.992688,0.565657,0.424804,0.008979,0.598291,...,0.006730,0.003463,0.000553,0.361942,0.00057,0.023902,0.000561,0.001148,0.000562,0.011831
1054,NODE_0001698,4.095786,2,0.076923,0.076923,0.923077,1.000000,,,1.000000,...,,,,,,,,,,
1055,NODE_0002319,4.095786,2,0.090909,0.090909,0.909091,1.000000,,,1.000000,...,,,,,,,,,,
1056,NODE_0002322,4.095786,2,,,1.000000,1.000000,,,1.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2083,NODE_0004144,4.111753,6,,,1.000000,,1.000000,,,...,,,,1.000000,,,,,,
2084,NODE_0004145,4.111753,6,,,1.000000,,1.000000,,,...,,,,1.000000,,,,,,
2085,NODE_0000645,4.111753,6,,,1.000000,,1.000000,,,...,,,,1.000000,,,,,,
2086,NODE_0004147,4.111753,6,,,1.000000,,1.000000,,,...,,,,1.000000,,,,,,


N_204G
N_204R
N_204Q
N_204L
N_204A
N_204P


In [3]:
#node.__dict__

First, make dataframe containing information about number of S1 mutations from root to clade and all mutations that occurred between root and clade. Each row is a clade

In [4]:
#initiate list to store all clade history info
clades_history = []

#Function to find path from root to clade
def get_parent(tree, child_clade):
    node_path = tree.get_path(child_clade)
    return node_path


#Only want to look at clades, don't care about tips
for node in tree.find_clades(terminal=False):
    
    #find all mutations that occurred on path from root to clade
    root_to_clade_mutations = []
    for parent in get_parent(tree, node):
        if hasattr(parent, 'branch_attrs'):
            root_to_clade_mutations.append(parent.branch_attrs['mutations'])
    
    #flatten root_to_clade_mutations, making dict entry for nucleotide muts and amino acid subs in each gene
    possible_mutation_site = ['ORF1a', 'ORF1b', 'S', 'ORF3a', 'E', 'M', 'ORF6', 
                              'ORF7a', 'ORF7b', 'ORF8', 'ORF9b', 'N', 'nuc']
    mutations_on_path = {k:[] for k in possible_mutation_site}
    
    for parent_muts in root_to_clade_mutations:
        for k,v in parent_muts.items():
            mutations_on_path[k]+=v
    
    
    #all S1 muts from root to clade already stored as json value
    if "S1_mutations" in node.node_attrs:
        s1_mutations = node.node_attrs["S1_mutations"]["value"]
    else:
        s1_mutations = None
    
    #find stored logistic growth value
    if "logistic_growth" in node.node_attrs:
        logistic_growth = node.node_attrs["logistic_growth"]["value"]
    else:
        logistic_growth = None
    
    #copy dictionary and add key/values for clade name and number of s1 mutations and logistic growth
    clade_mutation_history = mutations_on_path
    clade_mutation_history['clade'] = node.name
    clade_mutation_history['num_s1_mutations'] = s1_mutations
    clade_mutation_history['logistic_growth'] = logistic_growth
    

    
    clades_history.append(clade_mutation_history)
    


#turn list of clade history info into a dataframe
clades_df = pd.DataFrame(clades_history)

In [5]:
clades_df[pd.DataFrame(clades_df.nuc.tolist()).isin(['T11288-']).any(1).values]

Unnamed: 0,ORF1a,ORF1b,S,ORF3a,E,M,ORF6,ORF7a,ORF7b,ORF8,ORF9b,N,nuc,clade,num_s1_mutations,logistic_growth
292,"[S3675-, G3676-, F3677-]",[P314L],"[D614G, E484K]",[],[],[I82T],[],[],[],[],[],[T205I],"[C3037T, C14408T, C241T, A23403G, A21993-, T21...",NODE_0001779,2.0,6.511755
293,"[S3675-, G3676-, F3677-, A2123V, E2607K, M3752I]",[P314L],"[D614G, E484K, I210T, D936N, S939F, T1027I]",[],[],[I82T],[],[E22D],[],[],[P10S],"[T205I, P13L, S201I]","[C3037T, C14408T, C241T, A23403G, A21993-, T21...",NODE_0001780,3.0,6.511755
294,"[S3675-, G3676-, F3677-, A2123V, E2607K, M3752I]",[P314L],"[D614G, E484K, I210T, D936N, S939F, T1027I, N4...",[],[],[I82T],[],[E22D],[],[],[P10S],"[T205I, P13L, S201I]","[C3037T, C14408T, C241T, A23403G, A21993-, T21...",NODE_0001781,4.0,6.511755
295,"[S3675-, G3676-, F3677-, T2007I]",[P314L],"[D614G, E484K, A67V, H69-, V70-, Y144-, Q677H]",[],[L21F],[I82T],[F2-],[],[],[],[H9D],"[T205I, S2M, D3Y, A12G]","[C3037T, C14408T, C241T, A23403G, A21993-, T21...",NODE_0001782,6.0,6.511755
296,"[S3675-, G3676-, F3677-, T2007I]","[P314L, L314F]","[D614G, E484K, A67V, H69-, V70-, Y144-, Q677H,...",[],[L21F],[I82T],[F2-],[],[],[],[H9D],"[T205I, S2M, D3Y, A12G]","[C3037T, C14408T, C241T, A23403G, A21993-, T21...",NODE_0001783,7.0,6.511755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3275,"[I2230T, S3675-, G3676-, F3677-, T1001I, A1708D]","[P314L, K1383R, P1001S]","[D614G, N501Y, H69-, V70-, Y144-, S982A, A570D...",[],[],[],[],[],[],"[Q27*, R52I, Y73C, K68*, *68K]",[],"[R203K, G204R, S235F, D3L]","[C3037T, C14408T, C241T, A23403G, G28881A, G28...",NODE_0004177,6.0,5.519968
3276,"[I2230T, S3675-, G3676-, F3677-, T1001I, A1708D]","[P314L, K1383R, P1001S]","[D614G, N501Y, H69-, V70-, Y144-, S982A, A570D...",[],[],[],[],[],[],"[Q27*, R52I, Y73C, K68*, *68K]",[],"[R203K, G204R, S235F, D3L]","[C3037T, C14408T, C241T, A23403G, G28881A, G28...",NODE_0004178,6.0,5.519968
3277,"[I2230T, S3675-, G3676-, F3677-, T1001I, A1708D]","[P314L, K1383R, P1001S]","[D614G, N501Y, H69-, V70-, Y144-, S982A, A570D...",[],[],[],[],[],[],"[Q27*, R52I, Y73C, K68*, *68K]",[],"[R203K, G204R, S235F, D3L]","[C3037T, C14408T, C241T, A23403G, G28881A, G28...",NODE_0004179,6.0,
3278,"[I2230T, S3675-, G3676-, F3677-, T1001I, A1708D]","[P314L, K1383R, P1001S, I1181S]","[D614G, N501Y, H69-, V70-, Y144-, S982A, A570D...",[],[],[],[],[],[],"[Q27*, R52I, Y73C, K68*, *68K]",[],"[R203K, G204R, S235F, D3L]","[C3037T, C14408T, C241T, A23403G, G28881A, G28...",NODE_0000816,6.0,5.519968
