In [6]:
import math
import json
import random
import ast
import re
from os import path
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
from scipy import stats
from collections import Counter

### Save virus rates of adaptation and metadata info in csv for d3 to read in

In [7]:
def readin_virus_config(virus):
    """
    Read in the config file for this virus to get the paths to alignment, metadata files, etc 
    as well as metadata about the virus such as how many subtypes, 
    and which genes are receptor-binding or polymerase
    """
    config_json = f'config/adaptive_evo_config_{virus}.json'
    with open(config_json) as json_handle:
        configs = json.load(json_handle)
        
    return configs

In [8]:
def get_rate_in_muts_per_year(virus, subtype, gene, window, min_seqs):
    """
    Add a column giving the rate of adaptation (and 95ci, if bootstrapped) in 
    adaptive muts per year (rather than per codon per year)
    """
    
    intermediates_file = f'adaptation_results/intermediates/input_data_{virus}_{subtype}_{gene}_{window}_{min_seqs}_None_None.json'
    
    with open(intermediates_file) as json_handle:
        json_dict = json.load(json_handle)
        (aligned_codons_in_window, outgroup_codons, year_windows) = (json_dict['aligned_codons_in_window'], 
                                                                     json_dict['outgroup_codons'], 
                                                                     json_dict['year_windows'])
    
    gene_len_in_codons = len(outgroup_codons)


    
    return gene_len_in_codons


In [9]:
def readin_rates_updated_outgroup(virus_and_subtype, gene, window, min_seqs):
    """
    Read in the rates of adaptation, calculated with the update outgroup method
    """
    
    json_name = f'adaptation_results/results/{virus_and_subtype}_{gene}_{window}_{min_seqs}_adaptation_bootstrapped.json'

    if path.exists(json_name):
        with open(json_name) as json_handle:
            json_dict = json.load(json_handle)
            (window_midpoint, adaptive_substitutions, 
             adaptive_substitutions_per_codon, 
             rate_of_adaptation, bootstrap_adaptive_substitutions, 
             bootstrap_adaptive_substitutions_per_codon, 
             bootstrap_rate_of_adaptation) = (json_dict['window_midpoint'], 
                                              json_dict['adaptive_substitutions'], 
                                              json_dict['adaptive_substitutions_per_codon'], 
                                              json_dict['rate_of_adaptation'], 
                                              json_dict['bootstrap_adaptive_substitutions'], 
                                              json_dict['bootstrap_adaptive_substitutions_per_codon'], 
                                              json_dict['bootstrap_rate_of_adaptation'])
            
        slope_sci = rate_of_adaptation * (10**3)
        bs_slope_sci = [x * (10**3) for x in bootstrap_rate_of_adaptation]
        lower_95ci = np.percentile(sorted(bs_slope_sci), 2.5)
        upper_95ci = np.percentile(sorted(bs_slope_sci), 97.5)
    
    else:
        print(f'{json_name} does not exist')
        
    return slope_sci, lower_95ci, upper_95ci
    

In [10]:
def readin_rates_constant_outgroup(virus_and_subtype, gene, window, min_seqs):
    """
    Read in the rates of adaptation, calculated with the fixed outgroup method
    """
    
    json_name = f'bhatt_results/bhatt_results_nextstrain_{window}_{min_seqs}/{virus_and_subtype}_{gene}_bhatt_analysis_bootstrapped.json'

    if path.exists(json_name):
        with open(json_name) as json_handle:
            json_dict = json.load(json_handle)
            (window_midpoint, adaptive_substitutions, 
             adaptive_substitutions_per_codon, 
             rate_of_adaptation, bootstrap_adaptive_substitutions, 
             bootstrap_adaptive_substitutions_per_codon, 
             bootstrap_rate_of_adaptation) = (json_dict['window_midpoint'], 
                                              json_dict['adaptive_substitutions'], 
                                              json_dict['adaptive_substitutions_per_codon'], 
                                              json_dict['rate_of_adaptation'], 
                                              json_dict['bootstrap_adaptive_substitutions'], 
                                              json_dict['bootstrap_adaptive_substitutions_per_codon'], 
                                              json_dict['bootstrap_rate_of_adaptation'])
            
        slope_sci = rate_of_adaptation * (10**3)
        bs_slope_sci = [x * (10**3) for x in bootstrap_rate_of_adaptation]
        lower_95ci = np.percentile(sorted(bs_slope_sci), 2.5)
        upper_95ci = np.percentile(sorted(bs_slope_sci), 97.5)
    
    else:
        print(f'{json_name} does not exist')
        
    return slope_sci, lower_95ci, upper_95ci

In [11]:
#mappers to make gene names more readable
rota_segment_to_gene = {'A':'NSP1', 'C':'VP2', 'E':'NSP4', 'G':'VP7', 'I':'VP6', 
                        'M': 'VP3', 'N': 'NSP2', 'P':'VP4', 'R':'VP1', 'T':'NSP3'}

legible_gene_names = {'3clpro':'3CLpro', 'Ha1':'HA1', 'Ha2':'HA2', 
                      'He':'HE', 'Hn':'HN', 'Na':'NA', 'Nc':'NC', 
                      'Np':'NP', 'Ns1':'NS1', 'Ns2':'NS2', 'Pa':'PA', 
                      'Pb1':'PB1', 'Pb2':'PB2', 'Rdrp':'RdRp', 
                      'Sh':'SH', 'Vp1':'VP1', 'Vp2':'VP2', 'Vp3': 'VP3', 
                      'Vp4': 'VP4', '2a':"2A", '2b':"2B", '2c':"2C", 
                      '3a':"3A", '3b':"3B", '3c':"3C", '3d':"3D", 
                      "Hef1":"HEF1", 'Hef2':"HEF2", 'Ns':"NS", 'E1a':'E1A', 
                      'E1b 55k':'E1B 55K', 'Iva2':'IVa2', 'Ptp':'pTP', 'Piiia':'pIIIa', '100k':'100K'}

In [12]:
def gather_all_data_update_outgroup(viruses, to_include, window=5, min_seqs=3):
    """
    Gather all the data for each virus and gene (with rates calculated using the update outgroup method) 
    that will be needed for the website
    """
    all_data = []
    
    #for each virus, find whether there are any subtypes, then get all rates and metadata for these subtypes
    for virus in viruses:
        configs = readin_virus_config(virus)
            
        #metadata for virus
        genes = configs["genes"]
        genome_type = configs['genome_type']
        rna_dna = [x for x in ['RNA', 'DNA'] if x in configs['genome_type']][0]
        enveloped = configs['enveloped']
        transmission = configs['transmission']
        cellular_receptor = configs['cellular_receptor']
        fusion_protein_class = configs['fusion_protein_class']
        virus_family = configs['virus_family']
        #find the names of the polymerase and receptor-binding genes
        polymerase = configs['polymerase']['virus_gene']
        receptor_binding = configs['receptor_binding']['virus_gene']
        standard_genes = {polymerase.upper():'polymerase', receptor_binding.upper():'receptor_binding'}
        
        #if virus has subtypes, want to get info for each subtype
        if configs['subtype']=='True':
            for subtype in configs['subtypes']:
                virus_and_subtype = f'{virus}_{subtype}'
                legible_name = configs['legible_name'][subtype]
                if legible_name in to_include:
                    #get metadata for virus subtype
                    color = configs['color'][subtype]

                    #get rates
                    for gene in genes:
                        standard_gene = [standard_genes[gene.upper()] if gene.upper() in standard_genes.keys() else False][0]
                        gene_len_in_codons = get_rate_in_muts_per_year(virus, subtype, gene, window, min_seqs)
                        #get rate with updated outgroup
                        #for viruses with under 12 years of data, use smaller windows
                        if virus in ['h1n1pdm', 'mumps']:
                            window_v, min_seqs_v = 3, 3
                            rate_per_codon, lower_95ci, upper_95ci = readin_rates_updated_outgroup(virus_and_subtype, gene, window_v, min_seqs_v)
                        else:
                            rate_per_codon, lower_95ci, upper_95ci = readin_rates_updated_outgroup(virus_and_subtype, gene, window, min_seqs)
                        rate_per_gene = rate_per_codon/1000*gene_len_in_codons
                        lower_95ci_per_gene = lower_95ci/1000*gene_len_in_codons
                        upper_95ci_per_gene = upper_95ci/1000*gene_len_in_codons

                        legible_rate = str(round(rate_per_codon, 2))+'×10⁻³ muts per codon per year'
                        legible_rate_per_gene = str(round(rate_per_gene,2))+' muts per year'
                        
                        #make gene name more legible
                        gene_legible = gene.capitalize()
                        if virus == 'rotavirus':
                            gene_legible = rota_segment_to_gene[gene]
                        if gene_legible in legible_gene_names.keys():
                            gene_legible = legible_gene_names[gene_legible]
                            
                        #for polymerase and receptor-binding, make a label with gene name 
                        if standard_gene == 'polymerase':
                            legible_gene_name = f'{gene_legible} (Polymerase)'
                        elif standard_gene == 'receptor_binding':
                            legible_gene_name = f'{gene_legible} (Receptor-Binding)'
                        else:
                            legible_gene_name = gene_legible

                        all_data.append({'virus':virus, 'subtype':subtype, 'virus_and_subtype':virus_and_subtype, 'gene': gene_legible,
                                        'adaptive_subs_per_codon_per_year': rate_per_codon, 
                                         'lower_95ci': lower_95ci, 'upper_95ci': upper_95ci, 'ci': [lower_95ci, upper_95ci], 
                                         'len_in_codons': gene_len_in_codons, 'adaptive_muts_per_year': rate_per_gene, 
                                         'lower_95ci_mutspergene':lower_95ci_per_gene, 'upper_95ci_mutspergene':upper_95ci_per_gene, 
                                         'legible_name': legible_name, 'virus_family': virus_family.capitalize(), 'color':color, 
                                         'rna_dna': rna_dna, 'genome_type': genome_type, 'enveloped': enveloped, 'transmission':transmission,
                                         'cellular_receptor': cellular_receptor, 'fusion_protein_class': fusion_protein_class, 
                                         'legible_rate_per_codon': legible_rate, 'legible_rate_per_gene': legible_rate_per_gene, 
                                         'standard_gene':standard_gene, 'legible_gene_name':legible_gene_name})
                        
        #if no subtype               
        else:
            subtype=None
            virus_and_subtype = f'{virus}_{subtype}'
            #get metadata for virus 
            color = configs['color']
            legible_name = configs['legible_name']
            

            #get rates
            for gene in genes:
                standard_gene = [standard_genes[gene.upper()] if gene.upper() in standard_genes.keys() else False][0]
                gene_len_in_codons = get_rate_in_muts_per_year(virus, subtype, gene, window, min_seqs)
               #get rate with updated outgroup
                #for viruses with under 12 years of data, use smaller windows
                if virus in ['h1n1pdm', 'mumps']:
                    window_v, min_seqs_v = 3, 3
                    rate_per_codon, lower_95ci, upper_95ci = readin_rates_updated_outgroup(virus, gene, window_v, min_seqs_v)
                else:
                    rate_per_codon, lower_95ci, upper_95ci = readin_rates_updated_outgroup(virus, gene, window, min_seqs)
                rate_per_gene = rate_per_codon/1000*gene_len_in_codons
                lower_95ci_per_gene = lower_95ci/1000*gene_len_in_codons
                upper_95ci_per_gene = upper_95ci/1000*gene_len_in_codons
                
                legible_rate = str(round(rate_per_codon, 2))+'×10⁻³ muts per codon per year'
                legible_rate_per_gene = str(round(rate_per_gene,2))+' muts per year'
                
                #make gene name more legible
                gene_legible = gene.capitalize()
                if virus == 'rotavirus':
                    gene_legible = rota_segment_to_gene[gene]
                if gene_legible in legible_gene_names.keys():
                    gene_legible = legible_gene_names[gene_legible]
                    
                #for polymerase and receptor-binding, make a label with gene name 
                if standard_gene == 'polymerase':
                    legible_gene_name = f'{gene_legible} (Polymerase)'
                elif standard_gene == 'receptor_binding':
                    legible_gene_name = f'{gene_legible} (Receptor-Binding)'
                else:
                    legible_gene_name = gene_legible
            
                all_data.append({'virus':virus, 'subtype':subtype, 'virus_and_subtype':virus, 'gene': gene_legible,
                                 'adaptive_subs_per_codon_per_year': rate_per_codon, 
                                 'lower_95ci': lower_95ci, 'upper_95ci': upper_95ci, 'ci': [lower_95ci, upper_95ci], 
                                 'len_in_codons': gene_len_in_codons, 'adaptive_muts_per_year': rate_per_gene, 
                                 'lower_95ci_mutspergene':lower_95ci_per_gene, 'upper_95ci_mutspergene':upper_95ci_per_gene, 
                                 'legible_name': legible_name, 'virus_family': virus_family.capitalize(), 'color':color, 
                                 'rna_dna': rna_dna, 'genome_type': genome_type, 'enveloped': enveloped, 'transmission':transmission,
                                 'cellular_receptor': cellular_receptor, 'fusion_protein_class': fusion_protein_class, 
                                 'legible_rate_per_codon': legible_rate, 'legible_rate_per_gene': legible_rate_per_gene,
                                 'standard_gene':standard_gene, 'legible_gene_name':legible_gene_name})


                
        
    df = pd.DataFrame(all_data)

    return df     

In [13]:
def gather_all_data_constant_outgroup(viruses, to_include, window=5, min_seqs=3):
    """
    Gather all the data for each virus and gene (with rates calculated using the constant outgroup method) 
    that will be needed for the website
    """
    all_data = []
    
    #for each virus, find whether there are any subtypes, then get all rates and metadata for these subtypes
    for virus in viruses:
        configs = readin_virus_config(virus)
            
        #metadata for virus
        genes = configs["genes"]
        genome_type = configs['genome_type']
        rna_dna = [x for x in ['RNA', 'DNA'] if x in configs['genome_type']][0]
        enveloped = configs['enveloped']
        transmission = configs['transmission']
        cellular_receptor = configs['cellular_receptor']
        fusion_protein_class = configs['fusion_protein_class']
        virus_family = configs['virus_family']
        #find the names of the polymerase and receptor-binding genes
        polymerase = configs['polymerase']['virus_gene']
        receptor_binding = configs['receptor_binding']['virus_gene']
        standard_genes = {polymerase.upper():'polymerase', receptor_binding.upper():'receptor_binding'}
        
        #if virus has subtypes, want to get info for each subtype
        if configs['subtype']=='True':
            for subtype in configs['subtypes']:
                virus_and_subtype = f'{virus}_{subtype}'
                legible_name = configs['legible_name'][subtype]
                if legible_name in to_include:
                    #get metadata for virus subtype
                    color = configs['color'][subtype]

                    #get rates
                    for gene in genes:
                        standard_gene = [standard_genes[gene.upper()] if gene.upper() in standard_genes.keys() else False][0]
                        gene_len_in_codons = get_rate_in_muts_per_year(virus, subtype, gene, window, min_seqs)
                        #get_rate with constant outgroup
                        #for viruses with under 12 years of data, use smaller windows
                        if virus in ['h1n1pdm', 'mumps']:
                            window_v, min_seqs_v = 3, 2
                            rate_per_codon, lower_95ci, upper_95ci = readin_rates_constant_outgroup(virus_and_subtype, gene, window_v, min_seqs_v)
                        else:
                            rate_per_codon, lower_95ci, upper_95ci = readin_rates_constant_outgroup(virus_and_subtype, gene, window, min_seqs)
                        rate_per_gene = rate_per_codon/1000*gene_len_in_codons
                        lower_95ci_per_gene = lower_95ci/1000*gene_len_in_codons
                        upper_95ci_per_gene = upper_95ci/1000*gene_len_in_codons

                        legible_rate = str(round(rate_per_codon, 2))+'×10⁻³ muts per codon per year'
                        legible_rate_per_gene = str(round(rate_per_gene,2))+' muts per year'
                        
                        #make gene name more legible
                        gene_legible = gene.capitalize()
                        if virus == 'rotavirus':
                            gene_legible = rota_segment_to_gene[gene]
                        if gene_legible in legible_gene_names.keys():
                            gene_legible = legible_gene_names[gene_legible]
                        
                        #for polymerase and receptor-binding, make a label with gene name 
                        if standard_gene == 'polymerase':
                            legible_gene_name = f'{gene_legible} (Polymerase)'
                        elif standard_gene == 'receptor_binding':
                            legible_gene_name = f'{gene_legible} (Receptor-Binding)'
                        else:
                            legible_gene_name = gene_legible
                        
                        all_data.append({'virus':virus, 'subtype':subtype, 'virus_and_subtype':virus_and_subtype, 'gene': gene_legible,
                                        'adaptive_subs_per_codon_per_year': rate_per_codon, 
                                         'lower_95ci': lower_95ci, 'upper_95ci': upper_95ci, 'ci': [lower_95ci, upper_95ci], 
                                         'len_in_codons': gene_len_in_codons, 'adaptive_muts_per_year': rate_per_gene, 
                                         'lower_95ci_mutspergene':lower_95ci_per_gene, 'upper_95ci_mutspergene':upper_95ci_per_gene, 
                                         'legible_name': legible_name, 'virus_family': virus_family.capitalize(), 'color':color, 
                                         'rna_dna': rna_dna, 'genome_type': genome_type, 'enveloped': enveloped, 'transmission':transmission,
                                         'cellular_receptor': cellular_receptor, 'fusion_protein_class': fusion_protein_class, 
                                         'legible_rate_per_codon': legible_rate, 'legible_rate_per_gene': legible_rate_per_gene, 
                                         'standard_gene':standard_gene, 'legible_gene_name':legible_gene_name})
                        
        #if no subtype               
        else:
            subtype=None
            virus_and_subtype = f'{virus}_{subtype}'
            #get metadata for virus 
            color = configs['color']
            legible_name = configs['legible_name']
            

            #get rates
            for gene in genes:
                standard_gene = [standard_genes[gene.upper()] if gene.upper() in standard_genes.keys() else False][0]
                gene_len_in_codons = get_rate_in_muts_per_year(virus, subtype, gene, window, min_seqs)
               #get rate with constant outgroup
                #for viruses with under 12 years of data, use smaller windows
                if virus in ['h1n1pdm', 'mumps']:
                    window_v, min_seqs_v = 3, 2
                    rate_per_codon, lower_95ci, upper_95ci = readin_rates_constant_outgroup(virus, gene, window_v, min_seqs_v)
                else:
                    rate_per_codon, lower_95ci, upper_95ci = readin_rates_constant_outgroup(virus, gene, window, min_seqs)
                rate_per_gene = rate_per_codon/1000*gene_len_in_codons
                lower_95ci_per_gene = lower_95ci/1000*gene_len_in_codons
                upper_95ci_per_gene = upper_95ci/1000*gene_len_in_codons
                
                legible_rate = str(round(rate_per_codon, 2))+'×10⁻³ muts per codon per year'
                legible_rate_per_gene = str(round(rate_per_gene,2))+' muts per year'
                
                #make gene name more legible
                gene_legible = gene.capitalize()
                if virus == 'rotavirus':
                    gene_legible = rota_segment_to_gene[gene]
                if gene_legible in legible_gene_names.keys():
                    gene_legible = legible_gene_names[gene_legible]
                #for polymerase and receptor-binding, make a label with gene name 
                if standard_gene == 'polymerase':
                    legible_gene_name = f'{gene_legible} (Polymerase)'
                elif standard_gene == 'receptor_binding':
                    legible_gene_name = f'{gene_legible} (Receptor-Binding)'
                else:
                    legible_gene_name = gene_legible
            
                all_data.append({'virus':virus, 'subtype':subtype, 'virus_and_subtype':virus, 'gene': gene_legible,
                                 'adaptive_subs_per_codon_per_year': rate_per_codon, 
                                 'lower_95ci': lower_95ci, 'upper_95ci': upper_95ci, 'ci': [lower_95ci, upper_95ci], 
                                 'len_in_codons': gene_len_in_codons, 'adaptive_muts_per_year': rate_per_gene, 
                                 'lower_95ci_mutspergene':lower_95ci_per_gene, 'upper_95ci_mutspergene':upper_95ci_per_gene, 
                                 'legible_name': legible_name, 'virus_family': virus_family.capitalize(), 'color':color, 
                                 'rna_dna': rna_dna, 'genome_type': genome_type, 'enveloped': enveloped, 'transmission':transmission, 
                                 'cellular_receptor': cellular_receptor, 'fusion_protein_class': fusion_protein_class, 
                                 'legible_rate_per_codon': legible_rate, 'legible_rate_per_gene': legible_rate_per_gene,
                                 'standard_gene':standard_gene, 'legible_gene_name':legible_gene_name})


                
        
    df = pd.DataFrame(all_data)

    return df     

In [14]:
df_update = gather_all_data_update_outgroup(['h3n2', 'h1n1pdm','vic', 'yam', 'influenzaC', 
                    'measles', 'mumps', 'parainfluenza', 'rsv',
                    'oc43', '229e', 'nl63',
                    'dengue', 'rotavirusAg1p8', 'rotavirusAg1p4', 'norovirus',
                    'enterovirusd68', 'hepatitisA_IA',
                    'hepatitisB','parvovirusB19', 'adenovirusB7', 'adenovirusB3'], 
                to_include = ['Influenza A/H3N2', 'Influenza A/H1N1pdm', 'Influenza B/Vic', 'Influenza B/Yam', 
                              'Influenza C/Yamagata', 'Measles', 'Mumps', 'Parainfluenza-1', 'Parainfluenza-3', 
                              'RSV-A', 'RSV-B', 
                              'OC43-A', '229E', 'NL63', 
                              'Dengue 1-V', 'Dengue 2-AA', 'Dengue 3-III', 'Dengue 4-II', 
                               'Rotavirus A/P[8]', 'Rotavirus A/P[4]', 'Norovirus GII.4', 'Enterovirus D68', 
                              'Hepatitis A-IA', 'Parvovirus B19-1A', 
                              'Adenovirus B-7', 'Adenovirus B-3', 'Hepatitis B-A2', 'Hepatitis B-D3'])

In [15]:
df_constant = gather_all_data_constant_outgroup(['h3n2', 'h1n1pdm','vic', 'yam', 'influenzaC', 
                    'measles', 'mumps', 'parainfluenza', 'rsv',
                    'oc43', '229e', 'nl63',
                    'dengue', 'rotavirusAg1p8', 'rotavirusAg1p4', 'norovirus',
                    'enterovirusd68', 'hepatitisA_IA',
                    'hepatitisB','parvovirusB19', 'adenovirusB7', 'adenovirusB3'], 
                to_include = ['Influenza A/H3N2', 'Influenza A/H1N1pdm', 'Influenza B/Vic', 'Influenza B/Yam', 
                              'Influenza C/Yamagata', 'Measles', 'Mumps', 'Parainfluenza-1', 'Parainfluenza-3', 
                              'RSV-A', 'RSV-B', 
                              'OC43-A', '229E', 'NL63', 
                              'Dengue 1-V', 'Dengue 2-AA', 'Dengue 3-III', 'Dengue 4-II', 
                               'Rotavirus A/P[8]', 'Rotavirus A/P[4]', 'Norovirus GII.4', 'Enterovirus D68', 
                              'Hepatitis A-IA', 'Parvovirus B19-1A', 
                              'Adenovirus B-7', 'Adenovirus B-3', 'Hepatitis B-A2', 'Hepatitis B-D3'])

Now save all the data to .csv files. Want to partition data into specific files that will make plotting in d3 easier

In [16]:
def save_polyermase_update_outgroup(filename):
    """
    Save a csv file to be read in by d3 that contains all data for the polymerase gene, 
    with rates calculated using the update outgroup methods
    """
    
    #partition df to only polymerase
    df_p = df_update[df_update['standard_gene']=='polymerase']
    
    
    df_p.to_csv(filename, index=False)

In [17]:
save_polyermase_update_outgroup('aggregated_data_for_d3/polymerase_updateog_for_d3.csv')

In [18]:
def save_polyermase_constant_outgroup(filename):
    """
    Save a csv file to be read in by d3 that contains all data for the polymerase gene, 
    with rates calculated using the constant outgroup methods
    """
    
    #partition df to only polymerase
    df_p = df_constant[df_constant['standard_gene']=='polymerase']
    
    
    df_p.to_csv(filename, index=False)

In [19]:
save_polyermase_constant_outgroup('aggregated_data_for_d3/polymerase_constantog_for_d3.csv')

In [20]:
def save_rb_update_outgroup(filename):
    """
    Save a csv file to be read in by d3 that contains all data for the receptor-binding gene, 
    with rates calculated using the update outgroup methods
    """
    
    #partition df to only rb
    df_rb = df_update[df_update['standard_gene']=='receptor_binding']
    
    
    df_rb.to_csv(filename, index=False)

In [21]:
save_rb_update_outgroup('aggregated_data_for_d3/rb_updateog_for_d3.csv')

In [22]:
def save_rb_constant_outgroup(filename):
    """
    Save a csv file to be read in by d3 that contains all data for the receptor-binding gene, 
    with rates calculated using the constant outgroup methods
    """
    
    #partition df to only rb
    df_rb = df_constant[df_constant['standard_gene']=='receptor_binding']
    
    
    df_rb.to_csv(filename, index=False)

In [23]:
save_rb_constant_outgroup('aggregated_data_for_d3/rb_constantog_for_d3.csv')

In [24]:
def save_all_update_outgroup(filename):
    """
    Save a csv file to be read in by d3 that contains all data for all viruses and all genes,
    with rates calculated using the update outgroup method
    """
    
    df_update.to_csv(filename, index=False)

In [25]:
save_all_update_outgroup('aggregated_data_for_d3/all_genes_updateog.csv')

In [26]:
def save_all_constant_outgroup(filename):
    """
    Save a csv file to be read in by d3 that contains all data for all viruses and all genes,
    with rates calculated using the constant outgroup method
    """
    
    df_constant.to_csv(filename, index=False)

In [27]:
save_all_constant_outgroup('aggregated_data_for_d3/all_genes_constantog.csv')