In [69]:
import altair as alt
import pandas as pd
import numpy as np
import json

For each virus, plot the rate of adaptation in each gene in the genome

In [70]:
df = pd.read_csv('bhatt_results_nextstrain/concatenated_results_df/results_for_altair_all_genes.csv', 
                 keep_default_na=False)

In [71]:
#add column for legible rate
df['legible_rate_percodon'] = df['adaptive_subs_per_codon_per_year'].round(2).astype(str)+'×10⁻³ muts per codon per year'
df['legible_rate_pergene'] = df['adaptive_muts_per_year'].round(2).astype(str)+' muts per year'


In [72]:
#make color mapping lists
dom = list(set(df['legible_name']))
rng = []
for d in dom:
    rng.append(list(df[df['legible_name']==d]['color'])[0])

In [73]:
#make gene names more readable
df['gene'] = df['gene'].str.capitalize()
rota_segment_to_gene = {'A':'NSP1', 'C':'VP2', 'E':'NSP4', 'G':'VP7', 'I':'VP6', 
                        'M': 'VP3', 'N': 'NSP2', 'P':'VP4', 'R':'VP1', 'T':'NSP3'}
df[df['virus']=='rotavirusAg1p8'] = df[df['virus']=='rotavirusAg1p8'].replace({"gene": rota_segment_to_gene})

legible_gene_names = {'3clpro':'3CLpro', 'Ha1':'HA1', 'Ha2':'HA2', 
                      'He':'HE', 'Hn':'HN', 'Na':'NA', 'Nc':'NC', 
                      'Np':'NP', 'Ns1':'NS1', 'Ns2':'NS2', 'Pa':'PA', 
                      'Pb1':'PB1', 'Pb2':'PB2', 'Rdrp':'RdRp', 
                      'Sh':'SH', 'Vp1':'VP1', 'Vp2':'VP2', 'Vp3': 'VP3', 
                      'Vp4': 'VP4', '2a':"2A", '2b':"2B", '2c':"2C", 
                      '3a':"3A", '3b':"3B", '3c':"3C", '3d':"3D", 
                      "Hef1":"HEF1", 'Hef2':"HEF2", 'Ns':"NS", 'E1a':'E1A', 
                      'E1b 55k':'E1B 55K', 'Iva2':'IVa2', 'Ptp':'pTP', 'Piiia':'pIIIa', '100k':'100K'}

df = df.replace({"gene": legible_gene_names})

In [74]:
def readin_virus_config(virus):
    config_json = f'config/adaptive_evo_config_{virus}.json'
    with open(config_json) as json_handle:
        configs = json.load(json_handle)
        
    return configs

In [75]:
url_for_nextstrain_tree = []

#add column for link to nextstrain tree url
df['nextstrain_tree_url'] = ''



for k,v in df.iterrows():
    configs = readin_virus_config(v['virus'])
    
    gene_name = v['gene'].lower()
    #edit specific gene names to match URLs:
    if v['virus']=='norovirus':
        norovirus_names = {'vp1':'VP1', 'vp2':'VP2', 'ntpase':'NTPase', 'vpg':'VPg', '3clpro':'3CLpro'}
        if gene_name in norovirus_names.keys():
            gene_name = norovirus_names[gene_name]
            
    if v['virus'] in ['h3n2', 'h1n1pdm', 'vic', 'yam']:
        influenza_names = {'ha1':'ha', 'ha2':'ha'}
        if gene_name in influenza_names.keys():
            gene_name = influenza_names[gene_name]
            
    if v['virus']=='rotavirusAg1p8':
        gene_name = gene_name.upper()
    

    if v['virus']=='dengue':
        serotype = v['subtype'].split('_')[0]
        nextstrain_tree_url = configs['nextstrain_tree_url'].format(virus=v['virus'], 
                                                                    serotype=serotype, gene=gene_name)
    else: 
        nextstrain_tree_url = configs['nextstrain_tree_url'].format(virus=v['virus'], 
                                                                    subtype=v['subtype'], gene=gene_name)
        
    df.loc[k,'nextstrain_tree_url'] = str(nextstrain_tree_url)

    

In [76]:
def plot_genomewide_rates_percodon(virus_and_subtype, gene_order):

    #filename/path to save save plot
    if virus_and_subtype == 'rotavirusAg1p8':
        virus_directory = 'rotavirusA_g1p8'
    else:
        virus_directory = virus_and_subtype
    filename = f'../atlas-of-viral-adaptation/{virus_directory}/assets/genomewide_rates_percodon_plot.html'
    
    #set height and width of plots
    standard_width = 600
    standard_height = 250

    #get datapoints from only this virus/subtype
    df_virus = df[df['virus_and_subtype']==virus_and_subtype]
    
    #get measles and H3N2 points for comparison
    if virus_and_subtype!='measles':
        df_comparison_measles = df[(df['virus_and_subtype']=='measles')&(df['gene']=='H')]
        df_comparison_measles = df_comparison_measles.replace('H', 'Measles H')
        df_virus = pd.concat([df_virus, df_comparison_measles])
    if virus_and_subtype!='h3n2':
        df_comparison_h3n2 = df[(df['virus_and_subtype']=='h3n2')&(df['gene']=='HA1')]
        df_comparison_h3n2 = df_comparison_h3n2.replace('HA1', 'H3N2 HA1')
        df_virus = pd.concat([df_virus, df_comparison_h3n2])
                

    #make x-axis values (to allow squishing comparison points(measles and h3n2) closer together)
    x_tick_pos = []

    all_xticks = []

    last_coord = 0

    regular_spacing = 2
    squished_spacing = 1



    vert_bar_made = False
    for x in gene_order:
        if 'Measles' in x or 'H3N2' in x:
            tick_pos = last_coord+squished_spacing
            last_coord = tick_pos
        #make a vertical bar to divide the "benchmark" points from the plotted virus
        else:
            if vert_bar_made == False:
                vert_divider_pos = last_coord+squished_spacing
                last_coord = vert_divider_pos
                vert_bar_made=True
                tick_pos = last_coord+squished_spacing
                last_coord = tick_pos
            else:
                tick_pos = last_coord+regular_spacing
                last_coord = tick_pos


        all_xticks.append(tick_pos)
        x_tick_pos.append({'gene': x, 'xtick_pos': tick_pos})


    xtick_df = pd.DataFrame(x_tick_pos)

    df_virus = df_virus.merge(xtick_df, on='gene')
    
    
    #make labelExpr
    label_expr_str = ""

    for i in range(len(x_tick_pos)):
        x = x_tick_pos[i]
        if i==len(x_tick_pos)-1:
            label_expr_str+=f"'{x['gene']}'"
        else:
            label_expr_str+=f"datum.label == {x['xtick_pos']} ? '{x['gene']}' : "
    
    # generate the points
    points = alt.Chart(df_virus).mark_point(
        filled=True,
        size=200, 
        opacity=1
    ).encode(
        x=alt.X('xtick_pos:Q', title='', 
                axis=alt.Axis(values = all_xticks, labelExpr=label_expr_str, tickCount=len(all_xticks))),
        y=alt.Y('adaptive_subs_per_codon_per_year:Q', 
                axis=alt.Axis(title=['Adaptive Mutations', 'per Codon per Year (× 10⁻³)'], format=".1f")),
        color=alt.Color('legible_name', scale=alt.Scale(domain=dom, range=rng), legend=None),
        href='nextstrain_tree_url:N',
        tooltip=[alt.Tooltip('legible_name:N', title='Virus'),
                 alt.Tooltip('legible_rate_percodon', title='Rate')] 
    ).properties(
        width=standard_width,
        height=standard_height
    )

    #generate the error bars
    errorbars = alt.Chart(df_virus).mark_errorbar().encode(
        x=alt.X('xtick_pos:Q', title=''),
        y=alt.Y("lower_95ci", title=''),
        y2="upper_95ci", 
        color=alt.Color('legible_name', scale=alt.Scale(domain=dom, range=rng), legend=None),
        tooltip=alt.value(None)
    ).properties(
        width=standard_width,
        height=standard_height
    )

    vert_divider = alt.Chart(pd.DataFrame({'x': [vert_divider_pos]})).mark_rule(strokeDash=[3,5]).encode(x='x')



    chart = alt.layer(points, errorbars, vert_divider, data=df_virus).configure_axis(
        grid=False
    ).configure_view(
        strokeWidth=0
    ).configure_axisX(
        labelAngle=270, labelFontSize=12
    ).configure_axisY(labelFontSize=12,titleFontSize=14, titlePadding=20)

    chart
    
    chart.save(filename, scale_factor=10.0)

In [80]:
plot_genomewide_rates_percodon('h3n2', ['Measles H', 'HA1', 'HA2', 'NA', 'NP', 'PA', 'PB1', 'PB2'])

In [81]:
plot_genomewide_rates_percodon('yam', ['Measles H', 'H3N2 HA1', 'HA1', 'HA2', 'NA', 'NP', 'PA', 'PB1', 'PB2'])

In [82]:
plot_genomewide_rates_percodon('vic', ['Measles H', 'H3N2 HA1', 'HA1', 'HA2', 'NA', 'NP', 'PA', 'PB1', 'PB2'])

In [83]:
plot_genomewide_rates_percodon('measles', ['H3N2 HA1', "N", "P", "V", "C", "M", "F", "H", "L"])

In [84]:
plot_genomewide_rates_percodon('mumps', ['Measles H','H3N2 HA1',"NC", "V", "I", "M", "F", "SH", "HN", "L"])

In [85]:
plot_genomewide_rates_percodon('229e', 
                               ['Measles H','H3N2 HA1',"Replicase1ab", "RdRp", "Protein4a", 
                                "Protein4b", "S1", "S2", "Envelope", "Membrane", "Nucleocapsid"])

In [86]:
plot_genomewide_rates_percodon('oc43_a', 
                               ['Measles H','H3N2 HA1', "Replicase1ab", "RdRp", "HE", "S1", 
                                "S2", "Membrane", "Nucleocapsid"])

In [87]:
plot_genomewide_rates_percodon('nl63', 
                               ['Measles H','H3N2 HA1', "Replicase1ab", "RdRp", "S1", 
                                "S2", "Membrane", "Nucleocapsid"])

In [88]:
plot_genomewide_rates_percodon('rsv_A', ['Measles H','H3N2 HA1', "NS1", 
                                        "NS2", "N", "P", "M", "SH", "G", "F", "L"])

In [89]:
plot_genomewide_rates_percodon('rsv_B', ['Measles H','H3N2 HA1', "NS1", 
                                        "NS2", "N", "P", "M", "SH", "G", "F", "L"])

In [90]:
plot_genomewide_rates_percodon('enterovirusd68', ['Measles H','H3N2 HA1', 
                                                  "VP4", "VP2", "VP3", "VP1", 
                                                  "2A", "2B", "2C", "3A", "3B", "3C", "3D"])

In [91]:
plot_genomewide_rates_percodon('parainfluenza_1', ['Measles H','H3N2 HA1', 
                                                   "N", "P", "C", "M", "F", "HN", "L"])

In [92]:
plot_genomewide_rates_percodon('parainfluenza_3', ['Measles H','H3N2 HA1', 
                                                   "N", "D", "P", "C", "M", "F", "HN", "L"])

In [93]:
plot_genomewide_rates_percodon('influenzaC_Yamagata', ['Measles H','H3N2 HA1', 
                                              "PB2", "PB1", "P3", "HEF1", "HEF2", "NP", "M", "NS"])

In [94]:
plot_genomewide_rates_percodon('rotavirusAg1p8', ['Measles H','H3N2 HA1',"VP1", "VP2", "VP3", "VP4", 
                                                  "NSP1", "VP6", "NSP3", "NSP2", "VP7", "NSP4"])

In [95]:
plot_genomewide_rates_percodon('parvovirusB19', ['Measles H','H3N2 HA1',"NS", "U7_5kda", 
                                                 "VP1", "X", "U11kda", "VP2"])

In [61]:
plot_genomewide_rates_percodon('adenovirusB7', ['Measles H','H3N2 HA1',"E1A", "E1B 55K", "IVa2", "Pol", "pTP", 
                                                "pIIIa", "Hexon", "Protease", "100K", "Fiber"])

In [62]:
plot_genomewide_rates_percodon('adenovirusB3', ['Measles H','H3N2 HA1',"E1A", "E1B 55K", "IVa2", "Pol", "pTP", 
                                                "pIIIa", "Hexon", "Protease", "100K", "Fiber"])

In [77]:
plot_genomewide_rates_percodon('hepatitisB_A2', ['Measles H','H3N2 HA1',"Polymerase", "Large", "Middle", 
                                                 "Small", "X", "Core"])

In [78]:
plot_genomewide_rates_percodon('hepatitisB_D3', ['Measles H','H3N2 HA1',"Polymerase", "Large", "Middle", 
                                                 "Small", "X", "Core"])

In [63]:
def plot_genomewide_rates_pergene(virus_and_subtype, gene_order):

    #filename/path to save save plot
    if virus_and_subtype == 'rotavirusAg1p8':
        virus_directory = 'rotavirusA_g1p8'
    else:
        virus_directory = virus_and_subtype
    filename = f'../atlas-of-viral-adaptation/{virus_directory}/assets/genomewide_rates_pergene_plot.html'
    
    #set height and width of plots
    standard_width = 600
    standard_height = 250

    #get datapoints from only this virus/subtype
    df_virus = df[df['virus_and_subtype']==virus_and_subtype]

    #get measles and H3N2 points for comparison
    if virus_and_subtype!='measles':
        df_comparison_measles = df[(df['virus_and_subtype']=='measles')&(df['gene']=='H')]
        df_comparison_measles = df_comparison_measles.replace('H', 'Measles H')
        df_virus = pd.concat([df_virus, df_comparison_measles])
    if virus_and_subtype!='h3n2':
        df_comparison_h3n2 = df[(df['virus_and_subtype']=='h3n2')&(df['gene']=='HA1')]
        df_comparison_h3n2 = df_comparison_h3n2.replace('HA1', 'H3N2 HA1')
        df_virus = pd.concat([df_virus, df_comparison_h3n2])

    #make x-axis values (to allow squishing comparison points(measles and h3n2) closer together)
    x_tick_pos = []

    all_xticks = []

    last_coord = 0

    regular_spacing = 2
    squished_spacing = 1



    vert_bar_made = False
    for x in gene_order:
        if 'Measles' in x or 'H3N2' in x:
            tick_pos = last_coord+squished_spacing
            last_coord = tick_pos
        #make a vertical bar to divide the "benchmark" points from the plotted virus
        else:
            if vert_bar_made == False:
                vert_divider_pos = last_coord+squished_spacing
                last_coord = vert_divider_pos
                vert_bar_made=True
                tick_pos = last_coord+squished_spacing
                last_coord = tick_pos
            else:
                tick_pos = last_coord+regular_spacing
                last_coord = tick_pos


        all_xticks.append(tick_pos)
        x_tick_pos.append({'gene': x, 'xtick_pos': tick_pos})


    xtick_df = pd.DataFrame(x_tick_pos)

    df_virus = df_virus.merge(xtick_df, on='gene')

    #make labelExpr
    label_expr_str = ""

    for i in range(len(x_tick_pos)):
        x = x_tick_pos[i]
        if i==len(x_tick_pos)-1:
            label_expr_str+=f"'{x['gene']}'"
        else:
            label_expr_str+=f"datum.label == {x['xtick_pos']} ? '{x['gene']}' : "

    # generate the points
    points = alt.Chart(df_virus).mark_point(
        filled=True,
        size=200, 
        opacity=1
    ).encode(
        x=alt.X('xtick_pos:Q', title='', 
                axis=alt.Axis(values = all_xticks, labelExpr=label_expr_str, tickCount=len(all_xticks))),
        y=alt.Y('adaptive_muts_per_year:Q', 
                axis=alt.Axis(title=['Adaptive Mutations', 'per Year'], format=".1f")),
        color=alt.Color('legible_name', scale=alt.Scale(domain=dom, range=rng), legend=None),
        href='nextstrain_tree_url:N',
        tooltip=[alt.Tooltip('legible_name:N', title='Virus'),
                 alt.Tooltip('legible_rate_percodon', title='Rate')],
    ).properties(
        width=standard_width,
        height=standard_height
    )

    #generate the error bars
    errorbars = alt.Chart(df_virus).mark_errorbar().encode(
        x=alt.X('xtick_pos:Q', title=''),
        y=alt.Y("lower_95ci_mutspergene", title=''),
        y2="upper_95ci_mutspergene", 
        color=alt.Color('legible_name', scale=alt.Scale(domain=dom, range=rng), legend=None),
        tooltip=alt.value(None)
    ).properties(
        width=standard_width,
        height=standard_height
    )

    vert_divider = alt.Chart(pd.DataFrame({'x': [vert_divider_pos]})).mark_rule(strokeDash=[3,5]).encode(x='x')



    chart = alt.layer(points, errorbars, vert_divider, data=df_virus).configure_axis(
        grid=False
    ).configure_view(
        strokeWidth=0
    ).configure_axisX(
        labelAngle=270, labelFontSize=12
    ).configure_axisY(labelFontSize=12,titleFontSize=14, titlePadding=20)

    
    chart.save(filename)

In [277]:
plot_genomewide_rates_pergene('h3n2', ['Measles H', 'HA1', 'HA2', 'NA', 'NP', 'PA', 'PB1', 'PB2'])

In [278]:
plot_genomewide_rates_pergene('yam', ['Measles H', 'H3N2 HA1', 'HA1', 'HA2', 'NA', 'NP', 'PA', 'PB1', 'PB2'])

In [279]:
plot_genomewide_rates_pergene('vic', ['Measles H', 'H3N2 HA1', 'HA1', 'HA2', 'NA', 'NP', 'PA', 'PB1', 'PB2'])

In [280]:
plot_genomewide_rates_pergene('measles', ['H3N2 HA1', "N", "P", "V", "C", "M", "F", "H", "L"])

In [281]:
plot_genomewide_rates_pergene('mumps', ['Measles H','H3N2 HA1',"NC", "V", "I", "M", "F", "SH", "HN", "L"])

In [None]:
plot_genomewide_rates_pergene('229e', 
                               ['Measles H','H3N2 HA1',"Replicase1ab", "RdRp", "Protein4a", 
                                "Protein4b", "S1", "S2", "Envelope", "Membrane", "Nucleocapsid"])

In [None]:
plot_genomewide_rates_pergene('oc43_a', 
                               ['Measles H','H3N2 HA1', "Replicase1ab", "RdRp", "HE", "S1", 
                                "S2", "Membrane", "Nucleocapsid"])

In [None]:
plot_genomewide_rates_pergene('nl63', 
                               ['Measles H','H3N2 HA1', "Replicase1ab", "RdRp", "S1", 
                                "S2", "Membrane", "Nucleocapsid"])

In [283]:
plot_genomewide_rates_pergene('rsv_A', ['Measles H','H3N2 HA1', "NS1", 
                                        "NS2", "N", "P", "M", "SH", "G", "F", "L"])

In [284]:
plot_genomewide_rates_pergene('rsv_B', ['Measles H','H3N2 HA1', "NS1", 
                                        "NS2", "N", "P", "M", "SH", "G", "F", "L"])

In [282]:
plot_genomewide_rates_pergene('enterovirusd68', ['Measles H','H3N2 HA1', 
                                                  "VP4", "VP2", "VP3", "VP1", 
                                                  "2A", "2B", "2C", "3A", "3B", "3C", "3D"])

In [13]:
plot_genomewide_rates_pergene('parainfluenza_1', ['Measles H','H3N2 HA1', 
                                                   "N", "D", "P", "C", "M", "F", "HN", "L"])

In [14]:
plot_genomewide_rates_pergene('parainfluenza_3', ['Measles H','H3N2 HA1', 
                                                   "N", "D", "P", "C", "M", "F", "HN", "L"])

In [30]:
plot_genomewide_rates_pergene('influenzaC_Yamagata', ['Measles H','H3N2 HA1', 
                                              "PB2", "PB1", "P3", "HEF1", "HEF2", "NP", "M", "NS"])

In [46]:
plot_genomewide_rates_pergene('rotavirusAg1p8', ['Measles H','H3N2 HA1',"VP1", "VP2", "VP3", "VP4", 
                                                  "NSP1", "VP6", "NSP3", "NSP2", "VP7", "NSP4"])

In [61]:
plot_genomewide_rates_pergene('parvovirusB19', ['Measles H','H3N2 HA1',"NS", "U7_5kda", 
                                                 "VP1", "X", "U11kda", "VP2"])

In [64]:
plot_genomewide_rates_pergene('adenovirusB7', ['Measles H','H3N2 HA1',"E1A", "E1B 55K", "IVa2", "Pol", "pTP", 
                                                "pIIIa", "Hexon", "Protease", "100K", "Fiber"])

In [65]:
plot_genomewide_rates_pergene('adenovirusB3', ['Measles H','H3N2 HA1',"E1A", "E1B 55K", "IVa2", "Pol", "pTP", 
                                                "pIIIa", "Hexon", "Protease", "100K", "Fiber"])

In [79]:
plot_genomewide_rates_pergene('hepatitisB_A2', ['Measles H','H3N2 HA1',"Polymerase", "Large", "Middle", 
                                                 "Small", "X", "Core"])

In [80]:
plot_genomewide_rates_pergene('hepatitisB_D3', ['Measles H','H3N2 HA1',"Polymerase", "Large", "Middle", 
                                                 "Small", "X", "Core"])