In [1]:
import altair as alt
import pandas as pd
import numpy as np
import json

In [2]:
df = pd.read_csv('bhatt_results_nextstrain/concatenated_results_df/results_for_altair.csv')

In [3]:
#make color mapping lists
dom = list(set(df['legible_name']))
rng = []
for d in dom:
    rng.append(list(df[df['legible_name']==d]['color'])[0])

In [4]:
#order to plot on the x-axis
virus_order = ['Measles', 'Mumps', 'Parainfluenza-1', 'Parainfluenza-3', 'Influenza A/H3N2', 
                  'Influenza A/H1N1pdm', 'Influenza B/Vic', 'Influenza B/Yam', 
                  'OC43-A', '229E', 'NL63', 'RSV-A', 'RSV-B', 
                  'Dengue 1-I', 'Dengue 1-III', 'Dengue 1-IV', 'Dengue 1-V', 
                  'Dengue 2-AA', 'Dengue 2-AI', 'Dengue 2-C', 
               'Dengue 3-II', 'Dengue 3-III', 'Dengue 4-I', 
                  'Dengue 4-II', 'Enterovirus D68', 'Norovirus GII.4']

In [5]:
def readin_virus_config(virus):
    config_json = f'config/adaptive_evo_config_{virus}.json'
    with open(config_json) as json_handle:
        configs = json.load(json_handle)
        
    return configs

In [6]:
#get name of each viral gene and make it legible
virus_gene_names = []

#add legible gene names to df
legible_genes = {'polymerase': 'Polymerase', 'membrane_fusion': 'Membrane Fusion', 
                 'receptor_binding':'Receptor Binding'}

all_viruses = list(set(df['virus']))
for v in all_viruses:
    configs = readin_virus_config(v)
    for x in ['polymerase', 'membrane_fusion', 'receptor_binding']:
        gene_name = configs[x]['virus_gene']
        legible_gene =legible_genes[x]
        gene_name_legible = f'{gene_name.upper()} ({legible_gene})'
        virus_gene_names.append({'virus':v, 'gene': x, 'legible_gene_name': gene_name_legible})
        
virus_gene_name_mapper = pd.DataFrame(virus_gene_names)

df = df.merge(virus_gene_name_mapper.reset_index())

#capitalize virus family
df['virus_family'] = df['virus_family'].str.capitalize()

In [7]:
#add information about genome type
virus_genome_types = []
for v in all_viruses:
    configs = readin_virus_config(v)
    genome_type = configs['genome_type']
    enveloped = configs['enveloped']
    virus_genome_types.append({'virus':v, 'genome_type': genome_type, 'enveloped': enveloped})
    
genome_type_mapper = pd.DataFrame(virus_genome_types)

df = df.merge(genome_type_mapper, on= 'virus')

In [8]:
#add a column for the url address
map_to_url = {'Measles': 'measles', 'Mumps':'mumps', 
              'Parainfluenza-1':'parainfluenza_1', 'Parainfluenza-3':'parainfluenza_3',
              'Influenza A/H3N2':'h3n2', 
                  'Influenza A/H1N1pdm': 'h1n1pdm', 'Influenza B/Vic': 'vic', 'Influenza B/Yam': 'yam', 
                  'OC43-A': 'oc43_a', 'OC43-B': 'oc43_b', '229E': '229e', 'NL63':'nl63', 
              'Lassa-A':'lassa_a', 'Lassa-B':'lassa_b', 'RSV-A':'rsv_a', 'RSV-B':'rsv_b', 
                  'Dengue 1-I':'denv1_i', 'Dengue 1-III':'denv1_iii', 'Dengue 1-IV':'denv1_iv', 'Dengue 1-V':'denv1_v', 
                  'Dengue 2-AA':'denv2_aa', 'Dengue 2-AI':'denv2_ai', 'Dengue 2-AM':'denv1_am', 'Dengue 2-C':'denv2_c', 
                  'Dengue 3-I':'denv3_i', 'Dengue 3-II':'denv3_ii', 'Dengue 3-III':'denv3_iii', 'Dengue 4-I':'denv4_i', 
                  'Dengue 4-II':'denv4_ii', 'Zika':'zika', 'Enterovirus D68': 'enterovirusd68', 
                'Norovirus GII.4': 'norovirus_gii4'}

df['url_ending'] = df['legible_name'].map(map_to_url)


In [9]:
#make x-axis values (to allow squishing viruses closer together or spreading them apart)
x_tick_pos = []

all_xticks = []

last_coord = 0

regular_spacing = 3
squished_spacing = 1

#separate enveloped and non-enveloped viruses by a bit more
#keep track of where this is to add a vertical line

#first non-enveloped virus
first_nonenveloped = 'Enterovirus D68'


dengue_serotypes_already_seen = []
for x in virus_order:
    #regular spacing between dengue serotypes, but squished spacing between genotypes of same serotype
    if 'Dengue' in x:
        if x[0:8] in dengue_serotypes_already_seen:
            tick_pos = last_coord+squished_spacing
            last_coord = tick_pos
        else:
            tick_pos = last_coord+regular_spacing
            last_coord = tick_pos
            dengue_serotypes_already_seen.append(x[0:8])
    elif x == first_nonenveloped:
        enveloped_vert_divider = last_coord+regular_spacing
        tick_pos = last_coord+regular_spacing*2
    else:
        tick_pos = last_coord+regular_spacing
        
    last_coord = tick_pos
        
    all_xticks.append(tick_pos)
    x_tick_pos.append({'legible_name': x, 'xtick_pos': tick_pos})
            


xtick_df = pd.DataFrame(x_tick_pos)

df = df.merge(xtick_df, on='legible_name')

In [10]:
#make labelExpr
label_expr_str = ""

for i in range(len(x_tick_pos)):
    x = x_tick_pos[i]
    if i==len(x_tick_pos)-1:
        label_expr_str+=f"'{x['legible_name']}'"
    else:
        label_expr_str+=f"datum.label == {x['xtick_pos']} ? '{x['legible_name']}' : "

In [11]:
#add column for legible rate
df['legible_rate_percodon'] = df['adaptive_subs_per_codon_per_year'].round(2).astype(str)+'×10⁻³ muts per codon per year'
df['legible_rate_pergene'] = df['adaptive_muts_per_year'].round(2).astype(str)+' muts per year'


#subset dataframes to each gene
df_polymerase = df[df['gene']=='polymerase']
df_fusion = df[df['gene']=='membrane_fusion']
df_receptorbinding = df[df['gene']=='receptor_binding']


In [12]:
#get relative sizes of plot for each gene in order to make y-axis scale the same, 
#but truncate axes of polymerase and fusion

p_floor = min(df_polymerase['lower_95ci'])
p_ceiling = max(df_polymerase['upper_95ci'])

f_floor = min(df_fusion['lower_95ci'])
f_ceiling = max(df_fusion['upper_95ci'])

r_floor = min(df_receptorbinding['lower_95ci'])
r_ceiling = max(df_receptorbinding['upper_95ci'])

#relative extent of y-axes (compared to receptor-binding, which has largest range)
r_range = r_ceiling-r_floor
p_relative_range = (p_ceiling-p_floor)/r_range
f_relative_range = (f_ceiling-f_floor)/r_range


In [14]:
#set height and width of plots
standard_width = 800
standard_height = 250


# generate the points
points_p = alt.Chart(df_polymerase).transform_calculate(
    url='https://blab.github.io/atlas-of-viral-adaptation/' + alt.datum.url_ending
).mark_point(
    filled=True,
    size=200, 
    opacity=1
).encode(
    x=alt.X('xtick_pos:Q', title='', 
            axis=alt.Axis(values = all_xticks, labelExpr=label_expr_str, tickCount=len(all_xticks))),
    y=alt.Y('adaptive_subs_per_codon_per_year:Q', title='', axis=alt.Axis(format=".1f", tickMinStep=1)),
    color=alt.Color('legible_name', scale=alt.Scale(domain=dom, range=rng), legend=None),
    href='url:N',
    tooltip=[alt.Tooltip('legible_name:N', title='Virus'),
             alt.Tooltip('virus_family:N', title='Family'),
             alt.Tooltip('genome_type:N', title='Genome Type'),
             alt.Tooltip('enveloped:N', title='Enveloped'),
             alt.Tooltip('legible_gene_name:N', title='Protein/Subunit'), 
             alt.Tooltip('legible_rate_percodon', title='Rate')],
).properties(
    width=standard_width,
    height=standard_height*p_relative_range, 
    title='Polymerase'
)

points_r = alt.Chart(df_receptorbinding).transform_calculate(
    url='https://blab.github.io/atlas-of-viral-adaptation/' + alt.datum.url_ending
).mark_point(
    filled=True,
    size=200, opacity=1,
).encode(
    x=alt.X('xtick_pos:Q', title='', 
            axis=alt.Axis(values = all_xticks, labelExpr=label_expr_str, tickCount=len(all_xticks))),
    y=alt.Y('adaptive_subs_per_codon_per_year:Q', title='', axis=alt.Axis(format=".1f")),
    color=alt.Color('legible_name', scale=alt.Scale(domain=dom, range=rng), legend=None),
    href='url:N',
    tooltip=[alt.Tooltip('legible_name:N', title='Virus'),
             alt.Tooltip('virus_family:N', title='Family'),
             alt.Tooltip('genome_type:N', title='Genome Type'),
             alt.Tooltip('enveloped:N', title='Enveloped'),
             alt.Tooltip('legible_gene_name:N', title='Protein/Subunit'), 
             alt.Tooltip('legible_rate_percodon', title='Rate')],
).properties(
    width=standard_width,
    height=standard_height, 
    title='Receptor-Binding'
)

#generate the error bars
errorbars_p = alt.Chart(df_polymerase).mark_errorbar().encode(
    x=alt.X('xtick_pos:Q', sort=virus_order, title='', axis=alt.Axis(labels=False)),
    y=alt.Y("lower_95ci", title=''),
    y2="upper_95ci", 
    color=alt.Color('legible_name', scale=alt.Scale(domain=dom, range=rng), legend=None),
    tooltip=alt.value(None)
).properties(
    width=standard_width,
    height=standard_height*p_relative_range
)


errorbars_r = alt.Chart(df_receptorbinding).mark_errorbar().encode(
    x=alt.X('xtick_pos:Q', sort=virus_order, title=''),
    y=alt.Y("lower_95ci", title=''),
    y2="upper_95ci",
    color=alt.Color('legible_name', scale=alt.Scale(domain=dom, range=rng), legend=None),
    tooltip=alt.value(None)
).properties(
    width=standard_width,
    height=standard_height
)

#vertical line to divide enveloped from non-enveloped viruses
vert_divider = alt.Chart(pd.DataFrame({'x': [enveloped_vert_divider]})).mark_rule(strokeDash=[3,5]).encode(x='x')
enveloped_text = vert_divider.mark_text(
        align='right',
        color='#6e6e6e',
        dy=-120,
        dx=-10  # Nudges text to left so it doesn't appear on top of the bar. in pixels
).encode(
    text=alt.value(['enveloped', 'viruses']))
nonenveloped_text = vert_divider.mark_text(
        align='left',
        color='#6e6e6e',
        dy=-120,
        dx=10  # Nudges text to left so it doesn't appear on top of the bar. in pixels
).encode(
    text=alt.value(['non-enveloped', 'viruses']))




#y-axis label for both plots
text = alt.Chart().mark_text(
    align="center",
    baseline="bottom",
    fontSize=16,
    fontWeight=400,
    angle=270,
    color='black'
).encode(
    x=alt.value(10),  # pixels from left
    y=alt.value(160),  # pixels from top
    text=alt.value(['Rate of Adaptation'])
)

#y-axis label for both plots
text2 = alt.Chart().mark_text(
    align="center",
    baseline="bottom",
    fontSize=12,
    fontWeight=400,
    angle=270,
    color='black'
).encode(
    x=alt.value(0),  # pixels from left
    y=alt.value(160),  # pixels from top
    text=alt.value(['Adaptive Mutations per Codon per Year (× 10⁻³)'])
)

#layer the points and error bars
top = alt.layer(points_p, errorbars_p, vert_divider, data=df_polymerase)
bottom = alt.layer(points_r, errorbars_r, vert_divider, enveloped_text, nonenveloped_text, data=df_receptorbinding)

#stack the genes
plot_layout = alt.vconcat(top, bottom, spacing=10)

#add the yaxis label
chart = alt.hconcat(text, text2, plot_layout, spacing=0).configure_axis(
    grid=False
).configure_view(
    strokeWidth=0
).configure_title(
    fontSize=18,
    anchor='start',
    color='#c6c6c6'
).resolve_scale(
    y='independent',
    x='independent',
).configure_axisX(labelAngle=270)

chart.save('../atlas-of-viral-adaptation/assets/adaptation_percodon_overview_plot.html')

chart


In [15]:
#set height and width of plots
standard_width = 800
standard_height = 250


# generate the points
points_p = alt.Chart(df_polymerase).transform_calculate(
    url='https://blab.github.io/atlas-of-viral-adaptation/' + alt.datum.url_ending
).mark_point(
    filled=True,
    size=200, 
    opacity=1
).encode(
    x=alt.X('xtick_pos:Q', title='', 
            axis=alt.Axis(values = all_xticks, labelExpr=label_expr_str, tickCount=len(all_xticks))),
    y=alt.Y('adaptive_muts_per_year:Q', title='', axis=alt.Axis(format=".1f")),
    color=alt.Color('legible_name', scale=alt.Scale(domain=dom, range=rng), legend=None),
    href='url:N',
    tooltip=[alt.Tooltip('legible_name:N', title='Virus'),
             alt.Tooltip('virus_family:N', title='Family'),
             alt.Tooltip('genome_type:N', title='Genome Type'),
             alt.Tooltip('enveloped:N', title='Enveloped'),
             alt.Tooltip('legible_gene_name:N', title='Protein/Subunit'), 
             alt.Tooltip('legible_rate_pergene', title='Rate')],
).properties(
    width=standard_width,
    height=standard_height*p_relative_range, 
    title='Polymerase'
)

points_r = alt.Chart(df_receptorbinding).transform_calculate(
    url='https://blab.github.io/atlas-of-viral-adaptation/' + alt.datum.url_ending
).mark_point(
    filled=True,
    size=200, opacity=1,
).encode(
    x=alt.X('xtick_pos:Q', title='', 
            axis=alt.Axis(values = all_xticks, labelExpr=label_expr_str, tickCount=len(all_xticks))),
    y=alt.Y('adaptive_muts_per_year:Q', title='', axis=alt.Axis(format=".1f")),
    color=alt.Color('legible_name', scale=alt.Scale(domain=dom, range=rng), legend=None),
    href='url:N',
    tooltip=[alt.Tooltip('legible_name:N', title='Virus'),
             alt.Tooltip('virus_family:N', title='Family'),
             alt.Tooltip('genome_type:N', title='Genome Type'),
             alt.Tooltip('enveloped:N', title='Enveloped'),
             alt.Tooltip('legible_gene_name:N', title='Protein/Subunit'), 
             alt.Tooltip('legible_rate_pergene', title='Rate')],
).properties(
    width=standard_width,
    height=standard_height, 
    title='Receptor-Binding'
)

#generate the error bars
errorbars_p = alt.Chart(df_polymerase).mark_errorbar().encode(
    x=alt.X('xtick_pos:Q', sort=virus_order, title='', axis=alt.Axis(labels=False)),
    y=alt.Y("lower_95ci_mutspergene", title=''),
    y2="upper_95ci_mutspergene", 
    color=alt.Color('legible_name', scale=alt.Scale(domain=dom, range=rng), legend=None),
    tooltip=alt.value(None)
).properties(
    width=standard_width,
    height=standard_height*p_relative_range
)


errorbars_r = alt.Chart(df_receptorbinding).mark_errorbar().encode(
    x=alt.X('xtick_pos:Q', sort=virus_order, title=''),
    y=alt.Y("lower_95ci_mutspergene", title=''),
    y2="upper_95ci_mutspergene",
    color=alt.Color('legible_name', scale=alt.Scale(domain=dom, range=rng), legend=None),
    tooltip=alt.value(None)
).properties(
    width=standard_width,
    height=standard_height
)

#vertical line to divide enveloped from non-enveloped viruses
vert_divider = alt.Chart(pd.DataFrame({'x': [enveloped_vert_divider]})).mark_rule(strokeDash=[3,5]).encode(x='x')
enveloped_text = vert_divider.mark_text(
        align='right',
        color='#6e6e6e',
        dy=-120,
        dx=-10  # Nudges text to left so it doesn't appear on top of the bar. in pixels
).encode(
    text=alt.value(['enveloped', 'viruses']))
nonenveloped_text = vert_divider.mark_text(
        align='left',
        color='#6e6e6e',
        dy=-120,
        dx=10  # Nudges text to left so it doesn't appear on top of the bar. in pixels
).encode(
    text=alt.value(['non-enveloped', 'viruses']))



#y-axis label for both plots
text = alt.Chart().mark_text(
    align="center",
    baseline="bottom",
    fontSize=16,
    fontWeight=400,
    angle=270,
    color='black'
).encode(
    x=alt.value(10),  # pixels from left
    y=alt.value(140),  # pixels from top
    text=alt.value(['Rate of Adaptation'])
)

#y-axis label for both plots
text2 = alt.Chart().mark_text(
    align="center",
    baseline="bottom",
    fontSize=12,
    fontWeight=400,
    angle=270,
    color='black'
).encode(
    x=alt.value(0),  # pixels from left
    y=alt.value(140),  # pixels from top
    text=alt.value(['Adaptive Mutations per Year'])
)

#layer the points and error bars
top = alt.layer(points_p, errorbars_p, vert_divider, data=df_polymerase)
bottom = alt.layer(points_r, errorbars_r, vert_divider, enveloped_text, nonenveloped_text, data=df_receptorbinding)

#stack the genes
plot_layout = alt.vconcat(top, bottom, spacing=10)

#add the yaxis label
chart = alt.hconcat(text, text2, plot_layout, spacing=0).configure_axis(
    grid=False
).configure_view(
    strokeWidth=0
).configure_title(
    fontSize=18,
    anchor='start',
    color='#c6c6c6'
).resolve_scale(
    y='independent',
    x='independent',
).configure_axisX(labelAngle=270)

chart.save('../atlas-of-viral-adaptation/assets/adaptation_pergene_overview_plot.html')

chart
