In [5]:
import pandas as pd
import numpy as np
import json
from Bio import SeqIO
import seaborn as sns
import matplotlib.pyplot as plt
from ipysankeywidget import SankeyWidget
from ipywidgets import Layout
%matplotlib inline

In [6]:
df = pd.read_csv('../dataframes/h3n2_ha_12y_hi_egg.csv')

In [336]:
sites = [col[3:] for col in df.columns if col[0:3]=='mut']
sites = sorted(sites)

In [337]:
colors = ['rgba(26,152,80, 0.9)',
          'rgba(254,224,139, 0.9)',
          'rgba(50,136,189, 0.9)',
          'rgba(244,109,67, 0.9)',
          'rgba(253,174,97, 0.9)',
          'rgba(217,239,139, 0.9)',
          'rgba(102,189,99, 0.9)',
         'rgba(215,48,39, 0.9)',
         'rgba(166,217,106, 0.9)']

#Manually dictate order for more clear aesthetics
site_order = {'138': ['138A', '138S'],
              '156': ['156H', '156R', '156Q', '156N'],
              '160': ['160T', '160A', '160I', '160R', '160K'], 
              '186': ['186G', '186V', '186D'], 
              '194': ['194L', '194P', '194M', '194I'], 
              '203': ['203T'], 
              '219': ['219S', '219F', '219Y'], 
              '225': ['225D', '225N', '225G'], 
              '246': ['246N', '246H', '246S', '246D']}
for site in sites:
    site_order[site] = [site_order[site], [site+'mut'], [x+'_after' for x in site_order[site]]]

site_order['160'][2] = ['160T_after', '160K_after', '160A_after', '160I_after', '160R_after']
site_order['186'][2] = site_order['186'][2]+['186A_after', '186E_after', '186I_after', '186R_after', '186S_after']
site_order['194'][2] = site_order['194'][2]+['194V_after']
site_order['203'][2] = site_order['203'][2]+['203K_after', '203I_after']
site_order['246'][2] = site_order['246'][2]+['246K_after', '246T_after']

sankey_data = {}
for site in sites:
    sankey_data[site] = {}
    subdicts_to_add = ['before', 'before_mut', 'before_nomut', 'after', 'after_mut', 'after_nomut']
    for subdict in subdicts_to_add:
        sankey_data[site][subdict] = {}
            
    before_genotypes = list(df[str(site)+'_lastnode'].unique())
    after_genotypes = list(df[str(site)].unique())
    genotypes = list(set(before_genotypes+after_genotypes))
    cmap = {genotype:colors[genotypes.index(genotype)] for genotype in genotypes}
        
    for genotype in before_genotypes:
        for k,v in sankey_data[site].items():
            if 'before' in k:
                sankey_data[site][k][genotype] = {'count': 0, 'color': 'rgba(175,175,175,0.5)'}
        sankey_data[site]['before'][genotype]['count'] = len(df[df[str(site)+'_lastnode']==genotype])
        sankey_data[site]['before_mut'][genotype]['count'] = len(df[(df[str(site)+'_lastnode']==genotype)&(df['mut'+str(site)]==True)])
        sankey_data[site]['before_mut'][genotype]['color'] = cmap[genotype]
        sankey_data[site]['before_nomut'][genotype]['count'] = len(df[(df[str(site)+'_lastnode']==genotype)&(df['mut'+str(site)]==False)])
    
    for genotype in after_genotypes:
        for k,v in sankey_data[site].items():
            if 'after' in k:
                sankey_data[site][k][genotype] = {'count': 0, 'color': 'rgba(175,175,175,0.5)'}
        sankey_data[site]['after'][genotype]['count'] = len(df[df[str(site)]==genotype])
        sankey_data[site]['after_mut'][genotype]['count'] = len(df[(df[str(site)]==genotype)&(df['mut'+str(site)]==True)])
        sankey_data[site]['after_mut'][genotype]['color'] = cmap[genotype]
        sankey_data[site]['after_nomut'][genotype]['count'] = len(df[(df[str(site)]==genotype)&(df['mut'+str(site)]==False)])
    
    sankey_data[site]['order'] = site_order[site]


In [347]:
layout = Layout(width="1000", height="400")

site='246'

links = ([
    {'source': (site+key) , 'target': (site+'mut'), 
     'value': value['count'], 'type': 'x', 'color': 'rgba(253,174,97, 0.8)'}
    for (key, value) in sankey_data[site]['before_mut'].items()] +
    [{'source': (site+key) , 'target': (site+'mut'), 
      'value': value['count'], 'type': 'y', 'color': value['color']}
    for (key, value) in sankey_data[site]['before_nomut'].items()] +
    [{'source': (site+'mut') , 'target': (site+key+'_after'), 
      'value': value['count'], 'type': 'x', 'color': 'rgba(215,48,39, 0.8)'}
    for (key, value) in sankey_data[site]['after_mut'].items()] + 
    [{'source': (site+'mut') , 'target': (site+key+'_after'), 
      'value': value['count'], 'type': 'y', 'color': value['color']}
    for (key, value) in sankey_data[site]['after_nomut'].items()])

nodes = ([{'id': (site+key), 'title': (site+key), 'style':'process'} 
          for (key, value) in sankey_data[site]['before'].items()] + 
         [{'id': (site+'mut'), 'title': ''}] + 
         [{'id': (site+key+'_after'), 'title': (site+key), 'style':'process'} 
          for (key, value) in sankey_data[site]['after'].items()])


groups = [{'id': 'before', 'title': 'before egg-passaging', 'nodes': [(site+key) for (key, value) in sankey_data[site]['before'].items()]},
          {'id': 'mut', 'title': 'mutation', 'nodes': [(site+'mut')]},
          {'id': 'after', 'title': 'after egg-passaging', 'nodes': [(site+key+'_after') for (key, value) in sankey_data[site]['after'].items()]}]

order = sankey_data[site]['order']

SankeyWidget(links=links, nodes=nodes, groups=groups, order=order, 
             align_link_types=True, layout=layout, margins=dict(top=20, bottom=0, left=100, right=200))


SankeyWidget(align_link_types=True, groups=[{'id': 'before', 'title': 'before egg-passaging', 'nodes': ['246N'…

In [339]:
layout = Layout(width="1000", height="2000")

links = ([
    {'source': (site+key) , 'target': (site+'mut'), 
     'value': value['count'], 'type': 'x', 'color': value['color']}
    for site in sites for (key, value) in sankey_data[site]['before_mut'].items()] +
    [{'source': (site+key) , 'target': (site+'mut'), 
      'value': value['count'], 'type': 'y', 'color': value['color']}
    for site in sites for (key, value) in sankey_data[site]['before_nomut'].items()] +
    [{'source': (site+'mut') , 'target': (site+key+'_after'), 
      'value': value['count'], 'type': 'x', 'color': value['color']}
    for site in sites for (key, value) in sankey_data[site]['after_mut'].items()] + 
    [{'source': (site+'mut') , 'target': (site+key+'_after'), 
      'value': value['count'], 'type': 'y', 'color': value['color']}
    for site in sites for (key, value) in sankey_data[site]['after_nomut'].items()])

nodes = ([{'id': (site+key), 'title': (site+key), 'style':'process'} 
          for site in sites for (key, value) in sankey_data[site]['before'].items()] + 
         [{'id': (site+'mut'), 'title': ''} for site in sites] + 
         [{'id': (site+key+'_after'), 'title': (site+key), 'style':'process'} 
          for site in sites for (key, value) in sankey_data[site]['after'].items()])


groups = [{'id': 'before', 'title': 'before egg-passaging', 'nodes': [(site+key) for site in sites for (key, value) in sankey_data[site]['before'].items()]},
          {'id': 'mut', 'title': 'mutated during egg-passaging', 'nodes': [(site+'mut') for site in sites]},
          {'id': 'after', 'title': 'after egg-passaging', 'nodes': [(site+key+'_after') for site in sites for (key, value) in sankey_data[site]['after'].items()]}]

order = ([[sankey_data[site]['order'][0] for site in sites]]+
         [[sankey_data[site]['order'][1] for site in sites]]+
         [[sankey_data[site]['order'][2] for site in sites]])

SankeyWidget(links=links, nodes=nodes, groups=groups, order=order, 
             align_link_types=True, layout=layout, margins=dict(top=80, bottom=0, left=100, right=200))


SankeyWidget(align_link_types=True, groups=[{'id': 'before', 'title': 'before egg-passaging', 'nodes': ['138A'…