In [156]:
import pandas as pd
import numpy as np
import json
from Bio import SeqIO

In [3]:
#Load tree and sequence files
with open('augur_output/flu_seasonal_h3n2_ha_6y_tree.json', 'r') as jsonfile:
    tree_6y = json.load(jsonfile)

with open('augur_output/flu_seasonal_h3n2_ha_6y_sequences.json', 'r') as jsonfile:
    seq_6y = json.load(jsonfile)

In [177]:
#Load genbank file with reference HA sequence
for seq_record in SeqIO.parse('h3n2_outgroup.gb', 'genbank'):
    SP_ref = seq_record.seq[0:49].translate()
    HA1_ref = seq_record.seq[48:1035].translate()
    HA2_ref = seq_record.seq[1035:1699].translate()

In [199]:
#Input desired 0-based HA1 amino acid residue positions, find reference amino acid

positions = [159,185]
position_refaa = []

for pos in positions:
    position_refaa.append((str(pos), HA1_ref[pos]))


In [195]:
#Pull out tree tips from tree.json and HA1 muts (not just tip-specific) from sequences.json
#Generalize for any aa position, by entering position and ref AA as tuple

    
tip_muts = {}

def traverse(branch, seq, pos_list):

    if 'children' not in branch.keys():

        tip_muts[branch['strain']]=[branch['aa_muts']['HA1'], branch['aa_muts']['HA2'], 
                                    branch['aa_muts']['SigPep'],branch['attr']['num_date'], 
                                    branch['attr']['clade_membership'],seq[str(branch['clade'])]['HA1']
                                    ] + [str(seq[str(branch['clade'])]['HA1'][pos]) if pos in seq[str(branch['clade'])]['HA1'] else str(ref_aa) 
                                    for pos, ref_aa in pos_list]

    else:
        for child in branch['children']:
            traverse(child, seq, pos_list)

    

    
traverse(tree_6y, seq_6y, position_refaa)

In [196]:
#Organize data in a DF
df = pd.DataFrame(tip_muts).T
df.reset_index(inplace=True)
df.columns = ['strain', 'tip_HA1_muts', 'tip_HA2_muts', 'tip_SigPep_muts', 'date', 'clade', 'HA1_muts']+positions
df['passage'] = np.select((df.strain.str.contains('egg'), df.strain.str.contains('cell')), ('egg', 'cell'))


df.head(20)

Unnamed: 0,strain,tip_HA1_muts,tip_HA2_muts,tip_SigPep_muts,date,clade,HA1_muts,159,185,passage
0,A/AbuDhabi/221/2017-cell,[],[A201T],[],2017.97,A1b/135K,"{u'157': u'N', u'211': u'A', u'159': u'T', u'1...",T,S,cell
1,A/AbuDhabi/258/2018-cell,[],[],[],2018.0,A1b/135K,"{u'157': u'N', u'211': u'A', u'159': u'T', u'1...",T,S,cell
2,A/Acores/SU43/2012-egg,[],[],[A16T],2012.85,unassigned,"{u'229': u'V', u'157': u'N', u'211': u'A', u'1...",K,S,egg
3,A/Adana/A15/2017-cell,[],[],[],2017.09,A3,"{u'157': u'N', u'156': u'L', u'159': u'T', u'1...",T,S,cell
4,A/Afghanistan/243/2016-cell,[],[],[],2016.44,3c3.A,"{u'144': u'S', u'157': u'N', u'211': u'A', u'3...",K,S,cell
5,A/Afghanistan/437/2017,[],[],[],2017.85,A1b/135K,"{u'157': u'N', u'211': u'A', u'159': u'T', u'1...",T,S,0
6,A/Afghanistan/624/2017,[],[],[],2017.9,A1b/135K,"{u'157': u'N', u'211': u'A', u'159': u'T', u'1...",T,S,0
7,A/Aichi/116/2013-cell,[],[],[],2013.43,3c3,"{u'144': u'S', u'157': u'N', u'141': u'G', u'3...",K,S,cell
8,A/Aichi/118/2013-cell,[],[],[],2013.51,3c3,"{u'144': u'S', u'157': u'N', u'141': u'G', u'3...",K,S,cell
9,A/Aichi/119/2013-cell,[],[],[],2013.83,3c3,"{u'144': u'S', u'157': u'N', u'141': u'G', u'3...",K,S,cell


In [197]:
#Print genotype percentages for sequences from each passage-type
df.groupby('passage')[185].value_counts(normalize=True)

passage  185
0        S      0.990276
         V      0.008104
         A      0.001621
cell     S      1.000000
egg      S      0.855805
         V      0.125468
         R      0.009363
         D      0.003745
         A      0.001873
         E      0.001873
         I      0.001873
Name: 185, dtype: float64