In [56]:
#Toy example to show mutation-calling method
#Deals with mini clusters of -egg sequences

In [1]:
import pandas as pd
import numpy as np
import ast
import re

In [14]:
tree = {'mut': ['A1B'],
        'node': '1',
        'passage': '',
        'seq':'BBBCE',
        'children': [{'mut': ['B2C'],
                  'node': '2',
                  'passage': '',
                  'seq':'BCBCE',
                  'children': [{'mut': ['B3C'],
                            'node': '3',
                            'passage': '',
                            'seq':'BCCCE',
                            'children': [{'mut':[],
                                        'node':'15',
                                        'seq':'BCCCE',
                                        'passage': 'cell'},
                                        {'mut':['C4D', 'C3D'],
                                      'node': '4',
                                      'passage': '',
                                      'seq':'BCDDE',
                                      'children': [{'mut':['E5A'],
                                               'node':'6',
                                                'seq':'BCDDA',
                                               'passage': 'egg'},
                                               {'mut':['C2F'],
                                               'node':'7',
                                                'seq':'BFDDE',
                                               'passage': 'egg'},
                                               {'mut':[],
                                               'node':'10',
                                               'passage': '',
                                                'seq':'BCDDE',
                                               'children': [{'mut':['D3A', 'C2F', 'B1A'],
                                                        'node': '11',
                                                         'seq':'AFADE',
                                                        'passage':'egg'}, 
                                                        {'mut': [],
                                                        'node': '12',
                                                         'seq':'BCDDE',
                                                        'passage':'egg'}]}]}]},
                           {'mut': ['E5A'],
                            'node': '5',
                            'passage': '',
                            'seq':'BCBCA',
                            'children': [{'mut': ['C2A'],
                                     'node': '13',
                                      'seq':'BCABCA',
                                     'passage': 'egg'},
                                    {'mut': [],
                                     'node': '14',
                                      'seq':'BCBCA',
                                     'passage': 'cell'}]},
                           {'mut': ['B1D'],
                            'node': '8', 
                            'seq':'DCBCE',
                            'passage': 'cell'}]},
                 {'mut': ['B3A'],
                  'node': '9',
                  'seq':'BBACE',
                 'passage': 'cell'}]}

In [15]:
positions = [1, 3, 5]

tip_muts = {}

def traverse(branch, pos_list):

    #keep track of mutations at internal nodes
    if 'children' in branch.keys():
        for child in branch['children']:
            if child['mut']:
                traverse_aa.append({str(child['node']):child['mut']})
                traverse(child, pos_list)
                traverse_aa.remove({str(child['node']):child['mut']})

            else:
                traverse(child, pos_list)

    elif 'children' not in branch.keys():

        aa_mut_clade_list = [str(mut) for mut in traverse_aa]
        muts_list = [str(mut) for sublist in [list(ast.literal_eval(x).values())[0] 
                                              for x in aa_mut_clade_list] for mut in sublist]
        last_node = muts_list
        
        if branch['mut']:
            branch_tip_muts = len(branch['mut'])
            last_node = last_node[:-(branch_tip_muts)]

        tip_sequence = branch['seq']
        last_node_sequence = 'BBBCE'
        
        for mut in last_node:
            internal_mut_pos = int(re.findall('\d+', mut)[0])
            internal_mut_aa = mut[-1:]
            last_node_sequence = (last_node_sequence[:internal_mut_pos-1] + 
                                  internal_mut_aa + last_node_sequence[internal_mut_pos:])

        tip_muts[branch['node']]=([branch['mut'], branch['passage'], branch['seq'], aa_mut_clade_list] + 
                                  [tip_sequence[pos-1] for pos in pos_list] + 
                                  [last_node_sequence[pos-1] for pos in pos_list])

traverse_aa = []
traverse(tree, positions)

In [16]:
df = pd.DataFrame(tip_muts).T
df.reset_index(inplace=True)
df.columns = ['strain', 'tip_muts', 'passage', 'seq', 'aa_mut_list'] + positions + [str(x)+'_lastnode' for x in positions]
df.aa_mut_list = df.aa_mut_list.map(list)

In [17]:
for p in positions:
    df['mut'+str(p)] = np.select(
    (df[p]==df[str(p)+'_lastnode'], df[p]!=df[str(p)+'_lastnode']),
    (False, True))
for p in positions:
    df['aa_mut'+str(p)] = np.where(df['mut'+str(p)]==1, df[str(p)+'_lastnode']+str(p)+df[p], None)

In [18]:
max_internal_length=df['aa_mut_list'].map(len).max()

#Find clusters of egg-passaged sequences,
#allow mutations shared by these clusters to be called as mutations in egg or strains
#If multiple mutations occur at the same site within cluster, most recent mutation should be taken
#Tips=cluster of size 1, so tip mutations override ancestral
for internal_branch in range(0,max_internal_length):
    sub_df = df[df['aa_mut_list'].map(len) > internal_branch]

    group= sub_df.groupby((sub_df.aa_mut_list.apply(lambda col: col[0:(internal_branch+1)])).map(tuple))
    for k, v in group:
        if len(v[v['passage']=='egg']) != 0:
            if len(v.groupby('passage')) == 1:
                recent_muts = list(ast.literal_eval(k[-1]).values())[0]

                #Find most recent mutation(s)
                for recent_mut in recent_muts:
                    site = int(re.findall('\d+', recent_mut)[0])
                    if site in positions:
                        df.at[v.index, 'mut' + str(site)] = 1
                        df.at[v.index, 'aa_mut' + str(site)] = recent_mut
                        df.at[v.index, str(site) + '_lastnode'] = recent_mut[0]

In [19]:
df

Unnamed: 0,strain,tip_muts,passage,seq,aa_mut_list,1,3,5,1_lastnode,3_lastnode,5_lastnode,mut1,mut3,mut5,aa_mut1,aa_mut3,aa_mut5
0,15,[],cell,BCCCE,"[{'2': ['B2C']}, {'3': ['B3C']}]",B,C,E,B,C,E,0,0,0,,,
1,6,[E5A],egg,BCDDA,"[{'2': ['B2C']}, {'3': ['B3C']}, {'4': ['C4D',...",B,D,A,B,C,E,0,1,1,,C3D,E5A
2,7,[C2F],egg,BFDDE,"[{'2': ['B2C']}, {'3': ['B3C']}, {'4': ['C4D',...",B,D,E,B,C,E,0,1,0,,C3D,
3,11,"[D3A, C2F, B1A]",egg,AFADE,"[{'2': ['B2C']}, {'3': ['B3C']}, {'4': ['C4D',...",A,A,E,B,D,E,1,1,0,B1A,D3A,
4,12,[],egg,BCDDE,"[{'2': ['B2C']}, {'3': ['B3C']}, {'4': ['C4D',...",B,D,E,B,C,E,0,1,0,,C3D,
5,13,[C2A],egg,BCABCA,"[{'2': ['B2C']}, {'5': ['E5A']}, {'13': ['C2A']}]",B,A,C,B,B,A,0,1,1,,B3A,A5C
6,14,[],cell,BCBCA,"[{'2': ['B2C']}, {'5': ['E5A']}]",B,B,A,B,B,A,0,0,0,,,
7,8,[B1D],cell,DCBCE,"[{'2': ['B2C']}, {'8': ['B1D']}]",D,B,E,B,B,E,1,0,0,B1D,,
8,9,[B3A],cell,BBACE,[{'9': ['B3A']}],B,A,E,B,B,E,0,1,0,,B3A,
