In [56]:
#Toy example to show mutation-calling method
#Deals with mini clusters of -egg sequences

In [32]:
import pandas as pd
import numpy as np
import ast
import re

In [90]:
tree = {'mut': ['A1B'],
        'node': '1',
        'passage': '',
        'seq':'BBBCE',
        'kids': [{'mut': ['B2C'],
                  'node': '2',
                  'passage': '',
                  'seq':'BCBCE',
                  'kids': [{'mut': ['B3C'],
                            'node': '3',
                            'passage': '',
                            'seq':'BCCCE',
                            'kids': [{'mut':['C4D', 'C3D'],
                                      'node': '4',
                                      'passage': '',
                                      'seq':'BCDDE',
                                      'kids': [{'mut':['E5A'],
                                               'node':'6',
                                                'seq':'BCDDA',
                                               'passage': 'egg'},
                                               {'mut':['C2F'],
                                               'node':'7',
                                                'seq':'BFDDE',
                                               'passage': 'egg'},
                                               {'mut':[],
                                               'node':'10',
                                               'passage': '',
                                                'seq':'BCDDE',
                                               'kids': [{'mut':['D3A', 'C2F', 'B1A'],
                                                        'node': '11',
                                                         'seq':'AFADE',
                                                        'passage':'egg'}, 
                                                        {'mut': [],
                                                        'node': '12',
                                                         'seq':'BCDDE',
                                                        'passage':'egg'}]}]}]},
                           {'mut': ['E5A'],
                            'node': '5',
                            'passage': '',
                            'seq':'BCBCA',
                            'kids': [{'mut': ['C2A'],
                                     'node': '8',
                                      'seq':'BCABCA',
                                     'passage': 'egg'}]},
                           {'mut': ['B1D'],
                            'node': '8', 
                            'seq':'DCBCE',
                            'passage': 'cell'}]},
                 {'mut': ['B3A'],
                  'node': '9',
                  'seq':'BBACE',
                 'passage': 'cell'}]}

In [91]:
positions = [1, 3, 5]

tip_muts = {}

def traverse(branch, pos_list):

    #keep track of mutations at internal nodes
    if 'kids' in branch.keys():
        for child in branch['kids']:
            # if 'aa_muts' in child.keys():
            if child['mut']:
                traverse_aa.append(child['mut'])
                aa_mut_clade.append({str(child['node']):child['mut']})
                traverse(child, pos_list)
                traverse_aa.remove(child['mut'])
                aa_mut_clade.remove({str(child['node']):child['mut']})

            else:
                #Append place holder for branches with no mutations
                traverse_aa.append([])
                traverse(child, pos_list)
                traverse_aa.remove([])

    elif 'children' not in branch.keys():

        muts_list = [str(mut) for sublist in traverse_aa for mut in sublist]
        aa_mut_clade_list = [str(mut) for mut in aa_mut_clade]
        last_node = [str(mut) for sublist in traverse_aa[:-1] for mut in sublist]
        
        tip_sequence = branch['seq']
        last_node_sequence = 'BBBCE'
        
        for mut in last_node:
            internal_mut_pos = int(re.findall('\d+', mut)[0])
            internal_mut_aa = mut[-1:]
            last_node_sequence = (last_node_sequence[:internal_mut_pos-1] + 
                                  internal_mut_aa + last_node_sequence[internal_mut_pos:])

        tip_muts[branch['node']]=([branch['mut'], branch['passage'], branch['seq'], aa_mut_clade_list] + 
                                  [tip_sequence[pos-1] for pos in pos_list] + 
                                  [last_node_sequence[pos-1] for pos in pos_list])

traverse_aa = []
aa_mut_clade = []
traverse(tree, positions)

In [92]:
df = pd.DataFrame(tip_muts).T
df.reset_index(inplace=True)
df.columns = ['strain', 'tip_muts', 'passage', 'seq', 'aa_mut_list'] + positions + [str(x)+'_lastnode' for x in positions]
df.aa_mut_list = df.aa_mut_list.map(list)

In [93]:
for p in positions:
    df['mut'+str(p)] = np.select(
    (df[p]==df[str(p)+'_lastnode'], df[p]!=df[str(p)+'_lastnode']),
    (False, True))
for p in positions:
    df['aa_mut'+str(p)] = np.where(df['mut'+str(p)]==1, df[str(p)+'_lastnode']+str(p)+df[p], None)

In [94]:
max_internal_length=df['aa_mut_list'].map(len).max()

#Find clusters of egg-passaged sequences,
#allow mutations shared by these clusters to be called as mutations in egg or strains
#If multiple mutations occur at the same site within cluster, most recent mutation should be taken
#Tips=cluster of size 1, so tip mutations override ancestral
for internal_branch in range(0,max_internal_length):
    sub_df = df[df['aa_mut_list'].map(len) > internal_branch]

    group= sub_df.groupby((sub_df.aa_mut_list.apply(lambda col: col[0:(internal_branch+1)])).map(tuple))
    for k, v in group:
        if len(v[v['passage']=='egg']) != 0:
            if len(v.groupby('passage')) == 1:

                k = [ast.literal_eval(x) for x in list(k)]
                k_dict = {}
                for d in k:
                    for d_k, d_v in d.items():
                        if d_k not in k_dict.keys():
                            k_dict[d_k] = d_v
                        else:
                            k_dict[d_k]+=d_v

                #Find most recent mutation(s)
                for recent_mut in k_dict[max(k_dict, key=int)]:
                    site = int(re.findall('\d+', recent_mut)[0])
                    if str(site) in str(positions):
                        df.at[v.index, 'mut' + str(site)] = 1
                        df.at[v.index, 'aa_mut' + str(site)] = recent_mut
                        df.at[v.index, str(site) + '_lastnode'] = recent_mut[0]

In [95]:
df

Unnamed: 0,strain,tip_muts,passage,seq,aa_mut_list,1,3,5,1_lastnode,3_lastnode,5_lastnode,mut1,mut3,mut5,aa_mut1,aa_mut3,aa_mut5
0,6,[E5A],egg,BCDDA,"[{'2': ['B2C']}, {'3': ['B3C']}, {'4': ['C4D',...",B,D,A,B,C,E,0,1,1,,C3D,E5A
1,7,[C2F],egg,BFDDE,"[{'2': ['B2C']}, {'3': ['B3C']}, {'4': ['C4D',...",B,D,E,B,C,E,0,1,0,,C3D,
2,11,"[D3A, C2F, B1A]",egg,AFADE,"[{'2': ['B2C']}, {'3': ['B3C']}, {'4': ['C4D',...",A,A,E,B,D,E,1,1,0,B1A,D3A,
3,12,[],egg,BCDDE,"[{'2': ['B2C']}, {'3': ['B3C']}, {'4': ['C4D',...",B,D,E,B,C,E,0,1,0,,C3D,
4,8,[B1D],cell,DCBCE,"[{'2': ['B2C']}, {'8': ['B1D']}]",D,B,E,B,B,E,1,0,0,B1D,,
5,9,[B3A],cell,BBACE,[{'9': ['B3A']}],B,A,E,B,B,E,0,1,0,,B3A,
