In [1]:
import dendropy as dp
import numpy as np
import re, sys, os
import pandas as pd

In [265]:
tree_file_name = 'set1_MPXV_common_r0.nexus.tree'
location_human = '1'
psamp_after_first_human = 0.8

- Read tree and attach location to taxon labels

In [266]:
tree = dp.Tree.get_from_path(tree_file_name, 'nexus')

taxon_translation_list = []
for node in tree.postorder_node_iter():
    if node.is_leaf():
        location = node.annotations.get_value('location')
        new_name = node.taxon.label + '_type' + location + '_' + str(round(node.distance_from_root(), 2))
        taxon_translation_list.append([node.taxon.label, new_name])
        node.taxon.label = new_name
taxon_translation_list = pd.DataFrame(taxon_translation_list)
taxon_translation_list.columns = ['old_name', 'new_name']

- Get actual number of migration events and their ages

In [267]:
migration_events = []
node_ages = []
for node in tree.postorder_node_iter():
    reaction = node.annotations.get_value('reaction')
    age = node.distance_from_root()
    node_ages.append(age)
    if reaction == 'Migration':
        location = node.annotations.get_value('location')
        migration_events.append([reaction, location, node.child_nodes()[0].annotations.get_value('location'), age])

migration_events = pd.DataFrame(migration_events)
migration_events.columns = ['reaction', 'ancestor_type', 'child_type', 'time']
migration_events.head()

Unnamed: 0,reaction,ancestor_type,child_type,time
0,Migration,1,0,1.41063
1,Migration,1,0,1.442894
2,Migration,1,0,1.874215
3,Migration,1,0,1.814711
4,Migration,1,0,2.045148


In [268]:
num_migration_into_human = (migration_events.child_type == location_human).sum()
age_first_migration = migration_events.loc[migration_events.ancestor_type == '0', 'time'].min()
age_of_first_branching = max([i.distance_from_tip() for i in tree.postorder_internal_node_iter() if len(i.child_nodes()) == 2])
age_of_first_branching

2.03581893082824

- Export newick tree with metadata in name and no internal nodes

In [269]:
file_name_with_ground_truth = re.sub('[.].+', '', tree_file_name) + '_nMig' + str(num_migration_into_human) + '_migAge' + str(round(age_first_migration, 2)) + '_splitAge' + str(round(age_of_first_branching, 2))

In [270]:
tree_collapsed = tree.get_from_path(re.sub('nexus', 'newick', tree_file_name), 'newick')
for node in tree_collapsed.postorder_node_iter():
    if node.is_leaf():
        temp =  taxon_translation_list.loc[taxon_translation_list.old_name == node.taxon.label, 'new_name']
        node.taxon.label = temp.values[0]

In [271]:
tree.write_to_path(file_name_with_ground_truth + '.nexus.tree', 'nexus')
tree_collapsed.write_to_path(file_name_with_ground_truth + '.newick.tree', 'newick')

print('Newick tree exported to:' + file_name_with_ground_truth + '.newick.tree')
print('Number of migration events into humans: ' +  str(num_migration_into_human))

Newick tree exported to:set1_MPXV_common_r0_nMig3_migAge0.09_splitAge2.04.newick.tree
Number of migration events into humans: 3


In [263]:
temp.values[0]

'159_type1_0.63'

# Workflow testing complete up to here
1. ~~export tree with the data above.~~ 
1. simulate sequences
2. prune tips with removed reaction and location 0. These are animals 
2.1 Prune tips with removed reaction and location 1. Thesea are humans
2.3 prune nodes that are not leaves and which have a single descendants (migrations)
2.4 Calculate number of imports, date of first import, and tmrca of sampled tree
2.5 Export tree in nexus. This is the 'sampled' tree

3 take initial tree and remove only location 1 from the removed reaction
3.1 prune all tips with location 0 prior to the first location 1
2.5 Export tree in nexus. This is the 'opportunistically sampled' tree


In [233]:
tree.write_to_path('test_no_internal_nodes.tree', 'nexus')

In [223]:
sum( [bool(np.random.binomial(1, 0.8)) for i in range(100)] )

78

In [224]:
tree_reaction_location_age.loc[tree_reaction_location_age.iloc[:, 2] == location_human, ]

Unnamed: 0,0,1,2,3
2,3_1,Removed,1,1.927923
5,6_1,Removed,1,1.94393
13,14_1,Removed,1,1.979502
17,18_1,Sampling,1,1.976157
18,19_1,Removed,1,1.918691
21,22_1,Sampling,1,1.904313
22,23_1,Sampling,1,1.890275
23,24_1,Removed,1,1.892277
33,34_1,Sampling,1,1.87446
34,35_1,Removed,1,1.727671


### Also need to get the following stats:
- Time of origin (age of root, really)
- date of first human transmision
- number of migrations to human
- total number of species jumps (this to be compared with the number of monophyletic groups)


In [203]:
location_target = '1' # The bat
location_trigger_sampling = '0' # After human sample
sampling_prob = 0.1 # sample bats with probability of 0.5

In [204]:
trigger_location = tree_reaction_location_age.iloc[:, 2] == location_trigger_sampling
subset_trigger_location = tree_reaction_location_age.loc[trigger_location, 3]
first_target = subset_trigger_location.min()
print(first_target)

1.2868389592339013


In [205]:
tree_reaction_location_age.head()

Unnamed: 0,0,1,2,3
0,1,Sampling,0,1.73508
1,2,Sampling,0,2.038165
2,3,Sampling,0,1.856948
3,4,Sampling,0,1.876671
4,5,Sampling,1,1.867781


In [70]:
# Editing code up to here!!
match_target_and_age = list()

for i in range(tree_reaction_location_age.shape[0]):
    match_location = tree_reaction_location_age.iloc[i, 2] == location_target 
    match_age = tree_reaction_location_age.iloc[i, 3] >  first_target
    if match_location and match_age: # If younger than target EXCLUDE with a probability of 1 - samp_prob
                                     # If older than target, remove always. This should produce a list 
                                     # of all tips to remove
        match_target_and_age.append(i)

match_target_and_age = np.array(match_target_and_age)

NameError: name 'tree_reaction_location_age' is not defined

In [71]:
sampled = np.where(np.random.binomial(1, sampling_prob, len(match_target_and_age)) == 1)
sampled
print( match_target_and_age[sampled])
tree_reaction_location_age.loc[match_target_and_age[sampled], ]

NameError: name 'sampling_prob' is not defined

In [72]:
match_target_and_age

[]

In [73]:
np.array([i for i in range(100)])[np.random.binomial(1, 0.05, 100) == True]

array([32, 96])

In [74]:
(1.8623 + 0.1382) - 0.92749

1.07301

In [75]:
1.8-0.9

0.9

## Pune tips from locations with different rates:

In [14]:
import dendropy as dp
import numpy as np
import re, sys, os
import pandas as pd

In [15]:
tree_file_name = 'set1_MPXV_common_r0_nMig16_migAge0.19_splitAge1.91.nexus.tree'
aln_file_name = 'set1_MPXV_common_r0_nMig16_migAge0.19_splitAge1.91.fasta'
location_human = '1'
non_human_sampling_prop_high = 0.8
non_human_sampling_prop_low = 0.05

In [16]:
tree = dp.Tree.get_from_path(tree_file_name, 'nexus')
alignment = dp.DnaCharacterMatrix.get_from_path(aln_file_name, 'fasta')

- sample constantly at high and low rate anything that is not human

In [17]:
tips_to_remove = []
for i in tree.leaf_node_iter():
    location_temp =  i.annotations.get_value('location')
    if location_temp != location_human:
        to_sample = np.random.binomial(1, non_human_sampling_prop_high)
        if to_sample == 0:
            tips_to_remove.append(i.taxon.label)
            
tree_high_sampling = tree.clone(depth = 1)
tree_high_sampling.prune_taxa_with_labels(tips_to_remove)
tree_high_sampling.write_to_path('testPrunedHigh.tree', 'nexus')

def prune_alignment(aln, labels_to_remove):
    for i in aln:
        if re.sub('\'', '', i.label) in labels_to_remove:
            aln.remove_sequences([i])
    return(aln)

In [18]:
tips_to_remove = []
for i in tree.leaf_node_iter():
    location_temp =  i.annotations.get_value('location')
    if location_temp != location_human:
        to_sample = np.random.binomial(1, non_human_sampling_prop_low)
        if to_sample == 0:
            tips_to_remove.append(i.taxon.label)


tree_low_sampling = tree.clone(depth = 1)
tree_low_sampling.prune_taxa_with_labels(tips_to_remove)
tree_low_sampling.write_to_path(re.sub('.nexus.tree', '_prunedLowSamp.nexus.tree', tree_file_name), 'nexus')
alignment_low_sampling = alignment.clone(depth = 1)
prune_alignment(alignment_low_sampling, tips_to_remove).write_to_path(re.sub('.fasta', '_prunedLowSamp.fasta', aln_file_name), 'fasta')

- sample after first human sample

In [20]:
# need to get dates first then select those
tip_ages = []
for i in tree.leaf_node_iter():
    tip_ages.append([i.taxon.label, i.annotations.get_value('location'), i.distance_from_root()])

tip_ages = pd.DataFrame(tip_ages)
tip_ages.columns = ['tip_label', 'type', 'date']
tip_ages.head()

Unnamed: 0,tip_label,type,date
0,1_type0_1.62,0,1.623965
1,2_type1_1.85,1,1.845936
2,3_type0_1.35,0,1.349154
3,4_type0_1.57,0,1.573901
4,5_type0_2.06,0,2.059624


In [28]:
first_human_case = tip_ages.date[tip_ages.type == location_human].min()
first_human_case

0.4858398734331313

In [29]:
tips_to_remove = []
for i in range(tip_ages.shape[0]):

    if tip_ages.type[i] != location_human and tip_ages.date[i] < first_human_case:
        tips_to_remove.append(tip_ages.tip_label[i])
        print('Prunning '+tip_ages.tip_label[i])
        
    elif tip_ages.type[i] != location_human and tip_ages.date[i] >= first_human_case:
        if np.random.binomial(1, non_human_sampling_prop_high) == 0:
            tips_to_remove.append(tip_ages.tip_label[i])
            print('Prunning '+tip_ages.tip_label[i])
            
tree_opportunistic_sampling = tree.clone(depth = 1)
tree_opportunistic_sampling.prune_taxa_with_labels(tips_to_remove)
tree_opportunistic_sampling.write_to_path(re.sub('.nexus.tree', '_oppSamp.nexus.tree', tree_file_name), 'nexus')

aln_opportunistic_sampling = alignment.clone(depth = 1)
prune_alignment(aln_opportunistic_sampling, tips_to_remove).write_to_path(re.sub('.fasta', '_oppSamp.fasta', aln_file_name), 'fasta')

Prunning 8_type0_1.87
Prunning 10_type0_1.99
Prunning 20_type0_1.66
Prunning 26_type0_1.41
Prunning 37_type0_1.96
Prunning 118_type0_1.76
Prunning 124_type0_1.92


In [30]:
tips_to_remove

['8_type0_1.87',
 '10_type0_1.99',
 '20_type0_1.66',
 '26_type0_1.41',
 '37_type0_1.96',
 '118_type0_1.76',
 '124_type0_1.92']

removing '8_type0_1.87'
removing '10_type0_1.99'
removing '20_type0_1.66'
removing '26_type0_1.41'
removing '37_type0_1.96'
removing '118_type0_1.76'
removing '124_type0_1.92'


In [32]:
alignment.write_to_path('pruned_aln.fasta', 'fasta')