In [1]:
import dendropy
import pandas as pd
import re

## Getting cluster assignments from MASTER simulations
The following script takes in a nexus file from a MASTER simulation of the transmission dynamics in King County, Washington, where a three deme model is assumed to represent North KC, South KC, and an external source providing a high rate of introductions into both locations. This external deme represents everything outside of KC, the following script takes in the nexus tree, imports it into a `dendropy` tree object, and then creates does a postorder traversal of the tree, creating subtrees whenever the location of the ancestral node is found to be in this external, introduction-providing deme. A cluster number is then assigned to every tip and a `.tsv` is produced with two columns: one for the tip label; the other with the cluster assignment 

In [62]:
nexusPath = "/Users/miguelparedes/Desktop/StateClocks_S1_master.untyped.tree"

In [63]:
tree2 = dendropy.Tree.get(file=open(nexusPath, "r"), schema="nexus", extract_comment_metadata=True)

In [65]:
distance_list = tree2.calc_node_ages(ultrametricity_precision = False)

In [55]:
n_kc = "location=\'0\'"
s_kc = "location=\'1\'"
for leaf in tree2.leaf_node_iter():
    if n_kc in str(leaf.annotations):
        taxon_number = re.findall(r"'(.*?)'", str(leaf.taxon), re.DOTALL)
        leaf.taxon = taxon_number[0] + "|nKC"
    elif s_kc in str(leaf.annotations):
        taxon_number = re.findall(r"'(.*?)'", str(leaf.taxon), re.DOTALL)
        leaf.taxon = taxon_number[0] + "|sKC"
       

In [52]:
distance_list

[0.0622798501006962,
 0.16694497145369608,
 0.6238031545408812,
 0.7440138594081331,
 0.8363155626381827,
 0.8008931736500469,
 0.7735621667389572,
 0.29202472251508094,
 0.05363240207194264,
 0.04746737163545254,
 0.05500422039871661,
 0.46008932091010696,
 0.8065849600604799,
 0.7416026480027097,
 0.3060736464872969,
 0.08763221512894881,
 0.018986953537521396,
 0.07967014863478729,
 0.06405536511882429,
 0.16377278492130132,
 0.04179380069761595,
 0.12060876753823659,
 0.10964537155269971,
 0.10972996156770934,
 0.31074706102604444,
 0.3593965639381727,
 0.2754758195135026,
 0.47720375425413014,
 0.3893151141001478,
 0.37909678472596053,
 0.7678829450709185,
 0.713058691072264,
 0.7299775997782877,
 0.798143592839215,
 0.7576329856226122,
 0.6321645800368482,
 0.6740718179882608,
 0.6960636967152614,
 0.7681398880577532,
 0.6152742433064344,
 0.5915255531876631,
 0.548837223997049,
 0.3964436567950887,
 0.07442697788695295,
 0.09985353720321878,
 0.4570267873785717,
 0.3981136890785

In [56]:
location = "location=\'2\'"
clusters = {}
previous_taxons = []
count = 1
for node in tree2.postorder_node_iter():
    for a in node.annotations: 
        if location in str(a):
            taxons = []
            subtree = node.extract_subtree()
            for leaf in subtree.leaf_iter():
                taxons.append(leaf.taxon)
            if count == 1:
                clusters[count] = taxons
            elif count >1 :
                previous_count = count -1
                previous_taxons.extend((clusters[previous_count]))
                clusters[count] = [elem for elem in taxons if elem not in previous_taxons]
            count +=1 

In [57]:
clusters

{1: ['699|sKC', '700|nKC'],
 2: ['698|sKC'],
 3: ['697|nKC'],
 4: ['696|sKC'],
 5: ['695|sKC'],
 6: ['694|nKC'],
 7: ['693|sKC'],
 8: ['692|nKC'],
 9: ['691|nKC'],
 10: ['690|nKC'],
 11: ['689|sKC'],
 12: ['688|nKC'],
 13: ['687|nKC'],
 14: ['686|nKC'],
 15: ['685|sKC'],
 16: ['684|sKC'],
 17: ['683|sKC'],
 18: ['682|sKC'],
 19: ['681|nKC'],
 20: ['680|sKC'],
 21: ['679|nKC'],
 22: ['678|nKC'],
 23: ['677|nKC'],
 24: ['676|sKC'],
 25: ['674|sKC', '675|sKC'],
 26: ['673|nKC'],
 27: ['672|nKC'],
 28: ['669|nKC', '670|sKC', '671|sKC'],
 29: ['668|nKC'],
 30: ['667|nKC'],
 31: ['666|sKC'],
 32: ['665|sKC'],
 33: ['664|nKC'],
 34: ['663|nKC'],
 35: ['662|sKC'],
 36: ['661|sKC'],
 37: ['660|nKC'],
 38: ['658|nKC', '659|nKC'],
 39: ['657|nKC'],
 40: ['653|sKC', '654|nKC', '655|sKC', '656|sKC'],
 41: ['652|sKC'],
 42: ['649|sKC', '650|sKC', '651|sKC'],
 43: ['646|nKC', '647|sKC', '648|nKC'],
 44: ['645|nKC'],
 45: ['640|sKC', '641|sKC', '642|sKC', '643|sKC', '644|sKC'],
 46: ['639|nKC'],
 47: 

In [58]:
## removing the superfluous node location information that's part of the dendropy tree object
clean_list = []
for cluster, taxons in clusters.items():
    for taxon in taxons:
        #taxon_number = re.findall(r"'(.*?)'", str(taxon), re.DOTALL)
        clean_list.append([cluster, taxon])

In [59]:
clean_list

[[1, '699|sKC'],
 [1, '700|nKC'],
 [2, '698|sKC'],
 [3, '697|nKC'],
 [4, '696|sKC'],
 [5, '695|sKC'],
 [6, '694|nKC'],
 [7, '693|sKC'],
 [8, '692|nKC'],
 [9, '691|nKC'],
 [10, '690|nKC'],
 [11, '689|sKC'],
 [12, '688|nKC'],
 [13, '687|nKC'],
 [14, '686|nKC'],
 [15, '685|sKC'],
 [16, '684|sKC'],
 [17, '683|sKC'],
 [18, '682|sKC'],
 [19, '681|nKC'],
 [20, '680|sKC'],
 [21, '679|nKC'],
 [22, '678|nKC'],
 [23, '677|nKC'],
 [24, '676|sKC'],
 [25, '674|sKC'],
 [25, '675|sKC'],
 [26, '673|nKC'],
 [27, '672|nKC'],
 [28, '669|nKC'],
 [28, '670|sKC'],
 [28, '671|sKC'],
 [29, '668|nKC'],
 [30, '667|nKC'],
 [31, '666|sKC'],
 [32, '665|sKC'],
 [33, '664|nKC'],
 [34, '663|nKC'],
 [35, '662|sKC'],
 [36, '661|sKC'],
 [37, '660|nKC'],
 [38, '658|nKC'],
 [38, '659|nKC'],
 [39, '657|nKC'],
 [40, '653|sKC'],
 [40, '654|nKC'],
 [40, '655|sKC'],
 [40, '656|sKC'],
 [41, '652|sKC'],
 [42, '649|sKC'],
 [42, '650|sKC'],
 [42, '651|sKC'],
 [43, '646|nKC'],
 [43, '647|sKC'],
 [43, '648|nKC'],
 [44, '645|nKC'],
 [

In [49]:
df = pd.DataFrame(clean_list, columns = ['Cluster', 'Tip Name'])

In [50]:
df

Unnamed: 0,Cluster,Tip Name
0,1,'699'|sKC
1,1,'700'|nKC
2,2,'698'|sKC
3,3,'697'|nKC
4,4,'696'|sKC
...,...,...
695,206,'7'|nKC
696,206,'8'|sKC
697,206,'9'|nKC
698,207,'1'|nKC


In [11]:
df.to_csv('sims_cluster_assignment_final.tsv', sep="\t", index = False)