In [2]:
import re
import json
import treeswift as ts
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict

In [6]:
with open(f"../results/placement_comparison/epa_results-reads16S.jplace", 'r') as file:
    data = json.load(file)
backbone = ts.read_tree_newick("../misc_data/backboneU-WoLv1.nwk")

In [7]:
def make_newick(tree_str: str) -> str:
    subs_tree = re.sub("\{(\d+)\}", '', tree_str)
    return subs_tree

def map_branches(data) -> dict:
    """
    Build dictionary with edge/branch numbers as keys and 
    reference tree leaves as values
    """
    def get_id(s):
        return int(re.search("\{(\d+)\}", s).group(1))
    
    extnwk_str = data['tree']
    tree = ts.read_tree_newick(make_newick(extnwk_str))
    lbl_to_nd = tree.label_to_node(selection='leaves')
    branch_to_lbl = {
        get_id(extnwk_str[extnwk_str.find(lbl):]): lbl
        for lbl, nd in lbl_to_nd.items()
    }
    for ix, nd in enumerate(tree.traverse_postorder()):
        if (nd.is_leaf()):
            assert(branch_to_lbl[ix] == nd.label)
        else:
            nd.set_label(f"N{ix}")
            branch_to_lbl[ix] = nd.get_label()
    return tree, branch_to_lbl

In [8]:
tree, branch_to_lbl = map_branches(data)
lbl_to_nd = backbone.label_to_node(selection='all')
p_to_nd = tree.label_to_node(selection='all')

In [10]:
error_pdcit = defaultdict(float)
for placement in tqdm(data["placements"]):
    rid = placement["n"][0]
    total_lwr = 0
    total_error = 0
    qid, ii, qname = rid.split("_")
    for p in placement["p"]:
        backbone = ts.read_tree_newick("../misc_data/backboneU-WoLv1.nwk")
        lbl_to_nd = backbone.label_to_node(selection='all')
        edge_num, likelihood, like_weight_ratio, distal_length, pendant_length = p
        plbl = branch_to_lbl[edge_num]
        pnd = backbone.mrca(tree.extract_subtree(p_to_nd[plbl]).labels(internal=False))
        nnd = ts.Node(label=f"x{plbl}", edge_length=pnd.get_edge_length() - distal_length)
        pnd.set_edge_length(distal_length)
        nnd.set_parent(pnd.get_parent())
        nnd.get_parent().remove_child(pnd)
        nnd.get_parent().add_child(nnd)
        nnd.add_child(pnd)
        pnd.set_parent(nnd)
        error = backbone.distance_between(nnd, lbl_to_nd[qid])
        total_error = error * like_weight_ratio
        total_lwr += like_weight_ratio
    error_pdcit[rid] = total_error / 100 / total_lwr

 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 1455/2000 [06:27<02:25,  3.76it/s]


RuntimeError: Failed to parse string as Newick: ../misc_data/backboneU-WoLv1.nwk

In [89]:
pd.DataFrame({"rid": error_pdcit.keys(), "error":error_pdcit.values()}).to_csv("../results/placement_comparison/epa_error-lwr_blen.tsv", sep="\t")