In [23]:
import os
import gzip
from Bio import SeqIO
import re
from newick import read, dump


def clean_node(node_name):
    regex = r"[a-z]_[a-zA-Z]+_[a-zA-Z1-9]+.[a-zA-Z1-9]+.[a-zA-Z1-9]+.[a-zA-Z1-9]+"
    if re.search(regex, str(node_name)):
        node_name.name = re.search(regex, str(node_name)).group(0)
        
        
main_node = {'OG0000002_tree.txt':'n441',
             'OG0006648_tree.txt': 'n0',
             'OG0000300_tree.txt': 'n0',
             'OG0000622_tree.txt': 'n0',
             'OG0011556_tree.txt': 'n0',
             'OG0000239_tree.txt': 'n99',

             'OG0000834_tree.txt': 'n0',

             'OG0000822_tree.txt': 'n0',
             'OG0002813_tree.txt': 'n0',

             'OG0000006_tree.txt': 'n836',

             'OG0002713_tree.txt': 'n0',

             'OG0000571_tree.txt': 'n0',
             'OG0000446_tree.txt': 'n0'}

filename = os.listdir('./dataset/gene_tree_required_proteins/')
filename.remove('.DS_Store')

for file in filename:
    print(file)
    #Read Dataset
    trees = read('./dataset/gene_tree_required_proteins/{}'.format(file))
    if main_node[file] =='n0':
        node_needed = trees[0].get_leaves()
    else:
        node_needed = trees[0].get_node(main_node[file]).get_leaves()

    all_node = trees[0].get_leaves()

    trees[0].prune(node_needed, inverse=True)

    #Rename Nodes
    trees[0].visit(clean_node)

    #Get node names
    leaves = trees[0].get_leaves()

    #Get Node lengths
    unique = {}
    for node in leaves:
        regex = r"[a-z]_[a-zA-Z]+_[A-Z0-9]+"
        if re.search(regex, node.name):
            node_new_name = re.search(regex, str(node.name)).group(0)
        if node_new_name in unique:
            unique[node_new_name] = max(unique[node_new_name], node.length)
        else:
            unique[node_new_name] = node.length 

    #prune to just keep the longest unique nodes
    for node in leaves:
        regex = r"[a-z]_[a-zA-Z]+_[A-Z0-9]+"
        if re.search(regex, node.name):
            node_new_name = re.search(regex, str(node.name)).group(0)
        if node.length != unique[node_new_name]:
            trees[0].prune_by_names(node.name)

    #remove c elegans ref
    c_elegans_remove = ['c_elegans_ref_protein_PAR-1']
    trees[0].prune_by_names(c_elegans_remove)

    #Remove nodes with no child
    while trees[0].walk(mode='postorder'):
        atleast_once = True
        for n in trees[0].walk(mode='postorder'):
            regex = r"^n[0-9]+"
            if n.ancestor and not n.descendants and re.search(regex, n.name):
                trees[0].get_node(n.name).ancestor.descendants.remove(trees[0].get_node(n.name))
                atleast_once = False
        if atleast_once == True:
            break

    #dump the tree
    with open('./results/required_tree_{}'.format(file), 'w') as fobj:
        dump(trees, fobj)

OG0000002_tree.txt
OG0006648_tree.txt
OG0000300_tree.txt
OG0000622_tree.txt
OG0011556_tree.txt
OG0000239_tree.txt
OG0000834_tree.txt
OG0000822_tree.txt
OG0002813_tree.txt
OG0000006_tree.txt
OG0002713_tree.txt
OG0000571_tree.txt
OG0000446_tree.txt
