In [1]:
import ete3
import xml.etree.cElementTree as ET
import time

In [2]:
ls

ortho2newick.ipynb  orthoxmlexample.txt


In [55]:

with open('orthoxmlexample.txt' , 'r') as samplefile:
    testhog = samplefile.read()


In [56]:

def get_species_from_orthoxml(orthoxml):
    NCBITaxId2name = {}
    root = ET.fromstring(orthoxml)
    for child in root:
        if 'species' in child.tag:
            NCBITaxId2name[child.attrib['NCBITaxId']] = child.attrib['name']
    return NCBITaxId2name



def get_species_tree_from_orthoxml(orthoxml , verbose = True):
    if verbose == True:
        start = time.clock()
    
    species = get_species_from_orthoxml(orthoxml)
    
    if verbose == True:
        print(time.clock()-start)
    
    ncbi = ete3.NCBITaxa()
    tree = ncbi.get_topology(species)
    
    if verbose == True:
        print(time.clock()-start)
    
    orphans  = (set(species)- set( [node.name for node in tree.get_leaves()]))
    
    if verbose == True:
        print(time.clock()-start)
    if len(orphans) == 0:
        return tree
    else:
        return orphans
    if verbose == True:
        print(time.clock()-start)
    tree_string = tree.write(format=1)

def get_species_tree_from_orthoxml_test(orthoxml , verbose = True):
    if verbose == True:
        start = time.clock()
    
    species = get_species_from_orthoxml(orthoxml)
    
    if verbose == True:
        print(time.clock()-start)
    
    ncbi = ete3.NCBITaxa()
    tree = ncbi.get_topology(species)
    
    print(dir(tree))
    
    print(tree.sci_name)
    print(tree.name)



In [57]:
orphans = get_species_tree_from_orthoxml(testhog)
print(orphans)

1.211507
1.522955
1.530648
set(['623', '162425', '367830', '5297', '4641', '305', '99287', '107806', '190650', '196627', '109871', '271065', '5807', '13684'])


In [259]:
def getParents(orphans, hog , verbose):
    parentDict={}
    genes={}
    root = ET.fromstring(hog)
    for elem in root:
        if 'species' in elem.tag:
            if elem.attrib['NCBITaxId'] in orphans:
                if elem.attrib['NCBITaxId'] not in genes:
                    genes[elem.attrib['NCBITaxId']] =[]
                for gene in elem.iter():
                    if 'gene' in gene.tag:
                        try:
                            genes[gene.attrib['id']] = elem.attrib['NCBITaxId']
                        except KeyError:
                            pass
        if 'groups' in elem.tag:
            parent_map = dict((c, p) for p in elem.getiterator() for c in p)
            for groups in elem.iter():
                if 'geneRef' in groups.tag:
                    if groups.attrib['id'] in genes:
                        species = genes[groups.attrib['id']]
                        if species not in parentDict:
                            orthogroup = parent_map[groups]
                            for prop in orthogroup:
                                if 'property' in prop.tag:
                                    sciname = prop.get('value')
                                    parentDict[sciname] = species
                                    break

    return parentDict

def addOrphans(parentDict, t , verbose = False):
    added =[]
    newdict = parentDict
    leaves = set([leaf.name for leaf in t.get_leaves()])
            
    if verbose == True:
        print(newdict)
    for n in t.traverse():
        try:
            if n.sci_name in newdict and newdict[n.sci_name] not in leaves:
                n.add_child(name = newdict[n.sci_name])
                added.append(n.sci_name)
                leaves.add(newdict[n.sci_name])
        except AttributeError:
            pass
        #second attempt shortening the names...
        leftovers = set(newdict.keys()) - set(added)
    if len(leftovers)>0:
        if verbose == True:

            print('iterative start with leftovers:')
            print(leftovers)


        values = [ newdict[leftover] for leftover in leftovers]
        reduced = [ ''.join([word+' ' for word in leftover.split()[0:max(1,len(leftover.split())-1)]]).strip() for leftover in leftovers ]
        newdict = dict(zip(reduced,values))

        reducedSet = set(reduced)
        reducedOld = set([])
            
        while reducedSet != reducedOld :
            leaves = set([leaf.name for leaf in t.get_leaves()])
            for n in t.traverse():
                try:
                    if n.sci_name in newdict and newdict[n.sci_name] not in leaves:
                        n.add_child(name = newdict[n.sci_name])
                        leaves.add(newdict[n.sci_name])
                        added.append(n.sci_name)
                        if verbose == True:
                            print(n.sci_name)
                except AttributeError:
                    pass
            
            leftoversNew = set(newdict.keys()) - set(added)
            if verbose == True:
                print(leftoversNew)
            if len(leftoversNew) ==0:
                if verbose == True:
                    print('DONE!')
                break
            values = [ newdict[leftover] for leftover in leftoversNew]
            reduced = [ ''.join([word+' ' for word in leftover.split()[0:max(1,len(leftover.split())-1)]]).strip() for leftover in leftoversNew ]
            reducedOld = reducedSet
            reducedSet = set(reduced)
            newdict = dict(zip(reduced,values))
            if verbose == True:
                print('newdict')
                print(newdict)

                print('newleftovers')
                print(leftoversNew)

    return t


In [260]:
parents = getParents(orphans , testhog)
print(orphans - set(parents.values()))


TypeError: getParents() takes exactly 3 arguments (2 given)

In [261]:

def get_species_tree_from_orthoxmlv2(orthoxml , verbose = False):

    species = get_species_from_orthoxml(orthoxml)
    ncbi = ete3.NCBITaxa()
    tree = ncbi.get_topology(species)
    orphans  = (set(species)- set( [node.name for node in tree.get_leaves()]))    
    if len(orphans) == 0:
        return tree
    else:
        parents = getParents(orphans, orthoxml , verbose)
        tree = addOrphans( parents , tree, verbose)
        orphans = (set(species)- set( [node.name for node in tree.get_leaves()]))
        if verbose ==True:
            print(orphans)
        return tree
    
    tree_string = tree.write(format=1)


In [266]:
start = time.clock()
treeout= get_species_tree_from_orthoxmlv2(testhog, False)
print(time.clock()- start)

set(['162425'])
3.855087


In [265]:
print(treeout)


                        /-868595
                       |
                       |--485916
                       |
                     /-|--349161
                    |  |
                    |  |--760568
                    |  |
                    |   \-696281
                    |
                    |      /-272564
                    |   /-|
                    |--|   \-138119
                    |  |
                    |   \-871963
                    |
                  /-|   /-768704
                 |  |  |
                 |  |--|--768706
                 |  |  |
                 |  |   \-646529
                 |  |
                 |  |--645991
                 |  |
                 |  |--370438
                 |  |
                 |  |--477974
                 |  |
                 |   \-635013
                 |
                 |      /-213810
                 |   /-|
                 |  |   \-697329
                 |  |
                 |  |      /-203119
       

In [268]:
leaves = {}
for l in treeout.get_leaves():
    if l.name not in leaves:
        leaves[l.name] = 1
    else:
        leaves[l.name] += 1
print(max(leaves.values()))

3
