## Script for converting a tree of life from the Tree of Life Project in xml to a networkx graph and to a graphml file

This script convert a Tree of life from an xml format to a networkx graph then to several file formats (json, graphml).
The original data can be found here http://tolweb.org/tree/home.pages/downloadtree.html
The xml file available on the above website is licenced under the Attribution Creative Commons 3.0 https://creativecommons.org/licenses/by/3.0/, the copyright is owned by the Tree Of Life Project.


Copyright 2017 Benjamin Ricaud

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [None]:
# Tools for parsing xml
import xml.etree.ElementTree as ET

In [None]:
# Load the xml file
# If there are error during the loading, make sure the file is encoded in UTF8
# You may have to open it with a text editor and save it with encoding UTF8
xml_file_to_load = '../data/tolskeletaldumpUTF8.xml'
tree = ET.parse(xml_file_to_load)

In [None]:
# The data will be loaded in a networkx graph
# The networkx module can be installed using 'pip install networkx'
import networkx as nx

In [None]:
# Code for the tree construction
# read the data in variable 'tree'
# Create the graph from the data
# Load the node and their properties in a NetworkX graph
# Create the edges and assign a default weight of 1 to each of them.
i=1
G = nx.DiGraph()
root = tree.getroot()
for livingElement in root.iter('NODE'):
    name = livingElement.find('NAME').text
    data_dic = livingElement.attrib
    node_id = data_dic['ID']
    if name == None:
        name = 'None'
    data_dic['name'] = name
    if not G.has_node(node_id):
        G.add_nodes_from([(node_id,data_dic)])
    if data_dic['CHILDCOUNT']!='0':
        for child in livingElement[1]:
            child_name = child.find('NAME').text
            child_data_dic = child.attrib
            child_id = child_data_dic['ID']
            if child_name == None:
                child_name = 'None'
            child_data_dic['name'] = child_name
            #print(child_name,child_data_dic)
            if not G.has_node(child_id):
                G.add_nodes_from([(child_id,child_data_dic)])
            if G.has_edge(node_id,child_id):
                print('found existing edge',name,child_name)
                print('data: ',data_dic,child_data_dic)
            G.add_edge(node_id,child_id,weight=1)
            i+=1
print('Number of nodes processed:',i)
print('Number of nodes in the graph:',G.number_of_nodes())
print('Number of edges in the graph:',G.number_of_edges())
print('The graph is a tree?',nx.is_tree(G))

In [None]:
# Find the root node, the only one that has in_degree 0
root_node_list = [n for n,d in G.in_degree() if d==0] 
root_node_id = root_node_list[0]
print('Root node id:',root_node_id)

In [None]:
# Details about the root node
print(G.nodes[root_node_id])
print('Degree:',G.degree(root_node_id))
print('Successors: ',[G.nodes[node]['name'] for node in G.successors(root_node_id)])

In [None]:
# Details about a particular node
start_node_id = '16421' # Homo sapiens
node_data = G.nodes[start_node_id]
print(node_data['ID'], node_data['name'])
node_id = start_node_id
print('Ancestors:')
for p in range(200):
    predecessors = list(G.predecessors(node_id))
    node_data = G.nodes[predecessors[0]]
    #print(pred_data)
    print(node_data['ID'], node_data['name'])
    node_id = node_data['ID']
    if node_id == root_node_id:
        break

In [None]:
# Saving the graph in json format
from networkx.readwrite import json_graph
import json
with open('../data/treeoflife.json', 'w') as outfile1:
    outfile1.write(json.dumps(json_graph.node_link_data(G)))

In [None]:
# Saving the graph in graphML format
nx.write_graphml(G, "../data/treeoflife.graphml")

See https://networkx.github.io/ for more file formats and additional details on the handling of the graph.

## Saving a subgraph of the full graph

In [None]:
# Create a subgraph
# The subgraph contains the root and is descendants up to 'depth'
# First step: select the nodes
depth = 2
set_of_nodes = set(root_node_id)
for _ in range(0,depth):
    for node_id in set_of_nodes:
        neigs = G.neighbors(node_id)
        #print(neigs)
        set_of_nodes = set_of_nodes.union(set(neigs))
print('Number of nodes selected:',len(set_of_nodes))
print(set_of_nodes)
# Second step: extract the subgraph
G_sub = G.subgraph(list(set_of_nodes))

In [None]:
# Saving the graph in json format
from networkx.readwrite import json_graph
import json
with open('../data/treeoflife_subset.json', 'w') as outfile1:
    outfile1.write(json.dumps(json_graph.node_link_data(G_sub)))

In [None]:
# Check (Should be True)
print('The graph is a tree?',nx.is_tree(G_sub))
print('The graph is weakly connected?',nx.is_weakly_connected(G_sub))