In [53]:
import sys, os, optparse
import obonet
import networkx
import pprint
import inspect
import time
from helpformat import PrettyHelpFormatter
import logfile

def lineno():
    """Returns the current line number in our program."""
    return "Line number %d: " % inspect.currentframe().f_back.f_lineno

def read_tree_file(file):
    """Reads in a .obo ontology tree file."""
    try:
        graph = obonet.read_obo(file)
        return graph
    except ValueError:
        raise ValueError('File: %s has unknown file structure for .obo file. \
        \n \t It may be the wrong file type.' % file)

def read_entries_list(file):
    """Reads in a list of values of interest. Values may be
    names of entries in the ontology, or identifiers, but
    should be just one per line."""
    entries = []
    with open(file) as f:
        for line in f:
            entries.append(line.strip())
    print(lineno(), 'entries:', entries)
    return entries

def get_name(node):
    """Returns the name associated with node.
    Checks whether node is a key in the id_to_name library.
    If True, it and returns the associated value from id_to_name.
    If node is a key in name_to_id, node itself is returned.
    Raises a KeyError if neither case is True."""
    if node in id_to_name.keys():
        name = id_to_name[node]
    elif node in name_to_id.keys():
        name = node
    else:
        raise KeyError('Name: %s is not in the dataset' % node)
    #print(lineno(), 'name:', name)
    return name
        
def get_id(node):
    """Returns the identifier associated with node.
    Checks whether node is a key in the name_to_id library.
    If True, it and returns the associated value from name_to_id.
    If node is a key in id_to_name, node itself is returned.
    Raises a KeyError if neither case is True."""
    if node in name_to_id.keys():
        id_num = name_to_id[node]
    elif node in id_to_name.keys():
        id_num = node
    else:
        raise KeyError('Identifier: %s is not in the dataset' % node)
    print(lineno(), 'id_num:', id_num)
    return id_num

def get_category_list(umbrella_node=None):
    """Gets the list of categories to which the selected entries should be 
    matched. A supracategory that captures all desired categories may be
    designated, or a list of categories may be supplied for more tailored
    results."""
    if umbrella_node == None:
        umbrella_node = get_id(supra_cat)
    categories = {}
    print(lineno(), 'umbrella_node:', umbrella_node)
    for child, parent, key in graph.in_edges(umbrella_node, keys = True):
        #print(lineno(), 'child, parent, key:', child, parent, key)
        categories[get_name(child)] = child
    #print(lineno(), 'categories:', categories)
    #for cat in cat_list:
    #    categories[get_name(cat)] = get_id(cat)
    return categories

def get_paths(entry, umbrella_node=None):
    """Gets all paths from origin to destination(s).
    origin must be in the form of an ontology identifier, not a name."""
    if umbrella_node == None:
        umbrella_node = get_id(supra_cat)
        origin = get_id(entry)
    print(lineno(), 'umbrella_node:', umbrella_node)
    destinations = get_category_list(umbrella_node).values()
    #print(lineno(), 'destinations:', destinations)
    paths = networkx.all_simple_paths(graph, source = origin, target = destinations)
    print(lineno(), 'paths:', paths)
    return paths

def get_start_and_end(path):
    """Gets the origin and destination for a particular path.
    Returns a tuple."""
    entry = get_name(path[0])
    category = get_name(path[-1])
    print(lineno(), 'entry, category:', entry, category)
    return entry, category

def match_categories(nodes=None):
    """Iterates through all of the entries in the list file and finds the categories
    in the category list that they fall under. Returns a dictionary with each entry
    as a key and the values being either a set (if duplicates == False; the default)
    or a list (if duplicates == True) of categories."""
    if nodes == None:
        nodes = entries
    results = {}
    for node in nodes:
        #try:
            paths = get_paths(node)
            print('hydra')
            for path in list(paths):
                print(lineno(), 'path: ', path[0], path[-1])
                entry, category = get_start_and_end(path)
                if duplicates == False:
                # the dictionary values for each key will be a set (no duplicates)
                    if entry in results.keys():
                        results[entry].add(category)
                    else:
                        results[entry] = {category}
                elif duplicates == True:
                # the dictionary values for each key will be a list (duplicates)
                    if entry in results.keys():
                        results[entry].append(category)
                    else:
                        results[entry] = [category]
        #except KeyError:
            #logfile.write_out('KeyError: %s is not a valid entry' % node, log_file)
            #continue
    print(lineno(), 'results:', results)
    return results

# Debate: multiple output functions, or one that implements a 'switch/case'?
def write_json(results, mode='w'):
    """Writes desired entries and their categories to a json format.
    The category sets are converted to lists so as to be compatible
    with the json format."""
    import json
    with open(out_file, mode) as file:
        new_results = {key: list(value) for key, value in results.items()}
        json.dump(new_results, file, indent=4, sort_keys=True)
    logfile.write_out(results, log_file)

def write_json_plus_input(results, mode='w'):
    # print out inputs
    write_json(results, 'a')
    return

In [60]:
globals().update({'log_file' : 'log.txt'})
globals().update({'graph' : read_tree_file('efo.obo')})
globals().update({'entries' : read_entries_list('diseases.txt')})
globals().update({'supra_cat' : 'measurement'})
globals().update({'out_file' : 'entry_categories.json'})
globals().update({'duplicates' : False})
globals().update({'work_dir' : os.getcwd().lstrip()})
globals().update({'id_to_name' : {id_: data.get('name') for id_, data in graph.nodes(data=True) if 'name' in data}})
globals().update({'name_to_id' : {data['name']: id_ for id_, data in graph.nodes(data=True) if 'name' in data}})


Line number 31:  entries: ['EFO:0007878', 'alcohol consumption measurement', 'interleukin-6 measurement']


In [59]:
trait = 'interleukin-6 measurement'
name_to_id[trait]

'EFO:0004810'

In [56]:
nodes = ['interleukin-6 measurement']

In [61]:
results = {}
match_categories()

Line number 61:  id_num: EFO:0001444
Line number 61:  id_num: EFO:0007878
Line number 87:  umbrella_node: EFO:0001444
Line number 72:  umbrella_node: EFO:0001444
Line number 91:  paths: <generator object empty_generator at 0x11fd03c10>
hydra
Line number 61:  id_num: EFO:0001444
Line number 61:  id_num: EFO:0007878
Line number 87:  umbrella_node: EFO:0001444
Line number 72:  umbrella_node: EFO:0001444
Line number 91:  paths: <generator object empty_generator at 0x11fd033c0>
hydra
Line number 61:  id_num: EFO:0001444
Line number 61:  id_num: EFO:0004810
Line number 87:  umbrella_node: EFO:0001444
Line number 72:  umbrella_node: EFO:0001444
Line number 91:  paths: <generator object _all_simple_paths_multigraph at 0x11fd03c10>
hydra
athena
Line number 116:  path:  EFO:0004810 EFO:0004872
Line number 99:  entry, category: interleukin-6 measurement inflammatory biomarker measurement
athena
Line number 116:  path:  EFO:0004810 EFO:0004747
Line number 99:  entry, category: interleukin-6 measur

{'interleukin-6 measurement': {'inflammatory biomarker measurement',
  'protein measurement'}}

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adj',
 '_node',
 '_pred',
 '_succ',
 'add_edge',
 'add_edges_from',
 'add_node',
 'add_nodes_from',
 'add_weighted_edges_from',
 'adj',
 'adjacency',
 'adjlist_inner_dict_factory',
 'adjlist_outer_dict_factory',
 'clear',
 'clear_edges',
 'copy',
 'degree',
 'edge_attr_dict_factory',
 'edge_key_dict_factory',
 'edge_subgraph',
 'edges',
 'get_edge_data',
 'graph',
 'graph_attr_dict_factory',
 'has_edge',
 'has_node',
 'has_predecessor',
 'has_successor',
 'in_degree',
 'in_edges',
 'is_directed',
 'is_multigraph',
 'name',
 'nbunch_iter',
 'neighbors',
 'new_edge

http://www.ebi.ac.uk/efo/efo.owl
