/
entity_search.py
468 lines (387 loc) · 22.2 KB
/
entity_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#######################################################################################################################
# purpose #
# This script serves as development space for exploring different ways to characterize the types of paths that exist
# between two input entities. This script works
#######################################################################################################################
# import needed libraries
import json
import matplotlib.pyplot as plt
import networkx as nx
import random
# from networkx.drawing.nx_pydot import graphviz_layout
from pyvis.network import Network # type: ignore
from rdflib import URIRef # type: ignore
from rdflib.namespace import OWL, RDF, RDFS # type: ignore
from typing import Callable, Dict, List, Optional, Tuple, Union
def format_path_ancestors(anc_dict: Dict, node_metadata: Dict) -> List:
"""Processes a dictionary of node ancestors into a list.
Args:
anc_dict: A dictionary where keys are ints formatted as strings and values are sets of URL strings for each
concept that was found at that level. The level is the distance in the hierarchy from the searched node.
node_metadata: A nested dictionary containing node attributes.
Returns:
ancestors: A nested list where each inner list contains ontology identifier strings.
"""
ancestors = [['{} ({})'.format(node_metadata[str(x)]['label'], x) for x in anc_dict[str(k)]]
for k in sorted([int(x) for x in anc_dict.keys()])]
return ancestors
def formats_node_information(node: URIRef, neighborhood: List, metadata_dict: Dict, verbose: bool = False) -> \
None:
"""Processes neighborhood results.
Args:
node: A string containing a node URL.
neighborhood: A nested list of strings, where each string contains a node identifier.
metadata_dict: A nested dictionary containing node attributes.
verbose: A bool indicating whether or not node and edge metadata should be printed.
Returns:
None
"""
for e, o in neighborhood:
spe = '\n' if neighborhood.index([e, o]) == 0 else '\n\n'
s, s_lab = str(node[0]).split('/')[-1], metadata_dict[str(node[0])]['label']
e_lab = metadata_dict[str(e)]['label']
# TODO -- remove this line once the metadata are fixed
e_lab = 'causally related to' if e_lab == 'is substance that treats' else e_lab
o, o_lab, o_def = str(o).split('/')[-1], metadata_dict[str(o)]['label'], metadata_dict[str(o)]['description']
if verbose:
if o_def != 'None':
print(spe + '>>> {} ({}) - {} - {} ({})\n{} Definition: {}'.format(s_lab, s, e_lab, o_lab, o, o, o_def))
else:
print(spe + '>>> {} ({}) - {} - {} ({})'.format(s_lab, s, e_lab, o_lab, o))
else:
print('>>> {} ({}) - {} - {} ({})'.format(s_lab, s, e_lab, o_lab, o))
return None
def metadata_formatter(s: str, o: str, metadata_dict: Dict, out_type: bool = False) -> Optional[str]:
"""Function looks up edge-level metadata and prints it.
Args:
s: A string containing the identifier for the subject node of a predicate or triple.
o: A string containing the identifier for the object node of a predicate or triple.
metadata_dict: A nested dictionary containing node and edge-level metadata.
out_type: A bool indicating how to return metadata (default=False).
Returns:
None.
"""
s = s + '-reactome_' if 'R-HSA' in s else s
o = o + '-reactome_' if 'R-HSA' in o else o
if s + '-' + o in metadata_dict['edges'].keys():
s = json.dumps(metadata_dict['edges'][s + '-' + o], indent=4)
if not out_type: print('\nEdge Evidence'); print(s)
else: return s
elif o + '-' + s in metadata_dict['edges'].keys():
s = json.dumps(metadata_dict['edges'][o + '-' + s], indent=4)
if not out_type: print('\nEdge Evidence'); print(s)
else: return s
else: return None
def formats_path_information(kg: nx.multidigraph.MultiDiGraph, paths: List, path_type: str, metadata_func: Callable,
metadata_dict: Dict, node_metadata: Dict, verbose: bool = False, rand: bool = False,
sample_size: int = 10) -> None:
"""Processes shortest and simple path results.
Args:
kg: A networkx MultiDiGraph object.
paths: A nested list of strings, where each string contains an an entity identifier.
path_type: A string, either 'simple' or 'shortest' that indicates the types of paths to process.
metadata_func: A function that processes edge metadata.
metadata_dict: A nested dictionary containing node and edge-level metadata.
node_metadata: A nested dictionary containing node attributes.
verbose: A bool indicating whether or not node and edge metadata should be printed.
rand: A bool indicating whether or not to draw random samples from the path.
sample_size: An integer used when rand is True to specify the size of the random sample to draw.
Returns:
None
"""
if path_type == 'shortest':
if rand:
sample_size = sample_size if sample_size < len(paths) else len(paths)
paths = random.sample(paths, sample_size)
for path in paths:
print('*' * 100)
for i in range(0, len(path) - 1):
s = path[i]; o = path[i + 1]
edges = kg.get_edge_data(*(s, o)).keys()
for e in edges:
s_cut, s_label = str(s).split('/')[-1], node_metadata[str(s)]['label']
e_label = node_metadata[str(e)]['label']
# TODO -- remove this line once the metadata are fixed
e_label = 'causally related to' if e_label == 'is substance that treats' else e_label
o_cut, o_label = str(o).split('/')[-1], node_metadata[str(o)]['label']
if verbose:
print('>>> {} ({}) - {} - {} ({})'.format(s_label, s_cut, e_label, o_label, o_cut))
metadata_func(s_cut, o_cut, metadata_dict)
else: print('>>> {} ({}) - {} - {} ({})'.format(s_label, s_cut, e_label, o_label, o_cut))
print('*' * 100); print('\n')
else:
if rand:
sample_size = sample_size if sample_size < len(paths) else len(paths)
paths = random.sample(paths, sample_size)
for path in paths:
print('*' * 100)
for i in range(0, len(path) - 1):
s = path[i]; o = path[i + 1]; edges = kg.get_edge_data(*(s, o))
try: edges.keys()
except AttributeError: edges = kg.get_edge_data(*(o, s))
for e in edges.keys():
s_cut, s_label = str(s).split('/')[-1], node_metadata[str(s)]['label']
e_label = node_metadata[str(e)]['label']
# TODO -- remove this line once the metadata are fixed
e_label = 'causally related to' if e_label == 'is substance that treats' else e_label
o_cut, o_label = str(o).split('/')[-1], node_metadata[str(o)]['label']
if verbose:
print('>>> {} ({}) - {} - {} ({})'.format(s_label, s_cut, e_label, o_label, o_cut))
metadata_func(s_cut, o_cut, metadata_dict)
else: print('>>> {} ({}) - {} - {} ({})'.format(s_label, s_cut, e_label, o_label, o_cut))
print('*' * 100); print('\n')
return None
def nx_ancestor_search(kg: nx.multidigraph.MultiDiGraph, nodes: List, prefix: str, anc_list: Optional[List] = None) ->\
Union[Callable, List]:
"""Returns all ancestors nodes reachable through a direct edge. The returned list is ordered by seniority.
Args:
kg: A networkx MultiDiGraph object.
nodes: A list of RDFLib URIRef objects or None.
prefix: A string containing an ontology prefix (e.g., MONDO).
anc_list: A list that is empty or that contains RDFLib URIRef objects.
Returns:
anc_list: A list of period-delimited strings, where each string represents a path
"""
ancestor_list = [] if anc_list is None else anc_list
if len(nodes) == 0: return ancestor_list
else:
node = nodes.pop(); node_list = list(kg.neighbors(node))
neighborhood = [a for b in [[[i, n] for j in [kg.get_edge_data(*(node, n)).keys()]
for i in j] for n in node_list] for a in b]
ancestors = [x[1] for x in neighborhood if (prefix in str(x[1]) and x[0] == RDFS.subClassOf)]
if len(ancestors) > 0:
ancestor_list += [[str(x) for x in ancestors]]
nodes += [x for x in ancestors if x not in nodes]
return nx_ancestor_search(kg, nodes, prefix, ancestor_list)
def processes_ancestor_path_list(path_list: List) -> Dict:
"""Processes a nested list of ancestor paths into a dictionary.
Args:
path_list: A nested list of ontology URLs, where each list represents a set of ancestors.
Returns:
ancestors: A dictionary where keys are ints formatted as strings and values are sets of URL strings for each
concept that was found at that level. The level is the distance in the hierarchy from the searched node.
"""
anc_dict: Dict = dict()
for path in path_list:
for x in path:
idx = max([i for i, j in enumerate(path_list) if x in j])
if str(idx) in anc_dict.keys(): anc_dict[str(idx)] |= {x}
else: anc_dict[str(idx)] = {x}
return anc_dict
def nudge(pos: Dict, x_shift: int, y_shift: int) -> Dict:
"""Function just moves the node labels on the plot so they don't overlap the arrows.
Args:
pos: A dictionary containing x and x axis information for each node in a graph.
x_shift: An integer specifying the amount of x-axis shift.
y_shift: An integer specifying the amount of y-axis shift.
Returns:
A shifted dictionary.
"""
return {n: (x + x_shift, y + y_shift) for n, (x, y) in pos.items()}
def hierarchy_pos(g, root=None, width=1., vert_gap=0.2, vert_loc=0, xcenter=0.5):
"""From Joel's answer at https://stackoverflow.com/a/29597209/2966723.
Licensed under Creative Commons Attribution-Share Alike
If the graph is a tree this will return the positions to plot this in a hierarchical layout.
Args:
g: the graph (must be a tree)
root: the root node of current branch
- if the tree is directed and this is not given, the root will be found and used
- if the tree is directed and this is given, then the positions will be just for the descendants of this node.
- if the tree is undirected and not given, then a random choice will be used.
width: horizontal space allocated for this branch - avoids overlap with other branches
vert_gap: gap between levels of hierarchy
vert_loc: vertical location of root
xcenter: horizontal location of root
"""
if root is None:
if isinstance(g, nx.DiGraph):
root = next(iter(nx.topological_sort(g))) # allows back compatibility with nx version 1.11
else:
root = random.choice(list(g.nodes))
def _hierarchy_pos(g0, root_node, wid=1., v_gap=0.2, v_loc=0, xcent=0.5, pos=None, parent=None):
"""see hierarchy_pos docstring for most arguments
Args:
pos: a dict saying where all nodes go if they have been assigned
parent: parent of this branch. - only affects it if non-directed
"""
if pos is None: pos = {root: (xcent, v_loc)}
else: pos[root_node] = (xcent, v_loc)
children = list(g0.neighbors(root_node))
if not isinstance(g0, nx.DiGraph) and parent is not None: children.remove(parent)
if len(children) != 0:
dx = width / len(children); nextx = xcent - wid / 2 - dx / 2
for child in children:
nextx += dx
pos = _hierarchy_pos(g0, child, wid=dx, v_gap=v_gap,
v_loc=v_loc - v_gap, xcent=nextx,
pos=pos, parent=root)
return pos
return _hierarchy_pos(g, root, width, vert_gap, vert_loc, xcenter)
def visualize_ancestor_tree(node_list: List) -> None:
"""Takes a nested list of ancestor information and creates a hierarchical tree visualization.
Args:
node_list: A nested list of ancestor information.
Returns:
None.
"""
g_list = []; n_set = node_list[::-1]
for x in range(len(n_set) - 1):
for i in n_set[x]:
for j in n_set[x + 1]:
g_list.append((i.split('(')[0].rstrip(), j.split('(')[0].rstrip()))
# convert to graph object
g = nx.DiGraph(); g.add_edges_from(g_list)
# visualize graph
pos = hierarchy_pos(g); pos_labels = nudge(pos, 0, 0)
fig = plt.figure(figsize=(14, 14)); ax = plt.subplot(1, 1, 1); plt.margins(0.1)
# render image
nx.draw(g, pos=pos, ax=ax, with_labels=False, arrows=True, arrowsize=20, edge_color='gray')
nx.draw_networkx_nodes(g, pos=pos, node_color='lightblue', node_size=600)
label_pos = nx.draw_networkx_labels(g, pos=pos_labels, font_weight='bold', ax=ax)
return None
def pyvis_visualizer(node_info: list, edge_info: list) -> Network:
"""Visualizes processed network data using pyvis.
Args:
node_info: A nested list of node information.
edge_info: A nested list of edge information.
Return:
None.
"""
# process graph
ids, titles, labels, values = node_info
edge_list, edge_titles = edge_info
# create graph
g = Network(height="750px", width="100%", notebook=True, bgcolor="#222222", font_color="white")
g.add_nodes(ids, value=values, title=titles, label=labels)
for x in range(len(edge_list)):
g.add_edge(edge_list[x][0], edge_list[x][1], title=edge_titles[x])
return g
def visualize_kg_output(kg: nx.multidigraph.MultiDiGraph, node_list: list, node_metadata: dict) -> Network:
"""Processes a nested list of nodes in order to visualize them.
Args:
kg: A networkx MultiDiGraph object.
node_list: A nested list of ancestor information.
node_metadata: A nested dictionary containing node attributes.
Return:
None.
"""
# process edges
edge_list = []; edge_titles = []
for x in range(len(node_list) - 1):
for i in node_list[x]:
for j in node_list[x + 1]:
edge_list.append([i, j])
s, o = URIRef(i.rpartition('(')[1].strip(')')), URIRef(j.rpartition('(')[1].strip(')'))
try:
edges = list(kg.get_edge_data(*(s, o)).keys())[0]
edge_titles.append(node_metadata[str(edges)]['label'])
except AttributeError:
edge_titles.append('subClassOf')
# process nodes
ids = []; titles = []; labels = []; values = []
for x in node_list:
for i in x:
node = i.split('(')[0].rstrip(); uri = i.split('(')[1].strip(')'); ids.append(i)
titles += [uri]; labels.append(node); values.append(len(kg.in_edges(URIRef(uri))))
return pyvis_visualizer([ids, titles, labels, values], [edge_list, edge_titles])
def visualize_pheknowlator_schema() -> Network:
"""Visualizes the PheKnowLator v3.0.2 schema with node and edge count.
Returns:
pyvis Network object.
"""
g = Network(height="750px", width="100%", notebook=True, bgcolor="#222222", font_color="white")
g.add_nodes(['Phenotypes', 'Diseases', 'Chemicals', 'Variants', 'Transcripts', 'Genes', 'Molecular Functions',
'Biological Processes', 'Cellular Components', 'Cell Lines', 'Cells', 'Proteins',
'Anatomical Entities', 'Cofactors', 'Catalysts', 'Pathways'],
title=['16,291', '22,334', '150,080', '145,156', '190,829', '26,532', '4,442', '12,329', '1,801',
'41,791', '2,368', '96,197', '14,181', '44', '3,749', '13,794'],
label=['Phenotypes', 'Diseases', 'Chemicals', 'Variants', 'Transcripts', 'Genes', 'Molecular Functions',
'Biological Processes', 'Cellular Components', 'Cell Lines', 'Cells', 'Proteins',
'Anatomical Entities', 'Cofactors', 'Catalysts', 'Pathways'],
color=['#fbafd1ff', '#e06666ff', '#bc376aff', '#e2d3e7ff', '#88d8ffff', '#b7b7b7ff', '#8e7cc3ff',
'#8e7cc3ff', '#8e7cc3ff', '#76a5afff', '#f6b26bff', '#6aa84fff', '#3d85c6ff', '#bc376aff',
'#bc376aff', '#e7e6e6ff'])
g.add_edge('Chemicals', 'Diseases', title='causally related to (n=172,573)')
g.add_edge('Chemicals', 'Genes', title='interacts with (n=16,708)')
g.add_edge('Chemicals', 'Biological Processes', title='molecularly interacts with (n=288,873)')
g.add_edge('Chemicals', 'Cellular Components', title='molecularly interacts with (n=47,716)')
g.add_edge('Chemicals', 'Molecular Functions', title='molecularly interacts with (n=28,077)')
g.add_edge('Chemicals', 'Pathways', title='participates in (n=29,988)')
g.add_edge('Chemicals', 'Phenotypes', title='causally related to (n=110,898)')
g.add_edge('Chemicals', 'Proteins', title='interacts with (n=71,679)')
g.add_edge('Chemicals', 'Transcripts', title='interacts with (n=0)')
g.add_edge('Diseases', 'Phenotypes', title='has phenotype (n=435,102)')
g.add_edge('Genes', 'Diseases', title='causes or contributes to (n=12,842)')
g.add_edge('Genes', 'Genes', title='genetically interacts with (n=1,694)')
g.add_edge('Genes', 'Pathways', title='participates in (n=107,009)')
g.add_edge('Genes', 'Phenotypes', title='cause or contributes to (n=24,760)')
g.add_edge('Genes', 'Proteins', title='has gene product (n=19,521)')
g.add_edge('Genes', 'Transcripts', title='transcribed to (n=182,692)')
g.add_edge('Biological Processes', 'Pathways', title='realized in response to (n=672)')
g.add_edge('Pathways', 'Cellular Components', title='has component (n=16,014)')
g.add_edge('Pathways', 'Molecular Functions', title='has function (n=2,426)')
g.add_edge('Proteins', 'Anatomical Entities', title='located in (n=30,681)')
g.add_edge('Proteins', 'Catalysts', title='molecularly interacts with (n=25,136)')
g.add_edge('Proteins', 'Cells', title='located in (n=75,313)')
g.add_edge('Proteins', 'Cell Lines', title='located in (n=75,313)')
g.add_edge('Proteins', 'Cofactors', title='molecularly interacts with (n=1,998)')
g.add_edge('Proteins', 'Biological Processes', title='participates in (n=129,424)')
g.add_edge('Proteins', 'Cellular Components', title='located in (n=82,526)')
g.add_edge('Proteins', 'Molecular Functions', title='has function (n=69,801)')
g.add_edge('Proteins', 'Pathways', title='participates in (n=117,813)')
g.add_edge('Proteins', 'Proteins', title='molecularly interacts with (n=618,069)')
g.add_edge('Transcripts', 'Anatomical Entities', title='located in (n=444,974)')
g.add_edge('Transcripts', 'Cells', title='located in (n=65,180)')
g.add_edge('Transcripts', 'Cell Lines', title='located in (n=65,180)')
g.add_edge('Transcripts', 'Proteins', title='ribosomally translates to (n=44,205)')
g.add_edge('Variants', 'Diseases', title='causes or contributes to (n=43,439)')
g.add_edge('Variants', 'Genes', title='causally influences (n=145,129)')
g.add_edge('Variants', 'Phenotypes', title='causes or contributes to (n=3,081)')
return g
def visualize_pheknowlator_ontologies() -> Network:
g = Network(height="750px", width="100%", notebook=True, bgcolor="#222222", font_color="white")
g.add_nodes(['CLO', 'Mondo', 'Uberon', 'CL', 'PRO', 'GO', 'PW', 'ChEBI', 'HPO', 'SO', 'RO', 'VO'],
title = ['Cell Line Ontology', 'Mondo Disease Ontology', 'Uber Anatomy Ontology', 'Cell Ontology',
'Protein Ontology', 'Gene Ontology', 'Pathway Ontology', 'Chemical Entities of Biological Interest', 'Human Phenotype Ontology', 'Sequence Ontology',
'Relations Ontology', 'Vaccine Ontology'],
label = ['CLO', 'Mondo', 'Uberon', 'CL', 'PRO', 'GO', 'PW', 'ChEBI', 'HPO', 'SO', 'RO', 'VO'],
color = ['#76a5afff', '#e06666ff', '#3d85c6ff', '#f6b26bff', '#6aa84fff', '#8e7cc3ff', '#ffe599ff',
'#bc376aff', '#fbafd1ff', '#d9ead3ff', '#666666ff', '#b59e7dff'])
g.add_edge('HPO', 'Uberon', title='ontology import')
g.add_edge('HPO', 'CL', title='ontology import')
g.add_edge('HPO', 'GO', title='ontology import')
g.add_edge('HPO', 'ChEBI', title='ontology import')
g.add_edge('HPO', 'VO', title='ontology import')
g.add_edge('HPO', 'PRO', title='ontology import')
g.add_edge('CLO', 'PRO', title='ontology import')
g.add_edge('CLO', 'RO', title='ontology import')
g.add_edge('CLO', 'ChEBI', title='ontology import')
g.add_edge('CLO', 'GO', title='ontology import')
g.add_edge('CLO', 'CL', title='ontology import')
g.add_edge('Mondo', 'SO', title='ontology import')
g.add_edge('Mondo', 'RO', title='ontology import')
g.add_edge('Mondo', 'GO', title='ontology import')
g.add_edge('Mondo', 'ChEBI', title='ontology import')
g.add_edge('Mondo', 'CL', title='ontology import')
g.add_edge('Mondo', 'Uberon', title='ontology import')
g.add_edge('Uberon', 'PRO', title='ontology import')
g.add_edge('Uberon', 'ChEBI', title='ontology import')
g.add_edge('Uberon', 'GO', title='ontology import')
g.add_edge('Uberon', 'CL', title='ontology import')
g.add_edge('CL', 'RO', title='ontology import')
g.add_edge('CL', 'GO', title='ontology import')
g.add_edge('CL', 'ChEBI', title='ontology import')
g.add_edge('PRO', 'GO', title='ontology import')
g.add_edge('PRO', 'ChEBI', title='ontology import')
g.add_edge('GO', 'CL', title='ontology import')
g.add_edge('GO', 'RO', title='ontology import')
g.add_edge('GO', 'ChEBI', title='ontology import')
g.add_edge('GO', 'VO', title='ontology import')
g.add_edge('PW', 'GO', title='ontology import')
g.add_edge('VO', 'ChEBI', title='ontology import')
g.add_edge('VO', 'GO', title='ontology import')
g.add_edge('VO', 'Uberon', title='ontology import')
g.add_edge('VO', 'PRO', title='ontology import')
return g