In [1]:
from biothings_explorer.user_query_dispatcher import SingleEdgeQueryDispatcher
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
seqd = SingleEdgeQueryDispatcher(input_cls='Gene',
                                 output_cls='ChemicalSubstance',
                                 input_id='NCBIGene',
                                 values='1017')

In [3]:
seqd.query(verbose=True)

==== Step #1: Query path planning ====

Because NCBIGene:1017 is of type 'Gene', BTE will query our meta-KG for APIs that can take 'Gene' as input and 'ChemicalSubstance' as output

BTE found 10 apis:

API 1. pharos(1 API call)
API 2. hmdb(1 API call)
API 3. opentarget(1 API call)
API 4. mychem(3 API calls)
API 5. cord_gene(1 API call)
API 6. chembio(1 API call)
API 7. scigraph(1 API call)
API 8. dgidb(1 API call)
API 9. semmed_gene(10 API calls)
API 10. scibite(1 API call)


==== Step #2: Query path execution ====
NOTE: API requests are dispatched in parallel, so the list of APIs below is ordered by query time.

API 2.1: https://platform-api.opentargets.io/v3/platform/public/evidence/filter?target=ENSG00000123374&datasource=chembl&size=100&fields=drug
API 9.1: https://biothings.ncats.io/semmedgene/query?fields=negatively_regulates (POST -d q=C0108855,C1332733&scopes=umls)
API 9.2: https://biothings.ncats.io/semmedgene/query?fields=physically_interacts_with (POST -d q=C0108855,C1332733

In [11]:
seqd.input_cls

'Gene'

In [12]:
print(seqd.G.number_of_edges())

1215


In [46]:
for i in seqd.G.nodes:
    print(i, seqd.G.degree(i), seqd.G.number_of_edges('NCBIGene:1017',i))

NCBIGene:1017 1215 0
CHEMBL1197513 1 1
CHEBI:52214 1 1
CHEBI:36080 1 1
CHEBI:33252 1 1
CHEBI:35222 2 2
CHEMBL2163995 1 1
CHEMBL1964246 1 1
CHEMBL213713 1 1
CHEMBL272833 1 1
CHEMBL259838 1 1
CHEMBL257831 1 1
CHEMBL3354187 1 1
CHEMBL1964244 1 1
CHEMBL455195 1 1
CHEMBL405145 1 1
CHEMBL1964259 1 1
CHEMBL1761791 1 1
CHEMBL3642647 1 1
CHEMBL411426 1 1
CHEMBL3655765 1 1
CHEMBL3298984 1 1
CHEMBL79498 1 1
CHEMBL255463 1 1
CHEMBL485618 1 1
CHEMBL181114 1 1
CHEMBL424696 1 1
CHEMBL186288 1 1
CHEMBL488085 1 1
CHEMBL265903 1 1
CHEMBL415471 1 1
CHEMBL215086 1 1
CHEMBL496785 1 1
CHEMBL1171949 1 1
CHEMBL2158843 1 1
CHEMBL1964242 1 1
CHEMBL103285 1 1
CHEMBL295484 1 1
CHEMBL3357950 1 1
CHEMBL291324 1 1
CHEMBL79356 1 1
CHEMBL515001 1 1
CHEMBL260163 1 1
CHEMBL268507 1 1
CHEMBL255263 1 1
CHEMBL485619 1 1
CHEMBL3648117 1 1
CHEMBL520187 1 1
CHEMBL361833 1 1
CHEMBL476993 1 1
CHEMBL3644022 1 1
CHEMBL1964260 1 1
CHEMBL3648114 1 1
CHEMBL603097 1 1
CHEMBL509012 1 1
CHEMBL3648123 1 1
CHEMBL187750 1 1
CHEMBL260103 1

4-{5-[(1Z)-1-(2-IMINO-4-OXO-1,3-THIAZOLIDIN-5-YLIDENE)ETHYL]-2-FURYL}BENZENESULFONAMIDE 2 2
N-[4-(2,4-DIMETHYL-THIAZOL-5-YL)-PYRIMIDIN-2-YL]-N',N'-DIMETHYL-BENZENE-1,4-DIAMINE 2 2
6-(3,4-DIHYDROXYBENZYL)-3-ETHYL-1-(2,4,6-TRICHLOROPHENYL)-1H-PYRAZOLO[3,4-D]PYRIMIDIN-4(5H)-ONE 2 2
6-(3-AMINOPHENYL)-N-(TERT-BUTYL)-2-(TRIFLUOROMETHYL)QUINAZOLIN-4-AMINE 2 2
2-(4-(AMINOMETHYL)PIPERIDIN-1-YL)-N-(3_CYCLOHEXYL-4-OXO-2,4-DIHYDROINDENO[1,2-C]PYRAZOL-5-YL)ACETAMIDE 2 2
1-(3-(2,4-DIMETHYLTHIAZOL-5-YL)-4-OXO-2,4-DIHYDROINDENO[1,2-C]PYRAZOL-5-YL)-3-(4-METHYLPIPERAZIN-1-YL)UREA 2 2
4-{[5-(CYCLOHEXYLMETHOXY)[1,2,4]TRIAZOLO[1,5-A]PYRIMIDIN-7-YL]AMINO}BENZENESULFONAMIDE 2 2
4-{[5-(CYCLOHEXYLAMINO)[1,2,4]TRIAZOLO[1,5-A]PYRIMIDIN-7-YL]AMINO}BENZENESULFONAMIDE 2 2
4-({5-[(4-AMINOCYCLOHEXYL)AMINO][1,2,4]TRIAZOLO[1,5-A]PYRIMIDIN-7-YL}AMINO)BENZENESULFONAMIDE 2 2
4-{[5-(CYCLOHEXYLOXY)[1,2,4]TRIAZOLO[1,5-A]PYRIMIDIN-7-YL]AMINO}BENZENESULFONAMIDE 2 2
CAN-508 2 2
N-[3-(1H-BENZIMIDAZOL-2-YL)-1H-PYRAZOL-4-YL]BENZAM

# Filter 1
### Filter by number of edges overlapping between nodes

In [175]:
def filter_node_degree(G, count=50): #takes input G as networkX graph

    degrees = []
    for node in G.nodes:
        degrees.append(G.degree(node))

    data = {'node':G.nodes, 'degree':degrees}
    deg_count = pd.DataFrame(data=data)
    deg_count.sort_values(by='degree', inplace=True, ascending=False)

    filtered = list(deg_count.head(count)['node'])
    subG = G.subgraph(filtered)

    for i,node in enumerate(filtered):
        subG.nodes.data()[node]['filteredBy'] = 'NodeDegree'
        subG.nodes.data()[node]['rank'] = i+1

    return subG

In [185]:
x = filter_node_degree(seqd.G,10)
len(x.nodes)

10

In [210]:
for i,node in enumerate(x.nodes):
    print(i,node,x.degree(node),x.nodes.data()[node]['rank'], '\n')

0 DINACICLIB 11 5 

1 AT-7519 7 9 

2 STAUROSPORINE 7 10 

3 ALVOCIDIB 38 2 

4 OLOMOUCINE 12 4 

5 SELICICLIB 9 6 

6 NCBIGene:1017 129 1 

7 ROSCOVITINE 8 7 

8 UCN-01 30 3 

9 RONICICLIB 7 8 



In [233]:
l = []
for i in range(len(x.nodes)):
    for node in x.nodes:
        if x.nodes.data()[node]['rank'] == i+1:
            l.append(node)
l

['NCBIGene:1017',
 'ALVOCIDIB',
 'UCN-01',
 'OLOMOUCINE',
 'DINACICLIB',
 'SELICICLIB',
 'ROSCOVITINE',
 'RONICICLIB',
 'AT-7519',
 'STAUROSPORINE']

In [206]:
x.nodes(3)

NodeDataView({'DINACICLIB': None, 'AT-7519': None, 'STAUROSPORINE': None, 'ALVOCIDIB': None, 'OLOMOUCINE': None, 'SELICICLIB': None, 'NCBIGene:1017': None, 'ROSCOVITINE': None, 'UCN-01': None, 'RONICICLIB': None}, data=3)

In [143]:
assert seqd.G['NCBIGene:1017']['AT-7519']== x['NCBIGene:1017']['AT-7519']

In [180]:
import unittest

In [None]:
class TestFilterEdges(unittest.TestCase):

    # test for count values
    def test_count_values(self):
        counts = [10, 20, 40, 50, 100, 1050]

        seqd = SingleEdgeQeryDispatcher(output_cls='GenomicEntity',
                                        input_cls='Gene',
                                        input_id='HGNC',
                                        values='7890')
        seqd.query()
        for count in counts:
            newG = filter_node_degree(seqd.G, count)
            self.assertEqual(len(newG.nodes), 2)

# Filter 2
### Filter by type of relation

Essentially takes in graph input, the relation we are looking for
and returns a graph with only the edges with said relation, ordered then by degree?

In [2]:
seqd = SingleEdgeQueryDispatcher(input_cls='Gene',
                                 output_cls='ChemicalSubstance',
                                 input_id='NCBIGene',
                                 values='1017')
seqd.query()

In [412]:
x = []
for i in seqd.G.edges:
    x.append(seqd.G[i[0]][i[1]][0]['label'])
    
x = set(x)
len(seqd.G.nodes)

3

In [397]:
def filter_label(G, label, count=50):
    
    val_edges = []
    for edge in G.edges:
        if G[edge[0]][edge[1]][edge[2]]['label'] in label:
            val_edges.append(edge)
    
    subG = G.edge_subgraph(val_edges)
    subG = filter_node_degree(subG, count)
    
    for node in subG.nodes:
        subG.nodes.data()[node]['filteredBy'] = 'EdgeLabel'
        
    return subG

In [411]:
labels = ['related_to', 'negatively_regulated_by']
subG = filter_label(seqd.G, labels)

subG = filter_label(seqd.G, 'related_to')
for edge in subG.edges.data():
    assert 'related_to' == edge[2]['label']


In [404]:
len(seqd.G.edges)

1215

In [359]:
val = []
label = 'related_to'
for edge in seqd.G.edges:
    if seqd.G[edge[0]][edge[1]][0]['label'] == label:
        val.append(edge)
        
sG = seqd.G.edge_subgraph(val)

len(val)


582

In [63]:
seqd2 = SingleEdgeQueryDispatcher(output_cls='GenomicEntity',
                                         input_cls='ChemicalSubstance',
                                         pred="related_to",
                                         input_id='CHEBI',
                                         values='CHEBI:28640')
seqd2.query()

# Filter 3: Co-occurrence
### Using NIH MRCOC co-occurrence files, query MRCOC API

In [2]:
import time
import requests

In [12]:
# helper funcs
def get_ids(node):
    ids = []
    try:
        ids.append(seqd.G.nodes[node]['equivalent_ids']['MESH'])
        ids.append(seqd.G.nodes[node]['equivalent_ids']['UMLS'])
    except:
        pass

    ids = [i for sub in ids for i in sub] # flatten and get rid of set()

    return 0 if len(ids) == 0 else ids
    
def make_combo(id1, id2):
    combos = ['-'.join([i,j]) for i in id1 for j in id2]
    combos += ['-'.join([j,i]) for i in id1 for j in id2]
    
    return combos

In [233]:
# takes in networkX graph, outputs networkX graph with ranks
# this function finds the mesh/umls pairs, then sends batch queries the mrcoc co-occurrence api 
# rank is based on ngd score on each EDGE (not node)

def filter_co_occur_1(G, count=50):
    unique_edges = []
    for edge in G.edges:
        if [edge[0], edge[1]] in unique_edges:
            continue
        else:
            unique_edges.append([edge[0], edge[1]])

    for edge in unique_edges:
        id1 = get_ids(edge[0])
        id2 = get_ids(edge[1])

        if (id1 == 0) | (id2 == 0):
            edge.insert(0, 100)
        else:
            combo = make_combo(id1, id2)
            x = requests.post('https://biothings.ncats.io/mrcoc/query', json={'scopes':'combo', 'q': combo}).json()
            for query in x:
                if not 'notfound' in query:
                    edge.insert(0, query['ngd_overall'])
                    break
            if not isinstance(edge[0], float):
                edge.insert(0, 200)

    results = sorted(unique_edges)[:count]
    filtered = list(set([i[1] for i in results] + [i[2] for i in results]))
    subG = G.subgraph(filtered)

    for i,res in enumerate(results, start=1):
        for edge in subG[res[1]][res[2]]:
            subG[res[1]][res[2]][edge]['rank'] = i
            subG[res[1]][res[2]][edge]['filteredBy'] = 'CoOccurrence'
            subG[res[1]][res[2]][edge]['ngd_overall'] = res[0]

    return subG


In [234]:
def filter_co_occur(G, count=50):
    unique_edges = []
    for edge in G.edges:
        if [edge[0], edge[1]] in unique_edges:
            continue
        else:
            unique_edges.append([edge[0], edge[1]])

    num_combs, combos = [], []
    for edge in unique_edges:
        id1 = get_ids(edge[0])
        id2 = get_ids(edge[1])

        if (id1 == 0) | (id2 == 0):
            edge.insert(0, 100)
        else:
            combo = make_combo(id1, id2)
            combos.append(combo)
            num_combs.append(len(combo))

    combos = [i for j in combos for i in j]
    chunks = [combos[x:x+1000] for x in range(0,len(combos),1000)]
    x = []
    for chunk in chunks:
        x += requests.post('https://biothings.ncats.io/mrcoc/query', json={'scopes':'combo', 'q': chunk}).json()

    end, i = 0, 0
    for edge in unique_edges:
        if isinstance(edge[0],int):
            continue
        start = end
        end += num_combs[i]
        for query in x[start:end]:
            if not 'notfound' in query:
                edge.insert(0, query['ngd_overall'])
                break
        if not isinstance(edge[0], float):
            edge.insert(0, 200)
        i+=1

    results = sorted(unique_edges)[:count]
    filtered = list(set([i[1] for i in results] + [i[2] for i in results]))
    subG = G.subgraph(filtered)

    for i,res in enumerate(results, start=1):
        for edge in subG[res[1]][res[2]]:
            subG[res[1]][res[2]][edge]['rank'] = i
            subG[res[1]][res[2]][edge]['filteredBy'] = 'CoOccurrence'
            subG[res[1]][res[2]][edge]['ngd_overall'] = res[0]

    return subG

In [235]:
G = filter_co_occur(seqd.G)

In [236]:
G.edges.data()

OutMultiEdgeDataView([('MESH:D000755', 'BONE DISEASE', {'info': {'@type': 'Disease', 'name': 'Bone Diseases', 'UMLS': 'C0005940', 'pubmed': ['7282730'], '$api': 'SEMMED Disease API', '$source': 'SEMMED'}, 'label': 'coexists_with', 'source': 'SEMMED', 'rank': 25, 'filteredBy': 'CoOccurrence', 'ngd_overall': 0.47739189906836266}), ('MESH:D000755', 'MULTIPLE ORGAN FAILURE', {'info': {'@type': 'Disease', 'name': 'Multiple Organ Failure', 'UMLS': 'C0026766', 'pubmed': ['26283706'], '$api': 'SEMMED Disease API', '$source': 'SEMMED'}, 'label': 'coexists_with', 'source': 'SEMMED', 'rank': 44, 'filteredBy': 'CoOccurrence', 'ngd_overall': 0.5655872915508444}), ('MESH:D000755', 'SPLENIC INFARCTION', {'info': {'@type': 'Disease', 'name': 'Splenic Infarction', 'UMLS': 'C0037998', 'pubmed': ['13409094'], '$api': 'SEMMED Disease API', '$source': 'SEMMED'}, 'label': 'causes', 'source': 'SEMMED', 'rank': 13, 'filteredBy': 'CoOccurrence', 'ngd_overall': 0.41255997965932023}), ('MESH:D000755', 'SPLENIC I

In [218]:
start_1 = time.time()
G1 = filter_co_occur(seqd.G)
end_1 = time.time()
start_2 = time.time()
G2 = filter2(seqd.G)
end_2 = time.time()

print('G1 time: {}\t G2 time: {}'.format(end_1-start_1, end_2-start_2))

G1 time: 41.207399129867554	 G2 time: 1.273540735244751


In [232]:
import networkx as nx
nx.is_isomorphic(G1,G2)

True

In [228]:
G1.edges.data()

OutMultiEdgeDataView([('MESH:D000755', 'BONE DISEASE', {'info': {'@type': 'Disease', 'name': 'Bone Diseases', 'UMLS': 'C0005940', 'pubmed': ['7282730'], '$api': 'SEMMED Disease API', '$source': 'SEMMED'}, 'label': 'coexists_with', 'source': 'SEMMED', 'rank': 25, 'filteredBy': 'CoOccurrence', 'ngd_overall': 0.47739189906836266}), ('MESH:D000755', 'MULTIPLE ORGAN FAILURE', {'info': {'@type': 'Disease', 'name': 'Multiple Organ Failure', 'UMLS': 'C0026766', 'pubmed': ['26283706'], '$api': 'SEMMED Disease API', '$source': 'SEMMED'}, 'label': 'coexists_with', 'source': 'SEMMED', 'rank': 44, 'filteredBy': 'CoOccurrence', 'ngd_overall': 0.5655872915508444}), ('MESH:D000755', 'SPLENIC INFARCTION', {'info': {'@type': 'Disease', 'name': 'Splenic Infarction', 'UMLS': 'C0037998', 'pubmed': ['13409094'], '$api': 'SEMMED Disease API', '$source': 'SEMMED'}, 'label': 'causes', 'source': 'SEMMED', 'rank': 13, 'filteredBy': 'CoOccurrence', 'ngd_overall': 0.41255997965932023}), ('MESH:D000755', 'SPLENIC I

In [229]:
G2.edges.data()

OutMultiEdgeDataView([('MESH:D000755', 'BONE DISEASE', {'info': {'@type': 'Disease', 'name': 'Bone Diseases', 'UMLS': 'C0005940', 'pubmed': ['7282730'], '$api': 'SEMMED Disease API', '$source': 'SEMMED'}, 'label': 'coexists_with', 'source': 'SEMMED', 'rank': 25, 'filteredBy': 'CoOccurrence', 'ngd_overall': 0.47739189906836266}), ('MESH:D000755', 'MULTIPLE ORGAN FAILURE', {'info': {'@type': 'Disease', 'name': 'Multiple Organ Failure', 'UMLS': 'C0026766', 'pubmed': ['26283706'], '$api': 'SEMMED Disease API', '$source': 'SEMMED'}, 'label': 'coexists_with', 'source': 'SEMMED', 'rank': 44, 'filteredBy': 'CoOccurrence', 'ngd_overall': 0.5655872915508444}), ('MESH:D000755', 'SPLENIC INFARCTION', {'info': {'@type': 'Disease', 'name': 'Splenic Infarction', 'UMLS': 'C0037998', 'pubmed': ['13409094'], '$api': 'SEMMED Disease API', '$source': 'SEMMED'}, 'label': 'causes', 'source': 'SEMMED', 'rank': 13, 'filteredBy': 'CoOccurrence', 'ngd_overall': 0.41255997965932023}), ('MESH:D000755', 'SPLENIC I

In [208]:
pre_res = filter_co_occur(seqd.G)

In [5]:
seqd = SingleEdgeQueryDispatcher(input_cls='Disease', output_cls='Disease', input_id='MESH', values='D000755')
seqd.query()

In [201]:
unique_edges = []
for edge in seqd.G.edges:
    if [edge[0], edge[1]] in unique_edges:
        continue
    else:
        unique_edges.append([edge[0], edge[1]])

In [202]:
num_combs, combos = [], []
for edge in unique_edges:
    id1 = get_ids(edge[0])
    id2 = get_ids(edge[1])

    if (id1 == 0) | (id2 == 0):
        edge.insert(0, 100)
    else:
        combo = make_combo(id1, id2)
        combos.append(combo)
        num_combs.append(len(combo))

In [203]:
combos = [i for j in combos for i in j]
chunks = [combos[x:x+1000] for x in range(0,len(combos),1000)]

In [204]:
x = []
for chunk in chunks:
    x+= requests.post('https://biothings.ncats.io/mrcoc/query', json={'scopes':'combo', 'q': chunk}).json()

In [205]:
end, i = 0, 0
for edge in unique_edges:   
    if isinstance(edge[0],int):
        continue
        
    start = end
    end += num_combs[i]
    for query in x[start:end]:
        if not 'notfound' in query:
            edge.insert(0, query['ngd_overall'])
            print(edge)
            break
    if not isinstance(edge[0], float):
        edge.insert(0, 200)   
    i+=1

[0.3786116229133875, 'MESH:D000755', 'ALPHA THALASSEMIA']
[0.3520097217194215, 'MESH:D000755', 'ANEMIA']
[0.4193873106695017, 'MESH:D000755', 'ANEMIA, HEMOLYTIC']
[0.7189390880973388, 'MESH:D000755', 'ANKYLOSIS']
[0.8365177066376366, 'MESH:D000755', 'CARDIAC ARRHYTHMIA']
[0.6780323851612118, 'MESH:D000755', 'ARTHRITIS']
[0.6750172809472789, 'MESH:D000755', 'ASTHMA']
[0.7177989989115081, 'MESH:D000755', 'BACK PAIN']
[0.47847706925878947, 'MESH:D000755', 'CEREBRAL INFARCTION']
[0.6397025858996038, 'MESH:D000755', 'BENIGN RECURRENT INTRAHEPATIC CHOLESTASIS']
[0.7634033302810889, 'MESH:D000755', 'DISEASE OR DISORDER']
[0.6320661320820271, 'MESH:D000755', 'BRIGHT DISEASE']
[0.6155801802017089, 'MESH:D000755', 'HEADACHE']
[0.6029340682160718, 'MESH:D000755', 'NONSYNDROMIC GENETIC DEAFNESS']
[0.7488520180595014, 'MESH:D000755', 'HEREDITARY HYPERBILIRUBINEMIA']
[0.4658974351897532, 'MESH:D000755', 'PULMONARY HYPERTENSION']
[0.7182471219773823, 'MESH:D000755', 'HYPERTROPHY']
[0.3925935858047931

In [212]:
for i in range(len(pre_res)):
    assert pre_res[i] == unique_edges[i]

In [84]:
x = requests.post('https://biothings.ncats.io/mrcoc/query', json={'scopes':'combo', 'q': comb}).json()

In [101]:
isinstance('n', int)

False

In [164]:
x = [1,2,7,4,3,9,2354,98,2,111,39]

for i in x[5:]:
    print(i)

9
2354
98
2
111
39


In [58]:
x

[1, 2, 2, 3, 4, 7, 9, 39, 98]