In [115]:
from biothings_explorer.user_query_dispatcher import SingleEdgeQueryDispatcher
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
seqd = SingleEdgeQueryDispatcher(input_cls='Gene',
                                 output_cls='ChemicalSubstance',
                                 input_id='NCBIGene',
                                 values='1017')

In [5]:
seqd.query(verbose=True)

==== Step #1: Query path planning ====

Because NCBIGene:1017 is of type 'Gene', BTE will query our meta-KG for APIs that can take 'Gene' as input and 'ChemicalSubstance' as output

BTE found 10 apis:

API 1. dgidb(1 API call)
API 2. scigraph(1 API call)
API 3. pharos(1 API call)
API 4. opentarget(1 API call)
API 5. scibite(1 API call)
API 6. chembio(1 API call)
API 7. mychem(3 API calls)
API 8. hmdb(1 API call)
API 9. cord_gene(1 API call)
API 10. semmed_gene(10 API calls)


==== Step #2: Query path execution ====
NOTE: API requests are dispatched in parallel, so the list of APIs below is ordered by query time.

API 5.1: https://platform-api.opentargets.io/v3/platform/public/evidence/filter?target=ENSG00000123374&datasource=chembl&size=100&fields=drug
API 8.2: https://mychem.info/v1/query?fields=drugbank.id&size=250 (POST -d q=CDK2&scopes=drugbank.enzymes.gene_name)
API 8.1: https://mychem.info/v1/query?fields=drugbank.id&size=250 (POST -d q=CDK2&scopes=drugbank.targets.gene_name)
API

In [11]:
seqd.input_cls

'Gene'

In [12]:
print(seqd.G.number_of_edges())

1215


In [46]:
for i in seqd.G.nodes:
    print(i, seqd.G.degree(i), seqd.G.number_of_edges('NCBIGene:1017',i))

NCBIGene:1017 1215 0
CHEMBL1197513 1 1
CHEBI:52214 1 1
CHEBI:36080 1 1
CHEBI:33252 1 1
CHEBI:35222 2 2
CHEMBL2163995 1 1
CHEMBL1964246 1 1
CHEMBL213713 1 1
CHEMBL272833 1 1
CHEMBL259838 1 1
CHEMBL257831 1 1
CHEMBL3354187 1 1
CHEMBL1964244 1 1
CHEMBL455195 1 1
CHEMBL405145 1 1
CHEMBL1964259 1 1
CHEMBL1761791 1 1
CHEMBL3642647 1 1
CHEMBL411426 1 1
CHEMBL3655765 1 1
CHEMBL3298984 1 1
CHEMBL79498 1 1
CHEMBL255463 1 1
CHEMBL485618 1 1
CHEMBL181114 1 1
CHEMBL424696 1 1
CHEMBL186288 1 1
CHEMBL488085 1 1
CHEMBL265903 1 1
CHEMBL415471 1 1
CHEMBL215086 1 1
CHEMBL496785 1 1
CHEMBL1171949 1 1
CHEMBL2158843 1 1
CHEMBL1964242 1 1
CHEMBL103285 1 1
CHEMBL295484 1 1
CHEMBL3357950 1 1
CHEMBL291324 1 1
CHEMBL79356 1 1
CHEMBL515001 1 1
CHEMBL260163 1 1
CHEMBL268507 1 1
CHEMBL255263 1 1
CHEMBL485619 1 1
CHEMBL3648117 1 1
CHEMBL520187 1 1
CHEMBL361833 1 1
CHEMBL476993 1 1
CHEMBL3644022 1 1
CHEMBL1964260 1 1
CHEMBL3648114 1 1
CHEMBL603097 1 1
CHEMBL509012 1 1
CHEMBL3648123 1 1
CHEMBL187750 1 1
CHEMBL260103 1

4-{5-[(1Z)-1-(2-IMINO-4-OXO-1,3-THIAZOLIDIN-5-YLIDENE)ETHYL]-2-FURYL}BENZENESULFONAMIDE 2 2
N-[4-(2,4-DIMETHYL-THIAZOL-5-YL)-PYRIMIDIN-2-YL]-N',N'-DIMETHYL-BENZENE-1,4-DIAMINE 2 2
6-(3,4-DIHYDROXYBENZYL)-3-ETHYL-1-(2,4,6-TRICHLOROPHENYL)-1H-PYRAZOLO[3,4-D]PYRIMIDIN-4(5H)-ONE 2 2
6-(3-AMINOPHENYL)-N-(TERT-BUTYL)-2-(TRIFLUOROMETHYL)QUINAZOLIN-4-AMINE 2 2
2-(4-(AMINOMETHYL)PIPERIDIN-1-YL)-N-(3_CYCLOHEXYL-4-OXO-2,4-DIHYDROINDENO[1,2-C]PYRAZOL-5-YL)ACETAMIDE 2 2
1-(3-(2,4-DIMETHYLTHIAZOL-5-YL)-4-OXO-2,4-DIHYDROINDENO[1,2-C]PYRAZOL-5-YL)-3-(4-METHYLPIPERAZIN-1-YL)UREA 2 2
4-{[5-(CYCLOHEXYLMETHOXY)[1,2,4]TRIAZOLO[1,5-A]PYRIMIDIN-7-YL]AMINO}BENZENESULFONAMIDE 2 2
4-{[5-(CYCLOHEXYLAMINO)[1,2,4]TRIAZOLO[1,5-A]PYRIMIDIN-7-YL]AMINO}BENZENESULFONAMIDE 2 2
4-({5-[(4-AMINOCYCLOHEXYL)AMINO][1,2,4]TRIAZOLO[1,5-A]PYRIMIDIN-7-YL}AMINO)BENZENESULFONAMIDE 2 2
4-{[5-(CYCLOHEXYLOXY)[1,2,4]TRIAZOLO[1,5-A]PYRIMIDIN-7-YL]AMINO}BENZENESULFONAMIDE 2 2
CAN-508 2 2
N-[3-(1H-BENZIMIDAZOL-2-YL)-1H-PYRAZOL-4-YL]BENZAM

# Filter 1
### Filter by number of edges overlapping between nodes

In [175]:
def filter_node_degree(G, count=50): #takes input G as networkX graph

    degrees = []
    for node in G.nodes:
        degrees.append(G.degree(node))

    data = {'node':G.nodes, 'degree':degrees}
    deg_count = pd.DataFrame(data=data)
    deg_count.sort_values(by='degree', inplace=True, ascending=False)

    filtered = list(deg_count.head(count)['node'])
    subG = G.subgraph(filtered)

    for i,node in enumerate(filtered):
        subG.nodes.data()[node]['filteredBy'] = 'NodeDegree'
        subG.nodes.data()[node]['rank'] = i+1

    return subG

In [185]:
x = filter_node_degree(seqd.G,10)
len(x.nodes)

10

In [210]:
for i,node in enumerate(x.nodes):
    print(i,node,x.degree(node),x.nodes.data()[node]['rank'], '\n')

0 DINACICLIB 11 5 

1 AT-7519 7 9 

2 STAUROSPORINE 7 10 

3 ALVOCIDIB 38 2 

4 OLOMOUCINE 12 4 

5 SELICICLIB 9 6 

6 NCBIGene:1017 129 1 

7 ROSCOVITINE 8 7 

8 UCN-01 30 3 

9 RONICICLIB 7 8 



In [233]:
l = []
for i in range(len(x.nodes)):
    for node in x.nodes:
        if x.nodes.data()[node]['rank'] == i+1:
            l.append(node)
l

['NCBIGene:1017',
 'ALVOCIDIB',
 'UCN-01',
 'OLOMOUCINE',
 'DINACICLIB',
 'SELICICLIB',
 'ROSCOVITINE',
 'RONICICLIB',
 'AT-7519',
 'STAUROSPORINE']

In [206]:
x.nodes(3)

NodeDataView({'DINACICLIB': None, 'AT-7519': None, 'STAUROSPORINE': None, 'ALVOCIDIB': None, 'OLOMOUCINE': None, 'SELICICLIB': None, 'NCBIGene:1017': None, 'ROSCOVITINE': None, 'UCN-01': None, 'RONICICLIB': None}, data=3)

In [143]:
assert seqd.G['NCBIGene:1017']['AT-7519']== x['NCBIGene:1017']['AT-7519']

In [180]:
import unittest

In [None]:
class TestFilterEdges(unittest.TestCase):

    # test for count values
    def test_count_values(self):
        counts = [10, 20, 40, 50, 100, 1050]

        seqd = SingleEdgeQeryDispatcher(output_cls='GenomicEntity',
                                        input_cls='Gene',
                                        input_id='HGNC',
                                        values='7890')
        seqd.query()
        for count in counts:
            newG = filter_node_degree(seqd.G, count)
            self.assertEqual(len(newG.nodes), 2)

# Filter 2
### Filter by type of relation

Essentially takes in graph input, the relation we are looking for
and returns a graph with only the edges with said relation, ordered then by degree?

In [339]:
seqd = SingleEdgeQueryDispatcher(input_cls='Gene',
                                 output_cls='ChemicalSubstance',
                                 input_id='NCBIGene',
                                 values='1017')
seqd.query(verbose=True)

==== Step #1: Query path planning ====

Because NCBIGene:1017 is of type 'Gene', BTE will query our meta-KG for APIs that can take 'Gene' as input and 'ChemicalSubstance' as output

BTE found 10 apis:

API 1. dgidb(1 API call)
API 2. scigraph(1 API call)
API 3. pharos(1 API call)
API 4. opentarget(1 API call)
API 5. scibite(1 API call)
API 6. chembio(1 API call)
API 7. mychem(3 API calls)
API 8. hmdb(1 API call)
API 9. cord_gene(1 API call)
API 10. semmed_gene(10 API calls)


==== Step #2: Query path execution ====
NOTE: API requests are dispatched in parallel, so the list of APIs below is ordered by query time.

API 5.1: https://platform-api.opentargets.io/v3/platform/public/evidence/filter?target=ENSG00000123374&datasource=chembl&size=100&fields=drug
API 1.8: https://biothings.ncats.io/semmedgene/query?fields=disrupts (POST -d q=C0108855,C1332733&scopes=umls)
API 8.1: https://mychem.info/v1/query?fields=drugbank.id&size=250 (POST -d q=CDK2&scopes=drugbank.targets.gene_name)
API 1.3: 

In [284]:
x = []
for i in seqd.G.edges:
    x.append(seqd.G[i[0]][i[1]][0]['label'])
    
x = set(x)
x

{'negatively_regulated_by',
 'negatively_regulates',
 'physically_interacts_with',
 'positively_regulated_by',
 'positively_regulates',
 'related_to'}

In [387]:
def filter_label(G, label, count=50):
    
    val_edges = []
    for edge in G.edges:
        if G[edge[0]][edge[1]][edge[2]]['label'] == label:
            val_edges.append(edge)
    
    subG = G.edge_subgraph(val_edges)
    subG = filter_node_degree(subG, count)
    
    for node in subG.nodes:
        subG.nodes.data()[node]['filteredBy'] = 'EdgeLabel'
        
    return subG

In [393]:
sub = filter_label(seqd.G, 'related_to')

for i in sub.edges.data():
    assert i[2]['label'] == 'related_to'


In [345]:
len(seqd.G.edges)

1215

In [359]:
val = []
label = 'related_to'
for edge in seqd.G.edges:
    if seqd.G[edge[0]][edge[1]][0]['label'] == label:
        val.append(edge)
        
sG = seqd.G.edge_subgraph(val)

len(val)


582

KeyError: ('NCBIGene:1017', 'CHEMBL1197513', 0)

In [394]:
seqd.G.edges

OutMultiEdgeView([('NCBIGene:1017', 'CHEMBL1197513', 0), ('NCBIGene:1017', 'CHEBI:52214', 0), ('NCBIGene:1017', 'CHEBI:33252', 0), ('NCBIGene:1017', 'CHEBI:36080', 0), ('NCBIGene:1017', 'CHEBI:35222', 0), ('NCBIGene:1017', 'CHEBI:35222', 1), ('NCBIGene:1017', 'CHEMBL186708', 0), ('NCBIGene:1017', 'CHEMBL360206', 0), ('NCBIGene:1017', 'CHEMBL412097', 0), ('NCBIGene:1017', 'CHEMBL1964249', 0), ('NCBIGene:1017', 'CHEMBL3648108', 0), ('NCBIGene:1017', 'CHEMBL257831', 0), ('NCBIGene:1017', 'CHEMBL3644028', 0), ('NCBIGene:1017', 'CHEMBL312726', 0), ('NCBIGene:1017', 'CHEMBL2163995', 0), ('NCBIGene:1017', 'CHEMBL508129', 0), ('NCBIGene:1017', 'CHEMBL3823659', 0), ('NCBIGene:1017', 'CHEMBL181114', 0), ('NCBIGene:1017', 'CHEMBL2035044', 0), ('NCBIGene:1017', 'CHEMBL2158857', 0), ('NCBIGene:1017', 'CHEMBL319685', 0), ('NCBIGene:1017', 'CHEMBL268507', 0), ('NCBIGene:1017', 'CHEMBL457979', 0), ('NCBIGene:1017', 'CHEMBL1081295', 0), ('NCBIGene:1017', 'CHEMBL1964259', 0), ('NCBIGene:1017', 'CHEMBL36

In [370]:
print(len(test1), len(test2))

433 582
