In [1]:
from biothings_explorer.user_query_dispatcher import SingleEdgeQueryDispatcher
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
seqd = SingleEdgeQueryDispatcher(input_cls='Gene',
                                 output_cls='ChemicalSubstance',
                                 input_id='NCBIGene',
                                 values='1017')

In [3]:
seqd.query(verbose=True)

==== Step #1: Query path planning ====

Because NCBIGene:1017 is of type 'Gene', BTE will query our meta-KG for APIs that can take 'Gene' as input and 'ChemicalSubstance' as output

BTE found 10 apis:

API 1. chembio(1 API call)
API 2. mychem(3 API calls)
API 3. hmdb(1 API call)
API 4. semmed_gene(10 API calls)
API 5. opentarget(1 API call)
API 6. pharos(1 API call)
API 7. dgidb(1 API call)
API 8. cord_gene(1 API call)
API 9. scibite(1 API call)
API 10. scigraph(1 API call)


==== Step #2: Query path execution ====
NOTE: API requests are dispatched in parallel, so the list of APIs below is ordered by query time.

API 6.1: https://platform-api.opentargets.io/v3/platform/public/evidence/filter?target=ENSG00000123374&datasource=chembl&size=100&fields=drug
API 3.2: https://mychem.info/v1/query?fields=drugbank.id&size=250 (POST -d q=CDK2&scopes=drugbank.enzymes.gene_name)
API 5.4: https://biothings.ncats.io/semmedgene/query?fields=affected_by (POST -d q=C0108855,C1332733&scopes=umls)
API 3.

In [11]:
seqd.input_cls

'Gene'

In [12]:
print(seqd.G.number_of_edges())

1215


In [46]:
for i in seqd.G.nodes:
    print(i, seqd.G.degree(i), seqd.G.number_of_edges('NCBIGene:1017',i))

NCBIGene:1017 1215 0
CHEMBL1197513 1 1
CHEBI:52214 1 1
CHEBI:36080 1 1
CHEBI:33252 1 1
CHEBI:35222 2 2
CHEMBL2163995 1 1
CHEMBL1964246 1 1
CHEMBL213713 1 1
CHEMBL272833 1 1
CHEMBL259838 1 1
CHEMBL257831 1 1
CHEMBL3354187 1 1
CHEMBL1964244 1 1
CHEMBL455195 1 1
CHEMBL405145 1 1
CHEMBL1964259 1 1
CHEMBL1761791 1 1
CHEMBL3642647 1 1
CHEMBL411426 1 1
CHEMBL3655765 1 1
CHEMBL3298984 1 1
CHEMBL79498 1 1
CHEMBL255463 1 1
CHEMBL485618 1 1
CHEMBL181114 1 1
CHEMBL424696 1 1
CHEMBL186288 1 1
CHEMBL488085 1 1
CHEMBL265903 1 1
CHEMBL415471 1 1
CHEMBL215086 1 1
CHEMBL496785 1 1
CHEMBL1171949 1 1
CHEMBL2158843 1 1
CHEMBL1964242 1 1
CHEMBL103285 1 1
CHEMBL295484 1 1
CHEMBL3357950 1 1
CHEMBL291324 1 1
CHEMBL79356 1 1
CHEMBL515001 1 1
CHEMBL260163 1 1
CHEMBL268507 1 1
CHEMBL255263 1 1
CHEMBL485619 1 1
CHEMBL3648117 1 1
CHEMBL520187 1 1
CHEMBL361833 1 1
CHEMBL476993 1 1
CHEMBL3644022 1 1
CHEMBL1964260 1 1
CHEMBL3648114 1 1
CHEMBL603097 1 1
CHEMBL509012 1 1
CHEMBL3648123 1 1
CHEMBL187750 1 1
CHEMBL260103 1

4-{5-[(1Z)-1-(2-IMINO-4-OXO-1,3-THIAZOLIDIN-5-YLIDENE)ETHYL]-2-FURYL}BENZENESULFONAMIDE 2 2
N-[4-(2,4-DIMETHYL-THIAZOL-5-YL)-PYRIMIDIN-2-YL]-N',N'-DIMETHYL-BENZENE-1,4-DIAMINE 2 2
6-(3,4-DIHYDROXYBENZYL)-3-ETHYL-1-(2,4,6-TRICHLOROPHENYL)-1H-PYRAZOLO[3,4-D]PYRIMIDIN-4(5H)-ONE 2 2
6-(3-AMINOPHENYL)-N-(TERT-BUTYL)-2-(TRIFLUOROMETHYL)QUINAZOLIN-4-AMINE 2 2
2-(4-(AMINOMETHYL)PIPERIDIN-1-YL)-N-(3_CYCLOHEXYL-4-OXO-2,4-DIHYDROINDENO[1,2-C]PYRAZOL-5-YL)ACETAMIDE 2 2
1-(3-(2,4-DIMETHYLTHIAZOL-5-YL)-4-OXO-2,4-DIHYDROINDENO[1,2-C]PYRAZOL-5-YL)-3-(4-METHYLPIPERAZIN-1-YL)UREA 2 2
4-{[5-(CYCLOHEXYLMETHOXY)[1,2,4]TRIAZOLO[1,5-A]PYRIMIDIN-7-YL]AMINO}BENZENESULFONAMIDE 2 2
4-{[5-(CYCLOHEXYLAMINO)[1,2,4]TRIAZOLO[1,5-A]PYRIMIDIN-7-YL]AMINO}BENZENESULFONAMIDE 2 2
4-({5-[(4-AMINOCYCLOHEXYL)AMINO][1,2,4]TRIAZOLO[1,5-A]PYRIMIDIN-7-YL}AMINO)BENZENESULFONAMIDE 2 2
4-{[5-(CYCLOHEXYLOXY)[1,2,4]TRIAZOLO[1,5-A]PYRIMIDIN-7-YL]AMINO}BENZENESULFONAMIDE 2 2
CAN-508 2 2
N-[3-(1H-BENZIMIDAZOL-2-YL)-1H-PYRAZOL-4-YL]BENZAM

# Filter 1
### Filter by number of edges overlapping between nodes

In [175]:
def filter_node_degree(G, count=50): #takes input G as networkX graph

    degrees = []
    for node in G.nodes:
        degrees.append(G.degree(node))

    data = {'node':G.nodes, 'degree':degrees}
    deg_count = pd.DataFrame(data=data)
    deg_count.sort_values(by='degree', inplace=True, ascending=False)

    filtered = list(deg_count.head(count)['node'])
    subG = G.subgraph(filtered)

    for i,node in enumerate(filtered):
        subG.nodes.data()[node]['filteredBy'] = 'NodeDegree'
        subG.nodes.data()[node]['rank'] = i+1

    return subG

In [185]:
x = filter_node_degree(seqd.G,10)
len(x.nodes)

10

In [210]:
for i,node in enumerate(x.nodes):
    print(i,node,x.degree(node),x.nodes.data()[node]['rank'], '\n')

0 DINACICLIB 11 5 

1 AT-7519 7 9 

2 STAUROSPORINE 7 10 

3 ALVOCIDIB 38 2 

4 OLOMOUCINE 12 4 

5 SELICICLIB 9 6 

6 NCBIGene:1017 129 1 

7 ROSCOVITINE 8 7 

8 UCN-01 30 3 

9 RONICICLIB 7 8 



In [233]:
l = []
for i in range(len(x.nodes)):
    for node in x.nodes:
        if x.nodes.data()[node]['rank'] == i+1:
            l.append(node)
l

['NCBIGene:1017',
 'ALVOCIDIB',
 'UCN-01',
 'OLOMOUCINE',
 'DINACICLIB',
 'SELICICLIB',
 'ROSCOVITINE',
 'RONICICLIB',
 'AT-7519',
 'STAUROSPORINE']

In [206]:
x.nodes(3)

NodeDataView({'DINACICLIB': None, 'AT-7519': None, 'STAUROSPORINE': None, 'ALVOCIDIB': None, 'OLOMOUCINE': None, 'SELICICLIB': None, 'NCBIGene:1017': None, 'ROSCOVITINE': None, 'UCN-01': None, 'RONICICLIB': None}, data=3)

In [143]:
assert seqd.G['NCBIGene:1017']['AT-7519']== x['NCBIGene:1017']['AT-7519']

In [180]:
import unittest

In [None]:
class TestFilterEdges(unittest.TestCase):

    # test for count values
    def test_count_values(self):
        counts = [10, 20, 40, 50, 100, 1050]

        seqd = SingleEdgeQeryDispatcher(output_cls='GenomicEntity',
                                        input_cls='Gene',
                                        input_id='HGNC',
                                        values='7890')
        seqd.query()
        for count in counts:
            newG = filter_node_degree(seqd.G, count)
            self.assertEqual(len(newG.nodes), 2)

# Filter 2
### Filter by type of relation

Essentially takes in graph input, the relation we are looking for
and returns a graph with only the edges with said relation, ordered then by degree?

In [413]:
seqd = SingleEdgeQueryDispatcher(input_cls='Gene',
                                 output_cls='ChemicalSubstance',
                                 input_id='NCBIGene',
                                 values='1017')
seqd.query(verbose=True)

==== Step #1: Query path planning ====

Because NCBIGene:1017 is of type 'Gene', BTE will query our meta-KG for APIs that can take 'Gene' as input and 'ChemicalSubstance' as output

BTE found 10 apis:

API 1. dgidb(1 API call)
API 2. scigraph(1 API call)
API 3. pharos(1 API call)
API 4. opentarget(1 API call)
API 5. scibite(1 API call)
API 6. chembio(1 API call)
API 7. mychem(3 API calls)
API 8. hmdb(1 API call)
API 9. cord_gene(1 API call)
API 10. semmed_gene(10 API calls)


==== Step #2: Query path execution ====
NOTE: API requests are dispatched in parallel, so the list of APIs below is ordered by query time.

API 1.1: https://biothings.ncats.io/semmedgene/query?fields=negatively_regulates (POST -d q=C0108855,C1332733&scopes=umls)
API 1.4: https://biothings.ncats.io/semmedgene/query?fields=affected_by (POST -d q=C0108855,C1332733&scopes=umls)
API 1.3: https://biothings.ncats.io/semmedgene/query?fields=affects (POST -d q=C0108855,C1332733&scopes=umls)
API 1.5: https://biothings.ncats

In [412]:
x = []
for i in seqd.G.edges:
    x.append(seqd.G[i[0]][i[1]][0]['label'])
    
x = set(x)
len(seqd.G.nodes)

3

In [397]:
def filter_label(G, label, count=50):
    
    val_edges = []
    for edge in G.edges:
        if G[edge[0]][edge[1]][edge[2]]['label'] in label:
            val_edges.append(edge)
    
    subG = G.edge_subgraph(val_edges)
    subG = filter_node_degree(subG, count)
    
    for node in subG.nodes:
        subG.nodes.data()[node]['filteredBy'] = 'EdgeLabel'
        
    return subG

In [411]:
labels = ['related_to', 'negatively_regulated_by']
subG = filter_label(seqd.G, labels)

subG = filter_label(seqd.G, 'related_to')
for edge in subG.edges.data():
    assert 'related_to' == edge[2]['label']


In [404]:
len(seqd.G.edges)

1215

In [359]:
val = []
label = 'related_to'
for edge in seqd.G.edges:
    if seqd.G[edge[0]][edge[1]][0]['label'] == label:
        val.append(edge)
        
sG = seqd.G.edge_subgraph(val)

len(val)


582

In [405]:
seqd = SingleEdgeQueryDispatcher(output_cls='GenomicEntity',
                                         input_cls='ChemicalSubstance',
                                         pred="related_to",
                                         input_id='CHEBI',
                                         values='CHEBI:28640')
seqd.query()

In [408]:
seqd.G.edges.data()

OutMultiEdgeDataView([('CHEBI:CHEBI:28640', 'SO:0000165', {'info': {'@type': 'GenomicEntity', 'pmc': ['PMC6759490'], 'SO': 'SO:0000165', '$api': 'CORD Chemical API', '$source': 'Translator Text Mining Provider'}, 'label': 'related_to', 'source': 'Translator Text Mining Provider'}), ('CHEBI:CHEBI:28640', 'SO:0000331', {'info': {'@type': 'GenomicEntity', 'pmc': ['PMC6566617'], 'SO': 'SO:0000331', '$api': 'CORD Chemical API', '$source': 'Translator Text Mining Provider'}, 'label': 'related_to', 'source': 'Translator Text Mining Provider'})])

In [417]:
subG = filter_label(seqd.G, 'related_to')

for node in subG.nodes.data():
    print(node[1]['filteredBy'])

EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel
EdgeLabel


# Filter 3: Co-occurrence
### Using NIH MRCOC co-occurrence files

In [3]:
import time

In [13]:
df = pd.read_csv(path+'co_occurs_1', sep='|')

In [36]:
df[(df['DUI1'] == 'D000001') & (df['DUI2'] == 'D000252')]

Unnamed: 0,DUI1,CUI1,DUI2,CUI2,Freq,StarFreq(COF),Year,TimeFrame(SAB),numNoSH,StarnumNoSH,numSH,BothMain?,Star1OnlyFreq,Star2OnlyFreq,SH1OnlyFreq,SH2OnlyFreq
245,D000001,C0000699,D000252,C0001476,3,0,1979,RST,0,0,0,ZN,0,2,0,0
246,D000001,C0000699,D000252,C0001476,4,0,1980,RST,0,0,0,ZN,0,2,0,0
247,D000001,C0000699,D000252,C0001476,11,0,1981,RST,0,0,0,ZN,0,11,0,0
248,D000001,C0000699,D000252,C0001476,10,0,1982,RST,0,0,0,ZN,1,4,0,0
249,D000001,C0000699,D000252,C0001476,14,0,1983,RST,0,0,0,ZN,0,9,0,0
250,D000001,C0000699,D000252,C0001476,9,0,1984,RST,0,0,0,ZN,0,5,0,0
251,D000001,C0000699,D000252,C0001476,7,1,1985,RST,0,0,1,ZY,1,4,0,0
252,D000001,C0000699,D000252,C0001476,12,0,1986,RST,0,0,0,ZN,0,6,1,0
253,D000001,C0000699,D000252,C0001476,11,1,1987,RST,0,0,1,ZY,0,7,0,0
254,D000001,C0000699,D000252,C0001476,11,0,1988,RST,0,0,0,ZN,1,7,0,0


In [39]:
# read through file line by line until found, then go until it isn't the same anymore

# algorithm for co-occurrence coefficient?
    # keep running tally for. # of years
    
# parameters: takes in 'mesh' as a LIST of 2 meshIDs    
    


In [40]:
filter_co_occurs(['D000001', 'D000252'])

9.137931034482758

In [36]:
# simple filter that ranks items (edges of graph) by co-occurrence in literature using only average frequency per year

def filter_co_occurs_avg(G, count=50):
    
    def get_ids(node):
        ids = []
        try:
            ids.append(seqd.G.nodes[node]['equivalent_ids']['MESH'])
            ids.append(seqd.G.nodes[node]['equivalent_ids']['UMLS'])
        except:
            pass
        
        ids = [i for sub in ids for i in sub] # flatten and get rid of set()
            
        if len(ids) == 0:
            return 0
        else:
            return ids
    
    avgs = []
    num = 0
    for edge in G.edges.data():
        
        # get the ids
        ids = []
        ids.append(get_ids(edge[0]))
        ids.append(get_ids(edge[1]))
        
        
        if 0 in ids: # at least 1 doesn't have an ID
            edge[2]['rank'] = 0
            edge[2]['filteredBy'] = 'CoOccurrence'
            continue
    
        freq, numYears = 0,0
        
        linenum = 0
        # look for IDs in file
        with open('NIH_CoOccurs/summary_CoOccurs_2019.txt') as fp:
            for line in fp:
                line = line.strip().split('|')

                if (((line[0] in ids[0]) | (line[1] in ids[0])) & ((line[2] in ids[1]) | (line[3] in ids[1]))) | \
                (((line[0] in ids[1]) | (line[1] in ids[1])) & ((line[2] in ids[0]) | (line[3] in ids[0]))):
                    freq += int(line[4])
                    numYears += 1
                else:
                    if numYears > 0:
                        break # seems like all co-occs for a given pair are adjacent, don't need to go thru whole file

                linenum += 1
                if linenum % 1000000 == 0:
                    print('Line {}'.format(linenum))
        fp.close()
                    
        if numYears > 0:
            avgs.append([freq/numYears, edge[0], edge[1]]) #avg,node1,node2
            
        num += 1
        print('Edge {} done'.format(num))
    
    print('Exit loop')
    avgs.sort(reverse=True)
    
    print('Ranking nodes')
    # rank them
    for i in range(count):
        for edge in range(len(G[avgs[i][0]][avgs[i][1]])): # account for nodes w/ >1 edge
            G[avgs[i][0]][avgs[i][1]][edge]['rank'] = i+1
            G[avgs[i][0]][avgs[i][1]][edge]['filteredBy'] = 'CoOccurrence'
            G[avgs[i][0]][avgs[i][1]][edge]['avg'] = avgs[i][0]
        
    fp.close()
    return G
    

In [37]:
newG = filter_co_occurs_avg(seqd.G)

Line 1000000
Line 2000000
Line 3000000
Line 4000000
Line 5000000
Line 6000000
Line 7000000
Line 8000000
Line 9000000
Line 10000000
Line 11000000
Line 12000000
Line 13000000
Line 14000000
Line 15000000
Line 16000000
Line 17000000
Line 18000000
Line 19000000
Line 20000000
Line 21000000
Line 22000000
Line 23000000
Line 24000000
Line 25000000
Line 26000000
Line 27000000
Line 28000000
Line 29000000
Line 30000000
Line 31000000
Line 32000000
Line 33000000
Line 34000000
Line 35000000
Line 36000000
Line 37000000
Line 38000000
Line 39000000
Line 40000000
Line 41000000
Line 42000000
Line 43000000
Line 44000000
Line 45000000
Line 46000000
Line 47000000
Line 48000000
Line 49000000
Line 50000000
Line 51000000
Line 52000000
Line 53000000
Line 54000000
Line 55000000
Line 56000000
Line 57000000
Line 58000000
Line 59000000
Line 60000000
Line 61000000
Line 62000000
Line 63000000
Line 64000000
Line 65000000
Line 66000000
Line 67000000
Line 68000000
Line 69000000
Line 70000000
Line 71000000
Line 72000000
L

Line 65000000
Line 66000000
Line 67000000
Line 68000000
Line 69000000
Line 70000000
Line 71000000
Line 72000000
Line 73000000
Line 74000000
Line 75000000
Line 76000000
Line 77000000
Line 78000000
Line 79000000
Line 80000000
Line 81000000
Line 82000000
Line 83000000
Line 84000000
Line 85000000
Line 86000000
Line 87000000
Line 88000000
Line 89000000
Line 90000000
Line 91000000
Line 92000000
Line 93000000
Line 94000000
Line 95000000
Line 96000000
Line 97000000
Line 98000000
Line 99000000
Line 100000000
Line 101000000
Line 102000000
Line 103000000
Line 104000000
Line 105000000
Line 106000000
Line 107000000
Line 108000000
Line 109000000
Line 110000000
Line 111000000
Line 112000000
Line 113000000
Line 114000000
Line 115000000
Line 116000000
Line 117000000
Line 118000000
Line 119000000
Line 120000000
Line 121000000
Line 122000000
Line 123000000
Line 124000000
Line 125000000
Line 126000000
Line 127000000
Line 128000000
Line 129000000
Line 130000000
Line 131000000
Line 132000000
Line 133000000


Line 127000000
Line 128000000
Line 129000000
Line 130000000
Line 131000000
Line 132000000
Line 133000000
Line 134000000
Line 135000000
Line 136000000
Line 137000000
Line 138000000
Line 139000000
Line 140000000
Line 141000000
Line 142000000
Line 143000000
Line 144000000
Line 145000000
Line 146000000
Line 147000000
Line 148000000
Line 149000000
Line 150000000
Line 151000000
Line 152000000
Line 153000000
Line 154000000
Line 155000000
Line 156000000
Line 157000000
Line 158000000
Line 159000000
Line 160000000
Line 161000000
Line 162000000
Line 163000000
Line 164000000
Line 165000000
Line 166000000
Line 167000000
Line 168000000
Line 169000000
Line 170000000
Line 171000000
Line 172000000
Line 173000000
Line 174000000
Line 175000000
Line 176000000
Line 177000000
Line 178000000
Line 179000000
Line 180000000
Line 181000000
Line 182000000
Line 183000000
Line 184000000
Line 185000000
Line 186000000
Line 187000000
Line 188000000
Line 189000000
Line 190000000
Line 191000000
Line 192000000
Line 19300

Line 186000000
Line 187000000
Line 188000000
Line 189000000
Line 190000000
Line 191000000
Line 192000000
Line 193000000
Line 194000000
Line 195000000
Line 196000000
Line 197000000
Line 198000000
Line 199000000
Line 200000000
Line 201000000
Line 202000000
Line 203000000
Line 204000000
Line 205000000
Line 206000000
Line 207000000
Line 208000000
Line 209000000
Line 210000000
Line 211000000
Line 212000000
Line 213000000
Line 214000000
Line 215000000
Line 216000000
Line 217000000
Line 218000000
Line 219000000
Line 220000000
Line 221000000
Line 222000000
Line 223000000
Line 224000000
Line 225000000
Line 226000000
Line 227000000
Line 228000000
Line 229000000
Line 230000000
Line 231000000
Line 232000000
Line 233000000
Line 234000000
Line 235000000
Line 236000000
Line 237000000
Line 238000000
Line 239000000
Line 240000000
Line 241000000
Line 242000000
Line 243000000
Line 244000000
Line 245000000
Line 246000000
Line 247000000
Line 248000000
Line 249000000
Line 250000000
Edge 7 done
Line 1000000


Line 245000000
Line 246000000
Line 247000000
Line 248000000
Line 249000000
Line 250000000
Edge 9 done
Line 1000000
Line 2000000
Line 3000000
Line 4000000
Line 5000000
Line 6000000
Line 7000000
Line 8000000
Line 9000000
Line 10000000
Line 11000000
Line 12000000
Line 13000000
Line 14000000
Line 15000000
Line 16000000
Line 17000000
Line 18000000
Line 19000000
Line 20000000
Line 21000000
Line 22000000
Line 23000000
Line 24000000
Line 25000000
Line 26000000
Line 27000000
Line 28000000
Line 29000000
Line 30000000
Line 31000000
Line 32000000
Line 33000000
Line 34000000
Line 35000000
Line 36000000
Line 37000000
Line 38000000
Line 39000000
Line 40000000
Line 41000000
Line 42000000
Line 43000000
Line 44000000
Line 45000000
Line 46000000
Line 47000000
Line 48000000
Line 49000000
Line 50000000
Line 51000000
Line 52000000
Line 53000000
Line 54000000
Line 55000000
Line 56000000
Line 57000000
Line 58000000
Line 59000000
Line 60000000
Line 61000000
Line 62000000
Line 63000000
Line 64000000
Line 650000

KeyboardInterrupt: 

In [24]:
seqd = SingleEdgeQueryDispatcher(input_cls='Gene',
                                 output_cls='ChemicalSubstance',
                                 input_id='NCBIGene',
                                 values='1017')
seqd.query()

In [25]:
newG = filter_co_occurs_avg(seqd.G)

[['C0108855', 'C1332733'], ['C553669', 'C2698720']]


KeyboardInterrupt: 

In [157]:
for edge in range(len(seqd.G['NCBIGene:1017']['CHEBI:35222'])):
    seqd.G['NCBIGene:1017']['CHEBI:35222'][edge]['rank'] = 10000
    
seqd.G['NCBIGene:1017']['CHEBI:35222']

AtlasView({0: {'info': {'CHEBI': 'CHEBI:35222', '$api': 'Automat CORD19 Scigraph API', '$source': 'scigraph', '@type': 'ChemicalSubstance'}, 'label': 'related_to', 'source': 'scigraph', 'rank': 10000}, 1: {'info': {'@type': 'ChemicalSubstance', 'pmc': ['PMC6522292'], 'CHEBI': 'CHEBI:35222', '$api': 'CORD Gene API', '$source': 'Translator Text Mining Provider'}, 'label': 'related_to', 'source': 'Translator Text Mining Provider', 'rank': 10000}})

In [21]:
import time
start = time.time()
fp = open('NIH_CoOccurs/summary_CoOccurs_2019.txt')
print(time.time() - start)

0.0005218982696533203


In [105]:
def get_ids(node):
        ids = []
        try:
            ids.append(seqd.G.nodes[node]['equivalent_ids']['MESH'])
            ids.append(seqd.G.nodes[node]['equivalent_ids']['UMLS'])
        except:
            pass
        
        ids = [i for sub in ids for i in sub]
            
        if len(ids) == 0:
            return 0
        else:
            return ids

In [106]:
for i in seqd.G.nodes:
    print(get_ids(i))

['C0108855', 'C1332733']
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
['C553669', 'C2698720']
['C077990', 'C0174903']
0
0
0
0
0
0
['C000601854', 'C4078976']
0
['C546217', 'C2744793']
0
['C015330', 'C0065335']
['C067713', 'C0125179']
0
['D013936', 'C0040077']
0
['C120793', 'C0908

In [128]:


l = [[2, 'egg'], [30, 'b'], [15, 'a']]
l.sort(reverse=True)
l


[[30, 'b'], [15, 'a'], [2, 'egg']]