In [1]:
from kgx import Transformer, NeoTransformer



In [63]:
# SemMedDB

# Credentials can be found from 'Registry of Biolink-compatible Neo4 instances' spreadsheet
semmeddb_uri = ''
semmeddb_username = ''
semmeddb_password = ''

# Initialize NeoTransformer
semmeddb_transformer = NeoTransformer(uri=semmeddb_uri, username=semmeddb_username, password=semmeddb_password)

In [64]:
# Define filters
semmeddb_transformer.set_filter("subject_category", "gene")
semmeddb_transformer.set_filter("object_category", "chemical_substance")
semmeddb_transformer.set_filter("edge_label", "coexists_with")

In [65]:
# Load nodes and edges from SemMedDB
start = 0
end = 5000
semmeddb_transformer.load(start=start, end=end)
semmeddb_transformer.report()

2018-06-08 17:09:13,275 root       DEBUG      MATCH (n:gene:chemical_substance ) RETURN n SKIP 0 LIMIT 1000;
2018-06-08 17:09:13,444 root       DEBUG      MATCH (s:gene)-[p:coexists_with ]->(o:chemical_substance)
            RETURN s,p,o
            SKIP 0 LIMIT 1000;
2018-06-08 17:09:14,400 root       DEBUG      MATCH (s:gene)-[p:coexists_with ]->(o:chemical_substance)
            RETURN s,p,o
            SKIP 1000 LIMIT 1000;
2018-06-08 17:09:15,255 root       DEBUG      MATCH (s:gene)-[p:coexists_with ]->(o:chemical_substance)
            RETURN s,p,o
            SKIP 2000 LIMIT 1000;
2018-06-08 17:09:16,249 root       DEBUG      MATCH (s:gene)-[p:coexists_with ]->(o:chemical_substance)
            RETURN s,p,o
            SKIP 3000 LIMIT 1000;
2018-06-08 17:09:17,000 root       DEBUG      MATCH (s:gene)-[p:coexists_with ]->(o:chemical_substance)
            RETURN s,p,o
            SKIP 4000 LIMIT 1000;
Total number of nodes: 2375
Total number of edges: 3162


In [66]:
# Nodes of type 'gene' have 'id' in the UMLS namespace
gene_nodes = [n for n in semmeddb_transformer.graph.nodes(data=True) if 'gene' in n[1]['labels']]
gene_nodes

# But each node has an 'xrefs' attribute which defines one (or more) mapping to other ID namespace
cd47_gene_node = [n for n in semmeddb_transformer.graph.nodes(data=True) if n[1]['name'] == 'CD47 gene']
cd47_gene_node

[('UMLS:C1332713',
  {'name': 'CD47 gene',
   'xrefs': ['OMIM:601028', 'MTH:NOCODE', 'NCI:C24278', 'HGNC:HGNC:1682'],
   'id': 'UMLS:C1332713',
   'umls_type': ['Gene or Genome'],
   'category': 'gene',
   'labels': ['gene']})]

In [67]:
# We want to remap the node 'id' from UMLS to HGNC namespace
# Note: Not all nodes will have an HGNC xref. But for those that do, we can remap the node 'id' from UMLS to HGNC
semmeddb_transformer.remap_node_identifier("gene", "xrefs", "HGNC:")

In [69]:
# We can confirm that the remap was successful
cd47_gene_node = [n for n in semmeddb_transformer.graph.nodes(data=True) if n[1]['name'] == 'CD47 gene']
cd47_gene_node

[('HGNC:1682',
  {'name': 'CD47 gene',
   'xrefs': ['OMIM:601028', 'MTH:NOCODE', 'NCI:C24278', 'HGNC:HGNC:1682'],
   'id': 'HGNC:1682',
   'umls_type': ['Gene or Genome'],
   'category': 'gene',
   'labels': ['gene']})]