# BTE metakg visualization

The goal of this notebook is to visualize the metaKG that BTE uses.  Similar to the [subway diagram](https://raw.githubusercontent.com/biothings/BioThings_Explorer_TRAPI/main/diagrams/smartapi_metagraph.png) we've used before, but updated to the size and scale of the current metakg.

This notebook takes as input an ndson file with the SmartAPI metakg (originally provided by Chunlei on 2023-03-01).

Optimizations
* remove less-commonly-used node types from subject/object
* only count in one direction (`A-treats-B` gets merged with `B-treated_by-A`)

In [1]:
import json5
import networkx as nx
import pandas as pd
import re
import requests


## Read in data

First, read in the Smart API ndjson file

In [2]:
df = pd.read_json('data/smartapi_metakg_03012023.ndjson.gz', lines=True)
df

Unnamed: 0,subject,object,predicate,api,provided_by
0,AnatomicalEntity,MolecularActivity,affects_activity_of,"{'name': 'CAM-KP API', 'smartapi': {'metadata'...",
1,AnatomicalEntity,MolecularEntity,affects_activity_of,"{'name': 'CAM-KP API', 'smartapi': {'metadata'...",
2,AnatomicalEntity,NamedThing,affects_activity_of,"{'name': 'CAM-KP API', 'smartapi': {'metadata'...",
3,AnatomicalEntity,NucleicAcidEntity,affects_activity_of,"{'name': 'CAM-KP API', 'smartapi': {'metadata'...",
4,AnatomicalEntity,Occurrent,affects_activity_of,"{'name': 'CAM-KP API', 'smartapi': {'metadata'...",
...,...,...,...,...,...
175543,Cell,Phenomenon,causes,{'name': 'ARAX Translator Reasoner - TRAPI 1.3...,
175544,Transcript,CellLine,physically_interacts_with,{'name': 'ARAX Translator Reasoner - TRAPI 1.3...,
175545,Disease,PhenotypicFeature,entity_positively_regulates_entity,{'name': 'ARAX Translator Reasoner - TRAPI 1.3...,
175546,Device,Vitamin,entity_negatively_regulates_entity,{'name': 'ARAX Translator Reasoner - TRAPI 1.3...,


parse out a couple lines for the API name and ID

In [3]:
df = df.assign(api_name = lambda x: pd.json_normalize(x['api'])['name'])
df = df.assign(api_id = lambda x: pd.json_normalize(x['api'])['smartapi.id'])
df

Unnamed: 0,subject,object,predicate,api,provided_by,api_name,api_id
0,AnatomicalEntity,MolecularActivity,affects_activity_of,"{'name': 'CAM-KP API', 'smartapi': {'metadata'...",,CAM-KP API,c409da43f5177ae52392e2b346dbc585
1,AnatomicalEntity,MolecularEntity,affects_activity_of,"{'name': 'CAM-KP API', 'smartapi': {'metadata'...",,CAM-KP API,c409da43f5177ae52392e2b346dbc585
2,AnatomicalEntity,NamedThing,affects_activity_of,"{'name': 'CAM-KP API', 'smartapi': {'metadata'...",,CAM-KP API,c409da43f5177ae52392e2b346dbc585
3,AnatomicalEntity,NucleicAcidEntity,affects_activity_of,"{'name': 'CAM-KP API', 'smartapi': {'metadata'...",,CAM-KP API,c409da43f5177ae52392e2b346dbc585
4,AnatomicalEntity,Occurrent,affects_activity_of,"{'name': 'CAM-KP API', 'smartapi': {'metadata'...",,CAM-KP API,c409da43f5177ae52392e2b346dbc585
...,...,...,...,...,...,...,...
175543,Cell,Phenomenon,causes,{'name': 'ARAX Translator Reasoner - TRAPI 1.3...,,ARAX Translator Reasoner - TRAPI 1.3.0,e248aefca0f469229e82cca80fbabc89
175544,Transcript,CellLine,physically_interacts_with,{'name': 'ARAX Translator Reasoner - TRAPI 1.3...,,ARAX Translator Reasoner - TRAPI 1.3.0,e248aefca0f469229e82cca80fbabc89
175545,Disease,PhenotypicFeature,entity_positively_regulates_entity,{'name': 'ARAX Translator Reasoner - TRAPI 1.3...,,ARAX Translator Reasoner - TRAPI 1.3.0,e248aefca0f469229e82cca80fbabc89
175546,Device,Vitamin,entity_negatively_regulates_entity,{'name': 'ARAX Translator Reasoner - TRAPI 1.3...,,ARAX Translator Reasoner - TRAPI 1.3.0,e248aefca0f469229e82cca80fbabc89


Read in the BTE config file that specifies currently-allowed APIs

In [4]:
bte_config_url = "https://raw.githubusercontent.com/biothings/BioThings_Explorer_TRAPI/main/src/config/apis.js"
r = requests.get(bte_config_url)
str_bte_config = r.text
#print(str_bte_config)
str_bte_config = re.sub("exports.API_LIST = ",  "", str_bte_config)                       # remove variable assignment step
str_bte_config = re.sub("\s*//.*",              "", str_bte_config)                       # remove commented lines
str_bte_config = re.sub(r'^$\n',                '', str_bte_config, flags=re.MULTILINE)   # remove blank lines
str_bte_config = re.sub(r',\s*exclude:[^\]]*]', '', str_bte_config, flags=re.MULTILINE)   # remove 'exclude' section
str_bte_config = re.sub(r';$',                  '', str_bte_config, flags=re.MULTILINE)   # remove 'exclude' section
#print(str_bte_config)

bte_config = json5.loads(str_bte_config)
bte_config

{'include': [{'id': 'd22b657426375a5295e7da8a303b9893', 'name': 'BioLink API'},
  {'id': '43af91b3d7cae43591083bff9d75c6dd', 'name': 'EBI Proteins API'},
  {'id': 'dca415f2d792976af9d642b7e73f7a41', 'name': 'LitVar API'},
  {'id': '1f277e1563fcfd124bfae2cc3c4bcdec', 'name': 'QuickGO API'},
  {'id': '1c056ffc7ed0dd1229e71c4752239465',
   'name': 'Ontology Lookup Service API'},
  {'id': '38e9e5169a72aee3659c9ddba956790d',
   'name': 'BioThings BindingDB API'},
  {'id': '55a223c6c6e0291dbd05f2faf27d16f4',
   'name': 'BioThings BioPlanet Pathway-Disease API'},
  {'id': 'b99c6dd64abcefe87dcd0a51c249ee6d',
   'name': 'BioThings BioPlanet Pathway-Gene API'},
  {'id': '00fb85fc776279163199e6c50f6ddfc6', 'name': 'BioThings DDInter API'},
  {'id': 'e3edd325c76f2992a111b43a907a4870', 'name': 'BioThings DGIdb API'},
  {'id': 'a7f784626a426d054885a5f33f17d3f8', 'name': 'BioThings DISEASES API'},
  {'id': '1f47552dabd67351d4c625adb0a10d00',
   'name': 'BioThings EBIgene2phenotype API'},
  {'id': 'cc

In [5]:
bte_config_ids = [ x['id'] for x in bte_config['include'] ]
bte_config_ids

['d22b657426375a5295e7da8a303b9893',
 '43af91b3d7cae43591083bff9d75c6dd',
 'dca415f2d792976af9d642b7e73f7a41',
 '1f277e1563fcfd124bfae2cc3c4bcdec',
 '1c056ffc7ed0dd1229e71c4752239465',
 '38e9e5169a72aee3659c9ddba956790d',
 '55a223c6c6e0291dbd05f2faf27d16f4',
 'b99c6dd64abcefe87dcd0a51c249ee6d',
 '00fb85fc776279163199e6c50f6ddfc6',
 'e3edd325c76f2992a111b43a907a4870',
 'a7f784626a426d054885a5f33f17d3f8',
 '1f47552dabd67351d4c625adb0a10d00',
 'cc857d5b7c8b7609b5bbb38ff990bfff',
 'f339b28426e7bf72028f60feefcd7465',
 '34bad236d77bea0a0ee6c6cba5be54a6',
 '316eab811fd9ef1097df98bcaa9f7361',
 'a5b0ec6bfde5008984d4b6cde402d61f',
 '32f36164fabed5d3abe6c2fd899c9418',
 '77ed27f111262d0289ed4f4071faa619',
 'edeb26858bd27d0322af93e7a9e08761',
 '03283cc2b21c077be6794e1704b1d230',
 '1d288b3a3caf75d541ffaae3aab386c8',
 'ec6d76016ef40f284359d17fbf78df20',
 '8f08d1446e0bb9c2b323713ce83e2bd3',
 '671b45c0301c8624abbd26ae78449ca2',
 '59dce17363dce279d389100834e43648',
 '09c8782d9f4027712e65b95424adba79',
 

## Join SmartAPI data with BTE config IDs

In [6]:
df_bte = df.query('api_id in @bte_config_ids')
df_bte

Unnamed: 0,subject,object,predicate,api,provided_by,api_name,api_id
3000,SequenceVariant,Gene,is_sequence_variant_of,"{'name': 'LitVar API', 'smartapi': {'metadata'...",infores:dbsnp,LitVar API,dca415f2d792976af9d642b7e73f7a41
5115,Gene,Gene,orthologous_to,"{'name': 'Automat-biolink(Trapi v1.3.0)', 'sma...",,Automat-biolink(Trapi v1.3.0),25085b05fd1afcebb497724d147cfb44
5116,Gene,Gene,superclass_of,"{'name': 'Automat-biolink(Trapi v1.3.0)', 'sma...",,Automat-biolink(Trapi v1.3.0),25085b05fd1afcebb497724d147cfb44
5117,Gene,Gene,biomarker_for,"{'name': 'Automat-biolink(Trapi v1.3.0)', 'sma...",,Automat-biolink(Trapi v1.3.0),25085b05fd1afcebb497724d147cfb44
5118,Gene,Gene,phenotype_of,"{'name': 'Automat-biolink(Trapi v1.3.0)', 'sma...",,Automat-biolink(Trapi v1.3.0),25085b05fd1afcebb497724d147cfb44
...,...,...,...,...,...,...,...
130260,PhenotypicFeature,SmallMolecule,contribution_from,{'name': 'Text Mining Targeted Association API...,,Text Mining Targeted Association API,978fe380a147a8641caf72320862697b
130261,SmallMolecule,Disease,contributes_to,{'name': 'Text Mining Targeted Association API...,,Text Mining Targeted Association API,978fe380a147a8641caf72320862697b
130262,Disease,SmallMolecule,contribution_from,{'name': 'Text Mining Targeted Association API...,,Text Mining Targeted Association API,978fe380a147a8641caf72320862697b
130263,SmallMolecule,Disease,contributes_to,{'name': 'Text Mining Targeted Association API...,,Text Mining Targeted Association API,978fe380a147a8641caf72320862697b


In [7]:
df_bte.to_csv("results/bte_operations.tsv", sep="\t")

In [8]:
df_bte['api_name'].value_counts()

BioThings SEMMEDDB API                     2497
Automat-icees-kg(Trapi v1.3.0)              454
Automat-ctd(Trapi v1.3.0)                   441
Multiomics EHR Risk KP API                  204
Multiomics Wellness KP API                  196
Automat-hetio(Trapi v1.3.0)                 190
Automat-biolink(Trapi v1.3.0)               134
Automat-pharos(Trapi v1.3.0)                 99
Automat-hmdb(Trapi v1.3.0)                   66
Automat-human-goa(Trapi v1.3.0)              58
COHD TRAPI 1.3                               50
Automat-gtopdb(Trapi v1.3.0)                 42
Automat-viral-proteome(Trapi v1.3.0)         35
Automat-panther(Trapi v1.3.0)                34
Text Mining Targeted Association API         32
MyDisease.info API                           26
BioThings DGIdb API                          24
MyChem.info API                              20
BioLink API                                  19
Multiomics BigGIM-DrugResponse KP API        18
MyGene.info API                         

## Summarization

### by subject, object; count # of APIs

In [9]:
df1 = df_bte[["subject","object","api_name"]]
api_stats = df1.groupby(['subject','object'], group_keys=False)['api_name'].nunique().rename("count").to_frame()
api_stats['list'] = df1.groupby(['subject','object'], group_keys=False)['api_name'].unique().apply(list)
api_stats = api_stats.reset_index()

with pd.option_context('display.min_rows', 20,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(api_stats)


           subject                           object  count  \
0         Activity                         Activity      1   
1         Activity                       Annotation      1   
2         Activity                BiologicalProcess      1   
3         Activity                   ChemicalEntity      1   
4         Activity                 ChemicalExposure      1   
5         Activity                ClinicalAttribute      1   
6         Activity             ClinicalIntervention      1   
7         Activity          ComplexMolecularMixture      1   
8         Activity                           Device      1   
9         Activity                          Disease      1   
..             ...                              ...    ...   
672  SmallMolecule                       Phenomenon      1   
673  SmallMolecule                PhenotypicFeature      9   
674  SmallMolecule             PhysiologicalProcess      3   
675  SmallMolecule                      Polypeptide      7   
676  Sma

In [10]:
api_stats.to_csv("results/api_stats.tsv", sep="\t")


### by subject, object; count # of predicates

In [11]:
df1 = df_bte[["subject","object","predicate"]]
predicate_stats = df1.groupby(['subject','object'], group_keys=False)['predicate'].nunique().rename("count").to_frame()
predicate_stats['list'] = df1.groupby(['subject','object'], group_keys=False)['predicate'].unique().apply(list)
predicate_stats = predicate_stats.reset_index()

with pd.option_context('display.min_rows', 20,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(predicate_stats)


           subject                           object  count  \
0         Activity                         Activity      1   
1         Activity                       Annotation      1   
2         Activity                BiologicalProcess      1   
3         Activity                   ChemicalEntity      1   
4         Activity                 ChemicalExposure      1   
5         Activity                ClinicalAttribute      1   
6         Activity             ClinicalIntervention      1   
7         Activity          ComplexMolecularMixture      1   
8         Activity                           Device      1   
9         Activity                          Disease      1   
..             ...                              ...    ...   
672  SmallMolecule                       Phenomenon      1   
673  SmallMolecule                PhenotypicFeature     16   
674  SmallMolecule             PhysiologicalProcess      9   
675  SmallMolecule                      Polypeptide     13   
676  Sma

In [12]:
predicate_stats.to_csv("results/predicate_stats.tsv", sep="\t")

## Filter by most common types

Filter to only include the most common types of entities.  Also, since we _mostly_ have the same info in both directions, only keep one direction to simplify visualization

In [13]:
NUM_TYPES_TO_KEEP = 10

keep = set(pd.concat([predicate_stats['subject'], predicate_stats['object']]).value_counts().head(NUM_TYPES_TO_KEEP).keys())
keep

{'BiologicalProcess',
 'ChemicalEntity',
 'Disease',
 'Gene',
 'MolecularMixture',
 'PhenotypicFeature',
 'PhysiologicalProcess',
 'Procedure',
 'Protein',
 'SmallMolecule'}

In [14]:
predicate_stats_filt = predicate_stats.query("subject in @keep & object in @keep & subject <= object")
predicate_stats_filt.to_csv("results/predicate_stats_filt.tsv", sep="\t")
predicate_stats_filt

Unnamed: 0,subject,object,count,list
56,BiologicalProcess,BiologicalProcess,10,"[subclass_of, superclass_of, process_negativel..."
57,BiologicalProcess,ChemicalEntity,1,[correlated_with]
62,BiologicalProcess,Disease,3,"[subclass_of, superclass_of, correlated_with]"
65,BiologicalProcess,Gene,3,"[correlated_with, actively_involves, has_parti..."
69,BiologicalProcess,MolecularMixture,1,[correlated_with]
71,BiologicalProcess,PhenotypicFeature,2,"[superclass_of, correlated_with]"
72,BiologicalProcess,PhysiologicalProcess,1,[correlated_with]
74,BiologicalProcess,Procedure,1,[correlated_with]
75,BiologicalProcess,Protein,6,"[actively_involved_in, correlated_with, preced..."
77,BiologicalProcess,SmallMolecule,1,[correlated_with]


In [15]:
api_stats_filt = api_stats.query("subject in @keep & object in @keep & subject <= object")
api_stats_filt.to_csv("results/api_stats_filt.tsv", sep="\t")
api_stats_filt

Unnamed: 0,subject,object,count,list
56,BiologicalProcess,BiologicalProcess,5,"[Automat-viral-proteome(Trapi v1.3.0), Automat..."
57,BiologicalProcess,ChemicalEntity,1,[Automat-icees-kg(Trapi v1.3.0)]
62,BiologicalProcess,Disease,3,"[Automat-viral-proteome(Trapi v1.3.0), Automat..."
65,BiologicalProcess,Gene,3,"[Automat-icees-kg(Trapi v1.3.0), Automat-hetio..."
69,BiologicalProcess,MolecularMixture,1,[Automat-icees-kg(Trapi v1.3.0)]
71,BiologicalProcess,PhenotypicFeature,3,"[Automat-viral-proteome(Trapi v1.3.0), Automat..."
72,BiologicalProcess,PhysiologicalProcess,1,[Automat-icees-kg(Trapi v1.3.0)]
74,BiologicalProcess,Procedure,1,[Automat-icees-kg(Trapi v1.3.0)]
75,BiologicalProcess,Protein,3,"[Automat-viral-proteome(Trapi v1.3.0), Automat..."
77,BiologicalProcess,SmallMolecule,1,[Automat-icees-kg(Trapi v1.3.0)]


## Export to graphml

In [16]:
def create_graph(df2, filename):
    G = nx.Graph()

    node_types = set(pd.concat([df2['subject'], df2['object']]))
        
    for node_type in node_types:
        G.add_node(node_type, label = add_spacing(node_type))

    for index,row in df2.iterrows():
        G.add_edge(row['subject'], row['object'], weight=row['count'])
    
    nx.write_graphml(G, filename, infer_numeric_types=True)

In [17]:
def add_spacing(str):
    key = {
        "BiologicalProcess":               "Biological\nProcess",
        "ChemicalEntity":                  "Chemical\nEntity",
        "MolecularMixture":                "Molecular\nMixture", 
        "PhysiologicalProcess":            "Physiological\nProcess",
        "SmallMolecule":                   "Small\nMolecule",
        "PhenotypicFeature":               "Phenotypic\nFeature",
        'ChemicalExposure':                'Chemical\nExposure',
        'ClinicalAttribute':               'Clinical\nAttribute',
        'ClinicalIntervention':            'Clinical\nIntervention',
        'ComplexMolecularMixture':         'Complex\nMolecular\nMixture',
        'EnvironmentalExposure':           'Environmental\nExposure',
        'InformationContentEntity':        'Information\nContentEntity',
        'MolecularMixture':                'Molecular\nMixture',
        'PhysiologicalProcess':            'Physiological\nProcess',
        'PopulationOfIndividualOrganisms': 'PopulationOf\nIndividualOrganisms'
    }
    if str in key.keys():
        return(key[str])
    else:
        return(str)

In [18]:
create_graph(api_stats_filt, "results/api_stats_filt.graphml")
create_graph(predicate_stats_filt, "results/predicate_stats_filt.graphml")

