# Radiate Analysis

In [1]:
# install lifelike_gds package if not already installed (e.g. running in Google Colab)
import importlib

if importlib.util.find_spec('lifelike_gds') is None:
  !pip install git+https://github.com/SBRG/GDS-Public

# provide the path to the notebook folder in the github repository in case the notebook is run in Google Colab
github_path = 'SBRG/GDS-Public/main/notebooks/generic'

In [2]:
import pandas as pd
import networkx as nx
import os
import warnings
from pathlib import PurePosixPath

In [3]:
from lifelike_gds.arango_network.biocyc import *
from lifelike_gds.arango_network.radiate_trace import RadiateTrace
from lifelike_gds.arango_network.trace_graph_utils import *



In [4]:
warnings.filterwarnings('ignore')

## Settings
Make sure to change to the correct one based on your local settings

In [5]:
input_dir = PurePosixPath('input')
output_dir = PurePosixPath('output')
os.makedirs(output_dir, 0o777, True)
# gds database name
db_name = 'ecocyc-secondaries'
# gds database version, free text, that can be used to describe the graph
db_version = 'ecocyc 25.5 secondaries'

## Parameters
Set parameters
- source_name: name for the source entities. 
- source_file: source file name in input dir
- source_ids: list of source biocyc_ids.  You need either source_file or sourc_ids to get the source nodes


In [6]:
source_name = 'Tyr_genes'
source_file = 'Tyr_gene_table.csv'
# source_biocyc_ids = ['ZN+2', 'FE+2', 'FE+3']

### Define functions to get nodes from reading input file or list of biocyc_ids

In [7]:
"""
read column 'biocyc_id' from file, and found the matched nodes
""" 
def get_nodes_by_biocyc_id_from_file(csv_filename, id_column='biocyc_id'):
    csv_file_path = input_dir / csv_filename
    if os.path.isfile(csv_file_path):
      csv_file_ref = csv_file_path
    else:
      # if does not exist localy, pull from github
      csv_file_ref = f'https://raw.githubusercontent.com/{github_path}/{csv_file_path}'
        
    df = pd.read_csv(csv_file_ref)
    ids = [n for n in df[id_column]]
    nodes = database.get_nodes_by_attr(ids, 'biocyc_id')
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


"""
read column 'name' from file.  Need to path the entity type, such as Gene, Compound, Protein.
This method only match the name, not the synonyms
""" 
def get_nodes_by_name_from_file(csv_filename, nodeLabel, name_column='name'):
    csv_file_path = input_dir / csv_filename
    if os.path.isfile(csv_file_path):
      csv_file_ref = csv_file_path
    else:
      # if does not exist localy, pull from github
      csv_file_ref = f'https://raw.githubusercontent.com/{github_path}/{csv_file_path}'
        
    df = pd.read_csv(csv_file_ref)
    names = [n for n in df[name_column]]
    nodes = database.get_nodes_by_attr(names, 'name', nodeLabel)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes

"""
Get list of nodes from list of biocyc_ids
""" 
def get_nodes_by_biocyc_id(biocyc_ids: []):
    nodes = database.get_nodes_by_attr(biocyc_ids, 'biocyc_id')
    print('ids provided:', len(biocyc_ids), ', nodes matched:', len(nodes))
    return nodes

"""
Get list of nodes by name
names: list of names, e.g. gene names
nodeLabel: entity label, e.g. Gene, Protein, Compound, Reaction
"""
def get_nodes_by_name(names:[], nodeLabel:str):
    nodes = database.get_nodes_by_attr(names, 'name', nodeLabel)
    print('names provided:', len(names), ', nodes matched:', len(nodes))
    return nodes

### Define function to run radiate analysis and export to excel file

In [8]:
"""
Perform radiate analysis from the given source_nodes, and export pageranks and rev_pageranks
into excel file. The excel file contains two tabs, one for pageranks and one for reverse pageranks.
The data are sorted by pagerank (or rev_pagerank)
rows_export: define the top ranked rows exported into file
"""
def export_radiate_analysis(tracegraph, source_name, source_nodes, rows_export=4000):
    tracegraph.graph = tracegraph.orig_graph.copy()
    tracegraph.set_node_set_from_arango_nodes(source_nodes, source_name, source_name)
    outfile_name = f"Radiate_analysis_for_{source_name}.xlsx"
    tracegraph.export_pagerank_data(source_name, outfile_name, direction='both', num_nodes=rows_export)

## Connect to arango database

In [9]:
dbname = os.getenv('ARANGO_DATABASE', db_name)
print(dbname)

database = BiocycDB(dbname)

ecocyc-secondaries


## Load graph from arango graph database to memery

In [10]:
tracegraph = RadiateTrace(Biocyc(database))
# set up output directory where the excel and graph files will write to
tracegraph.datadir = output_dir
# initiate tracegraph by loading graph data from arango
# a networkx graph is created here.  
tracegraph.init_default_graph()

INFO: MultiDirectedGraph with 33428 nodes and 37417 edges


## Get source nodes, run radiate analysis, and export data to excel

In [11]:
source_nodes = get_nodes_by_name_from_file(source_file, 'Gene', 'gene_name')
export_radiate_analysis(tracegraph, source_name, source_nodes)

file_rows: 10 , nodes matched: 16


INFO: set pagerank and num reach for Tyr_genes
INFO: export top 4000 pagerank data into output/Radiate_analysis_for_Tyr_genes.xlsx
