# Reactome Radiate Analysis

In [1]:
import os
import sys

root = os.getcwd().split('/notebooks/')[0]
sys.path.append(os.path.join(root, 'src'))

# Import GDS modules
from lifelike_gds.arango_network.radiate_trace import RadiateTrace
from lifelike_gds.arango_network.reactome import *
from lifelike_gds.arango_network.trace_graph_utils import *

# Ignore warnings
import warnings

warnings.filterwarnings('ignore')

## Settings
Make sure to change to the correct one based on your local settings

In [2]:
# Directory where to look for input data
input_dir = './input'

# Directory where to output results
output_dir = './output'
os.makedirs(output_dir, exist_ok=True)

# Reactome arango database connection parameters
# If you are running this notebook from Lifelike's online training BinderHub website,
# then these paremeters are already set for you in the environment.
arango_dbname = os.getenv('REACTOME_DATABASE', 'reactome')

## Parameters
Set parameters
- source_name: name for the source entities. 
- source_file: source file name in input dir

In [3]:
source_name = 'endo-updown-genes'
source_file = 'updown.entrez'

## Define functions to get source/target nodes
- read input file with ids (stId or dbId)
- read input file with reference ids (gene_id or chebi_id)

In [4]:
"""
read csv file to get list of reactome nodes

Parameters
----------
csv_filename:  the input file
id_name: the property name in reactome db, e.g. stId, dbId
id_column: the column name for the id property
"""


def get_nodes_by_identity_from_file(csv_filename, id_name, id_column, sep=','):
    df = pd.read_csv(os.path.join(input_dir, csv_filename), sep=sep, dtype='str')
    ids = [n for n in df[id_column]]
    nodes = database.get_nodes_by_attr(ids, id_name)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


def get_chemical_nodes_by_chebi(csv_filename, chebi_id_column, sep=','):
    df = pd.read_csv(os.path.join(input_dir, csv_filename), sep=sep, dtype='str')
    ids = [n for n in df[chebi_id_column]]
    nodes = database.get_entity_nodes_by_chebi_ids(ids)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


def get_protein_nodes_by_gene_id(csv_filename, gene_id_column, sep=','):
    df = pd.read_csv(os.path.join(input_dir, csv_filename), sep=sep, dtype='str')
    ids = [n for n in df[gene_id_column]]
    nodes = database.get_entity_nodes_by_gene_ids(ids)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


def get_reference_nodes_by_chebi(csv_filename, chebi_id_column, sep=','):
    df = pd.read_csv(os.path.join(input_dir, csv_filename), sep=sep, dtype='str')
    ids = [n for n in df[chebi_id_column]]
    nodes = database.get_reference_nodes_by_chebi_ids(ids)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


def get_reference_nodes_by_gene_id(csv_filename, gene_id_column, sep=','):
    df = pd.read_csv(os.path.join(input_dir, csv_filename), sep=sep, dtype='str')
    ids = [n for n in df[gene_id_column]]
    nodes = database.get_reference_nodes_by_gene_ids(ids)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes

### Define function to run radiate analysis and export to excel file

In [5]:
"""
Perform radiate analysis from the given source_nodes, and export pageranks and rev_pageranks
into excel file. The excel file contains two tabs, one for pageranks and one for reverse pageranks.
The data are sorted by pagerank (or rev_pagerank)
rows_export: define the top ranked rows exported into file
"""


def export_radiate_analysis(
    tracegraph,
    source_name,
    source_nodes,
    exclude_sources_from_file=False,
    rows_export=4000,
):
    tracegraph.graph = tracegraph.orig_graph.copy()
    tracegraph.set_node_set_from_arango_nodes(
        source_nodes, source_name, source_name
    )
    outfile_name = f"Radiate_analysis_for_{source_name}.xlsx"
    tracegraph.export_pagerank_data(
        source_name,
        outfile_name,
        direction='both',
        num_nodes=rows_export,
        exclude_sources=exclude_sources_from_file,
    )

## Connect to arango database

In [6]:
database = ReactomeDB()

## Load graph from arango graph database to memery

In [7]:
tracegraph = RadiateTrace(Reactome(database))

# set up output directory where the excel and graph files will write to
tracegraph.datadir = output_dir

# initiate tracegraph by loading graph data from arango
# a networkx graph is created here.
tracegraph.init_default_graph()

INFO: load reactome graph
INFO: MultiDirectedGraph with 71225 nodes and 112575 edges


## Get source nodes, run radiate analysis, and export data to excel

In [None]:
source_nodes = get_protein_nodes_by_gene_id(source_file, 'gene_id')
export_radiate_analysis(tracegraph, source_name, source_nodes)


INFO: 92 gene_ids, matched to 119 nodes


file_rows: 92 , nodes matched: 119


INFO: set pagerank and num reach for endo-updown-genes
INFO: export top 4000 pagerank data into ./output/Radiate_analysis_for_endo-updown-genes.xlsx
