# Radiate Traces

In [1]:
# install lifelike_gds package if not already installed (e.g. running in Google Colab)
import importlib

if importlib.util.find_spec('lifelike_gds') is None:
  !pip install git+https://github.com/SBRG/GDS-Public

# provide the path to the notebook folder in the github repository in case the notebook is run in Google Colab
github_path = 'SBRG/GDS-Public/main/notebooks/reactome'

In [2]:
import os
import warnings
from pathlib import PurePosixPath

In [3]:
# Import GDS modules
from lifelike_gds.arango_network.radiate_trace import RadiateTrace
from lifelike_gds.arango_network.reactome import *
from lifelike_gds.arango_network.trace_graph_utils import *



In [4]:
# Ignore warnings
warnings.filterwarnings('ignore')

## Settings
Make sure to change to the correct one based on your local settings

In [5]:
# Directory where to look for input data
input_dir = PurePosixPath('input')

# Directory where to output results
output_dir = PurePosixPath('output')
os.makedirs(output_dir, exist_ok=True)

# Reactome arango database connection parameters
# If you are running this notebook from Lifelike's online training BinderHub website,
# then these paremeters are already set for you in the environment.
arango_dbname = os.getenv('REACTOME_DATABASE', 'reactome')


## Parameters
Set parameters
- source_name: name for the source entities. 
- source_file: source file name in input dir


In [6]:
source_name = 'endo-updown-genes'
source_file = 'updown.entrez'

nodes_select_file = 'Radiate_analysis_for_endo-updown-genes.xlsx'


## Define functions to get source/target nodes
- read input file with ids (stId or dbId)
- read input file with reference ids (gene_id or chebi_id)

In [7]:
"""
read csv file to get list of reactome nodes

Parameters
----------
csv_filename:  the input file
id_name: the property name in reactome db, e.g. stId, dbId
id_column: the column name for the id property
"""


def get_nodes_by_identity_from_file(csv_filename, id_name, id_column, sep=','):
    csv_file_path = input_dir / csv_filename
    if os.path.isfile(csv_file_path):
      csv_file_ref = csv_file_path
    else:
      # if does not exist localy, pull from github
      csv_file_ref = f'https://raw.githubusercontent.com/{github_path}/{csv_file_path}'
        
    df = pd.read_csv(csv_file_ref, sep=sep, dtype='str')
    ids = [n for n in df[id_column]]
    nodes = database.get_nodes_by_attr(ids, id_name)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


def get_chemical_nodes_by_chebi(csv_filename, chebi_id_column, sep=','):
    csv_file_path = input_dir / csv_filename
    if os.path.isfile(csv_file_path):
      csv_file_ref = csv_file_path
    else:
      # if does not exist localy, pull from github
      csv_file_ref = f'https://raw.githubusercontent.com/{github_path}/{csv_file_path}'
        
    df = pd.read_csv(csv_file_ref, sep=sep, dtype='str')
    ids = [n for n in df[chebi_id_column]]
    nodes = database.get_entity_nodes_by_chebi_ids(ids)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


def get_protein_nodes_by_gene_id(csv_filename, gene_id_column, sep=','):
    csv_file_path = input_dir / csv_filename
    if os.path.isfile(csv_file_path):
      csv_file_ref = csv_file_path
    else:
      # if does not exist localy, pull from github
      csv_file_ref = f'https://raw.githubusercontent.com/{github_path}/{csv_file_path}'
        
    df = pd.read_csv(csv_file_ref, sep=sep, dtype='str')
    ids = [n for n in df[gene_id_column]]
    nodes = database.get_entity_nodes_by_gene_ids(ids)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


def get_nodes_by_stId(stIds):
    return database.get_nodes_by_attr(stIds, 'stId')


"""
Read file for nodes selection.  The file was generated from radiate analysis but contains user's node selection. 
Any columns after 'nReach' or 'rev_nReach' will be scanned for value '1' as selected rows.
Users can use the column name to specific the selected nodes, such as 'selected_genes', 'selected_compounds' 

filename: the file with nodes selection based on radiate analysis
sheet_name: 'pageranks' or 'reverse pageranks'
return dict with column name as key, and selected node eids as value
"""


def get_selected_nodes(filename, sheet_name):
    file_path = input_dir / filename
    if os.path.isfile(file_path):
      file_ref = file_path
    else:
      # if does not exist localy, pull from github
      file_ref = f'https://raw.githubusercontent.com/{github_path}/{file_path}'
    df = pd.read_excel(file_ref, sheet_name)
    colnames = [c for c in df.columns]
    if 'nReach' in colnames:
        select_cols = colnames[colnames.index('nReach') + 1 :]
    else:
        select_cols = colnames[colnames.index('rev_nReach') + 1 :]
    selected_nodes = dict()
    for c in select_cols:
        mydf = df[df[c] == 1]
        selected_nodes[c] = [id for id in mydf['stId']]
    print(f'selected {sheet_name} nodes:\n', selected_nodes)
    return selected_nodes


### Define function to run radiate analysis and export to excel file

In [8]:
"""
source_name: the data set name for radiate analysis
source_nodes: list of source nodes used for radiate analysis
forward_selection: dict for col_name:eids for nodes selected based on pageranks
reverse_selection: dict for col_name:eids for nodes selected based on rev_pageranks
"""


def export_radiate_traces(
    tracegraph,
    source_name,
    source_nodes,
    forward_selection: dict,
    reverse_selection: dict,
):
    tracegraph.graph = tracegraph.orig_graph.copy()
    tracegraph.set_node_set_from_arango_nodes(
        source_nodes, source_name, source_name
    )

    # set pagerank or rev_pagerank property
    pagerank_prop = 'pagerank'
    rev_pagerank_prop = 'rev_pagerank'

    if forward_selection:
        tracegraph.set_pagerank(source_name, pagerank_prop, False)

    if reverse_selection:
        tracegraph.set_pagerank(source_name, rev_pagerank_prop, True)

    # add graph description
    tracegraph.add_graph_description('Reactome')

    # add forward traces
    if forward_selection:
        for k, v in forward_selection.items():
            selected_nodes = get_nodes_by_stId(v)
            nodeset_name = 'forward ' + k
            tracegraph.set_node_set_from_arango_nodes(
                selected_nodes, nodeset_name, nodeset_name
            )

            # add traces from sources to each selected nodes
            tracegraph.add_traces_from_sources_to_each_selected_nodes(
                selected_nodes,
                source_name,
                weighted_prop=pagerank_prop,
                selected_nodes_name=nodeset_name,
            )

            # add traces from sources to all selected nodes
            tracegraph.add_trace_from_sources_to_all_selected_nodes(
                nodeset_name,
                source_name,
                weighted_prop=pagerank_prop,
                trace_name=f'Forward combined {k}',
            )

    # add reverse traces
    if reverse_selection:
        for k, v in reverse_selection.items():
            selected_nodes = get_nodes_by_stId(v)
            nodeset_name = 'reverse ' + k
            tracegraph.set_node_set_from_arango_nodes(
                selected_nodes, nodeset_name, nodeset_name
            )

            # add traces from each selected nodes to SOURCE_SET genes
            tracegraph.add_traces_from_each_selected_nodes_to_targets(
                selected_nodes,
                source_name,
                weighted_prop=rev_pagerank_prop,
                selected_nodes_name=nodeset_name,
            )

            # add traces from all reverse-selected nodes to SOURCE_SET
            tracegraph.add_trace_from_all_selected_nodes_to_targets(
                nodeset_name,
                source_name,
                weighted_prop=rev_pagerank_prop,
                trace_name=f"Reverse combined {k}",
            )

    # write all traces into one graph file
    graph_file = f'Radiate_traces_for_{source_name}.graph'
    tracegraph.write_to_sankey_file(graph_file)


## Connect to arango database

In [9]:
database = ReactomeDB(arango_dbname)

## Load graph from arango graph database to memery

In [10]:
tracegraph = RadiateTrace(Reactome(database))
# set up output directory where the excel and graph files will write to
tracegraph.datadir = output_dir
# initiate tracegraph by loading graph data from arango
# a networkx graph is created here.  
tracegraph.init_default_graph()

INFO: load reactome graph
INFO: MultiDirectedGraph with 71225 nodes and 112575 edges


## Get source nodes, run radiate analysis, and export data to excel

In [None]:
source_nodes = get_protein_nodes_by_gene_id(source_file, 'gene_id')
selected_forward_nodes = get_selected_nodes(nodes_select_file, 'pageranks')
selected_reverse_nodes = get_selected_nodes(nodes_select_file, 'reverse pageranks')

export_radiate_traces(tracegraph, source_name, source_nodes, selected_forward_nodes, selected_reverse_nodes)

INFO: 92 gene_ids, matched to 119 nodes


file_rows: 92 , nodes matched: 119
selected pageranks nodes:
 {'select': ['R-HSA-6793931', 'R-HSA-549533', 'R-HSA-400373']}
selected reverse pageranks nodes:
 {'select': ['R-HSA-6791223', 'R-HSA-5660404', 'R-HSA-4568733']}


INFO: Adding trace network endo-updown-genes to PER1 [cytosol] #1
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: Target 1336810 cannot be reachedfrom given sources
ERROR: