# Create all shortest path traces with reactome db

The biggest difference from biocyc is entity mapping.  It is a lot more complicated since Reactome does not have a consistent ID.

In [1]:
import os

# Import GDS modules
from lifelike_gds.arango_network.shortest_paths_trace import ShortestPathTrace
from lifelike_gds.arango_network.reactome import *
from lifelike_gds.arango_network.trace_graph_utils import *

# Ignore warnings
import warnings

warnings.filterwarnings('ignore')

## Settings
Make sure to change to the correct one based on your local settings

In [2]:
# Directory where to look for input data
input_dir = './input'

# Directory where to output results
output_dir = './output'
os.makedirs(output_dir, exist_ok=True)

# Reactome arango database connection parameters
# If you are running this notebook from Lifelike's online training BinderHub website,
# then these paremeters are already set for you in the environment.
arango_dbname = os.getenv('REACTOME_DATABASE', 'reactome')

## Parameters
Set parameters
- source_name: name for the source entities. 
- target_name: name for the target entities. 
- source_file: source file name in input dir
- target_file: target file name in input dir


In [3]:
source_name = 'endo-down-genes'
source_file = 'down.tsv'

target_name = 'metabs'
target_file = 'metabolite.txt'

## Define functions to get source/target nodes
- read input file with ids (stId or dbId)
- read input file with reference ids (gene_id or chebi_id)

In [4]:
# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

"""
read csv file to get list of reactome nodes

Parameters
----------
csv_filename:  the input file
id_name: the property name in reactome db, e.g. stId, dbId
id_column: the column name for the id property
"""


def get_nodes_by_identity_from_file(csv_filename, id_name, id_column, sep=','):
    df = pd.read_csv(os.path.join(input_dir, csv_filename), sep=sep, dtype='str')
    ids = [n for n in df[id_column]]
    nodes = database.get_nodes_by_attr(ids, id_name)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


def get_chemical_nodes_by_chebi(csv_filename, chebi_id_column, sep=','):
    df = pd.read_csv(os.path.join(input_dir, csv_filename), sep=sep, dtype='str')
    ids = [n for n in df[chebi_id_column]]
    nodes = database.get_entity_nodes_by_chebi_ids(ids)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


def get_protein_nodes_by_gene_id(csv_filename, gene_id_column, sep=','):
    df = pd.read_csv(os.path.join(input_dir, csv_filename), sep=sep, dtype='str')
    ids = [n for n in df[gene_id_column]]
    nodes = database.get_entity_nodes_by_gene_ids(ids)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


def get_reference_nodes_by_chebi(csv_filename, chebi_id_column, sep=','):
    df = pd.read_csv(os.path.join(input_dir, csv_filename), sep=sep, dtype='str')
    ids = [n for n in df[chebi_id_column]]
    nodes = database.get_reference_nodes_by_chebi_ids(ids)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


def get_reference_nodes_by_gene_id(csv_filename, gene_id_column, sep=','):
    df = pd.read_csv(os.path.join(input_dir, csv_filename), sep=sep, dtype='str')
    ids = [n for n in df[gene_id_column]]
    nodes = database.get_reference_nodes_by_gene_ids(ids)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes

## Define functions to export shortest paths traces to graph file 

In [5]:
"""
Simplest way to generate shortest paths. 
Return all paths from all source nodes to all target_nodes in one trace graph
"""


def write_shortest_paths(
    tracegraph, source_name, source_nodes, target_name, target_nodes
):
    tracegraph.graph = tracegraph.orig_graph.copy()
    tracegraph.set_node_set_from_arango_nodes(
        source_nodes, source_name, source_name
    )
    tracegraph.set_node_set_from_arango_nodes(
        target_nodes, target_name, target_name
    )
    tracegraph.add_graph_description('Reactome')
    source_as_query = len(source_nodes) > len(target_nodes)
    ok = tracegraph.add_shortest_paths(source_name, target_name, source_as_query)
    if ok:
        graphfile = f"Shortest_paths_from_{source_name}_to_{target_name}.graph"
        tracegraph.write_to_sankey_file(graphfile)
    else:
        print(f"No paths found from {source_name} to {target_name}")

## Connect to arango database

In [6]:
database = ReactomeDB(arango_dbname)

## Load graph from arango graph database to memery

In [7]:
tracegraph = ShortestPathTrace(Reactome(database))

# set up output directory where the excel and graph files will write to
tracegraph.datadir = output_dir

# initiate tracegraph by loading graph data from arango
# a networkx graph is created here.
tracegraph.init_default_graph()

INFO: load reactome graph
INFO: MultiDirectedGraph with 71225 nodes and 112575 edges


## Get source and target nodes
Make sure to choose the right method to get the nodes

In [8]:
source_refs = get_reference_nodes_by_gene_id(source_file, 'entrez', sep='\t')
source_nodes = get_protein_nodes_by_gene_id(source_file, 'entrez', sep='\t')

INFO: 22 gene_ids, matched to 15 nodes
INFO: 22 gene_ids, matched to 32 nodes


file_rows: 22 , nodes matched: 15
file_rows: 22 , nodes matched: 32


In [9]:
target_refs = get_reference_nodes_by_chebi(target_file, 'chebi')
target_nodes = get_chemical_nodes_by_chebi(target_file, 'chebi')

INFO: 23 chebi_ids, matched to 7 nodes


file_rows: 23 , nodes matched: 7


INFO: 23 chebi_ids, matched to 13 nodes


file_rows: 23 , nodes matched: 13


## Create (sankey) graph file for shortest paths

In [10]:
write_shortest_paths(
    tracegraph, source_name, source_nodes, target_name, target_nodes
)

ERROR: Target 854528 cannot be reachedfrom given sources
ERROR: Target 878946 cannot be reachedfrom given sources
ERROR: Target 1864290 cannot be reachedfrom given sources
ERROR: Target 196388 cannot be reachedfrom given sources
ERROR: Target 778052 cannot be reachedfrom given sources
ERROR: Target 1746180 cannot be reachedfrom given sources
ERROR: Target 1685994 cannot be reachedfrom given sources
ERROR: Target 778124 cannot be reachedfrom given sources
ERROR: Target 198286 cannot be reachedfrom given sources
ERROR: Target 869649 cannot be reachedfrom given sources
ERROR: Target 878776 cannot be reachedfrom given sources
ERROR: Target 295580 cannot be reachedfrom given sources
ERROR: Target 854528 cannot be reachedfrom given sources
ERROR: Target 878946 cannot be reachedfrom given sources
ERROR: Target 1864290 cannot be reachedfrom given sources
ERROR: Target 196388 cannot be reachedfrom given sources
ERROR: Target 778052 cannot be reachedfrom given sources
ERROR: Target 1746180 canno