# Create all shortest paths traces from sources to targets

In [10]:
import os
import sys
root = os.getcwd().split('/notebooks/')[0]
sys.path.append(os.path.join(root, 'src'))

from lifelike_gds.arango_network.biocyc import *
from lifelike_gds.arango_network.shortest_paths_trace import ShortestPathTrace
from lifelike_gds.arango_network.trace_graph_utils import *
import pandas as pd
import networkx as nx

import warnings
warnings.filterwarnings('ignore')

## Settings
Make sure to change to the correct one based on your local settings

In [11]:
input_dir = 'input'
output_dir = 'output'
os.makedirs(output_dir, 0o777, True)
# gds database name
db_name = 'ecocyc-secondaries'
# gds database version, free text, that can be used to describe the graph
db_version = 'ecocyc 25.5 secondaries'
sppn = 3

## Parameters
Set parameters
- source_name: name for the source entities. 
- target_name: name for the target entities. 
- source_file: source file name in input dir
- source_ids: list of source biocyc_ids.  You need either source_file or sourc_ids to get the source nodes
- target_file: target file name in input dir
- target_ids: list of target biocyc_ids.  You need either target_file or target_ids to get the target nodes


In [12]:
# source and target data files must have column 'biocyc_id' , or 'name'
source_name = 'metals'
source_file = 'metals.csv'
# source_biocyc_ids = ['ZN+2', 'FE+2', 'FE+3']

target_name = 'biomass-metabolites'
target_file = 'biomass_precursors.csv'
# target_biocyc_ids = ['FRUCTOSE-6P','GAP', 'PYRUVATE', 'SUC-COA']

## Define functions to get source/target nodes
- read input file with biocyc_id
- read input file with entity name
- get nodes by list of biocyc_ids
- get nodes by list of names

In [13]:
"""
read column 'biocyc_id' from file, and found the matched nodes
""" 
def get_nodes_by_biocyc_id_from_file(csv_filename, id_column='biocyc_id'):
    df = pd.read_csv(os.path.join(input_dir, csv_filename))
    ids = [n for n in df[id_column]]
    nodes = database.get_nodes_by_attr(ids, 'biocyc_id')
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


"""
read column 'name' from file.  Need to path the entity type, such as Gene, Compound, Protein.
This method only match the name, not the synonyms
""" 
def get_nodes_by_name_from_file(csv_filename, nodeLabel, name_column='name'):
    df = pd.read_csv(os.path.join(input_dir, csv_filename))
    names = [n for n in df[name_column]]
    nodes = database.get_nodes_by_attr(names, 'name', nodeLabel)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes

"""
Get list of nodes from list of biocyc_ids
""" 
def get_nodes_by_biocyc_id(biocyc_ids: []):
    nodes = database.get_nodes_by_attr(biocyc_ids, 'biocyc_id')
    print('ids provided:', len(biocyc_ids), ', nodes matched:', len(nodes))
    return nodes
    
"""
Get list of nodes by name
names: list of names, e.g. gene names
nodeLabel: entity label, e.g. Gene, Protein, Compound, Reaction
"""
def get_nodes_by_name(names:[], nodeLabel:str):
    nodes = database.get_nodes_by_attr(names, 'name', nodeLabel)
    print('names provided:', len(names), ', nodes matched:', len(nodes))
    return nodes
    

## Define functions to export shortest paths traces to graph file 

In [14]:
"""
Simplest way to generate shortest paths. 
Return all paths from all source nodes to all target_nodes in one trace graph
"""
def write_shortest_paths(tracegraph, source_name, source_nodes, target_name, target_nodes, sppn=3):
    tracegraph.graph = tracegraph.orig_graph.copy()
    tracegraph.set_node_set_from_arango_nodes(source_nodes, source_name, source_name)
    tracegraph.set_node_set_from_arango_nodes(target_nodes, target_name, target_name)
    tracegraph.add_graph_description(f'database: {db_version}\n')
    source_as_query= len(source_nodes)>len(target_nodes)
    ok = tracegraph.add_shortest_paths(source_name, target_name, source_as_query, shortest_paths_plus_n=sppn)
    if ok:
        graphfile = f"Shortest_paths_from_{source_name}_to_{target_name}.graph"
        tracegraph.write_to_sankey_file(graphfile)
    else:
        print(f"No paths found from {source_name} to {target_name}")
    

"""
Get traces from each source nodes to all the target nodes in different traces(dropdown list)
and also the combined traces from all source nodes to all target_nodes
"""
def write_shortest_paths_from_each_source(tracegraph, source_name, source_nodes, target_name, target_nodes, sppn=3):
    tracegraph.graph = tracegraph.orig_graph.copy()
    tracegraph.set_node_set_from_arango_nodes(source_nodes, source_name, source_name)
    tracegraph.set_node_set_from_arango_nodes(target_nodes, target_name, target_name)
    tracegraph.add_graph_description(f'database: {db_version}\n')
    
    ok = False
    for node in source_nodes:
        node_key = tracegraph.set_node_set_for_node(node)
        ok = ok | tracegraph.add_shortest_paths(node_key, target_name, sources_as_query=False)
    ok = ok | tracegraph.add_shortest_paths(source_name, target_name, len(source_nodes)>len(target_nodes), shortest_paths_plus_n=sppn)
    if ok:
        graphfile = f"Shortest_paths_from_each_{source_name}_to_{target_name}.graph"
        tracegraph.write_to_sankey_file(graphfile)
    else:
        print(f"No paths found from {source_name} to {target_name}")
    
"""
Get traces from all source nodes to each target node in different traces(dropdown list)
and also the combined traces from all source nodes to all target_nodes
"""
def write_shortest_paths_to_each_traget(tracegraph, source_name, source_nodes, target_name, target_nodes, sppn=3):
    tracegraph.graph = tracegraph.orig_graph.copy()
    tracegraph.set_node_set_from_arango_nodes(source_nodes, source_name, source_name)
    tracegraph.set_node_set_from_arango_nodes(target_nodes, target_name, target_name)
    tracegraph.add_graph_description(f'database: {db_version}\n')
    
    ok = False
    for node in target_nodes:
        node_key = tracegraph.set_node_set_for_node(node)
        ok = ok | tracegraph.add_shortest_paths(source_name, node_key, sources_as_query=True, shortest_paths_plus_n=sppn)
    ok = ok | tracegraph.add_shortest_paths(source_name, target_name, len(source_nodes)>len(target_nodes), shortest_paths_plus_n=sppn)
    if ok:
        graphfile = f"Shortest_paths_from_{source_name}_to_each_{target_name}.graph"
        tracegraph.write_to_sankey_file(graphfile)
    else:
        print(f"No paths found from {source_name} to {target_name}")

## Connect to arango database

In [15]:
dbname = os.getenv('ARANGO_DATABASE', db_name)

database = BiocycDB(dbname)

## Load graph from arango graph database to memery

In [16]:
tracegraph = ShortestPathTrace(Biocyc(database), multigraph=False)
# set up output directory where the excel and graph files will write to
tracegraph.datadir = output_dir
# initiate tracegraph by loading graph data from arango
# a networkx graph is created here.  
tracegraph.init_default_graph()

INFO:root:MultiDirectedGraph with 33428 nodes and 37886 edges


## Get source and target nodes
Make sure to choose the right method to get the nodes

In [17]:
source_nodes = get_nodes_by_biocyc_id_from_file(source_file)
# source_nodes = get_nodes_by_name(['csgD'], 'Gene')
target_nodes = get_nodes_by_biocyc_id_from_file(target_file)

file_rows: 3 , nodes matched: 3
file_rows: 12 , nodes matched: 12


## Create (sankey) graph file for shortest paths

In [18]:
write_shortest_paths(tracegraph, source_name, source_nodes, target_name, target_nodes, sppn)

INFO:root:add Shortest paths from metals to biomass-metabolites: 422 paths
INFO:root:clean graph: number of graph nodes decreased from 33428 to 127
INFO:root:writing output/Shortest_paths_from_metals_to_biomass-metabolites


In [19]:
# write_shortest_paths_from_each_source(tracegraph, source_name, source_nodes, target_name, target_nodes)

In [20]:
# write_shortest_paths_to_each_traget(tracegraph, source_name, source_nodes, target_name, target_nodes)