# All Shortest Paths for curli use case

#### Library:  NetworkX
NetworkX is a python package for the creation, manipulation and study of the structure,
and functions of complex networks.  

#### Unweighted shortest paths
This demo will get all shortest paths for all pairs of nodes from group S nodes (sources) to group T nodes (targets). 

Given source S and target T in the example below, the shortest paths have 3 hops (2 nodes in between), including S->1->2->T and S->1->3->T.   
<img align='left' src="img/shortest_paths.png" width='500'> 

In [31]:
import os
import sys
root = os.getcwd().split('/notebooks/')[0]
sys.path.append(os.path.join(root, 'src'))

from lifelike_gds.arango_network.biocyc import *
from lifelike_gds.arango_network.shortest_paths_trace import ShortestPathTrace
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

## Settings
Make sure to change to the correct one based on your local settings

In [32]:
input_dir = 'input'
output_dir = 'output'
os.makedirs(output_dir, 0o777, True)
# gds database name
db_name = 'ecocyc'
# gds database version, free text, that can be used to describe the graph
db_version = 'ecocyc 25.5'

## Parameters
Set parameters
- source_name: name for the source entities. 
- target_name: name for the target entities. 
- source_file: source file name in input dir
- target_file: target file name in input dir


In [33]:
# shortest paths sources are pheno type1 knock-out genes
source_name = 'pheno1_genes'
source_file = 'curli_genes_pheno_1.csv'

# shortest paths taragets are curli genes
target_name = 'curli_genes'
target_file = 'csg_genes.csv'

## Define functions to get nodes from reading input file or list of biocyc_ids

In [34]:
"""
read column 'biocyc_id' from file, and found the matched nodes
""" 
def get_nodes_by_biocyc_id_from_file(csv_filename, id_column='biocyc_id'):
    df = pd.read_csv(os.path.join(input_dir, csv_filename))
    ids = [n for n in df[id_column]]
    nodes = database.get_nodes_by_attr(ids, 'biocyc_id')
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


## Define functions to export shortest paths traces to graph file 

In [35]:
"""
Simplest way to generate shortest paths. 
Return all paths from all source nodes to all target_nodes in one trace graph
"""
def write_shortest_paths(tracegraph, source_name, source_nodes, target_name, target_nodes):
    tracegraph.graph = tracegraph.orig_graph.copy()
    tracegraph.set_node_set_from_arango_nodes(source_nodes, source_name, source_name)
    tracegraph.set_node_set_from_arango_nodes(target_nodes, target_name, target_name)
    tracegraph.add_graph_description(f'database: {db_version}\n')
    tracegraph.add_shortest_paths(source_name, target_name)
    graphfile = f"Shortest_paths_from_{source_name}_to_{target_name}.graph"
    tracegraph.write_to_sankey_file(graphfile)


## Connect to arango database

In [36]:
dbname = os.getenv('ARANGO_DATABASE', db_name)

database = BiocycDB(dbname)

## Load graph from arango graph database to memery

In [37]:
tracegraph = ShortestPathTrace(Biocyc(database))
# set up output directory where the excel and graph files will write to
tracegraph.datadir = output_dir
# initiate tracegraph by loading graph data from arango
# a networkx graph is created here.  
tracegraph.init_default_graph()

INFO:root:MultiDirectedGraph with 33428 nodes and 37886 edges


## Get source and target nodes

In [38]:
source_nodes = get_nodes_by_biocyc_id_from_file(source_file)
target_nodes = get_nodes_by_biocyc_id_from_file(target_file)

file_rows: 35 , nodes matched: 35
file_rows: 7 , nodes matched: 7


## Create (sankey) graph file for shortest paths

In [39]:
write_shortest_paths(tracegraph, source_name, source_nodes, target_name, target_nodes)

INFO:root:add Shortest paths from pheno1_genes to curli_genes: 230 paths
INFO:root:clean graph: number of graph nodes decreased from 33428 to 210
INFO:root:writing output/Shortest_paths_from_pheno1_genes_to_curli_genes
