# Radiate Traces for curli phenotype-1 genes

In [1]:
import os
import sys
root = os.getcwd().split('/notebooks/')[0]
sys.path.append(os.path.join(root, 'src'))

from lifelike_gds.arango_network.biocyc import *
from lifelike_gds.arango_network.radiate_trace import RadiateTrace
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

## Settings
Make sure to change to the correct one based on your local settings

In [2]:
input_dir = 'input'
output_dir = 'output'
os.makedirs(output_dir, 0o777, True)
# gds database name
db_name = 'ecocyc-25'
# gds database version, free text, that can be used to describe the graph
db_version = 'ecocyc 25.5'

## Parameters
Set parameters
- source_name: name for the source entities. 
- source_file: source file name in input dir
- nodes_select_file: radiate analysis output file with columns 'xx_select' that flag the selected nodes as '1'. 


In [3]:
# sources for personalized pagerank analysis
source_name = 'pheno1_genes'
source_file = 'curli_genes_pheno_1.csv'

nodes_select_file = 'Radiate_analysis_for_pheno1_genes_select.xlsx'

### Define functions to get nodes from reading input file or list of biocyc_ids

In [4]:
"""
read column 'biocyc_id' from file, and found the matched nodes
""" 
def get_nodes_by_biocyc_id_from_file(csv_filename, id_column='biocyc_id'):
    df = pd.read_csv(os.path.join(input_dir, csv_filename))
    ids = [n for n in df[id_column]]
    nodes = database.get_nodes_by_attr(ids, 'biocyc_id')
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


"""
read column 'name' from file.  Need to path the entity type, such as Gene, Compound, Protein.
This method only match the name, not the synonyms
""" 
def get_nodes_by_name_from_file(csv_filename, nodeLabel, name_column='name'):
    df = pd.read_csv(os.path.join(input_dir, csv_filename))
    names = [n for n in df[name_column]]
    nodes = database.get_nodes_by_attr(names, 'name', nodeLabel)
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes

"""
Get list of nodes from list of biocyc_ids
""" 
def get_nodes_by_biocyc_id(biocyc_ids: []):
    nodes = database.get_nodes_by_attr(biocyc_ids, 'biocyc_id')
    print('ids provided:', len(biocyc_ids), ', nodes matched:', len(nodes))
    return nodes
    

"""
Get list of nodes by 'eid'.
eid is the unique identity for ecocyc graph database, most of the time it matches the biocyc_id.
For the case of reversible reactions, two nodes (forward and reverse reactions) have the same biocyc_id,
but different eids. 
"""
def get_nodes_by_eid(eids: []):
    return database.get_nodes_by_attr(eids, 'eid')


"""
Read file for nodes selection.  The file was generated from radiate analysis but contains user's node selection. 
Any columns after 'nReach' or 'rev_nReach' will be scanned for value '1' as selected rows.
Users can use the column name to specific the selected nodes, such as 'selected_genes', 'selected_compounds' 

filename: the file with nodes selection based on radiate analysis
sheet_name: 'pageranks' or 'reverse pageranks'
return dict with column name as key, and selected node eids as value
"""
def get_selected_nodes(filename, sheet_name):
    df = pd.read_excel(os.path.join(input_dir, filename), sheet_name)
    colnames = [c for c in df.columns]
    if 'nReach' in colnames:
        select_cols = colnames[colnames.index('nReach')+1:]
    else: 
        select_cols = colnames[colnames.index('rev_nReach')+1:]
    selected_nodes = dict()
    for c in select_cols:
        mydf = df[df[c]==1]
        selected_nodes[c] = [id for id in mydf['eid']]
    print(f'selected {sheet_name} nodes:\n', selected_nodes)
    return selected_nodes
    

### Define function to run radiate analysis and export to excel file

In [5]:
"""
source_name: the data set name for radiate analysis
source_nodes: list of source nodes used for radiate analysis
forward_selection: dict for col_name:eids for nodes selected based on pageranks
reverse_selection: dict for col_name:eids for nodes selected based on rev_pageranks
"""
def export_radiate_traces(tracegraph, source_name, source_nodes,
                            forward_selection:dict, reverse_selection:dict):
    tracegraph.graph = tracegraph.orig_graph.copy()
    tracegraph.set_node_set_from_arango_nodes(source_nodes, 
                                             source_name, source_name)
    # set pagerank or rev_pagerank property
    pagerank_prop = 'pagerank'
    rev_pagerank_prop = 'rev_pagerank'
    if forward_selection:
        tracegraph.set_pagerank(source_name, pagerank_prop, False)
    if reverse_selection:
        tracegraph.set_pagerank(source_name, rev_pagerank_prop, True)
    
    # add graph description
    tracegraph.add_graph_description(f'Database: {db_version}\n')
    
    # add forward traces
    if forward_selection:
        for k, v in forward_selection.items():
            selected_nodes = get_nodes_by_eid(v)
            tracegraph.set_node_set_from_arango_nodes(selected_nodes, k, k)
            # add traces from sources to each selected nodes
            tracegraph.add_traces_from_sources_to_each_selected_nodes(selected_nodes, source_name, 
                                                                      weighted_prop=pagerank_prop,
                                                                      selected_nodes_name = k)
            # add traces from sources to all selected nodes
            tracegraph.add_trace_from_sources_to_all_selected_nodes(k, source_name, 
                                                                    weighted_prop=pagerank_prop,
                                                                    trace_name=f'forward combined {k}')
    
    # add reverse traces
    if reverse_selection:
        for k, v in reverse_selection.items():
            selected_nodes = get_nodes_by_eid(v)
            tracegraph.set_node_set_from_arango_nodes(selected_nodes, k, k)
            # add traces from each selected nodes to SOURCE_SET genes
            tracegraph.add_traces_from_each_selected_nodes_to_targets(selected_nodes,source_name, 
                                                                      weighted_prop=rev_pagerank_prop,
                                                                      selected_nodes_name=k)

            # add traces from all reverse-selected nodes to SOURCE_SET
            tracegraph.add_trace_from_all_selected_nodes_to_targets(k, source_name, 
                                                                    weighted_prop=rev_pagerank_prop,
                                                                    trace_name=f"reverse combined {k}")

            
    # write all traces into one graph file
    graph_file = f'Radiate_traces_for_{source_name}.graph'
    tracegraph.write_to_sankey_file(graph_file)
    
    

## Connect to arango database

In [6]:
dbname = os.getenv('ARANGO_DATABASE', db_name)

database = BiocycDB(dbname)

## Load graph from arango graph database to memery

In [7]:
tracegraph = RadiateTrace(Biocyc(database))
# set up output directory where the excel and graph files will write to
tracegraph.datadir = output_dir
# initiate tracegraph by loading graph data from arango
# a networkx graph is created here.  
tracegraph.init_default_graph()

INFO: MultiDirectedGraph with 33428 nodes and 37886 edges


## Get source nodes, run radiate analysis, and export data to excel

In [8]:
source_nodes = get_nodes_by_biocyc_id_from_file(source_file)
selected_forward_nodes = get_selected_nodes(nodes_select_file, 'pageranks')
selected_reverse_nodes = get_selected_nodes(nodes_select_file, 'reverse pageranks')

export_radiate_traces(tracegraph, source_name, source_nodes, selected_forward_nodes, selected_reverse_nodes)

file_rows: 35 , nodes matched: 35
selected pageranks nodes:
 {'select': ['G6545', 'G6543', 'G6544', 'G6546']}
selected reverse pageranks nodes:
 {'select': ['PD00353', 'CPLX-123', 'G7072']}


INFO: Adding trace network pheno1_genes to csgG #1
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
ERROR: Target 10480 cannot be reachedfrom given sources
INFO: Adding trace network pheno1_genes to csgF #2
ER