# Radiate Analysis of curli phenotype-1 knockout genes

In [43]:
import os
import sys
root = os.getcwd().split('/notebooks/')[0]
sys.path.append(os.path.join(root, 'src'))

from lifelike_gds.arango_network.biocyc import *
from lifelike_gds.arango_network.radiate_trace import RadiateTrace
from lifelike_gds.arango_network.trace_graph_utils import *
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

## Settings
Make sure to change to the correct one based on your local settings

In [44]:
input_dir = 'input'
output_dir = 'output'
os.makedirs(output_dir, 0o777, True)
# gds database name
db_name = 'ecocyc'
# gds database version, free text, that can be used to describe the graph
db_version = 'ecocyc 25.5'

## Parameters
Set parameters
- source_name: name for the source entities. 
- source_file: source file name in input dir


In [45]:
# sources for personalized pagerank analysis
source_name = 'pheno1_genes'
source_file = 'curli_genes_pheno_1.csv'

### Define functions to get nodes from reading input file or list of biocyc_ids

In [46]:
"""
read column 'biocyc_id' from file, and found the matched nodes
""" 
def get_nodes_by_biocyc_id_from_file(csv_filename, id_column='biocyc_id'):
    df = pd.read_csv(os.path.join(input_dir, csv_filename))
    ids = [n for n in df[id_column]]
    nodes = database.get_nodes_by_attr(ids, 'biocyc_id')
    print('file_rows:', len(df), ', nodes matched:', len(nodes))
    return nodes


### Define function to run radiate analysis and export to excel file

In [47]:
"""
Perform radiate analysis from the given source_nodes, and export pageranks and rev_pageranks
into excel file. The excel file contains two tabs, one for pageranks and one for reverse pageranks.
The data are sorted by pagerank (or rev_pagerank)
rows_export: define the top ranked rows exported into file
"""
def export_radiate_analysis(tracegraph, source_name, source_nodes, rows_export=4000):
    tracegraph.graph = tracegraph.orig_graph.copy()
    tracegraph.set_node_set_from_arango_nodes(source_nodes, source_name, source_name)
    outfile_name = f"Radiate_analysis_for_{source_name}.xlsx"
    tracegraph.export_pagerank_data(source_name, outfile_name, direction='both', num_nodes=rows_export)

## Connect to arango database

In [48]:
dbname = os.getenv('ARANGO_DATABASE', db_name)

database = BiocycDB(dbname)

## Load graph from arango graph database to memery

In [49]:
tracegraph = RadiateTrace(Biocyc(database))
# set up output directory where the excel and graph files will write to
tracegraph.datadir = output_dir
# initiate tracegraph by loading graph data from arango
# a networkx graph is created here.  
tracegraph.init_default_graph()

INFO: MultiDirectedGraph with 33428 nodes and 37886 edges


## Get source nodes, run radiate analysis, and export data to excel

In [50]:
source_nodes = get_nodes_by_biocyc_id_from_file(source_file)
export_radiate_analysis(tracegraph, source_name, source_nodes)

file_rows: 35 , nodes matched: 35


INFO: set pagerank and num reach for pheno1_genes
Traceback (most recent call last):
  File "/Users/dommas/Library/Application Support/JetBrains/Toolbox/apps/PyCharm-P/ch-0/223.8214.51/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydevd_bundle/pydevd_xml.py", line 264, in frame_vars_to_xml
    type_handler.handle(k, v, hidden_ns, eval_full_val, user_type_renderers=user_type_renderers)
  File "/Users/dommas/Library/Application Support/JetBrains/Toolbox/apps/PyCharm-P/ch-0/223.8214.51/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydevd_bundle/pydevd_frame_type_handler.py", line 142, in handle
    super(DummyVarHandler, self).handle(key, value, hidden_ns, evaluate_full_value, user_type_renderers)
  File "/Users/dommas/Library/Application Support/JetBrains/Toolbox/apps/PyCharm-P/ch-0/223.8214.51/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydevd_bundle/pydevd_frame_type_handler.py", line 26, in handle
    self.give_to_next(key, value, hidden_ns, evaluate_full_value, user

KeyboardInterrupt: 