## Preamble

This file is mostly a test/proof-of-concept for some of the Neo4j module's functions.

To start, we test setting up a connection to a local server. We then test the creation command's necessary to make nodes for each type of database. Lastly, we test the commands necessary to create relations between each node which is the ultimate goal of this project.

Being that this is more a test or sandbox environment, we use fake data and lost of print messages to understand/manually inspect each step of the process.

## Neo4j Setup and Basic Functions

In [1]:
import time
from dotenv import dotenv_values
from neo4j import GraphDatabase

# get authentication credentials.
env_v = dotenv_values(".env")
neo4j_auth = (env_v["neo4j_user"], env_v["neo4j_pw"])

class GraphDriver:
    def __init__(self, uri):
        """Creates an authorized API to neo4j server at uri.

        Args:
            uri: the to the link and port to the neo4j server.
        """
        self._driver = GraphDatabase.driver(uri, auth=neo4j_auth)

    def execute_command(self, cmd:str)-> bool:
        """Runs command on neo4j server.

        Args:
            cmd: the Cypher-formatted command to run.

        Returns:
            A boolean value specifying whether the command was run successfully.
        """
        res = True
        session = self._driver.session() 
        start = time.perf_counter()

        # Attempt command.
        try:
            session.run(cmd)
        except:
            res = False
        print(f"Elapsed time: {round(time.perf_counter() - start, 2)} seconds.")

        return res

    def execute_commands(self, cmds:list[str])-> tuple[int, list[int]]:
        """Runs commands passed in list.

        Args:
            cmds: a list of properly formatted Cypher commands.

        Returns:
            A tuple specifying how many commands failed and a list of the 
            indices of the failed cmds.
            
        Note: 
            All commands are attempted regardless of previous failure.
        """
        failed_cmds = 0
        failed_cmd_indices = []
        session = self._driver.session() 

        start = time.perf_counter()
        
        # attempt to execute all cmds on server. 
        for i, cmd in enumerate(cmds):
            try:
                session.run(cmd)
            except:
                failed_cmds+= 1
                failed_cmd_indices.append(i)
                
        print(f"Elapsed time: {round(time.perf_counter() - start, 2)} seconds.")

        return (failed_cmds, failed_cmd_indices)

    def delete_data(self):
        """Deletes all nodes in database.
        """
        self.execute_commands(["MATCH (n) DETACH DELETE n"])


## Cypher Command Creation

In [2]:
def create_node(row, category:str, col_labels: list[tuple[str, type]]) -> str:
        """Creates a node with appropriate properties.

        Args:
            row: the data extracted from a dataframe.
            category: the category of the data (i.e. "predication", "entity", or "sentence").
            col_labels: a list of tuples specifying the labels and data type of each value.

        Returns:
            A string containing a properly formatted Cypher command.
        """

        def _make_prop_string(label: str, val, dtype:type, last:bool) -> str:
            """Formats properties of Cypher Command.

            Args:
                label: the label of the passed value.
                val: the integer or string data.
                dtype: the type of the passed value (i.e. int or str).
                last: whether the label & value combo are the last property values that will be
                in the Cypher command.
                
            Returns:
                A string containing the property combo in the proper format.
            """
            prop_string = ""

            # add quotations around str values.
            if dtype == str:
                prop_string += f"{label.lower()}:\"{val}\""
            else:
                prop_string+= f"{label.lower()}:{val}"

            # indicate remaining properties in command.
            if not last:
                prop_string+=", "
            
            return prop_string
        
        # create node properties in form "prop:val, prop:val,..."
        # note, dask index (unimportant to us) included in row at index 0, hence index+1.
        props = ''.join([
            _make_prop_string(label, row[i+1], dtype, False) if label != col_labels[-1][0]
                else _make_prop_string(label, row[i+1], dtype, True) for i, (label, dtype) in enumerate(col_labels) 
        ])

        cmd = f"CREATE(:{category.capitalize()}{{{props}}})"
    
        return cmd

def create_relation(cat_one:str, cat_two:str, relation:str) -> str:
    """Creates relations between two node categories.

    Args:
        cat_one: a string containing the category name to start the 
        relation from.
        cat_two: a string containing the category name to direct the relation
        to.
        relation: a string containing the desired relation between
        the node categories.
    
    Returns:
            A string containing a properly formatted Cypher command.    
    Note:
        Relationship is directed from category one Node to 
        category two node based on matching "sentence_id" properties.
    """
    # get all nodes of category one and two
    cmd = f"MATCH (a:{cat_one.capitalize()}), (b:{cat_two.capitalize()}) "
    
    # match sentence IDd
    cmd+= "WHERE a.sentence_id = b.sentence_id "

    # create relation
    cmd+= f"MERGE (a)-[:{relation}]->(b)"
    
    return cmd

## Data Setup 

In [3]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

uri = "bolt://localhost:7687"

# create driver and clear any existing nodes.
driver = GraphDriver(uri)
driver.delete_data()


categories = ["predication", "sentence", "entity"]

# column labels and data types as they appear in dataframes.
col_labels = {
    "predication": list({
        "PREDICATION_ID": int,  # Auto-generated primary key for each unique predication
        "SENTENCE_ID": int,     # Foreign key to the SENTENCE table
        "PMID": int,            # The PubMed identifier of the citation to which the predication belongs
        "PREDICATE": str,       # The string representation of each predicate (for example TREATS, PROCESS_OF)
        "SUBJECT_CUI": str,     # The CUI of the subject of the predication
        "SUBJECT_NAME": str,    # The preferred name of the subject of the predication
        "SUBJECT_SEMTYPE": str, # The semantic type of the subject of the predication
        "SUBJECT_NOVELTY": int, # The novelty of the subject of the predication
        "OBJECT_CUI": str,      # The CUI of the object of the predication
        "OBJECT_NAME": str,     # The preferred name of the object of the predication
        "OBJECT_SEMTYPE": str,  # The semantic type of the object of the predication
        "OBJECT_NOVELTY": int,  # The novelty of the object of the predication
        }.items()),
    "sentence": list({
        "SENTENCE_ID": int,               # Auto-generated primary key for each sentence
        "PMID": int,                      # The PubMed identifier of the citation to which the sentence belongs
        "TYPE": str,                      # 'ti' for the title of the citation, 'ab' for the abstract
        "NUMBER": int,                    # The location of the sentence within the title or abstract
        "SENT_START_INDEX": int,          # The character position within the text of the MEDLINE citation of the first character of the sentence  NEW
        "SENT_END_INDEX": int,            # The character position within the text of the MEDLINE citation of the last character of the sentence  NEW
        "SENTENCE": str,                  # The actual string or text of the sentence
        }.items()),
    "entity": list({
        "ENTITY_ID": int,    # Auto-generated primary key for each unique entity
        "SENTENCE_ID": int,  # The foreign key to SENTENCE table
        "CUI": str,          # The CUI of the entity
        "NAME": str,         # The preferred name of the entity
        "TYPE": str,         # The semantic type of the entity
        "TEXT": str,         # The text in the utterance that maps to the entity
        "START_INDEX": int,  # The first character position (in document) of the text denoting the entity
        "END_INDEX": int,    # The last character position (in document) of the text denoting the entity
        "SCORE": int,        # The confidence score
        }.items()),
}

Elapsed time: 0.06 seconds.


## Spoof Data and Command Creation

In [4]:
# see note in create_node. The only info we really care about is sentence ID.
test_data = {
    "predication":[
        [None, 1, 6, 0, "Test Predicate 1", "Test", "Test", "Test", 1, "Test", "Test", "Test", 1 ],
        [None, 2, 102, 0, "Test Predicate 2", "Test", "Test", "Test", 1, "Test", "Test", "Test", 1 ],
        [None, 3, 34, 0, "Test Predicate 3", "Test", "Test", "Test", 1, "Test", "Test", "Test", 1 ],
        [None, 4, 18, 0, "Test Predicate 4", "Test", "Test", "Test", 1, "Test", "Test", "Test", 1 ],
        [None, 5, 82, 0, "Test Predicate 5", "Test", "Test", "Test", 1, "Test", "Test", "Test", 1 ]
    ],
    "sentence": [
        [None, 102, 0, "Test", 0, 0, 0, "Test Sentence 1"],
        [None, 18, 0, "Test", 0, 0, 0, "Test Sentence 2"],
        [None, 34, 0, "Test", 0, 0, 0, "Test Sentence 3"],
        [None, 82, 0, "Test", 0, 0, 0, "Test Sentence 4"],
        [None, 6, 0, "Test", 0, 0, 0, "Test Sentence 5"],
    ],
    "entity": [
        [None, 1, 82, "Test", "Test Entity 1", "Test", "Test", 0, 0, 0],
        [None, 2, 102, "Test", "Test Entity 2", "Test", "Test", 0, 0, 0],
        [None, 3, 34, "Test", "Test Entity 3", "Test", "Test", 0, 0, 0],
        [None, 4, 6, "Test", "Test Entity 4", "Test", "Test", 0, 0, 0],
        [None, 5, 18, "Test", "Test Entity 5", "Test", "Test", 0, 0, 0],
    ]
}

test_cmds = []
for cat in categories:

    for row in test_data[cat]:
        cmd = create_node(row, cat, col_labels[cat])
        test_cmds.append(cmd)


for i, cmd in enumerate(test_cmds):
    if i == 0:
        print(f"\n{'-'*20}Predication Cmds{'-'*20}")
    elif i == 5:
        print(f"\n{'-'*20}Sentence Cmds{'-'*20}")
    elif i==10:
        print(f"\n{'-'*20}Entity Cmds{'-'*20}")
    print(f"{i}.\t{cmd}\n")

relation_cmds = {
    "Predication -> Sentence": {
        "category_one": "predication",
        "category_two": "sentence",
        "relation": "PREDICATE_OF",
        "cmd": None
    },
    "Predication -> Entity Cmd": {
        "category_one": "predication",
        "category_two": "entity",
        "relation": "PREDICATES",
        "cmd": None
    },
    "Entity -> Sentence": {
        "category_one": "entity",
        "category_two": "sentence",
        "relation": "SUBJECT_OF",
        "cmd": None
    },
}

for key, value in relation_cmds.items():
    
    value["cmd"] = create_relation(value["category_one"], value["category_two"], value["relation"])
    
    print(f"\n{'-'*20}{key} Cmd{'-'*20}")
    print(f"{value['cmd']}\n")


--------------------Predication Cmds--------------------
0.	CREATE(:Predication{predication_id:1, sentence_id:6, pmid:0, predicate:"Test Predicate 1", subject_cui:"Test", subject_name:"Test", subject_semtype:"Test", subject_novelty:1, object_cui:"Test", object_name:"Test", object_semtype:"Test", object_novelty:1})

1.	CREATE(:Predication{predication_id:2, sentence_id:102, pmid:0, predicate:"Test Predicate 2", subject_cui:"Test", subject_name:"Test", subject_semtype:"Test", subject_novelty:1, object_cui:"Test", object_name:"Test", object_semtype:"Test", object_novelty:1})

2.	CREATE(:Predication{predication_id:3, sentence_id:34, pmid:0, predicate:"Test Predicate 3", subject_cui:"Test", subject_name:"Test", subject_semtype:"Test", subject_novelty:1, object_cui:"Test", object_name:"Test", object_semtype:"Test", object_novelty:1})

3.	CREATE(:Predication{predication_id:4, sentence_id:18, pmid:0, predicate:"Test Predicate 4", subject_cui:"Test", subject_name:"Test", subject_semtype:"Test",

## Command Execution

In [5]:
driver.execute_commands(test_cmds)

Elapsed time: 0.11 seconds.


(0, [])

In [10]:
%%html
<image src="../images/Example_CreatedNodes.png" width="1080" height="720"></image>

In [11]:
for _, value in relation_cmds.items():
    cmd = value["cmd"]
    driver.execute_command(cmd)

Elapsed time: 0.02 seconds.
Elapsed time: 0.01 seconds.
Elapsed time: 0.01 seconds.


In [12]:
%%html
<img src="../images/Example_CreatedRelations.png" width="1080" height="720"></img>