# Reactome database for GDS

## Setup

### Install neo4j driver

In [1]:
!pip install neo4j pandas

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.9 -m pip install --upgrade pip[0m


### Import needed libraries

In [2]:
import os
import re
import pandas as pd
from dotenv import load_dotenv
from neo4j import GraphDatabase, Result
from IPython.display import display, Markdown, Code, DisplayObject

### Connect

In [3]:
load_dotenv() # These params are set in .env file

URI = os.getenv("NEO4J_URI")
AUTH = (os.getenv("NEO4J_USER"), os.getenv("NEO4J_PASSWORD"))
DATABASE = os.getenv("NEO4J_DATABASE")

driver = GraphDatabase.driver(URI, auth=AUTH)
with driver as session:
    session.verify_connectivity()

### Define utils

In [4]:
def _split_statements(query: str):
    return filter(
        lambda statement: statement,
        map(
            # deal with comments
            lambda statement: re.sub(r'\/\/.*[$\n]', '', statement).strip(),
            query.split(';')
        )
    )

counter_attrs = [
    'nodes_created',
    'nodes_deleted',
    'relationships_created',
    'relationships_deleted',
    'properties_set',
    'labels_added',
    'labels_removed',
    'indexes_added',
    'indexes_removed',
    'constraints_added',
    'constraints_removed'
]

def _get_counters(counters):
    for key in counter_attrs:
        value = getattr(counters, key)
        if isinstance(value, int):
            yield key, value

def _result_transformer(result: Result):
    """Transforms results to dataframe of records and update summary"""
    records_df = result.to_df()
    summary = result.consume()
    if records_df.empty: # if empty show all counters
        return pd.DataFrame.from_dict({kv[0]: [kv[1]] for kv in _get_counters(summary.counters)})
    else: # show only nonzero counters
        counters = {k:v for k,v in _get_counters(summary.counters) if v}
    return records_df.assign(**counters)

def _execute_with_feedback(execute, statements):
    executing_display_ref = display(Markdown('Executing:'), display_id=True)
    code_display_ref = display(display_id=True)
    for statement in statements:
        code_display_ref.update(Code(statement, language='cypher'))
        yield execute(statement)
    executing_display_ref.update(Markdown(''))
    code_display_ref.update(Markdown(''))

def _execute_statement(statement: str):
    """Execute statement with predefined params"""
    return driver.execute_query(statement, result_transformer_=_result_transformer, database_=DATABASE)

def execute_query(query: str):
    """
    Helper function to execute multiple statements at once
    and return results as dataframe
    """
    statements = list(_split_statements(query))
    return pd.concat(
        _execute_with_feedback(
            _execute_statement,
            statements
        ),
        keys=statements
    )        

## Input data profiling

In [5]:
execute_query("""
    MATCH (n) RETURN count(n) AS count;
    MATCH ()-[r]->() RETURN count(*) AS count;
""")





Unnamed: 0,Unnamed: 1,count
MATCH (n) RETURN count(n) AS count,0,2442415
MATCH ()-[r]->() RETURN count(*) AS count,0,10210779


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>MATCH (n) RETURN count(n)</th>
            <td><small>-</small></td>
            <td>2442415</td>
        </tr>
        <tr>
            <th>MATCH ()-[r]->() RETURN count(*)</th>
            <td><small>-</small></td>
            <td>10210779</td>
        </tr>
    </tbody>
</table>

In [6]:
execute_query("""
    MATCH (n:PhysicalEntity) RETURN COUNT(n)
""")





Unnamed: 0,Unnamed: 1,COUNT(n)
MATCH (n:PhysicalEntity) RETURN COUNT(n),0,405149


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>MATCH (n:PhysicalEntity) RETURN COUNT(n)</th>
            <td><small>-</small></td>
            <td>405149</td>
        </tr>
    </tbody>
</table>

In [7]:
execute_query("""
    MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes;
""")





Unnamed: 0,Unnamed: 1,NodeType,NumberOfNodes
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",0,[DBInfo],1
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",1,"[Event, Pathway, TopLevelPathway]",406
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",2,"[Event, Pathway]",21634
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",3,"[Event, ReactionLikeEvent, Reaction]",80813
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",4,"[PhysicalEntity, Complex]",102348
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",...,...,...
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",66,"[AbstractModifiedResidue, GeneticallyModifiedR...",1880
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",67,"[AbstractModifiedResidue, ReplacedResidue, Gen...",1093
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",68,"[ExternalOntology, CellType]",14
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",69,"[ControlledVocabulary, DrugActionType, Reactio...",3


Expected output:
<table>
    <thead>
        <tr>
            <th>NodeType,</th>
            <th colspan=2>NumberOfNodes</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>DBInfo</th>
            <td><small>-</small></td>
            <td>1</td>
        </tr>
        <tr>
            <th>DatabaseObject,Event,Pathway,TopLevelPathway</th>
            <td><small>-</small></td>
            <td>406</td>
        </tr>
        <tr>
            <th>DatabaseObject,Event,Pathway</th>
            <td><small>-</small></td>
            <td>21634</td>
        </tr>
        <tr>
            <th>DatabaseObject,Event,ReactionLikeEvent,Reaction</th>
            <td><small>-</small></td>
            <td>80813</td>
        </tr>
        <tr>
            <th>DatabaseObject,PhysicalEntity,Complex</th>
            <td><small>-</small></td>
            <td>102348</td>
        </tr>
        <tr>
            <th>DatabaseObject,PhysicalEntity,GenomeEncodedEntity,EntityWithAccessionedSequence</th>
            <td><small>-</small></td>
            <td>245399</td>
        </tr>
        <tr>
            <th>DatabaseObject,ReferenceEntity,ReferenceGeneProduct,ReferenceSequence</th>
            <td><small>-</small></td>
            <td>102065</td>
        </tr>
        <tr>
            <th>DatabaseObject,ReferenceEntity,ReferenceSequence,ReferenceDNASequence</th>
            <td><small>-</small></td>
            <td>509585</td>
        </tr>
        <tr>
            <th>DatabaseObject,Taxon,Species</th>
            <td><small>-</small></td>
            <td>84</td>
        </tr>
        <tr>
            <th>DatabaseObject,DatabaseIdentifier</th>
            <td><small>-</small></td>
            <td>711831</td>
        </tr>
        <tr>
            <th>DatabaseObject,ReferenceDatabase</th>
            <td><small>-</small></td>
            <td>154</td>
        </tr>
        <tr>
            <th>DatabaseObject,InstanceEdit</th>
            <td><small>-</small></td>
            <td>117465</td>
        </tr>
        <tr>
            <th>DatabaseObject,Person</th>
            <td><small>-</small></td>
            <td>14588</td>
        </tr>
        <tr>
            <th>DatabaseObject,Affiliation</th>
            <td><small>-</small></td>
            <td>327</td>
        </tr>
        <tr>
            <th>DatabaseObject,Taxon</th>
            <td><small>-</small></td>
            <td>306</td>
        </tr>
        <tr>
            <th>DatabaseObject,ReferenceEntity,ReferenceSequence,ReferenceRNASequence</th>
            <td><small>-</small></td>
            <td>119956</td>
        </tr>
        <tr>
            <th>DatabaseObject,Compartment,GO_CellularComponent,GO_Term</th>
            <td><small>-</small></td>
            <td>149</td>
        </tr>
        <tr>
            <th>DatabaseObject,GO_CellularComponent,GO_Term</th>
            <td><small>-</small></td>
            <td>31</td>
        </tr>
        <tr>
            <th>DatabaseObject,ModifiedResidue,TranslationalModification,AbstractModifiedResidue</th>
            <td><small>-</small></td>
            <td>35419</td>
        </tr>
        <tr>
            <th>DatabaseObject,PsiMod,ExternalOntology</th>
            <td><small>-</small></td>
            <td>164</td>
        </tr>
        <tr>
            <th>DatabaseObject,LiteratureReference,Publication</th>
            <td><small>-</small></td>
            <td>36269</td>
        </tr>
        <tr>
            <th>DatabaseObject,Summation</th>
            <td><small>-</small></td>
            <td>19025</td>
        </tr>
        <tr>
            <th>DatabaseObject,PhysicalEntity,SimpleEntity</th>
            <td><small>-</small></td>
            <td>3648</td>
        </tr>
        <tr>
            <th>DatabaseObject,ReferenceEntity,ReferenceMolecule</th>
            <td><small>-</small></td>
            <td>2071</td>
        </tr>
        <tr>
            <th>DatabaseObject,PhysicalEntity,EntitySet,DefinedSet</th>
            <td><small>-</small></td>
            <td>3633</td>
        </tr>
        <tr>
            <th>DatabaseObject,Event,ReactionLikeEvent,Depolymerisation</th>
            <td><small>-</small></td>
            <td>3</td>
        </tr>
        <tr>
            <th>DatabaseObject,PhysicalEntity,Polymer</th>
            <td><small>-</small></td>
            <td>1503</td>
        </tr>
        <tr>
            <th>DatabaseObject,TranslationalModification,AbstractModifiedResidue,GroupModifiedResidue</th>
            <td><small>-</small></td>
            <td>12308</td>
        </tr>
        <tr>
            <th>DatabaseObject,AbstractModifiedResidue,ReplacedResidue,GeneticallyModifiedResidue</th>
            <td><small>-</small></td>
            <td>192</td>
        </tr>
        <tr>
            <th>DatabaseObject,PhysicalEntity,GenomeEncodedEntity</th>
            <td><small>-</small></td>
            <td>5844</td>
        </tr>
        <tr>
            <th>DatabaseObject,Event,ReactionLikeEvent,Polymerisation</th>
            <td><small>-</small></td>
            <td>236</td>
        </tr>
        <tr>
            <th>DatabaseObject,Event,ReactionLikeEvent,BlackBoxEvent</th>
            <td><small>-</small></td>
            <td>8734</td>
        </tr>
        <tr>
            <th>DatabaseObject,ReferenceEntity,ReferenceIsoform,ReferenceGeneProduct,ReferenceSequence</th>
            <td><small>-</small></td>
            <td>2228</td>
        </tr>
        <tr>
            <th>DatabaseObject,CatalystActivity</th>
            <td><small>-</small></td>
            <td>37914</td>
        </tr>
        <tr>
            <th>DatabaseObject,GO_Term,GO_MolecularFunction</th>
            <td><small>-</small></td>
            <td>1516</td>
        </tr>
        <tr>
            <th>DatabaseObject,PhysicalEntity,EntitySet,CandidateSet</th>
            <td><small>-</small></td>
            <td>8624</td>
        </tr>
        <tr>
            <th>DatabaseObject,PositiveRegulation,Regulation</th>
            <td><small>-</small></td>
            <td>2195</td>
        </tr>
        <tr>
            <th>DatabaseObject,RegulationReference,ControlReference</th>
            <td><small>-</small></td>
            <td>1414</td>
        </tr>
        <tr>
            <th>DatabaseObject,EvidenceType</th>
            <td><small>-</small></td>
            <td>1</td>
        </tr>
        <tr>
            <th>DatabaseObject,GO_Term,GO_BiologicalProcess</th>
            <td><small>-</small></td>
            <td>944</td>
        </tr>
        <tr>
            <th>DatabaseObject,PositiveRegulation,Regulation,PositiveGeneExpressionRegulation</th>
            <td><small>-</small></td>
            <td>916</td>
        </tr>
        <tr>
            <th>DatabaseObject,Figure</th>
            <td><small>-</small></td>
            <td>864</td>
        </tr>
        <tr>
            <th>DatabaseObject,TranslationalModification,AbstractModifiedResidue,InterChainCrosslinkedResidue,CrosslinkedResidue</th>
            <td><small>-</small></td>
            <td>669</td>
        </tr>
        <tr>
            <th>DatabaseObject,ReferenceEntity,ReferenceGroup</th>
            <td><small>-</small></td>
            <td>111</td>
        </tr>
        <tr>
            <th>DatabaseObject,NegativePrecedingEvent</th>
            <td><small>-</small></td>
            <td>307</td>
        </tr>
        <tr>
            <th>DatabaseObject,NegativePrecedingEventReason,ControlledVocabulary</th>
            <td><small>-</small></td>
            <td>0,6</td>
        </tr>
        <tr>
            <th>DatabaseObject,ControlReference,CatalystActivityReference</th>
            <td><small>-</small></td>
            <td>906</td>
        </tr>
        <tr>
            <th>DatabaseObject,Regulation,NegativeGeneExpressionRegulation,NegativeRegulation</th>
            <td><small>-</small></td>
            <td>238</td>
        </tr>
        <tr>
            <th>DatabaseObject,PhysicalEntity,OtherEntity</th>
            <td><small>-</small></td>
            <td>34</td>
        </tr>
        <tr>
            <th>DatabaseObject,Regulation,NegativeRegulation</th>
            <td><small>-</small></td>
            <td>2448</td>
        </tr>
        <tr>
            <th>DatabaseObject,PhysicalEntity,ChemicalDrug,Drug</th>
            <td><small>-</small></td>
            <td>1027</td>
        </tr>
        <tr>
            <th>DatabaseObject,ReferenceEntity,ReferenceTherapeutic</th>
            <td><small>-</small></td>
            <td>1036</td>
        </tr>
        <tr>
            <th>DatabaseObject,ExternalOntology,Disease</th>
            <td><small>-</small></td>
            <td>744</td>
        </tr>
        <tr>
            <th>DatabaseObject,Publication,URL</th>
            <td><small>-</small></td>
            <td>43</td>
        </tr>
        <tr>
            <th>DatabaseObject,Publication,Book</th>
            <td><small>-</small></td>
            <td>126</td>
        </tr>
        <tr>
            <th>DatabaseObject,PositiveRegulation,Regulation,Requirement</th>
            <td><small>-</small></td>
            <td>706</td>
        </tr>
        <tr>
            <th>DatabaseObject,PhysicalEntity,Drug,ProteinDrug</th>
            <td><small>-</small></td>
            <td>86</td>
        </tr>
        <tr>
            <th>DatabaseObject,TranslationalModification,AbstractModifiedResidue,CrosslinkedResidue,IntraChainCrosslinkedResidue</th>
            <td><small>-</small></td>
            <td>401</td>
        </tr>
        <tr>
            <th>DatabaseObject,AbstractModifiedResidue,GeneticallyModifiedResidue,FragmentDeletionModification,FragmentModification</th>
            <td><small>-</small></td>
            <td>81</td>
        </tr>
        <tr>
            <th>DatabaseObject,AbstractModifiedResidue,ModifiedNucleotide,TranscriptionalModification</th>
            <td><small>-</small></td>
            <td>15</td>
        </tr>
        <tr>
            <th>DatabaseObject,AbstractModifiedResidue,GeneticallyModifiedResidue,FragmentModification,FragmentInsertionModification</th>
            <td><small>-</small></td>
            <td>163</td>
        </tr>
        <tr>
            <th>DatabaseObject,Event,ReactionLikeEvent,FailedReaction</th>
            <td><small>-</small></td>
            <td>445</td>
        </tr>
        <tr>
            <th>DatabaseObject,EntityFunctionalStatus</th>
            <td><small>-</small></td>
            <td>677</td>
        </tr>
        <tr>
            <th>DatabaseObject,FunctionalStatus</th>
            <td><small>-</small></td>
            <td>27</td>
        </tr>
        <tr>
            <th>DatabaseObject,FunctionalStatusType</th>
            <td><small>-</small></td>
            <td>5</td>
        </tr>
        <tr>
            <th>DatabaseObject,ExternalOntology,SequenceOntology</th>
            <td><small>-</small></td>
            <td>15</td>
        </tr>
        <tr>
            <th>DatabaseObject,AbstractModifiedResidue,GeneticallyModifiedResidue,FragmentModification,FragmentReplacedModification</th>
            <td><small>-</small></td>
            <td>188</td>
        </tr>
        <tr>
            <th>DatabaseObject,AbstractModifiedResidue,ReplacedResidue,GeneticallyModifiedResidue,NonsenseMutation</th>
            <td><small>-</small></td>
            <td>1093</td>
        </tr>
        <tr>
            <th>DatabaseObject,ExternalOntology,CellType</th>
            <td><small>-</small></td>
            <td>14</td>
        </tr>
        <tr>
            <th>DatabaseObject,ControlledVocabulary,DrugActionType,ReactionType</th>
            <td><small>-</small></td>
            <td>3</td>
        </tr>
        <tr>
            <th>DatabaseObject,UndirectedInteraction,Interaction</th>
            <td><small>-</small></td>
            <td>49973</td>
        </tr>
    </tbody>
</table>

## Reproduction steps

### 1. Reverse inputs so realtionships represent flow of reactions

In [8]:
execute_query("""
    MATCH ()-[r:input]->() call apoc.refactor.invert(r) yield input, output return count(*)
""")





Unnamed: 0,Unnamed: 1,count(*)
"MATCH ()-[r:input]->() call apoc.refactor.invert(r) yield input, output return count(*)",0,176305


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>count(*)</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>MATCH ()-[r:input]->() call apoc.refactor.invert(r) yield input, output return count(*)</th>
            <th><small>-</small>
            <td>176305</td>
        </tr>
    </tbody>
</table>

### 2. Remove "un-needed" nodes and labels

+ Remove label: DatabaseObject (removed label for 2442414 nodes)
    > @Nathan This query removes the DatabaseObject label from the nodes. It runs easily enough.

In [9]:
execute_query("""
    call apoc.periodic.iterate(
    "match(n:DatabaseObject) return n", 
    'remove n:DatabaseObject', 
    {batchSize: 5000, parallel:True}
    )
""")





Unnamed: 0,Unnamed: 1,batches,total,timeTaken,committedOperations,failedOperations,failedBatches,retries,errorMessages,batch,operations,wasTerminated,failedParams,updateStatistics
"call apoc.periodic.iterate(\n ""match(n:DatabaseObject) return n"", \n 'remove n:DatabaseObject', \n {batchSize: 5000, parallel:True}\n )",0,1,0,0,0,0,0,0,{},"{'total': 1, 'committed': 1, 'failed': 0, 'err...","{'total': 0, 'committed': 0, 'failed': 0, 'err...",False,{},"{'nodesDeleted': 0, 'labelsAdded': 0, 'relatio..."


+ Remove nodes: InstanceEdit (parallel failed)
    > @Nathan This query removes the InstanceEdit nodes. Some of these nodes have hundreds of thousand's of connecting edges. This query fails due to running out of memory for one batch. For the las batch I increased the maximum heap size setting and tried different batch sizes using the next query using different limits untill finding that with 5 it runs ok:
    >
    > ```cypher
    > MATCH (n:InstanceEdit)
    > WITH n
    > LIMIT 5
    > DETACH DELETE n
    > ```
    >
    > <br>Then I ran:
    >
    > ```cypher
    > call apoc.periodic.iterate(
    > "match(n:InstanceEdit) return n",
    > 'detach delete n',
    > {batchSize: 5}
    > )
    > ```

In [10]:
execute_query("""
    call apoc.periodic.iterate(
        'match(n:InstanceEdit) return n',
        'detach delete n',
        {batchSize: 5000}
    )
""")





Unnamed: 0,Unnamed: 1,batches,total,timeTaken,committedOperations,failedOperations,failedBatches,retries,errorMessages,batch,operations,wasTerminated,failedParams,updateStatistics
"call apoc.periodic.iterate(\n 'match(n:InstanceEdit) return n',\n 'detach delete n',\n {batchSize: 5000}\n )",0,23,112465,13,112465,0,0,0,{},"{'total': 23, 'committed': 23, 'failed': 0, 'e...","{'total': 112465, 'committed': 112465, 'failed...",False,{},"{'nodesDeleted': 112465, 'labelsAdded': 0, 're..."


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>nodes_deleted</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>
                call apoc.periodic.iterate(
                    "match(n:InstanceEdit) return n", 
                    'detach delete n', 
                    {batchSize: 5000}
                )
            </th>
            <th><small>-</small>
            <td>112465</td>
        </tr>
    </tbody>
</table>

+ Remove person, publication etc.
    > @Nathan I ran each query separately. they run Ok.

In [11]:
execute_query("""
    match(n:Affiliation) detach delete n;
    match(n:Person) detach delete n;
    match(n:Publication) detach delete n;
""")





Unnamed: 0,Unnamed: 1,nodes_created,nodes_deleted,relationships_created,relationships_deleted,properties_set,labels_added,labels_removed,indexes_added,indexes_removed,constraints_added,constraints_removed
match(n:Affiliation) detach delete n,0,0,327,0,556,0,0,0,0,0,0,0
match(n:Person) detach delete n,0,0,145880,0,222947,0,0,0,0,0,0,0
match(n:Publication) detach delete n,0,0,36438,0,89396,0,0,0,0,0,0,0


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>nodes_deleted</th>
            <th colspan=2>relationships_deleted</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
            <th>v75</th>
            <th>v83</th>
    </thead>
    <tbody>
        <tr>
            <th>match(n:Affiliation) detach delete n</th>
            <th><small>-</small>
            <td>327</td>
            <th><small>-</small>
            <td>556</td>
        </tr>
        <tr>
            <th>match(n:Person) detach delete n</th>
            <th><small>-</small>
            <td>145880</td>
            <th><small>-</small>
            <td>222947</td>
        </tr>
        <tr>
            <th>match(n:Publication) detach delete n</th>
            <th><small>-</small>
            <td>36438</td>
            <th><small>-</small>
            <td>89396</td>
        </tr>
    </tbody>
</table>

+ Remove all Tax
    > @Nathan This again had insufficient memory. So I ran it in batches
    >
    > ```cypher
    > call apoc.periodic.iterate(
    > "match(n:Taxon) return n",
    > 'detach delete n',
    > {batchSize: 1000}
    > )
    > ```

In [12]:
execute_query("""
    match(n:Taxon) detach delete n
""")





Unnamed: 0,Unnamed: 1,nodes_created,nodes_deleted,relationships_created,relationships_deleted,properties_set,labels_added,labels_removed,indexes_added,indexes_removed,constraints_added,constraints_removed
match(n:Taxon) detach delete n,0,0,390,0,1249099,0,0,0,0,0,0,0


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>nodes_deleted</th>
            <th colspan=2>relationships_deleted</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>match(n:Taxon) detach delete n</th>
            <th><small>-</small>
            <td>390</td>
            <th><small>-</small>
            <td>1249099</td>
        </tr>
    </tbody>
</table>

+ Remove all non-human events All events have property 'speciesName'.
    > @Nathan This again had insufficient memory. So I ran it in batches
    > 
    > ```cypher
    > call apoc.periodic.iterate(
    > "match(n:Event) where n.speciesName <> 'Homo sapiens' return n",
    > 'detach delete n',
    > {batchSize: 1000}
    > )
    > ```

In [13]:
execute_query("""
    match(n:Event) where n.speciesName <> 'Homo sapiens' detach delete n
""")





Unnamed: 0,Unnamed: 1,nodes_created,nodes_deleted,relationships_created,relationships_deleted,properties_set,labels_added,labels_removed,indexes_added,indexes_removed,constraints_added,constraints_removed
match(n:Event) where n.speciesName <> 'Homo sapiens' detach delete n,0,0,95291,0,904579,0,0,0,0,0,0,0


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>nodes_deleted</th>
            <th colspan=2>relationships_deleted</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>match(n:Event) where n.speciesName <> 'Homo sapiens' detach delete n</th>
            <th><small>-</small>
            <td>95291</td>
            <th><small>-</small>
            <td>904579</td>
        </tr>
    </tbody>
</table>

+ Remove non-human PhysicalEntity 4783 physical entities did not have speciesName, including SimpleEntity and others. Keep them.
    > @Nathan This again had insufficient memory (removes about 350000 nodes). So I ran it in batches
    > 
    > ```cypher
    > call apoc.periodic.iterate(
    > "match(n:PhysicalEntity) where exists (n.speciesName) and n.speciesName <> 'Homo sapiens' return n",
    > 'detach delete n',
    > {batchSize: 1000}
    > )
    > ```

In [14]:
execute_query("""
    match(n:PhysicalEntity) where  exists (n.speciesName) and n.speciesName <> 'Homo sapiens' detach delete n
""")





Unnamed: 0,Unnamed: 1,nodes_created,nodes_deleted,relationships_created,relationships_deleted,properties_set,labels_added,labels_removed,indexes_added,indexes_removed,constraints_added,constraints_removed
match(n:PhysicalEntity) where exists (n.speciesName) and n.speciesName <> 'Homo sapiens' detach delete n,0,0,349063,0,1575214,0,0,0,0,0,0,0


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>nodes_deleted</th>
            <th colspan=2>relationships_deleted</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>match(n:PhysicalEntity) where  exists (n.speciesName) and n.speciesName <> 'Homo sapiens' detach delete n</th>
            <th><small>-</small>
            <td>349063</td>
            <th><small>-</small>
            <td>1575214</td>
        </tr>
    </tbody>
</table>

### 3. Set commonName and compartment properties

+ Set PhysicalEntity common name, and create index
    > @Nathan This runs without problem. The text reads: Set PhysicalEntity common name, and create index but I do not see a query to create an index. Should we create one?

In [15]:
execute_query("""
    match(n:PhysicalEntity) set n.commonName = n.name[0]
""")





Unnamed: 0,Unnamed: 1,nodes_created,nodes_deleted,relationships_created,relationships_deleted,properties_set,labels_added,labels_removed,indexes_added,indexes_removed,constraints_added,constraints_removed
match(n:PhysicalEntity) set n.commonName = n.name[0],0,0,0,0,0,56086,0,0,0,0,0,0


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>properties_set</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>match(n:PhysicalEntity) set n.commonName = n.name[0]</th>
            <td><small>-</small></td>
            <td>56086</td>
        </tr>
    </tbody>
</table>

+ Set compartment property
    > @Nathan This transforms the compartment from a relation to a property. The 4 queries run without problems.

In [16]:
execute_query("""
    match(n:PhysicalEntity)-[r:compartment]-(x)
    with n, collect(x.name) as gos set n.compartment = gos;
    match(n:PhysicalEntity)-[r:compartment]-(x) delete r;

    match(n:Event)-[r:compartment]-(x)  
    with n, collect(x.name) as gos set n.compartment = gos;
    match(n:Event)-[r:compartment]-(:GO_Term) delete r
""")





Unnamed: 0,Unnamed: 1,nodes_created,nodes_deleted,relationships_created,relationships_deleted,properties_set,labels_added,labels_removed,indexes_added,indexes_removed,constraints_added,constraints_removed
"match(n:PhysicalEntity)-[r:compartment]-(x)\n with n, collect(x.name) as gos set n.compartment = gos",0,0,0,0,0,56086,0,0,0,0,0,0
match(n:PhysicalEntity)-[r:compartment]-(x) delete r,0,0,0,0,56171,0,0,0,0,0,0,0
"match(n:Event)-[r:compartment]-(x) \n with n, collect(x.name) as gos set n.compartment = gos",0,0,0,0,0,15264,0,0,0,0,0,0
match(n:Event)-[r:compartment]-(:GO_Term) delete r,0,0,0,0,22531,0,0,0,0,0,0,0


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>relationships_deleted</th>
            <th colspan=2>properties_set</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>match(n:PhysicalEntity)-[r:compartment]-(x)
    with n, collect(x.name) as gos set n.compartment = gos</th>
            <td><small>-</small></td>
            <td></td>
            <td><small>-</small></td>
            <td>56086</td>
        </tr>
        <tr>
            <th>match(n:PhysicalEntity)-[r:compartment]-(x) delete r</th>
            <td><small>-</small></td>
            <td>56171</td>
            <td><small>-</small></td>
            <td></td>
        </tr>
        <tr>
            <th>match(n:Event)-[r:compartment]-(x)  
    with n, collect(x.name) as gos set n.compartment = gos</th>
            <td><small>-</small></td>
            <td></td>
            <td><small>-</small></td>
            <td>15264</td>
        </tr>
        <tr>
            <th>match(n:Event)-[r:compartment]-(:GO_Term) delete r</th>
            <td><small>-</small></td>
            <td>22531</td>
            <td><small>-</small></td>
            <td></td>
        </tr>
    </tbody>
</table>

### 4. Set GNE, RNA, Protein and Chemical labels

> @Nathan The queries above run without problems. The deffinition of a Gene is a bit complicated, it is a DNA sequence that encodes mRNA but ok. A gene product can be miRNA or lncRNA, I guess those are covered by the RNA label.

In [17]:
execute_query("""
    match(n:EntityWithAccessionedSequence) where (n)-[:referenceEntity]-(:ReferenceDNASequence) set n:Gene;
    match(n:EntityWithAccessionedSequence) where (n)-[:referenceEntity]-(:ReferenceRNASequence) set n:RNA;
    match(n:EntityWithAccessionedSequence) where (n)-[:referenceEntity]-(:ReferenceGeneProduct) set n:Protein;

    match(n:SimpleEntity) set n:Chemical;
""")





Unnamed: 0,Unnamed: 1,nodes_created,nodes_deleted,relationships_created,relationships_deleted,properties_set,labels_added,labels_removed,indexes_added,indexes_removed,constraints_added,constraints_removed
match(n:EntityWithAccessionedSequence) where (n)-[:referenceEntity]-(:ReferenceDNASequence) set n:Gene,0,0,0,0,0,0,1336,0,0,0,0,0
match(n:EntityWithAccessionedSequence) where (n)-[:referenceEntity]-(:ReferenceRNASequence) set n:RNA,0,0,0,0,0,0,309,0,0,0,0,0
match(n:EntityWithAccessionedSequence) where (n)-[:referenceEntity]-(:ReferenceGeneProduct) set n:Protein,0,0,0,0,0,0,29011,0,0,0,0,0
match(n:SimpleEntity) set n:Chemical,0,0,0,0,0,0,3648,0,0,0,0,0


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>labels_added</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>match(n:EntityWithAccessionedSequence) where (n)-[:referenceEntity]-(:ReferenceDNASequence) set n:Gene</th>
            <td><small>-</small></td>
            <td>1336</td>
        </tr>
        <tr>
            <th>match(n:EntityWithAccessionedSequence) where (n)-[:referenceEntity]-(:ReferenceRNASequence) set n:RNA</th>
            <td><small>-</small></td>
            <td>309</td>
        </tr>
        <tr>
            <th>match(n:EntityWithAccessionedSequence) where (n)-[:referenceEntity]-(:ReferenceGeneProduct) set n:Protein</th>
            <td><small>-</small></td>
            <td>29011</td>
        </tr>
        <tr>
            <th>match(n:SimpleEntity) set n:Chemical</th>
            <td><small>-</small></td>
            <td>3648</td>
        </tr>
    </tbody>
</table>

### 5. Reverse a few relationshps for better traversal

In [18]:
execute_query("""
    match(n:Complex)-[r:hasComponent]->(x) merge (x)-[:componentOf]->(n) delete r;
    match(n:EntitySet)-[r:hasMember]->(x) merge (x)-[:memberOf]->(n) delete r;
    match(n:ReactionLikeEvent)-[r:catalystActivity]->(x) merge (x)-[:catalyzes]->(n) delete r;
    match (n:CatalystActivity)-[r:physicalEntity]->(x) merge (x)-[:catalystOf]->(n) delete r;
    match (n:CatalystActivity)-[r:activeUnit]->(x) merge (x)-[:activeUnitOf]->(n) delete r;
    match (n)-[r:regulatedBy]->(x:Regulation) merge (x)-[:regulates]->(n) delete r;
    match(n:Regulation)-[r:regulator]->(x) merge (x)-[:regulatorOf]->(n) delete r;
    match(n:Regulation)-[r:activeUnit]->(x) merge (x)-[:activeUnitOf]->(n) delete r;
""")





Unnamed: 0,Unnamed: 1,nodes_created,nodes_deleted,relationships_created,relationships_deleted,properties_set,labels_added,labels_removed,indexes_added,indexes_removed,constraints_added,constraints_removed
match(n:Complex)-[r:hasComponent]->(x) merge (x)-[:componentOf]->(n) delete r,0,0,0,32536,32536,0,0,0,0,0,0,0
match(n:EntitySet)-[r:hasMember]->(x) merge (x)-[:memberOf]->(n) delete r,0,0,0,25906,25906,0,0,0,0,0,0,0
match(n:ReactionLikeEvent)-[r:catalystActivity]->(x) merge (x)-[:catalyzes]->(n) delete r,0,0,0,6158,6158,0,0,0,0,0,0,0
match (n:CatalystActivity)-[r:physicalEntity]->(x) merge (x)-[:catalystOf]->(n) delete r,0,0,0,4724,4724,0,0,0,0,0,0,0
match (n:CatalystActivity)-[r:activeUnit]->(x) merge (x)-[:activeUnitOf]->(n) delete r,0,0,0,1156,1156,0,0,0,0,0,0,0
match (n)-[r:regulatedBy]->(x:Regulation) merge (x)-[:regulates]->(n) delete r,0,0,0,2381,2381,0,0,0,0,0,0,0
match(n:Regulation)-[r:regulator]->(x) merge (x)-[:regulatorOf]->(n) delete r,0,0,0,2284,2284,0,0,0,0,0,0,0
match(n:Regulation)-[r:activeUnit]->(x) merge (x)-[:activeUnitOf]->(n) delete r,0,0,0,834,834,0,0,0,0,0,0,0


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>relationships_deleted</th>
            <th colspan=2>relationships_creates</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>match(n:Complex)-[r:hasComponent]->(x) merge (x)-[:componentOf]->(n) delete r</th>
            <td><small>-</small></td>
            <td>32536</td>
            <td><small>-</small></td>
            <td>32536</td>
        </tr>
        <tr>
            <th>match(n:EntitySet)-[r:hasMember]->(x) merge (x)-[:memberOf]->(n) delete r</th>
            <td><small>-</small></td>
            <td>25906</td>
            <td><small>-</small></td>
            <td>25906</td>
        </tr>
        <tr>
            <th>match(n:ReactionLikeEvent)-[r:catalystActivity]->(x) merge (x)-[:catalyzes]->(n) delete r</th>
            <td><small>-</small></td>
            <td>6158</td>
            <td><small>-</small></td>
            <td>6158</td>
        </tr>
        <tr>
            <th>match (n:CatalystActivity)-[r:physicalEntity]->(x) merge (x)-[:catalystOf]->(n) delete r</th>
            <td><small>-</small></td>
            <td>4724</td>
            <td><small>-</small></td>
            <td>4724</td>
        </tr>
        <tr>
            <th>match (n:CatalystActivity)-[r:activeUnit]->(x) merge (x)-[:activeUnitOf]->(n) delete r</th>
            <td><small>-</small></td>
            <td>1156</td>
            <td><small>-</small></td>
            <td>1156</td>
        </tr>
        <tr>
            <th>match (n)-[r:regulatedBy]->(x:Regulation) merge (x)-[:regulates]->(n) delete r</th>
            <td><small>-</small></td>
            <td>2381</td>
            <td><small>-</small></td>
            <td>2381</td>
        </tr>
        <tr>
            <th>match(n:Regulation)-[r:regulator]->(x) merge (x)-[:regulatorOf]->(n) delete r</th>
            <td><small>-</small></td>
            <td>2284</td>
            <td><small>-</small></td>
            <td>2284</td>
        </tr>
        <tr>
            <th>match(n:Regulation)-[r:activeUnit]->(x) merge (x)-[:activeUnitOf]->(n) delete r</th>
            <td><small>-</small></td>
            <td>834</td>
            <td><small>-</small></td>
            <td>834</td>
        </tr>
    </tbody>
</table>

### 6. Removed referredTo relationships:

In [19]:
execute_query("""
    match (n)-[r:inferredTo]->(m) delete r
""")





Unnamed: 0,Unnamed: 1,nodes_created,nodes_deleted,relationships_created,relationships_deleted,properties_set,labels_added,labels_removed,indexes_added,indexes_removed,constraints_added,constraints_removed
match (n)-[r:inferredTo]->(m) delete r,0,0,0,0,455,0,0,0,0,0,0,0


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>relationships_deleted</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>match (n)-[r:inferredTo]->(m) delete r</th>
            <td><small>-</small></td>
            <td>455</td>
        </tr>
    </tbody>
</table>

### 7. Refactor 'translocate' and 'transport' reactions with EntitySet in both input and output

+ Mark reaction nodes to refactor

In [20]:
execute_query("""
    match(n:ReactionLikeEvent {category: 'transition'})
     where (n.displayName contains 'transport') or (n.displayName contains 'translocate') 
    with n match (s1:EntitySet)-[:input]-(n)-[:output]-(s2:EntitySet) 
    set n.refactorStatus = 'refactored'
""")





Unnamed: 0,Unnamed: 1,nodes_created,nodes_deleted,relationships_created,relationships_deleted,properties_set,labels_added,labels_removed,indexes_added,indexes_removed,constraints_added,constraints_removed
match(n:ReactionLikeEvent {category: 'transition'})\n where (n.displayName contains 'transport') or (n.displayName contains 'translocate') \n with n match (s1:EntitySet)-[:input]-(n)-[:output]-(s2:EntitySet) \n set n.refactorStatus = 'refactored',0,0,0,0,0,88,0,0,0,0,0,0


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>properties_set</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>match(n:ReactionLikeEvent {category: 'transition'})
     where (n.displayName contains 'transport') or (n.displayName contains 'translocate') 
    with n match (s1:EntitySet)-[:input]-(n)-[:output]-(s2:EntitySet) 
    set n.refactorStatus = 'refactored'</th>
            <td>67</td>
            <td>88</td>
        </tr>
    </tbody>
</table>

+ Drop constraints and create indexes instead With unique constraints for dbId and stId, cloning nodes would fail.
    > @Nathan `DROP CONSTRAINT ON` is deprecated. A solution for this is to use:
    >
    > ```cypher
    > SHOW ALL CONSTRAINTS;
    > // Choose the right constraint names and then
    > DROP CONSTRAINT constraint_name IF EXISTS;
    > ```

In [21]:
execute_query("""
    drop constraint on (n:Event) assert n.dbId is unique;
    drop constraint on (n:Event) assert n.stId is unique;
    drop constraint on (n:ReactionLikeEvent) assert n.dbId is unique;
    drop constraint on (n:ReactionLikeEvent) assert n.stId is unique;
    drop constraint on (n:Reaction) assert n.dbId is unique;
    drop constraint on (n:Reaction) assert n.stId is unique;

    create index for (n:ReactionLikeEvent) on (n.dbId);
    create index for (n:ReactionLikeEvent) on (n.stId);
""")





Unnamed: 0,Unnamed: 1,nodes_created,nodes_deleted,relationships_created,relationships_deleted,properties_set,labels_added,labels_removed,indexes_added,indexes_removed,constraints_added,constraints_removed
drop constraint on (n:Event) assert n.dbId is unique,0,0,0,0,0,0,0,0,0,0,0,1
drop constraint on (n:Event) assert n.stId is unique,0,0,0,0,0,0,0,0,0,0,0,1
drop constraint on (n:ReactionLikeEvent) assert n.dbId is unique,0,0,0,0,0,0,0,0,0,0,0,1
drop constraint on (n:ReactionLikeEvent) assert n.stId is unique,0,0,0,0,0,0,0,0,0,0,0,1
drop constraint on (n:Reaction) assert n.dbId is unique,0,0,0,0,0,0,0,0,0,0,0,1
drop constraint on (n:Reaction) assert n.stId is unique,0,0,0,0,0,0,0,0,0,0,0,1
create index for (n:ReactionLikeEvent) on (n.dbId),0,0,0,0,0,0,0,0,1,0,0,0
create index for (n:ReactionLikeEvent) on (n.stId),0,0,0,0,0,0,0,0,1,0,0,0


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>indexes_added</th>
            <th colspan=2>constraints_removed</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>drop constraint on (n:Event) assert n.dbId is unique</th>
            <td><small>-</small></td>
            <td></td>
            <td><small>-</small></td>
            <td>1</td>
        </tr>
        <tr>
            <th>drop constraint on (n:Event) assert n.stId is unique</th>
            <td><small>-</small></td>
            <td></td>
            <td><small>-</small></td>
            <td>1</td>
        </tr>
        <tr>
            <th>drop constraint on (n:ReactionLikeEvent) assert n.dbId is unique</th>
            <td><small>-</small></td>
            <td></td>
            <td><small>-</small></td>
            <td>1</td>
        </tr>
        <tr>
            <th>drop constraint on (n:ReactionLikeEvent) assert n.stId is unique</th>
            <td><small>-</small></td>
            <td></td>
            <td><small>-</small></td>
            <td>1</td>
        </tr>
        <tr>
            <th>drop constraint on (n:Reaction) assert n.dbId is unique</th>
            <td><small>-</small></td>
            <td></td>
            <td><small>-</small></td>
            <td>1</td>
        </tr>
        <tr>
            <th>drop constraint on (n:Reaction) assert n.stId is unique</th>
            <td><small>-</small></td>
            <td></td>
            <td><small>-</small></td>
            <td>1</td>
        </tr>
        <tr>
            <th>create index for (n:ReactionLikeEvent) on (n.dbId)</th>
            <td><small>-</small></td>
            <td>1</td>
            <td><small>-</small></td>
            <td></td>
        </tr>
        <tr>
            <th>create index for (n:ReactionLikeEvent) on (n.stId)</th>
            <td><small>-</small></td>
            <td>1</td>
            <td><small>-</small></td>
            <td></td>
        </tr>
    </tbody>
</table>

+ Clone reactions and add input-output relationships

In [22]:
execute_query("""
    match(n:ReactionLikeEvent {refactorStatus: 'refactored'}) 
    with n match (s1:EntitySet)-[:input]-(n)-[:output]-(s2:EntitySet) 
    with n, s1, s2 match (s1)<-[:memberOf]-(m1), (s2)<-[:memberOf]-(m2) 
    where (m1)-[:referenceEntity]-()-[:referenceEntity]-(m2)
    with n, m1, m2 call apoc.refactor.cloneNodes([n]) yield input, output as n2
    set n2.refactorStatus = 'added'
    merge (m1)-[:input]->(n2)-[:output]->(m2)
    return count(*)
""")





Unnamed: 0,Unnamed: 1,count(*),relationships_created,properties_set
"match(n:ReactionLikeEvent {refactorStatus: 'refactored'}) \n with n match (s1:EntitySet)-[:input]-(n)-[:output]-(s2:EntitySet) \n with n, s1, s2 match (s1)<-[:memberOf]-(m1), (s2)<-[:memberOf]-(m2) \n where (m1)-[:referenceEntity]-()-[:referenceEntity]-(m2)\n with n, m1, m2 call apoc.refactor.cloneNodes([n]) yield input, output as n2\n set n2.refactorStatus = 'added'\n merge (m1)-[:input]->(n2)-[:output]->(m2)\n return count(*)",0,373,746,373


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>count(*)</th>
            <th colspan=2>relationships_created</th>
            <th colspan=2>properties_set</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
            <th>v75</th>
            <th>v83</th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>match(n:ReactionLikeEvent {refactorStatus: 'refactored'}) 
    with n match (s1:EntitySet)-[:input]-(n)-[:output]-(s2:EntitySet) 
    with n, s1, s2 match (s1)<-[:memberOf]-(m1), (s2)<-[:memberOf]-(m2) 
    where (m1)-[:referenceEntity]-()-[:referenceEntity]-(m2)
    with n, m1, m2 call apoc.refactor.cloneNodes([n]) yield input, output as n2
    set n2.refactorStatus = 'added'
    merge (m1)-[:input]->(n2)-[:output]->(m2)
    return count(*)</th>
            <td>276</td>
            <td>373</td>
            <td><small>-</small></td>
            <td>746</td>
            <td>276</td>
            <td>373</td>
        </tr>
    </tbody>
</table>

+ add regulates relationships to newly created nodes

In [23]:
execute_query("""
    match(n:ReactionLikeEvent {refactorStatus: 'refactored'})-[:regulates]-(r) 
    with n, r match (n2:ReactionLikeEvent) where n2.dbId = n.dbId and n2.refactorStatus = 'added'
    merge (r)-[:regulates]->(n2)
""")





Unnamed: 0,Unnamed: 1,nodes_created,nodes_deleted,relationships_created,relationships_deleted,properties_set,labels_added,labels_removed,indexes_added,indexes_removed,constraints_added,constraints_removed
"match(n:ReactionLikeEvent {refactorStatus: 'refactored'})-[:regulates]-(r) \n with n, r match (n2:ReactionLikeEvent) where n2.dbId = n.dbId and n2.refactorStatus = 'added'\n merge (r)-[:regulates]->(n2)",0,0,0,30,0,0,0,0,0,0,0,0


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>replationships_created</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>match(n:ReactionLikeEvent {refactorStatus: 'refactored'})-[:regulates]-(r) 
    with n, r match (n2:ReactionLikeEvent) where n2.dbId = n.dbId and n2.refactorStatus = 'added'
    merge (r)-[:regulates]->(n2)</th>
            <td>2</td>
            <td>30</td>
        </tr>
    </tbody>
</table>

+ add catalyzes relationships to newly created nodes

In [24]:
execute_query("""
    match(n:ReactionLikeEvent {refactorStatus: 'refactored'})-[:catalyzes]-(c) 
    with n, c match (n2:ReactionLikeEvent) where n2.dbId = n.dbId and n2.refactorStatus = 'added'
    merge (c)-[:catalyzes]->(n2)
""")





Unnamed: 0,Unnamed: 1,nodes_created,nodes_deleted,relationships_created,relationships_deleted,properties_set,labels_added,labels_removed,indexes_added,indexes_removed,constraints_added,constraints_removed
"match(n:ReactionLikeEvent {refactorStatus: 'refactored'})-[:catalyzes]-(c) \n with n, c match (n2:ReactionLikeEvent) where n2.dbId = n.dbId and n2.refactorStatus = 'added'\n merge (c)-[:catalyzes]->(n2)",0,0,0,331,0,0,0,0,0,0,0,0


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>replationships_created</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>match(n:ReactionLikeEvent {refactorStatus: 'refactored'})-[:catalyzes]-(c) 
    with n, c match (n2:ReactionLikeEvent) where n2.dbId = n.dbId and n2.refactorStatus = 'added'
    merge (c)-[:catalyzes]->(n2)</th>
            <td>236</td>
            <td>331</td>
        </tr>
    </tbody>
</table>

### 8. Remove "un-needed" nodes and labels for version > 83

This is separate step to be possibly match how this database was looking like base on version 75.

In [25]:
execute_query("""
    match (n:NegativePrecedingEvent) detach delete n; // safe to delete nodes (does not share nodes with robin db)
    match (n:NegativePrecedingEventReason) detach delete n; // safe to delete nodes (does not share nodes with robin db)
    match (n:DrugActionType) detach delete n; // safe to delete nodes (does not share nodes with robin db)
    match ()-[r:cellType]->() delete r; // safe to delete relations?

    match (n:CellType) remove n:CellType;
    match (n:ControlledVocabulary) remove n:ControlledVocabulary;
    match (n:ReactionType) remove n:ReactionType;
""")





Unnamed: 0,Unnamed: 1,nodes_created,nodes_deleted,relationships_created,relationships_deleted,properties_set,labels_added,labels_removed,indexes_added,indexes_removed,constraints_added,constraints_removed
match (n:NegativePrecedingEvent) detach delete n,0,0,307,0,911,0,0,0,0,0,0,0
match (n:NegativePrecedingEventReason) detach delete n,0,0,6,0,0,0,0,0,0,0,0,0
match (n:DrugActionType) detach delete n,0,0,3,0,15,0,0,0,0,0,0,0
match ()-[r:cellType]->() delete r,0,0,0,0,58,0,0,0,0,0,0,0
match (n:CellType) remove n:CellType,0,0,0,0,0,0,0,14,0,0,0,0
match (n:ControlledVocabulary) remove n:ControlledVocabulary,0,0,0,0,0,0,0,0,0,0,0,0
match (n:ReactionType) remove n:ReactionType,0,0,0,0,0,0,0,0,0,0,0,0


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th colspan=2>nodes_deleted</th>
            <th colspan=2>relationships_deleted</th>
            <th colspan=2>labels_removed</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
            <th>v75</th>
            <th>v83</th>
            <th>v75</th>
            <th>v83</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>match (n:NegativePrecedingEvent) detach delete n</th>
            <td><small>-</small></td>
            <td>307</td>
            <td><small>-</small></td>
            <td>911</td>
            <td><small>-</small></td>
            <td>0</td>
        </tr>
        <tr>
            <th>match (n:NegativePrecedingEventReason) detach delete n</th>
            <td><small>-</small></td>
            <td>6</td>
            <td><small>-</small></td>
            <td>0</td>
            <td><small>-</small></td>
            <td>0</td>
        </tr>
        <tr>
            <th>match (n:DrugActionType) detach delete n</th>
            <td><small>-</small></td>
            <td>3</td>
            <td><small>-</small></td>
            <td>15</td>
            <td><small>-</small></td>
            <td>0</td>
        </tr>
        <tr>
            <th>match ()-[r:cellType]->() delete r</th>
            <td><small>-</small></td>
            <td>0</td>
            <td><small>-</small></td>
            <td>58</td>
            <td><small>-</small></td>
            <td>0</td>
        </tr>
        <tr>
            <th>match (n:CellType) remove n:CellType</th>
            <td><small>-</small></td>
            <td>0</td>
            <td><small>-</small></td>
            <td>0</td>
            <td><small>-</small></td>
            <td>14</td>
        </tr>
        <tr>
            <th>match (n:ControlledVocabulary) remove n:ControlledVocabulary</th>
            <td><small>-</small></td>
            <td>0</td>
            <td><small>-</small></td>
            <td>0</td>
            <td><small>-</small></td>
            <td>0</td>
        </tr>
        <tr>
            <th>match (n:ReactionType) remove n:ReactionType</th>
            <td><small>-</small></td>
            <td>0</td>
            <td><small>-</small></td>
            <td>0</td>
            <td><small>-</small></td>
            <td>0</td>
        </tr>
    </tbody>
</table>

### 9. For reactome-human label secondary metabolites

In [None]:
if DATABASE == 'reactome-human':
    execute_query("""
        WITH ["R-ALL-29438", "R-HSA-9660007", "R-HSA-113595", "R-ALL-76577", "R-ALL-113592", "R-ALL-29370", "R-ALL-29420", "R-ALL-29356", "R-ALL-29372", "R-HSA-9660032", "R-ALL-29358", "R-ALL-113582", "R-HSA-68524", "R-ALL-113518", "R-ALL-113550", "R-ALL-2485002", "R-ALL-113541", "R-ALL-164121", "R-ALL-427523", "R-ALL-9668967", "R-ALL-113571", "R-ALL-74016", "R-ALL-205687", "R-ALL-76194", "R-ALL-1500625", "R-ALL-111294", "R-ALL-109276", "R-ALL-73473", "R-ALL-70106", "R-ALL-29360", "R-ALL-29364", "R-ALL-29366", "R-ALL-113528", "R-ALL-31633", "R-ALL-29804", "R-ALL-113573", "R-ALL-29376", "R-ALL-113525", "R-ALL-113593", "R-ALL-113581", "R-ALL-113548", "R-ALL-163953", "R-ALL-113529", "R-ALL-29404", "R-ALL-113552", "R-ALL-29362", "R-ALL-113526", "R-ALL-113521", "R-ALL-113597", "R-ALL-29374", "R-ALL-113596", "R-ALL-31649", "R-ALL-74113", "R-ALL-188972", "R-ALL-352022", "R-ALL-83910", "R-ALL-164934", "R-ALL-141335", "R-ALL-193514", "R-ALL-159450", "R-ALL-189422", "R-ALL-1524102", "R-ALL-1806221", "R-ALL-111875", "R-ALL-74112", "R-ALL-74722", "R-ALL-114549", "R-ALL-194653", "R-ALL-114564", "R-ALL-29368", "R-ALL-374900", "R-ALL-2318767", "R-ALL-29496", "R-ALL-162743", "R-ALL-351626", "R-ALL-417888", "R-ALL-114570", "R-ALL-216801", "R-ALL-140648", "R-ALL-113519", "R-ALL-113551", "R-ALL-141090", "R-ALL-139827", "R-ALL-158602", "R-ALL-1467290", "R-ALL-113685", "R-ALL-159942", "R-ALL-5632460", "R-ALL-5632457", "R-ALL-29386", "R-ALL-29408", "R-ALL-1222424", "R-ALL-113533", "R-ALL-113600", "R-ALL-113563", "R-ALL-159448", "R-ALL-113561", "R-ALL-1132084", "R-ALL-1614617", "R-ALL-3299683", "R-ALL-3341383", "R-ALL-3299686", "R-ALL-352327", "R-ALL-351629", "R-ALL-189461", "R-ALL-189481", "R-ALL-113602", "R-ALL-113575", "R-ALL-141343", "R-ALL-113535", "R-ALL-193465", "R-ALL-389556", "R-ALL-389843", "R-ALL-29896", "R-ALL-2160490", "R-ALL-2022135", "R-ALL-29382", "R-ALL-109277", "R-ALL-114623", "R-ALL-109275", "R-ALL-8878981", "R-ALL-352330", "R-ALL-389620", "R-ALL-192305", "R-ALL-156540", "R-ALL-113534", "R-ALL-8938078", "R-ALL-158466", "R-ALL-517496", "R-ALL-517495", "R-ALL-113601", "R-ALL-113564", "R-ALL-1614597", "R-ALL-351593", "R-ALL-351603", "R-ALL-351627", "R-ALL-351628", "R-ALL-5693978", "R-ALL-5668577", "R-ALL-5668566", "R-ALL-5668574", "R-ALL-5668565", "R-ALL-1605715", "R-ALL-6781870", "R-ALL-110096", "R-ALL-428040", "R-ALL-205689", "R-ALL-742343", "R-ALL-2022884", "R-ALL-1606834", "R-ALL-194688", "R-ALL-194725", "R-ALL-194668", "R-ALL-111627", "R-ALL-110732", "R-ALL-8851519", "R-ALL-8851508", "R-ALL-8851517", "R-ALL-8851514", "R-ALL-8851528", "R-ALL-8851513", "R-ALL-167012", "R-ALL-8851242", "R-ALL-8851226", "R-ALL-114626", "R-ALL-74126", "R-ALL-9631150", "R-ALL-211579", "R-ALL-211606", "R-ALL-9683057", "R-ALL-9683078", "R-ALL-76230", "R-ALL-189385", "R-ALL-372511", "R-ALL-141707", "R-ALL-8931885", "R-ALL-8931884", "R-ALL-1470067", "R-ALL-6788973", "R-ALL-1222719", "R-ALL-1222475", "R-ALL-1222561", "R-ALL-1222461", "R-ALL-1132304", "R-ALL-1132121", "R-ALL-1236709", "R-ALL-2000345", "R-ALL-983318", "R-ALL-2000347", "R-ALL-2000348", "R-ALL-2000349", "R-ALL-389536", "R-ALL-139836", "R-ALL-8953399", "R-ALL-140912", "R-ALL-194697", "R-ALL-159751", "R-ALL-6806656", "R-ALL-5228597", "R-HSA-8943136", "R-ALL-111349", "R-ALL-5210949", "R-ALL-5244410", "R-ALL-1132287", "R-ALL-8939024", "R-ALL-8942170", "R-ALL-2429665", "R-ALL-2429673", "R-ALL-428206", "R-ALL-428218", "R-ALL-217259", "R-ALL-29572", "R-ALL-8953493", "R-ALL-189484", "R-ALL-189480", "R-ALL-192307", "R-ALL-5685882", "R-ALL-425425", "R-ALL-196180", "R-ALL-425978", "R-ALL-425971", "R-ALL-425969", "R-ALL-425958", "R-ALL-425999", "R-ALL-425977", "R-ALL-428548", "R-ALL-5626313", "R-ALL-1996291", "R-ALL-1237009", "R-ALL-114565", "R-ALL-114625", "R-ALL-114571", "R-ALL-114640", "R-ALL-114582", "R-ALL-114622", "R-ALL-114654", "R-ALL-113542", "R-ALL-427899", "R-ALL-2213216", "R-ALL-1678675", "R-ALL-5693747", "R-ALL-389573", "R-ALL-389593", "R-ALL-6809360", "R-ALL-2046049", "R-ALL-2046064", "R-ALL-5228339", "R-ALL-5696069", "R-ALL-5696026", "R-ALL-2130170", "R-ALL-5278291", "R-ALL-1132163", "R-ALL-1132345", "R-ALL-1131511", "R-ALL-6790135", "R-ALL-1132417", "R-ALL-1130844", "R-ALL-6790191", "R-ALL-1132064", "R-ALL-1130860", "R-ALL-2855229", "R-ALL-5617813", "R-ALL-5617810", "R-ALL-5688282", "R-ALL-5623644", "R-ALL-5623650", "R-ALL-879867", "R-ALL-8949220", "R-ALL-2889072", "R-ALL-2889083", "R-ALL-8949232", "R-ALL-947590", "R-ALL-2731020", "R-ALL-2892454", "R-ALL-2872447", "R-ALL-2872443", "R-ALL-2730999", "R-ALL-2731008", "R-ALL-1996292", "R-ALL-9628545", "R-ALL-2255331", "R-ALL-2230955"] AS sm
        MATCH (n)
        WHERE n.stId IN sm
        SET n:SecondaryMetabolite
        RETURN n;
    """)

## Incremental Validation

Validate counts for processed database

In [27]:
execute_query("""
    MATCH (n) RETURN count(n) AS count;
    MATCH ()-[r]->() RETURN count(*) AS count;
""")





Unnamed: 0,Unnamed: 1,count
MATCH (n) RETURN count(n) AS count,0,1697618
MATCH ()-[r]->() RETURN count(*) AS count,0,3383356


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
            <th>Difference v83 - v75</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>MATCH (n) RETURN count(n)</th>
            <td>1703054</td>
            <td>1697618</td>
            <td>-5436</td>
        </tr>
        <tr>
            <th>MATCH ()-[r]->() RETURN count(*)</th>
            <td>3368926</td>
            <td>3383356</td>
            <td>14430</td>
        </tr>
    </tbody>
</table>

In [28]:
execute_query("""
    MATCH (n:PhysicalEntity) RETURN COUNT(n)
""")





Unnamed: 0,Unnamed: 1,COUNT(n)
MATCH (n:PhysicalEntity) RETURN COUNT(n),0,56086


Expected output:
<table>
    <thead>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
            <th>Difference v83 - v75</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <th>MATCH (n:PhysicalEntity) RETURN COUNT(n)</th>
            <td>50891</td>
            <td>56086</td>
            <td>5195</td>
        </tr>
    </tbody>
</table>

In [29]:
execute_query("""
    MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes;
""")





Unnamed: 0,Unnamed: 1,NodeType,NumberOfNodes
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",0,[DBInfo],1
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",1,"[Event, Pathway, TopLevelPathway]",29
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",2,"[Event, Pathway]",2581
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",3,"[Event, ReactionLikeEvent, Reaction]",12003
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",4,"[PhysicalEntity, Complex]",14133
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",...,...,...
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",57,"[ExternalOntology, SequenceOntology]",15
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",58,"[AbstractModifiedResidue, GeneticallyModifiedR...",1880
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",59,"[AbstractModifiedResidue, ReplacedResidue, Gen...",1093
"MATCH (n) RETURN labels(n) AS NodeType, count(n) AS NumberOfNodes",60,[ExternalOntology],14


Expected output:
<table>
    <thead>
        <tr>
            <th>NodeType,</th>
            <th colspan=2>NumberOfNodes</th>
            <th>Difference v83 - v75</th>
        </tr>
        <tr>
            <th></th>
            <th>v75</th>
            <th>v83</th>
            <th></th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>PhysicalEntity,Complex</td>
            <td>14133</td>
            <td>13187</td>
            <td>946</td>
        </tr>
        <tr>
            <td>PhysicalEntity,GenomeEncodedEntity,EntityWithAccessionedSequence,Protein</td>
            <td>29011</td>
            <td>26537</td>
            <td>2474</td>
        </tr>
        <tr>
            <td>PhysicalEntity,SimpleEntity,Chemical, SecondaryMetabolite</td>
            <td>268</td>
            <td>271</td>
            <td>-3</td>
        </tr>
        <tr>
            <td>PhysicalEntity,SimpleEntity,Chemical</td>
            <td>3380</td>
            <td>3178</td>
            <td>202</td>
        </tr>
        <tr>
            <td>PhysicalEntity,Polymer</td>
            <td>234</td>
            <td>218</td>
            <td>16</td>
        </tr>
        <tr>
            <td>PhysicalEntity,EntitySet,DefinedSet, CommonEntity,SecondaryMetabolite</td>
            <td>0</td>
            <td>5</td>
            <td>-5</td>
        </tr>
        <tr>
            <td>PhysicalEntity,EntitySet,DefinedSet,SecondaryMetabolite</td>
            <td>11</td>
            <td>6</td>
            <td>5</td>
        </tr>
        <tr>
            <td>PhysicalEntity,EntitySet,DefinedSet</td>
            <td>4190</td>
            <td>3916</td>
            <td>274</td>
        </tr>
        <tr>
            <td>PhysicalEntity,EntitySet,CandidateSet</td>
            <td>1311</td>
            <td>1192</td>
            <td>119</td>
        </tr>
        <tr>
            <td>PhysicalEntity,GenomeEncodedEntity,EntityWithAccessionedSequence,Gene</td>
            <td>1336</td>
            <td>893</td>
            <td>443</td>
        </tr>
        <tr>
            <td>PhysicalEntity,OtherEntity</td>
            <td>340</td>
            <td>335</td>
            <td>5</td>
        </tr>
        <tr>
            <td>PhysicalEntity,GenomeEncodedEntity</td>
            <td>450</td>
            <td>440</td>
            <td>10</td>
        </tr>
        <tr>
            <td>PhysicalEntity,GenomeEncodedEntity,EntityWithAccessionedSequence,RNA</td>
            <td>309</td>
            <td>229</td>
            <td>80</td>
        </tr>
        <tr>
            <td>PhysicalEntity,ChemicalDrug,Drug</td>
            <td>1027</td>
            <td>387</td>
            <td>640</td>
        </tr>
        <tr>
            <td>PhysicalEntity,Drug,ProteinDrug</td>
            <td>86</td>
            <td>27</td>
            <td>59</td>
        </tr>
    </tbody>
</table>

-------------------

## Outdated steps

+ additinal changes (local lifelike-stg instance reactome-human)

In [None]:
execute_query("""
    create constraint constraint_synonym_name on (n:Synonym) assert (n.name) is Unique; // Added 1 constraint
    match(n:ReferenceGeneProduct) with n unwind n.geneName as synonym 
    merge (s:Synonym {name:synonym}) merge (n)-[:HAS_SYNONYM]->(s); // Added 60893 labels, created 60893 nodes, set 60893 properties, created 94021 relationships

    call apoc.periodic.iterate(
        "match(n:PhysicalEntity) unwind n.name as syn return n, syn",
        "merge(s:Synonym {name:syn}) merge (n)-[:HAS_SYNONYM]->(s)",
        {batchSize: 5000} // ??
    )
""")

+ Based on Christian's code, the following changes made 8/24/2021

  + Change hasCandidate to candidateOf, reverse
  + Change requiredInputComponent to requiredInput, reverse
  + Change repeatedUnit to repeatedUnitOf, reverse

In [None]:
execute_query("""
    match(n)-[r:hasCandidate]->(x) merge (x)-[:candidateOf]->(n) delete r; // Deleted 8653 relationships, created 8653 relationships
    match(n)-[r:requiredInputComponent]->(x) merge (x)-[:requiredInput]->(n) delete r; // Deleted 62 relationships, created 62 relationships
    match(n)-[r:repeatedUnit]->(x) merge (x)-[:repeatedUnitOf]->(n) delete r // Deleted 240 relationships, created 240 relationships
""")

+ 9/15/2021 set nodeLabel for display

In [None]:
execute_query("""
    match(n:Protein) set n.nodeLabel ='Protein'; // Set 29011 properties
    match(n:Gene) set n.nodeLabel='Gene'; // Set 1336 properties
    match(n:ReferenceGeneProduct) set n.nodeLabel = 'Gene'; // Set 104293 properties
    match(n:Chemical) set n.nodeLabel = 'Chemical'; // Set 3648 properties
    match(n:Complex) set n.nodeLabel = 'Complex'; // Set 14133 properties
    match(n:EntitySet) set n.nodeLabel = 'EntitySet'; // Set 5512 properties
    match(n:Polymer) set n.nodeLabel = 'Polymer'; // Set 234 properties
    match(n:ProteinDrug) set n.nodeLabel = 'Protein'; // Set 86 properties
    match(n:ChemicalDrug) set n.nodeLabel = 'Chemical'; // Set 1027 properties
    match(n:RNA) set n.nodeLabel = 'RNA'; // Set 309 properties
    match(n:PhysicalEntity) where not exists (n.nodeLabel) set n.nodeLabel = 'Entity'; // Set 790 properties

    match(n:ReactionLikeEvent) set n.nodeLabel = 'Reaction'; // Set 14770 properties
    match(n:CatalystActivity) set n.nodeLabel = 'CatalystActivity'; // Set 37914 properties
    match(n:Regulation) set n.nodeLabel = 'Regulation'; // Set 6503 properties
    match(n:Pathway) set n.nodeLabel = 'Pathway'; // Set 2610 properties
""")

<br>