In [10]:
import time
import pandas as pd
import numpy as np
from datetime import datetime
import yaml
from pathlib import Path
from neo4j_lib import NeoApp

In [11]:
# connect to neo4j
with Path("neo4j_config.yaml").open() as nc:
    neo_config = yaml.safe_load(nc)
    
neo_con = NeoApp(neo_config["uri"], neo_config["user"], neo_config["password"])

In [12]:
# Load data
df = pd.read_parquet("data/pubmed_dump_clean.parquet")
df_entit = pd.read_parquet("data/entities_05.parquet")
df = df.merge(df_entit, on="pmid", how="left")

for col in df.columns:
    if isinstance(df[col][0],np.ndarray):
        df[col]=df[col].apply(np.ndarray.tolist)

In [13]:
# Clean previous graph
query_str = """
    MATCH (n)
    DETACH DELETE n
    """
neo_con.query(query_str)

query_str = """
    CALL apoc.schema.assert({}, {})
    """
neo_con.query(query_str);


In [14]:
neo_con.query(
    "CREATE CONSTRAINT papers IF NOT EXISTS ON (p:Paper) ASSERT p.pmid IS UNIQUE"
)
neo_con.query(
    "CREATE CONSTRAINT authors IF NOT EXISTS ON (a:Author) ASSERT a.name IS UNIQUE"
)
neo_con.query(
    "CREATE CONSTRAINT keywords IF NOT EXISTS ON (k:Keyword) ASSERT k.name IS UNIQUE"
)
neo_con.query(
    "CREATE CONSTRAINT diseases IF NOT EXISTS ON (d:Disease) ASSERT d.name IS UNIQUE"
)
neo_con.query(
    "CREATE CONSTRAINT organizations IF NOT EXISTS ON (o:Organization) ASSERT o.name IS UNIQUE"
)
neo_con.query(
    "CREATE CONSTRAINT chemicals IF NOT EXISTS ON (c:Chemical) ASSERT c.name IS UNIQUE"
)
neo_con.query(
    "CREATE CONSTRAINT locations IF NOT EXISTS ON (l:Location) ASSERT l.name IS UNIQUE"
)


[]

In [15]:
# Create (Paper) nodes
query = """
        UNWIND $rows AS row
        MERGE (p:Paper {pmid:toInteger(row.pmid)})
        ON CREATE SET
        p.title = row.title,
        p.authors = row.author_list,
        p.keywords = row.keywords,
        p.diseases = row.disease,
        p.chemicals = row.chemical,
        p.locations = row.location,
        p.organizations = row.organization,
        p.date = toInteger(row.date),
        p.url = row.url
        RETURN count(*) as total
        """
rows = df[
    [
        "title",
        "pmid",
        "author_list",
        "keywords",
        "disease",
        "chemical",
        "location",
        "organization",
        "date",
        "url",
    ]
].to_dict("records")
neo_con.query(query, parameters={"rows": rows})


[{'total': 928}]

In [16]:
# Expand selected properties to nodes
author_dic = {
    "node_type": "Author",
    "node_label": "a",
    "property_name": "authors",
    "rel_name": "AUTHORED",
}
chem_dic = {
    "node_type": "Chemical",
    "node_label": "c",
    "property_name": "chemicals",
    "rel_name": "ABOUT_CHEMICAL",
}
dis_dic = {
    "node_type": "Disease",
    "node_label": "d",
    "property_name": "diseases",
    "rel_name": "ABOUT_DISEASE",
}
gen_dic = {
    "node_type": "Genetic",
    "node_label": "g",
    "property_name": "genetics",
    "rel_name": "ABOUT_GENETIC",
}
loc_dic = {
    "node_type": "Location",
    "node_label": "l",
    "property_name": "locations",
    "rel_name": "HAS_LOCATION",
}
org_dic = {
    "node_type": "Organization",
    "node_label": "o",
    "property_name": "organizations",
    "rel_name": "HAS_ORG",
}
key_dic = {
    "node_type": "Keyword",
    "node_label": "k",
    "property_name": "keywords",
    "rel_name": "HAS_KEYWORD",
}
col_list = [author_dic, chem_dic, dis_dic, gen_dic, loc_dic, org_dic, key_dic]

for col in col_list:
    query = f"""
            MATCH (p:Paper)
            UNWIND p.{col["property_name"]} AS property
            WITH  property, collect(p) AS papers
            MERGE ({col["node_label"]}:{col["node_type"]} {{name:property}})
            WITH {col["node_label"]}, papers
            UNWIND papers AS p
            WITH {col["node_label"]},p
            MERGE (p)-[:{col["rel_name"]}]->({col["node_label"]});
            """
    neo_con.query(query)

    query = f"""
            MATCH (p:Paper)
            SET p.{col["property_name"]} = null
            """
    neo_con.query(query)


In [17]:
query = """
        MATCH ()-[rel:AUTHORED]->()
        CALL apoc.refactor.invert(rel)
        YIELD input, output
        RETURN input, output;
        """
neo_con.query(query);

In [18]:
# Add node similarities
degree_cutoff = 2
similarity_cutoff = 0.2

drop_graph_query = """
CALL gds.graph.drop('myGraph') YIELD graphName;
"""
# preview_query = f"""
# CALL gds.nodeSimilarity.stream('myGraph', {{ degreeCutoff: {degree_cutoff} , similarityCutoff: {similarity_cutoff}}})
# YIELD node1, node2, similarity
# RETURN gds.util.asNode(node1).title AS Paper1, gds.util.asNode(node2).title AS Paper2, similarity
# ORDER BY similarity DESC
# """


# 1. Keywords
project_query = """
CALL gds.graph.project(
    'myGraph',
    ['Paper','Keyword'],
    ['HAS_KEYWORD']
);
"""
write_similarity_query = f"""
CALL gds.nodeSimilarity.write('myGraph', {{
    writeRelationshipType: 'SIMILAR_KEYWORDS',
    writeProperty: 'score',
    degreeCutoff: {degree_cutoff},
    similarityCutoff: {similarity_cutoff}
}})
YIELD nodesCompared, relationshipsWritten
"""
print(neo_con.query(project_query))
print(neo_con.query(write_similarity_query))
print(neo_con.query(drop_graph_query))

# 2. Chemical
project_query = """
CALL gds.graph.project(
    'myGraph',
    ['Paper','Chemical'],
    ['ABOUT_CHEMICAL']
);
"""
write_similarity_query = f"""
CALL gds.nodeSimilarity.write('myGraph', {{
    writeRelationshipType: 'SIMILAR_CHEMICAL',
    writeProperty: 'score',
    degreeCutoff: {degree_cutoff},
    similarityCutoff: {similarity_cutoff}
}})
YIELD nodesCompared, relationshipsWritten
"""
print(neo_con.query(project_query))
print(neo_con.query(write_similarity_query))
print(neo_con.query(drop_graph_query))

# 3. Disease
project_query = """
CALL gds.graph.project(
    'myGraph',
    ['Paper','Disease'],
    ['ABOUT_DISEASE']
);
"""
write_similarity_query = f"""
CALL gds.nodeSimilarity.write('myGraph', {{
    writeRelationshipType: 'SIMILAR_DISEASE',
    writeProperty: 'score',
    degreeCutoff: {degree_cutoff},
    similarityCutoff: {similarity_cutoff}
}})
YIELD nodesCompared, relationshipsWritten
"""
print(neo_con.query(project_query))
print(neo_con.query(write_similarity_query))
print(neo_con.query(drop_graph_query))


[{'nodeProjection': {'Keyword': {'label': 'Keyword', 'properties': {}}, 'Paper': {'label': 'Paper', 'properties': {}}}, 'relationshipProjection': {'HAS_KEYWORD': {'orientation': 'NATURAL', 'aggregation': 'DEFAULT', 'type': 'HAS_KEYWORD', 'properties': {}}}, 'graphName': 'myGraph', 'nodeCount': 2360, 'relationshipCount': 1950, 'projectMillis': 14}]
[{'nodesCompared': 326, 'relationshipsWritten': 493}]
[{'graphName': 'myGraph'}]
[{'nodeProjection': {'Chemical': {'label': 'Chemical', 'properties': {}}, 'Paper': {'label': 'Paper', 'properties': {}}}, 'relationshipProjection': {'ABOUT_CHEMICAL': {'orientation': 'NATURAL', 'aggregation': 'DEFAULT', 'type': 'ABOUT_CHEMICAL', 'properties': {}}}, 'graphName': 'myGraph', 'nodeCount': 1452, 'relationshipCount': 798, 'projectMillis': 17}]
[{'nodesCompared': 167, 'relationshipsWritten': 309}]
[{'graphName': 'myGraph'}]
[{'nodeProjection': {'Disease': {'label': 'Disease', 'properties': {}}, 'Paper': {'label': 'Paper', 'properties': {}}}, 'relationsh