In [130]:
from neo4j import GraphDatabase
import logging
from neo4j.exceptions import ServiceUnavailable
import time
import pandas as pd
import numpy as np
from datetime import datetime

In [131]:
df = pd.read_parquet("pubmed_dump.parquet")
df_entit=pd.read_parquet('entities_09.parquet')
df=df.merge(df_entit,on='pmid',how='left')

df['author_list']=df['author_list'].apply(np.ndarray.tolist)
#TODO
df['keywords']=df['keywords'].fillna({i: np.array([]) for i in df.index}).apply(np.ndarray.tolist) 
df['disease']=df['disease'].fillna({i: np.array([]) for i in df.index}).apply(np.ndarray.tolist)
df['chemical']=df['chemical'].fillna({i: np.array([]) for i in df.index}).apply(np.ndarray.tolist)
df['organization']=df['organization'].fillna({i: np.array([]) for i in df.index}).apply(np.ndarray.tolist)
df['location']=df['location'].fillna({i: np.array([]) for i in df.index}).apply(np.ndarray.tolist)
df=df.sample(n=50)

In [132]:
df

Unnamed: 0,pmid,title,abstract,keywords,author_list,url,citation,date,location,organization,person,disease,genetic,chemical
490,19793089,Histological structure and distribution of car...,While the mandibular glands usually consist of...,[],"[T Mizuno, A McKinnon, N Ichihara, T Amasaki, ...",https://pubmed.ncbi.nlm.nih.gov/19793089/,2009 Nov;38(6):449-54.,2009.0,[],[],,[],,[]
615,17177685,Cytokines and soluble CD14 in breast milk in r...,Background: Conflicting evi...,[],"[B E P Snijders, J G M C Damoiseaux, J Penders...",https://pubmed.ncbi.nlm.nih.gov/17177685/,2006 Dec;36(12):1609-15.,2006.0,[],[],,[],,[]
68,31162024,Prevalence and clinical significance of koala ...,Purpose: Koala retrovirus (...,"[Chlamydia pecorum, Phascolarctidae, South Aus...","[Jessica Fabijan, Darren Miller, Olusola Olago...",https://pubmed.ncbi.nlm.nih.gov/31162024/,2019 Jul;68(7):1072-1080.,2019.0,[],[],,[],,[]
123,22551982,Expression and in vitro upregulation of MHCII ...,Understanding and measuring immune activity of...,[],"[Quintin Lau, Paul J Canfield, Damien P Higgins]",https://pubmed.ncbi.nlm.nih.gov/22551982/,2012 Jun 15;147(1-2):35-43.,2012.0,[],[],,[],,[]
707,26713222,Evidence for horizontal gene transfer between ...,"Chlamydia-infecting bacteriophages, members of...","[gokushovirinae, horizontal gene transfer, mic...","[Anne G Rosenwald, Bradley Murray, Theodore To...",https://pubmed.ncbi.nlm.nih.gov/26713222/,2014 Dec 15;4(4):e965076.,2014.0,[],[],,[],,[]
614,29412005,Citizen Science to Communicate about Public He...,There is a lack of research on how to communic...,[],"[Katrien De Cocker, Sebastien F M Chastin, Ils...",https://pubmed.ncbi.nlm.nih.gov/29412005/,2019 Jun;34(7):720-725.,2019.0,[],[],,[],,[]
291,34827969,An Analysis of Demographic and Triage Assessme...,"In the 2019-2020 Australian bushfires, Kangaro...","[Phascolarctidae, burn, mortality, rescue, tra...","[Evie Dunstan, Oliver Funnell, Jenny McLelland...",https://pubmed.ncbi.nlm.nih.gov/34827969/,2021 Nov 12;11(11):3237.,2021.0,[],[],,[],,[]
593,26748912,Obstructive sleep apnoea in children with obesity,Aims: The aim of this study...,"[child, hypoventilation, obesity, sleep apnoea...","[Rubina Kassim, Margaret-Anne Harris, Gary M L...",https://pubmed.ncbi.nlm.nih.gov/26748912/,2016 Mar;52(3):284-90.,2016.0,[],[],,[],,[]
658,1537048,Retinal topography in the koala (Phascolarctos...,Nissl-stained retinal wholemounts were used to...,[],"[K L Schmid, L M Schmid, C F Wildsoet, J D Pet...",https://pubmed.ncbi.nlm.nih.gov/1537048/,1992;39(1):8-16.,1992.0,[],[],,[],,[]
606,9765820,Lipopolysaccharide biosynthesis genes in koala...,We showed in 1988 that there are two strains o...,[],"[A A Girjes, F N Carrick, M F Lavin]",https://pubmed.ncbi.nlm.nih.gov/9765820/,1997 Jun;148(5):413-25.,1997.0,[],[],,[],,[]


In [133]:
class NeoApp:

    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        # Don't forget to close the driver connection when you are finished with it
        self.driver.close()

    def create_friendship(self, person1_name, person2_name):
        with self.driver.session(database="neo4j") as session:
            # Write transactions allow the driver to handle retries and transient errors
            result = session.write_transaction(
                self._create_and_return_friendship, person1_name, person2_name)
            for row in result:
                print("Created friendship between: {p1}, {p2}".format(p1=row['p1'], p2=row['p2']))

    @staticmethod
    def _create_and_return_friendship(tx, person1_name, person2_name):
        # To learn more about the Cypher syntax, see https://neo4j.com/docs/cypher-manual/current/
        # The Reference Card is also a good resource for keywords https://neo4j.com/docs/cypher-refcard/current/
        query = (
            "CREATE (p1:Person { name: $person1_name }) "
            "CREATE (p2:Person { name: $person2_name }) "
            "CREATE (p1)-[:KNOWS]->(p2) "
            "RETURN p1, p2"
        )
        result = tx.run(query, person1_name=person1_name, person2_name=person2_name)
        try:
            return [{"p1": row["p1"]["name"], "p2": row["p2"]["name"]}
                    for row in result]
        # Capture any errors along with the query and data for traceability
        except ServiceUnavailable as exception:
            logging.error("{query} raised an error: \n {exception}".format(
                query=query, exception=exception))
            raise

    def find_person(self, person_name):
        with self.driver.session(database="neo4j") as session:
            result = session.read_transaction(self._find_and_return_person, person_name)
            for row in result:
                print("Found person: {row}".format(row=row))

    @staticmethod
    def _find_and_return_person(tx, person_name):
        query = (
            "MATCH (p:Person) "
            "WHERE p.name = $person_name "
            "RETURN p.name AS name"
        )
        result = tx.run(query, person_name=person_name)
        return [row["name"] for row in result]
        
    #@staticmethod
    def query(self, query, parameters=None, db=None):
        session = None
        response = None
        try: 
            session = self.driver.session(database=db) if db is not None else self.driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [134]:
uri = "neo4j+s://206c60f6.databases.neo4j.io"
user = "neo4j"
password = "Vo4n0Pi6yHb2no8cLAK2aoxGkRiUAwGKHJ8ApwgRHuQ"
neo_con = NeoApp(uri, user, password)

In [135]:
# Clean graph
query_str = """
    MATCH (n)
    DETACH DELETE n
    """
neo_con.query(query_str)


[]

In [136]:
neo_con.query('CREATE CONSTRAINT papers IF NOT EXISTS ON (p:Paper)     ASSERT p.id IS UNIQUE')
neo_con.query('CREATE CONSTRAINT authors IF NOT EXISTS ON (a:Author) ASSERT a.name IS UNIQUE')
neo_con.query('CREATE CONSTRAINT categories IF NOT EXISTS ON (c:Category) ASSERT c.category IS UNIQUE')

[]

In [137]:
# (Paper)
query = '''
        UNWIND $rows AS row
        MERGE (p:Paper {pmid:toInteger(row.pmid)})
        ON CREATE SET
        p.title = row.title,
        p.authors = row.author_list,
        p.keywords = row.keywords,
        p.diseases = row.disease,
        p.chemicals = row.chemical,
        p.locations = row.location,
        p.organizations = row.organization,
        p.date = toInteger(row.date),
        p.url = row.url
        RETURN count(*) as total
        '''
rows= df[['title','pmid','author_list','keywords','disease','chemical','location','organization','date','url']].to_dict('records')
neo_con.query(query, parameters = {'rows': rows})

[<Record total=50>]

query = '''
        MATCH (p:Paper)
        UNWIND p.diseases AS disease
        WITH  disease, collect(p) AS papers
        MERGE (d:Disease {name:disease})
        WITH d, papers
        UNWIND papers AS p
        WITH d,p
        MERGE (p)-[:ABOUT_DISEASE]->(d);
        '''
neo_con.query(query)

query = '''
        MATCH (p:Paper)
        SET p.diseases = null
        '''
neo_con.query(query)

In [138]:
author_dic = {
    "node_type": "Author",
    "node_label": "a",
    "property_name": "authors",
    "rel_name": "AUTHORED",
}
chem_dic = {
    "node_type": "Chemical",
    "node_label": "c",
    "property_name": "chemicals",
    "rel_name": "ABOUT_CHEMICAL",
}
dis_dic = {
    "node_type": "Disease",
    "node_label": "d",
    "property_name": "diseases",
    "rel_name": "ABOUT_DISEASE",
}
gen_dic = {
    "node_type": "Genetic",
    "node_label": "g",
    "property_name": "genetics",
    "rel_name": "ABOUT_GENETIC",
}
loc_dic = {
    "node_type": "Location",
    "node_label": "l",
    "property_name": "locations",
    "rel_name": "HAS_LOCATION",
}
org_dic = {
    "node_type": "Organization",
    "node_label": "o",
    "property_name": "organizations",
    "rel_name": "HAS_ORG",
}
key_dic = {
    "node_type": "Keyword",
    "node_label": "k",
    "property_name": "keywords",
    "rel_name": "HAS_KEYWORD",
}
col_list=[author_dic, chem_dic, dis_dic, gen_dic, loc_dic, org_dic, key_dic]

In [139]:
for col in col_list:
    query = f'''
            MATCH (p:Paper)
            UNWIND p.{col["property_name"]} AS property
            WITH  property, collect(p) AS papers
            MERGE ({col["node_label"]}:{col["node_type"]} {{name:property}})
            WITH {col["node_label"]}, papers
            UNWIND papers AS p
            WITH {col["node_label"]},p
            MERGE (p)-[:{col["rel_name"]}]->({col["node_label"]});
            '''
    neo_con.query(query)

    query = f'''
            MATCH (p:Paper)
            SET p.{col["property_name"]} = null
            '''
    neo_con.query(query)