# SciPi Spark Implementation - Part 2

## Load CSV in Neo4j

#### Joseph Azzopardi & Andrew Cachia

In [None]:
from py2neo import Graph
from py2neo import Node
from py2neo import Relationship

In [None]:
import time
start = time.time()

# Connection Configuration

In [None]:
#connect to Neo4j API

def neo4jConnect(IP, boltPort, username, pwd, httpPort):
    bolturl = "bolt://" + IP + ":" + boltPort
    mygraph = Graph(bolturl, user=username, password=pwd, bolt=True, secure = False, http_port = httpPort)
    #mygraph = Graph("bolt://40.114.206.146:7697", user="neo4j", password="joseph", bolt=True, secure = False, http_port = 7484)
    print (mygraph)
                    
    return mygraph

In [None]:
#Connect to Neo4j Enterprise on Azure

IP = "neo4j-custom" # IP Address of Neo4j Container
boltPort = "7687"
httpPort = 7474
user = "neo4j"
pwd  = "test"

mygraph = neo4jConnect(IP, boltPort, user, pwd, httpPort)

# Create Constraints

In [None]:
cquery1 = """
CREATE 
CONSTRAINT ON (n:Author) 
ASSERT n.authorid IS UNIQUE
"""

cquery2 = """
CREATE 
CONSTRAINT ON (n:Paper)
ASSERT n.paperid IS UNIQUE
"""

cquery3 = """
CREATE 
CONSTRAINT ON (n:Publishers)
ASSERT n.publisherid IS UNIQUE
"""

cquery4 = """
CREATE 
CONSTRAINT ON (n:ConfInstance) 
ASSERT n.confid IS UNIQUE
"""

cquery5 = """
CREATE 
CONSTRAINT ON (n:Journal)
ASSERT n.journalid IS UNIQUE
"""

cquery6 = """
CREATE 
CONSTRAINT ON (n:Institution)
ASSERT n.institutionid IS UNIQUE
"""

cquery7 = """
CREATE 
CONSTRAINT ON (n:Keyword) 
ASSERT n.name IS UNIQUE
"""

tx = mygraph.begin()
tx.run (cquery1)
tx.run (cquery2)
tx.run (cquery3)
tx.run (cquery4)
tx.run (cquery5)
tx.run (cquery6)
tx.run (cquery7)
tx.commit()

## Node Queries

#### Author Nodes 

In [None]:
# sample: 2797,Elena Frantova,3,28

query_author_nodes = """
    USING PERIODIC COMMIT 1000
    LOAD CSV FROM {csvfile}  AS line
    MERGE (:Author { authorid: line[0], name: line[1] } )
    """
#CREATE (:author { authorid: line[0], name: line[1], PaperCount: toInteger(line[2]), CiteCount: toInteger(line[3]) })

#### Paper Nodes 

In [None]:
# sample: 1979425243,Journal,Kinesin superfamily protein member 4 (KIF4) is localized to midzone and midbody in dividing cells,2004

query_paper_nodes = """
        USING PERIODIC COMMIT 1000
        LOAD CSV FROM {csvfile}  AS line
        MERGE (:Paper{ paperid: line[0], doc_type:line[1], title:line[2], year: line[3] } )
        """

##### Publisher Nodes

In [None]:
# sample: 8589934592,Western Economic Association International

query_publisher_nodes = """
        USING PERIODIC COMMIT 1000
        LOAD CSV FROM {csvfile}  AS line
        MERGE (:Publishers { publisherid: line[0], name:line[1] })
        """

##### ConferenceInstance  Nodes

In [None]:
# sample: 31227610,eurocon 2011,"Lisbon, Portugal"

query_conferenceinstance_nodes = """
        USING PERIODIC COMMIT 1000
        LOAD CSV FROM {csvfile}  AS line
        MERGE (:ConfInstance { confid: line[0], name:line[1], location:line[2] })
        """

#### Journal Nodes

In [None]:
# sample: 18204665,international journal of multiphase flow

query_journal_nodes = """
        USING PERIODIC COMMIT 1000
        LOAD CSV FROM {csvfile}  AS line
        MERGE (:Journal { journalid: line[0], name:line[1] })
        """

##### Institutions Nodes

In [None]:
# sample: 4,USSR Academy of Medical Sciences

query_institution_nodes = """
        USING PERIODIC COMMIT 1000
        LOAD CSV FROM {csvfile}  AS line
        MERGE (:Institution { institutionid: line[0], name:line[1] })
        """

##### Keywords

In [None]:
query_keywords = """
        USING PERIODIC COMMIT 5000
        LOAD CSV FROM {csvfile}  AS line
        MERGE (paper:Paper{paperid:line[0]})
        MERGE (keyword:Keyword{name:line[1]})
        MERGE (paper)-[:Contains]->(keyword)
        """

## Relationships Queries

##### Author-Author Relationships - Author Collaborations

In [None]:
# sample: 

#query_rel_auth_auth = """
#        USING PERIODIC COMMIT 1000
#        LOAD CSV FROM {csvfile}  AS line
#        MATCH (a:Author { authorid: line[0] })
#        MATCH (b:Author { authorid: line[1] })
#        CREATE (a)-[r:co_author { collaborations: line[2] }]->(b);
#        """

query_rel_auth_auth = """
        USING PERIODIC COMMIT 1000
        LOAD CSV FROM {csvfile}  AS line
        MATCH (a:Author { authorid: line[0] })
        MATCH (b:Author { authorid: line[1] })
        CREATE (a)-[r:co_author]->(b);
        """

##### Paper to Publisher Relationship

In [None]:
# sample:  1968760085,1709396983808   (paperId, publisherId)

query_rel_pub_paper = """
        USING PERIODIC COMMIT 1000
        LOAD CSV FROM {csvfile}  AS line
        MATCH (p:Paper {paperid: line[0]} )
        MATCH (publisher:Publishers {publisherid: line[1]} )
        CREATE (p)-[:published_by ]->(publisher);
        """

##### Paper to ConfInstance Relationship

In [None]:
# sample: 2140101510,2624888355   (paperId, conferenceInstanceId)

query_rel_conf_paper = """
        USING PERIODIC COMMIT 1000
        LOAD CSV FROM {csvfile}  AS line
        MATCH (p:Paper {paperid: line[0]} )
        MATCH (c:ConfInstance {confid: line[1]} )
        CREATE (p)-[:conf_part_of]->(c);
        """

##### Paper to Journal Relationship

In [None]:
# sample: 2374592160,2764507941 (paperId, journalId)

query_rel_journal_paper = """
        USING PERIODIC COMMIT 1000
        LOAD CSV FROM {csvfile}  AS line
        MATCH (p:Paper {paperid: line[0]} )
        MATCH (j:Journal {journalid: line[1]} )
        CREATE (p)-[:journal_part_of]->(j);
        """

##### Paper-Author Relationships

In [None]:
# sample author:    15,199142497,A      (paperid, authorid, relationship_type)
# sample co-author: 15,680395887,CO_A   (paperid, authorid, relationship_type) 


query_rel_paper_author = """
        USING PERIODIC COMMIT 1000
        LOAD CSV FROM {csvfile}  AS line
        MATCH (p:Paper { paperid: line[0]})
        MATCH (a:Author { authorid: line[1] })
        CREATE (p)-[ :Authored{Type:line[2]} ]->(a);
        """


##### Author-Institution Relationships

In [None]:
# sample: 2430849057,93   (distData.authorId, distData.institutionId)

query_rel_author_inst = """
        USING PERIODIC COMMIT 1000
        LOAD CSV FROM {csvfile}  AS line
        MATCH (a:Author {authorid: line[0]} )
        MATCH (i:Institution {institutionid: line[1]} )
        CREATE (a)-[:member_of]->(i);
        """



## Import Csv To Neo4j Function

In [None]:
def importCsv(filename, query):
    print(filename)
    #csv_file_base = "https://ics5114mag.blob.core.windows.net/parsed-csv-files/"
    csv_file_base = "file:///"
    csvfile = csv_file_base + filename
    
    params = { "csvfile":csvfile }
    mygraph.run(query, parameters=params )

# Loading Csv Files

In [None]:
### From Azure
#from azure.storage.blob import BlockBlobService
#blob_service = BlockBlobService(account_name="ics5114mag",account_key="ConHeKBYxAuTLZbVQsLM5ltqGmxv8aCrDoRylQGcus/P4yEqlzluItdl/5z8ZG3NXdyJ/f2Aye39ZMkHdGQwSg==")
#generator = blob_service.list_blobs("parsed-csv-files")

### Locally
import os
root_dir = "/home/data"
generator = set()

for dir_, _, files in os.walk(root_dir):
    for file_name in files:
        rel_dir = os.path.relpath(dir_, root_dir)
        rel_file = os.path.join(rel_dir, file_name)
        generator.add(rel_file)

In [None]:
def filterCsv(directory, blob, query):
    #filename = blob.name
    filename = blob
    if (filename.endswith(".csv")):
        if (filename.startswith("results/" + directory)):
            importCsv(filename, query)

In [None]:
for blob in generator:
        filterCsv("authors/", blob, query_author_nodes)

In [None]:
for blob in generator:
        filterCsv("papers/", blob, query_paper_nodes)

In [None]:
for blob in generator:
        filterCsv("publishers/", blob, query_publisher_nodes)

In [None]:
for blob in generator:
        filterCsv("conferenceinstance/", blob, query_conferenceinstance_nodes)

In [None]:
for blob in generator:
        filterCsv("journals/", blob, query_journal_nodes)

In [None]:
for blob in generator:
        filterCsv("institutions/", blob, query_institution_nodes)

In [None]:
for blob in generator:
        filterCsv("keywords/", blob, query_keywords)

In [None]:
for blob in generator:
        filterCsv("paper-publisher-rel/", blob, query_rel_pub_paper)

In [None]:
for blob in generator:
        filterCsv("paper-confinstance-rel/", blob, query_rel_conf_paper)

In [None]:
for blob in generator:
        filterCsv("paper-journal-rel/", blob, query_rel_journal_paper)

In [None]:
for blob in generator:
        filterCsv("papers-author-rel/", blob, query_rel_paper_author)

In [None]:
for blob in generator:
        filterCsv("author-institution-rel/", blob, query_rel_author_inst)

In [None]:
for blob in generator:
        filterCsv("author-author-rel/", blob, query_rel_auth_auth)

In [None]:
end = time.time()
print(end - start)