In [46]:
import pandas as pd
import json
from tqdm import tqdm

In [47]:
# Database wrapper from: https://towardsdatascience.com/create-a-graph-database-in-neo4j-using-python-4172d40f89c4
from neo4j import GraphDatabase


class Neo4jConnection:

    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)

    def close(self):
        if self.__driver is not None:
            self.__driver.close()

    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try:
            session = self.__driver.session(database=db) if db is not None else self.__driver.session()
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally:
            if session is not None:
                session.close()
        return response

port = 7687 # Check if is the case for your server!

conn = Neo4jConnection(uri="bolt://localhost:"+str(port),
                       user="driver",
                       pwd="driver")

In [48]:
import time


def insert_data(query, rows, batch_size = 10000):
    # Function to handle the updating the Neo4j database in batch mode.

    total = 0
    batch = 0
    start = time.time()
    result = None

    while batch * batch_size < len(rows):
        batch_start = time.time()
        res = conn.query(query,
                         parameters = {'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
        total += res[0]['total']
        batch += 1
        result = {"batch_size": batch_size,
                  "batches_done": batch,
                  "batch_time": time.time() - batch_start,
                  "total_time": time.time()-start}
        print(result)

    return result

def add_papers(rows, batch_size=5000):
   # Adds paper nodes and relationships (:Author)-[:AUTHORED]-(:Paper), (:Paper)-[:REFERENCES]-(:Paper)
   query = '''
    // Create papers
    UNWIND $rows as paper
    MERGE (p:Paper {paperid: paper.id})
    ON CREATE SET
    p.title = paper.title,
    p.year = paper.year,
    p.n_citation = paper.n_citation,
    p.doi = paper.doi

    // Match authors
    WITH paper, p
    UNWIND  paper.authors AS author
    MERGE (a:Author {authorid: author.id})
    ON CREATE SET a.name = author.name
    MERGE (a)-[:AUTHORED]->(p)

    // Match references
    WITH paper, p
    UNWIND  paper.references AS refid
    MATCH (r:Paper {paperid:refid})
    MERGE (p)-[:REFERENCES]->(r)
    RETURN count(p:Paper) as total
   '''

   return insert_data(query, rows, batch_size)

In [51]:
file = "data/dblp_papers_v11.txt"

subset = ["id", "title", "year", "n_citation", "doi", "authors", "references"]

# TODO: Might be possible to speed up inserts by using just the name for matching instead of ID https://stackoverflow.com/a/23609143/9994398
with open(file, 'r') as f:
    while True:
        try:
            lines = 10000
            rows  = []
            for line in tqdm(f):
                rows.append(json.loads(line))
                lines -= 1
                if lines == 0: break
            df = pd.DataFrame(rows)
            add_papers(df[subset], 500)
        except Exception as e:
            print(e)
            break

9999it [00:00, 10176.36it/s]


{'total_inserted': 3, 'batches_done': 1, 'total_time': 4.5319390296936035}
{'total_inserted': 3, 'batches_done': 2, 'total_time': 10.221335411071777}
{'total_inserted': 9, 'batches_done': 3, 'total_time': 21.46748113632202}
{'total_inserted': 15, 'batches_done': 4, 'total_time': 34.82120728492737}
{'total_inserted': 15, 'batches_done': 5, 'total_time': 51.59878206253052}
{'total_inserted': 20, 'batches_done': 6, 'total_time': 71.70696544647217}
{'total_inserted': 32, 'batches_done': 7, 'total_time': 95.49982571601868}
{'total_inserted': 32, 'batches_done': 8, 'total_time': 125.41570115089417}
{'total_inserted': 49, 'batches_done': 9, 'total_time': 162.8287422657013}
{'total_inserted': 60, 'batches_done': 10, 'total_time': 196.98083209991455}
{'total_inserted': 72, 'batches_done': 11, 'total_time': 246.89355731010437}
{'total_inserted': 81, 'batches_done': 12, 'total_time': 296.36041474342346}
{'total_inserted': 93, 'batches_done': 13, 'total_time': 342.64794850349426}
{'total_inserted'

9999it [00:00, 11209.04it/s]


{'total_inserted': 10, 'batches_done': 1, 'total_time': 67.93622064590454}
{'total_inserted': 34, 'batches_done': 2, 'total_time': 141.68059134483337}
{'total_inserted': 54, 'batches_done': 3, 'total_time': 222.92155408859253}
{'total_inserted': 94, 'batches_done': 4, 'total_time': 302.74795722961426}
{'total_inserted': 109, 'batches_done': 5, 'total_time': 378.2459383010864}
{'total_inserted': 127, 'batches_done': 6, 'total_time': 456.53497982025146}
{'total_inserted': 133, 'batches_done': 7, 'total_time': 544.5719645023346}
{'total_inserted': 163, 'batches_done': 8, 'total_time': 627.1456837654114}
{'total_inserted': 198, 'batches_done': 9, 'total_time': 732.5805420875549}
{'total_inserted': 220, 'batches_done': 10, 'total_time': 845.0952622890472}



In [53]:
def get_stack_exchange_df():
    path = "data/Posts.xml"
    with open(path, 'r', encoding='utf8') as f:
        try:
            raw_data = f.read()
        except Exception as e:
            print(e)
    return pd.read_xml(raw_data)

df = get_stack_exchange_df()
df.head()

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,...,CommentCount,FavoriteCount,ContentLicense,LastEditorDisplayName,LastEditDate,LastEditorUserId,CommunityOwnedDate,ParentId,ClosedDate,OwnerDisplayName
0,1,1,15.0,2010-07-19T19:12:12.510,49,5012.0,<p>How should I elicit prior distributions fro...,8.0,2020-11-05T09:44:51.710,Eliciting priors from experts,...,1,34.0,CC BY-SA 2.5,,,,,,,
1,2,1,59.0,2010-07-19T19:12:57.157,34,33292.0,<p>In many different statistical methods there...,24.0,2022-01-29T15:05:59.723,What is normality?,...,1,12.0,CC BY-SA 2.5,user88,2010-08-07T17:56:44.800,,,,,
2,3,1,5.0,2010-07-19T19:13:28.577,71,6528.0,<p>What are some valuable Statistical Analysis...,18.0,2013-05-27T14:48:36.927,What are some valuable Statistical Analysis op...,...,3,40.0,CC BY-SA 2.5,,2011-02-12T05:50:03.667,183.0,2010-07-19T19:13:28.577,,,
3,4,1,135.0,2010-07-19T19:13:31.617,23,43160.0,<p>I have two groups of data. Each with a dif...,23.0,2010-09-08T03:00:19.690,Assessing the significance of differences in d...,...,2,12.0,CC BY-SA 2.5,,,,,,,
4,5,2,,2010-07-19T19:14:43.050,90,,"<p>The R-project</p>\n\n<p><a href=""http://www...",23.0,2010-07-19T19:21:15.063,,...,3,,CC BY-SA 2.5,,2010-07-19T19:21:15.063,23.0,2010-07-19T19:14:43.050,3.0,,
