In [19]:
import pandas as pd
import json
from tqdm import tqdm


# file = "sample_data/citation_sample.txt"
file = "data/dblp_papers_v11.txt"

metadata  = []

lines = 5000    # 100k for testing

with open(file, 'r') as f:

    for line in tqdm(f):
        metadata.append(json.loads(line))
        lines -= 1
        if lines == 0: break

df = pd.DataFrame(metadata)

4999it [00:01, 4640.99it/s]


In [20]:
df.head()

Unnamed: 0,id,title,authors,venue,year,n_citation,page_start,page_end,doc_type,publisher,volume,issue,fos,doi,references,indexed_abstract
0,100001334,Ontologies in HYDRA - Middleware for Ambient I...,"[{'name': 'Peter Kostelnik', 'id': '2702511795...",{'raw': 'AMIF'},2009,2,43,46,,,,,"[{'name': 'Lernaean Hydra', 'w': 0.4178039}, {...",,,
1,1000018889,Remote Policy Enforcement for Trusted Applicat...,"[{'name': 'Fabio Martinelli', 'id': '210743870...",{'raw': 'international conference on trusted s...,2013,2,70,84,Conference,"Springer, Cham",,,"[{'name': 'Trusted Computing', 'w': 0.6314859}...",10.1007/978-3-319-03491-1_5,"[94181602, 1504669610, 1542792105, 1639158619,...","{'IndexLength': 173, 'InvertedIndex': {'Both':..."
2,1000022707,A SIMPLE OBSERVATION REGARDING ITERATIONS OF F...,"[{'name': 'Jerzy Mycka', 'id': '263067851'}]","{'raw': 'Reports on Mathematical Logic', 'id':...",2009,0,19,29,Journal,,44.0,,"[{'name': 'Discrete mathematics', 'w': 0.47368...",,"[1972178849, 2069792094]","{'IndexLength': 49, 'InvertedIndex': {'A': [0]..."
3,100004108,Gait based human identity recognition from mul...,"[{'name': 'Emdad Hossain', 'id': '2017661848',...",{'raw': 'international conference on algorithm...,2012,0,319,328,Conference,"Springer, Berlin, Heidelberg",,,"[{'name': 'Biometrics', 'w': 0.529778063}, {'n...",10.1007/978-3-642-33065-0_34,"[1578000111, 2120433720, 2136461127, 213893135...","{'IndexLength': 82, 'InvertedIndex': {'In': [0..."
4,10000571,The GAME Algorithm Applied to Complex Fraction...,"[{'name': 'Pavel Kordík', 'id': '419063071', '...",{'raw': 'international conference on artificia...,2008,5,859,868,Conference,"Springer, Berlin, Heidelberg",,,"[{'name': 'Pattern recognition', 'w': 0.453429...",10.1007/978-3-540-87559-8_89,"[291899685, 1964166287, 2135293965, 2146842127...","{'IndexLength': 171, 'InvertedIndex': {'Comple..."


In [6]:
from neo4j import GraphDatabase


class Neo4jConnection:

    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)

    def close(self):
        if self.__driver is not None:
            self.__driver.close()

    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try:
            session = self.__driver.session(database=db) if db is not None else self.__driver.session()
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally:
            if session is not None:
                session.close()
        return response

port = 7687 # Check if is the case for your server!

conn = Neo4jConnection(uri="bolt://localhost:"+str(port),
                       user="driver",
                       pwd="driver")

In [8]:
conn.query('CREATE CONSTRAINT papers IF NOT EXISTS ON (p:Paper)     ASSERT p.id IS UNIQUE')
conn.query('CREATE CONSTRAINT authors IF NOT EXISTS ON (a:Author) ASSERT a.name IS UNIQUE')
conn.query('CREATE CONSTRAINT categories IF NOT EXISTS ON (c:Category) ASSERT c.category IS UNIQUE')

[]

In [59]:
conn.query('DROP CONSTRAINT ON (a:Author) ASSERT a.name IS UNIQUE')

[]

In [90]:
import time


def insert_data(query, rows, batch_size = 10000):
    # Function to handle the updating the Neo4j database in batch mode.

    total = 0
    batch = 0
    start = time.time()
    result = None

    while batch * batch_size < len(rows):

        res = conn.query(query,
                         parameters = {'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
        total += res[0]['total']
        batch += 1
        result = {"total":total,
                  "batches":batch,
                  "time":time.time()-start}
        print(result)

    return result

def add_categories(categories):
    # Adds category nodes to the Neo4j graph.
    query = '''
            UNWIND $rows AS row
            MERGE (c:Category {category: row.category})
            RETURN count(*) as total
            '''
    return conn.query(query, parameters = {'rows':categories.to_dict('records')})


def add_authors(rows, batch_size=10000):
    # Adds author nodes to the Neo4j graph as a batch job.
    query = '''
            UNWIND $rows AS row
            MERGE (a:Author {name: row.name})
            RETURN count(*) as total
            '''
    return insert_data(query, rows, batch_size)

def add_papers(rows, batch_size=1000):
   # Adds paper nodes and (:Author)--(:Paper)
   query = '''
    // Create papers
    UNWIND $rows as paper
    MERGE (p:Paper {paperid: paper.id})
    ON CREATE SET
    p.title = paper.title,
    p.year = paper.year,
    p.n_citation = paper.n_citation,
    p.doi = paper.doi

    // Match authors
    // WITH paper, p
    // UNWIND  paper.authors AS author
    // MATCH (a:Author {name: author})
    // MERGE (a)-[:AUTHORED]->(p)

    // Match references
    WITH paper, p
    UNWIND  paper.references AS refid
    OPTIONAL MATCH (r:Paper {paperid:refid})
    MERGE (p)-[:references]->(r)
    RETURN count(p:Paper) as total
   '''

   return insert_data(query, rows, batch_size)


In [72]:
# categories = pd.DataFrame(df[['category_list']])
# categories.rename(columns={'category_list':'category'}, inplace=True)
# categories = categories.explode('category').drop_duplicates(subset=['category'])
#
authors = pd.DataFrame.from_records(
    df[['authors']].explode('authors').authors.values.tolist()
).drop_duplicates(subset=['name'])

# add_categories(categories)
add_authors(authors)
add_papers(df)

{'total': 10000, 'batches': 1, 'time': 14.639893770217896}
{'total': 12781, 'batches': 2, 'time': 26.440998792648315}


{'total': 12781, 'batches': 2, 'time': 26.440998792648315}

In [91]:
add_papers(df)

{'total': 7, 'batches': 1, 'time': 15.533691644668579}
{'total': 17, 'batches': 2, 'time': 32.37181043624878}
{'total': 22, 'batches': 3, 'time': 48.01698684692383}
{'total': 28, 'batches': 4, 'time': 63.595189571380615}
{'total': 37, 'batches': 5, 'time': 79.46612858772278}


{'total': 37, 'batches': 5, 'time': 79.46612858772278}