In [1]:
from neo4j import GraphDatabase
import pandas as pd
import json
from tqdm import tqdm

In [2]:
# Database wrapper from: https://towardsdatascience.com/create-a-graph-database-in-neo4j-using-python-4172d40f89c4


class Neo4jConnection:

    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)

    def close(self):
        if self.__driver is not None:
            self.__driver.close()

    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try:
            session = self.__driver.session(database=db) if db is not None else self.__driver.session()
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally:
            if session is not None:
                session.close()
        return response

port = 7687 # Check if is the case for your server!

conn = Neo4jConnection(uri="bolt://localhost:"+str(port),
                       user="driver",
                       pwd="driver")

In [3]:
import time


def insert_data(query, rows, batch_size = 10000):
    # Function to handle the updating the Neo4j database in batch mode.

    total = 0
    batch = 0
    start = time.time()
    result = None

    for batch in tqdm(range(len(rows) // batch_size + 1)):
        batch_start = time.time()
        res = conn.query(query,
                         parameters = {'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
        total += res[0]['total']
        result = {"batch_size": batch_size,
                  "batches_done": batch,
                  "batch_time": time.time() - batch_start,
                  "total_time": time.time()-start}
        # print(result)

    return result

def add_papers(rows, batch_size=5000):
   # Adds paper nodes and relationships (:Author)-[:AUTHORED]-(:Paper), (:Paper)-[:REFERENCES]-(:Paper)
   query = '''
    // Create papers
    UNWIND $rows as paper
    MERGE (p:Paper {paperid: paper.id})
    ON CREATE SET
    p.title = paper.title,
    p.year = paper.year,
    p.n_citation = paper.n_citation,
    p.doi = paper.doi

    // Match authors
    WITH paper, p
    UNWIND  paper.authors AS author
    MERGE (a:Author {authorid: author.id})
    ON CREATE SET a.name = author.name
    MERGE (a)-[:AUTHORED]->(p)

    // Match references
    WITH paper, p
    UNWIND  paper.references AS refid
    MATCH (r:Paper {paperid:refid})
    MERGE (p)-[:REFERENCES]->(r)
    RETURN count(p:Paper) as total
   '''

   return insert_data(query, rows, batch_size)

In [4]:
conn.query('CREATE INDEX paper_id_index IF NOT EXISTS FOR (p:Paper) ON (p.paperid);')
conn.query('CREATE INDEX author_id_index IF NOT EXISTS FOR (a:Author) ON (a.authorid);')

Query failed: Couldn't connect to localhost:7687 (resolved to ('[::1]:7687', '127.0.0.1:7687')):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)
Query failed: Couldn't connect to localhost:7687 (resolved to ('[::1]:7687', '127.0.0.1:7687')):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)


In [5]:
file = "data/dblp_papers_v11.txt"
n_papers = 4107340 # Number of papers in the dataset
subset = ["id", "title", "year", "n_citation", "doi", "authors", "references"]

# TODO: Add tqdm max of the number of papers
with open(file, 'r') as f:
    for episode in tqdm(range(n_papers // 100000)):
        try:
            lines = 100000
            rows  = []
            for line in f:
                rows.append(json.loads(line))
                lines -= 1
                if lines == 0: break
            df = pd.DataFrame(rows)
            add_papers(df[subset], 1000)
        except Exception as e:
            print(e)
            break

  0%|          | 0/41 [00:00<?, ?it/s]
  0%|          | 0/100 [00:04<?, ?it/s][A
  0%|          | 0/41 [00:14<?, ?it/s]

Query failed: Couldn't connect to localhost:7687 (resolved to ('[::1]:7687', '127.0.0.1:7687')):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)
'NoneType' object is not subscriptable





In [4]:
import re


def process_stackexchange(board: str) -> list[pd.DataFrame]:
    """_summary_

    Args:
        board (str): Stackexchange board to load. Expected file structure is
            ./data/`board`/*.xml

    Returns:
        list[pd.DataFrame]: List of pandas dataframes with Comments and
            PostHistory data with extracted DOIs and markdown links.
    """
    # Regex patterns
    DOI_PATTERN = "10\.\d{4,9}/[-._;\(\)/:A-Z0-9]+[/A-Z0-9]"
    MD_PATTERN = "\[([\w\s\d]+)\](https?:\/\/[\w\d./?=#]+)"

    def process_set(set: str) -> pd.DataFrame:
        """Processes single file in board data

        Args:
            set (str): One of "Comments" or "PostHistory"

        Returns:
            pd.DataFrame: Original data with extracted DOIs and Markdown links
        """
        # Load xml file
        path = f"data/{board}/{set}.xml"
        with open(path, encoding="utf8") as file:
            df = pd.read_xml(file.read())

        # Extract all DOIs using pattern defined before to list per row
        df["DOIs"] = df["Text"].apply(lambda text: str(re.findall(DOI_PATTERN, str(text))))
        df = df[df["DOIs"] != "[]"]
        return df

    return process_set("Comments"), process_set("PostHistory")

In [10]:
def add_boards(rows, batch_size=5000):
    # Adds board nodes
    query = '''
    UNWIND $rows as board
    MERGE (b:Board {boardname: board.name})
    RETURN count(b:Board) as total
    '''
    return insert_data(query, rows, batch_size)

In [6]:
def add_posts(rows, batch_size=5000):
   # Adds post nodes
   query = '''
    // Create posts
    UNWIND $rows as post
    MERGE (p:Post {postid: post.PostId})
    ON CREATE SET
    p.text = post.Text,

    WITH post, p
    UNWIND post.DOIs AS doi
    MATCH (r:Paper {doi:doi})
    MERGE (p)-[:REFERENCES]->(r)
    RETURN count(p:Post) as total
   '''
   return insert_data(query, rows, batch_size)

In [7]:
def add_comments(rows, batch_size=500):
   # Adds comment nodes
   query = '''
    // Create comments
    UNWIND $rows as comment
    MERGE (c:Comment {commentid: comment.id})
    ON CREATE SET
    p.title = paper.title

    // Match posts
    WITH comment, c
    MATCH (p:Post {postid: comment.PostId})
    MERGE (c)-[:RESPONDS]->(p)

    // Match references
    WITH comment, c
    UNWIND comment.DOIs AS doi
    MATCH (r:Paper {doi:doi})
    MERGE (c)-[:REFERENCES]->(r)
    RETURN count(c:Comment) as total
   '''
   return insert_data(query, rows, batch_size)

In [8]:
def add_post_links(rows, batch_size=5000):
   # Adds post nodes
   query = '''
    // Create posts
    UNWIND $rows as post_link
    MATCH (p:Post {postid:post_link.postid})
    MERGE (c)-[:REFERENCES]->(r)
    RETURN count(p:Post) as total
   '''
   return insert_data(query, rows, batch_size)

In [12]:
se_boards = ["ai", "cstheory", "datascience", "softwarerecs", "stats"]
add_boards(pd.DataFrame({"name": se_boards}), batch_size=1)

100%|██████████| 5/5 [00:02<00:00,  2.00it/s]


{'batch_size': 1,
 'batches_done': 4,
 'batch_time': 0.005155801773071289,
 'total_time': 2.497079610824585}

In [None]:
for board in se_boards:
    comments_df, posthistory_df = process_stackexchange("stats")
    posthistory_df = posthistory_df.sort_values('CreationDate').drop_duplicates(subset=["PostId", "DOIs"])
    add_posts(posthistory_df)
    add_comments(comments_df)
    # add_post_links(posthistory_df)
    del(comments_df, posthistory_df)