In [None]:
from neo4j import GraphDatabase
import pandas as pd
import json
from tqdm import tqdm

In [2]:
# Database wrapper from: https://towardsdatascience.com/create-a-graph-database-in-neo4j-using-python-4172d40f89c4


class Neo4jConnection:

    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)

    def close(self):
        if self.__driver is not None:
            self.__driver.close()

    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try:
            session = self.__driver.session(database=db) if db is not None else self.__driver.session()
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally:
            if session is not None:
                session.close()
        return response

port = 7687 # Check if is the case for your server!

conn = Neo4jConnection(uri="bolt://localhost:"+str(port),
                       user="driver",
                       pwd="driver")

In [2]:
import time


def insert_data(query, rows, batch_size = 10000):
    # Function to handle the updating the Neo4j database in batch mode.

    total = 0
    batch = 0
    start = time.time()
    result = None

    for batch in tqdm(range(len(rows) // batch_size)):
        batch_start = time.time()
        res = conn.query(query,
                         parameters = {'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
        total += res[0]['total']
        result = {"batch_size": batch_size,
                  "batches_done": batch,
                  "batch_time": time.time() - batch_start,
                  "total_time": time.time()-start}
        # print(result)

    return result

def add_papers(rows, batch_size=5000):
   # Adds paper nodes and relationships (:Author)-[:AUTHORED]-(:Paper), (:Paper)-[:REFERENCES]-(:Paper)
   query = '''
    // Create papers
    UNWIND $rows as paper
    MERGE (p:Paper {paperid: paper.id})
    ON CREATE SET
    p.title = paper.title,
    p.year = paper.year,
    p.n_citation = paper.n_citation,
    p.doi = paper.doi

    // Match authors
    WITH paper, p
    UNWIND  paper.authors AS author
    MERGE (a:Author {authorid: author.id})
    ON CREATE SET a.name = author.name
    MERGE (a)-[:AUTHORED]->(p)

    // Match references
    WITH paper, p
    UNWIND  paper.references AS refid
    MATCH (r:Paper {paperid:refid})
    MERGE (p)-[:REFERENCES]->(r)
    RETURN count(p:Paper) as total
   '''

   return insert_data(query, rows, batch_size)

In [3]:
conn.query('CREATE INDEX paper_id_index IF NOT EXISTS FOR (n:Paper) ON (n.paperid);')
conn.query('CREATE INDEX author_id_index IF NOT EXISTS FOR (a:Author) ON (a.authorid);')

[]

In [9]:
file = "data/dblp_papers_v11.txt"
n_papers = 4107340 # Number of papers in the dataset
subset = ["id", "title", "year", "n_citation", "doi", "authors", "references"]

# TODO: Add tqdm max of the number of papers
with open(file, 'r') as f:
    for episode in tqdm(range(n_papers // 100000)):
        try:
            lines = 100000
            rows  = []
            for line in f:
                rows.append(json.loads(line))
                lines -= 1
                if lines == 0: break
            df = pd.DataFrame(rows)
            add_papers(df[subset], 1000)
        except Exception as e:
            print(e)
            break

  0%|          | 0/41 [00:05<?, ?it/s]


KeyboardInterrupt: 

In [5]:
import re


def process_stackexchange(board: str) -> list[pd.DataFrame]:
    """_summary_

    Args:
        board (str): Stackexchange board to load. Expected file structure is
            ./data/`board`/*.xml

    Returns:
        list[pd.DataFrame]: List of pandas dataframes with Comments and
            PostHistory data with extracted DOIs and markdown links.
    """
    # Regex patterns
    DOI_PATTERN = "10\.\d{4,9}/[-._;\(\)/:A-Z0-9]+[/A-Z0-9]"
    MD_PATTERN = "\[([\w\s\d]+)\](https?:\/\/[\w\d./?=#]+)"

    def process_set(set: str) -> pd.DataFrame:
        """Processes single file in board data

        Args:
            set (str): One of "Comments" or "PostHistory"

        Returns:
            pd.DataFrame: Original data with extracted DOIs and Markdown links
        """
        # Load xml file
        path = f"data/{board}/{set}.xml"
        with open(path, encoding="utf8") as file:
            df = pd.read_xml(file.read())

        # Extract all DOIs using pattern defined before to list per row
        df["DOI"] = df["Text"].apply(re.findall, args=(DOI_PATTERN,))

        # Extract Markdown links
        markdown_df = df["Text"].apply(re.findall, args=(MD_PATTERN,))
        markdown_df.columns = ["LinkTitle", "LinkURL"]

        return pd.concat([df
                             # , markdown_df
                          ], axis=1)


    return process_set("Comments"), process_set("PostHistory")


In [7]:
comments_df, posthistory_df = process_stackexchange("ai")

error: unbalanced parenthesis at position 114

In [None]:
se_boards = ["ai", "cstheory", "datascience", "softwarerecs", "stats"]
for board in se_boards:
    comments_df, posthistory_df = process_stackexchange("stats")
    comments_df = comments_df[~comments_df["DOIs"].isna()]
    posthistory_df = posthistory_df.sort_values('CreationDate').drop_duplicates(subset=["PostId", "DOIs"]).shape