In [None]:
import os

os.chdir("../")

In [None]:
from knowledge_graph_creator.doc_extractor.pdf_extractor import PyMuPDFReader
from knowledge_graph_creator.patterns import ReferencePattern
from knowledge_graph_creator.extractors.reference_extractor import ReferenceExtractor
from knowledge_graph_creator.extractors.reference_details import (
    ReferenceDetailsExtractor,
)

In [None]:
DOC_PATH = "data/3643806.pdf"

In [None]:
pdf_reader = PyMuPDFReader()
pages = pdf_reader.to_list(path=DOC_PATH, select_pages=list(range(32, 42)))

In [None]:
reference_extractor = ReferenceExtractor(ReferencePattern.BRACKETED_NUMBER)

In [None]:
references = {}
for page_text in pages:
    references.update(reference_extractor.extract(text=page_text))

In [None]:
references_details_extractor = ReferenceDetailsExtractor()

references_details = {}
for ref_id, ref_text in references.items():
    details = references_details_extractor.parse_with_regex(
        ref_id=ref_id, ref_text=ref_text
    )
    references_details[ref_id] = details

In [None]:
references_details.get(25).authors

In [None]:
# split and create unique authors
from knowledge_graph_creator.extractors.reference_details import ReferenceDetails

all_authors = []
for ref_id, ref_text in references_details.items():
    authors_list = ref_text.authors.split(", ")
    for author in authors_list:
        temp_reference_details = ReferenceDetails(
            id_=ref_id,
            authors=author,
            title=ref_text.title,
            publish=ref_text.publish,
            year=ref_text.year,
            page_or_volume=ref_text.page_or_volume.strip(),
        )
        all_authors.append(temp_reference_details)

In [None]:
print(len(all_authors))

In [None]:
all_authors[5]

In [None]:
from neo4j import GraphDatabase

# Connect to Neo4j
uri = "bolt://localhost:7687"
user = "neo4j"
password = "your_password"
driver = GraphDatabase.driver(uri, auth=(user, password))


def write_references_to_neo4j(references_details):
    with driver.session() as session:
        for id_, details in enumerate(references_details[:1000]):
            # Example: details might have 'title', 'authors', 'year'
            session.run(
                """
                MERGE (r:Reference {id: $ref_id})
                SET r.title = $title, r.year = $year
                WITH r
                UNWIND $authors AS author
                MERGE (a:Author {name: author})
                MERGE (a)-[:AUTHORED]->(r)
                MERGE (r)-[:YOP]->(y:Year {value: $year})
                """,
                ref_id=details.id_,
                title=details.title,
                year=details.year,
                authors=details.authors
            )


write_references_to_neo4j(all_authors)
driver.close()

In [None]:
driver = GraphDatabase.driver(uri, auth=(user, password))
with driver.session() as session:
    # Delete all nodes and relationships
    session.run("MATCH (n) DETACH DELETE n")