In [13]:
from __future__ import annotations

import asyncio
import logging

from neo4j_graphrag.experimental.components.entity_relation_extractor import (
    LLMEntityRelationExtractor,
    OnError,
)
from neo4j_graphrag.experimental.components.kg_writer import Neo4jWriter
from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader
from neo4j_graphrag.experimental.components.schema import (
    SchemaBuilder,
    SchemaEntity,
    SchemaRelation,
)
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import (
    FixedSizeSplitter,
)
from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult
from neo4j_graphrag.llm import LLMInterface, OpenAILLM

import neo4j
from neo4j_graphrag.llm import LLMResponse, OllamaLLM


logging.basicConfig(level=logging.INFO)

In [31]:
LLM_MODEL = "deepseek-r1:7b"
MAX_TOKENS = 2000
FILE_PATH = "/Users/anunay.aatipamula/Downloads/LLM Engineer's Handbook-9781836200079.pdf"

In [32]:
async def define_and_run_pipeline(
    neo4j_driver: neo4j.Driver, llm: LLMInterface
) -> PipelineResult:
    from neo4j_graphrag.experimental.pipeline import Pipeline

    entities = [
    SchemaEntity(label="DOCUMENT", description="A written or digital work, such as a book, article, report, or web page."),
    SchemaEntity(label="AUTHOR", description="An individual or group responsible for creating the content of a document."),
    SchemaEntity(label="PUBLISHER", description="An entity that publishes or distributes the document."),
    SchemaEntity(label="SECTION", description="A distinct subdivision within a document, such as a chapter or heading."),
    SchemaEntity(label="REFERENCE", description="A citation or link to another document or source."),
    SchemaEntity(label="URL", description="The web address where the document is located."),
    ]
    
    relations = [
        SchemaRelation(label="AUTHORED_BY", description="Connects a document to its author(s)."),
        SchemaRelation(label="PUBLISHED_BY", description="Links a document to its publisher."),
        SchemaRelation(label="CONTAINS_SECTION", description="Indicates that a document includes specific sections."),
        SchemaRelation(label="REFERENCES", description="Denotes that a document cites another document or source."),
        SchemaRelation(label="LOCATED_AT", description="Specifies the URL where the document can be accessed."),
    ]
    
    potential_schema = [
        ("DOCUMENT", "AUTHORED_BY", "AUTHOR"),
        ("DOCUMENT", "PUBLISHED_BY", "PUBLISHER"),
        ("DOCUMENT", "CONTAINS_SECTION", "SECTION"),
        ("DOCUMENT", "REFERENCES", "REFERENCE"),
        ("DOCUMENT", "LOCATED_AT", "URL"),
    ]

    # Set up the pipeline
    pipe = Pipeline()
    pipe.add_component(PdfLoader(), "pdf_loader")
    pipe.add_component(
        FixedSizeSplitter(chunk_size=4000, chunk_overlap=200, approximate=False),
        "splitter",
    )
    pipe.add_component(SchemaBuilder(), "schema")
    pipe.add_component(
        LLMEntityRelationExtractor(
            llm=llm,
            on_error=OnError.RAISE,
        ),
        "extractor",
    )
    pipe.add_component(Neo4jWriter(neo4j_driver), "writer")
    pipe.connect("pdf_loader", "splitter", input_config={"text": "pdf_loader.text"})
    pipe.connect("splitter", "extractor", input_config={"chunks": "splitter"})
    pipe.connect(
        "schema",
        "extractor",
        input_config={
            "schema": "schema",
            "document_info": "pdf_loader.document_info",
        },
    )
    pipe.connect(
        "extractor",
        "writer",
        input_config={"graph": "extractor"},
    )

    pipe_inputs = {
        "pdf_loader": {
            "filepath": FILE_PATH
        },
        "schema": {
            "entities": entities,
            "relations": relations,
            "potential_schema": potential_schema,
        },
    }
    return await pipe.run(pipe_inputs)


In [33]:
async def main() -> PipelineResult:
    llm = OllamaLLM(
        model_name=LLM_MODEL,
        model_params={
            "max_tokens": MAX_TOKENS,
            "response_format": {"type": "json_object"},
        },
    )
    driver = neo4j.GraphDatabase.driver(
        "bolt://localhost:7687", auth=("neo4j", "learning")
    )
    res = await define_and_run_pipeline(driver, llm)
    driver.close()
    await llm.async_client.close()
    return res

In [34]:
res = await main()

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


LLMGenerationError: LLM response has improper format

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


In [22]:
res 

<coroutine object main at 0x11cb3e730>