# Document Processing with GraphRAG

Let's see how to process documents and connect them to our graph using Neo4j's GraphRAG library.

In [None]:
from neo4j import GraphDatabase
from dotenv import load_dotenv
import os
import openai
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.embedder import OpenAIEmbedder

# Load environment variables
load_dotenv()

# Connect to Neo4j
driver = GraphDatabase.driver(
    os.getenv('NEO4J_URI'),
    auth=(os.getenv('NEO4J_USERNAME'), os.getenv('NEO4J_PASSWORD'))
)

# Initialize OpenAI
openai.api_key = os.getenv('OPENAI_API_KEY')
llm = OpenAILLM()
embedder = OpenAIEmbedder()

## Configure Knowledge Graph Builder

First, let's configure our Knowledge Graph Builder with the entities and relationships we want to extract:

In [None]:
# Define node labels and relationship types to extract
node_labels = ['Product', 'Feature', 'Issue', 'Solution']
rel_types = ['HAS_FEATURE', 'HAS_ISSUE', 'RESOLVES']

# Define prompt template for entity and relationship extraction
prompt_template = """
Given the following text, identify:
1. Products and their features
2. Issues or problems mentioned
3. Solutions or resolutions described

Create nodes for each entity and connect them with appropriate relationships.

Text: {text}
"""

# Initialize Knowledge Graph Builder
kg_builder = SimpleKGPipeline(
    llm=llm,
    driver=driver,
    text_splitter=FixedSizeSplitter(chunk_size=500, chunk_overlap=100),
    embedder=embedder,
    entities=node_labels,
    relations=rel_types,
    prompt_template=prompt_template,
    from_pdf=True
)

## Process Product Documentation

Now let's process our product documentation PDFs and add them to the graph:

In [None]:
# List of PDF files to process
pdf_files = [
    '../data/documents/laptop_pro_manual.pdf',
    '../data/documents/wireless_mouse_guide.pdf',
    '../data/documents/monitor_4k_manual.pdf'
]

# Process each PDF
for pdf_file in pdf_files:
    print(f"Processing: {pdf_file}")
    result = await kg_builder.run_async(file_path=pdf_file)
    print(f"Result: {result}\n")

## Process Support Cases

Next, let's process our support case documents:

In [None]:
# Update prompt template for support cases
support_prompt_template = """
Given the following support case text, identify:
1. Products mentioned
2. Issues or problems reported
3. Solutions provided or attempted

Create nodes for each entity and connect them with appropriate relationships.

Text: {text}
"""

# Initialize Knowledge Graph Builder for support cases
support_kg_builder = SimpleKGPipeline(
    llm=llm,
    driver=driver,
    text_splitter=FixedSizeSplitter(chunk_size=500, chunk_overlap=100),
    embedder=embedder,
    entities=node_labels,
    relations=rel_types,
    prompt_template=support_prompt_template,
    from_pdf=True
)

# List of support case PDFs
support_files = [
    '../data/documents/support_case_1.pdf',
    '../data/documents/support_case_2.pdf'
]

# Process each support case
for support_file in support_files:
    print(f"Processing: {support_file}")
    result = await support_kg_builder.run_async(file_path=support_file)
    print(f"Result: {result}\n")

## Query the Knowledge Graph

Let's verify that our documents were processed and connected correctly:

In [None]:
def explore_product_knowledge(product_name):
    """Explore knowledge extracted about a specific product."""
    with driver.session() as session:
        result = session.run("""
        MATCH (p:Product {name: $name})
        OPTIONAL MATCH (p)-[:HAS_FEATURE]->(f:Feature)
        OPTIONAL MATCH (p)-[:HAS_ISSUE]->(i:Issue)
        OPTIONAL MATCH (s:Solution)-[:RESOLVES]->(i)
        RETURN p.name as product,
               collect(DISTINCT f.name) as features,
               collect(DISTINCT {issue: i.name, solution: s.name}) as issues_and_solutions
        """, name=product_name)
        
        for record in result:
            print(f"\nProduct: {record['product']}")
            print("\nFeatures:")
            for feature in record['features']:
                print(f"- {feature}")
            print("\nIssues and Solutions:")
            for item in record['issues_and_solutions']:
                if item['issue']:
                    print(f"- Issue: {item['issue']}")
                    if item['solution']:
                        print(f"  Solution: {item['solution']}")

# Explore knowledge about the Laptop Pro
explore_product_knowledge('Laptop Pro')

## Next Steps

Now that we have processed our documents and connected them to the graph, let's explore GraphRAG patterns for retrieving and using this knowledge!