In [25]:
import os
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

# Access the variables
neo4j_uri = os.getenv("NEO4J_URI")
neo4j_username = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")
neo4j_database = os.getenv("NEO4J_DATABASE")

google_api_key = os.getenv("GOOGLE_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")

langsmith_endpoint = os.getenv("LANGSMITH_ENDPOINT")
langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
langsmith_project = os.getenv("LANGSMITH_PROJECT")

# 🔐 Step 1: Environment Setup

Load API keys and database credentials from `.env` file for:
- **Neo4j** - Graph database for knowledge storage
- **Google Gemini** - LLM for entity extraction
- **LangSmith** - LLM observability and tracing
- **Other APIs** - Tavily, Groq for extended functionality

# 🔬 Knowledge Graph RAG Pipeline for Medical Research

This notebook demonstrates a **Knowledge Graph-based Retrieval Augmented Generation (GraphRAG)** system for medical research papers.

---

## 📋 Pipeline Overview

1. **📥 Load Data** - Import medical research papers from JSON files
2. **✂️ Split Documents** - Chunk papers into processable segments  
3. **🧠 Extract Knowledge** - Use LLM to identify entities and relationships
4. **📊 Build Graph** - Store knowledge in Neo4j graph database
5. **🔍 Query & Visualize** - Interactive query-based subgraph exploration

---

## 🎯 Key Features

- **Multi-section document parsing** with metadata preservation
- **Automated entity/relationship extraction** using LLM transformers
- **Graph-based retrieval** for more contextual answers
- **Interactive visualization** using yFiles graph widgets
- **Query-driven subgraph extraction** for focused exploration

---

In [2]:
from langchain_community.graphs import Neo4jGraph
graph = Neo4jGraph(neo4j_uri,
                neo4j_username,
                neo4j_password)

  graph = Neo4jGraph(neo4j_uri,


# 🗄️ Step 2: Connect to Neo4j Graph Database

Initialize connection to Neo4j for storing and querying the knowledge graph.

In [3]:
from langchain.schema import Document
import json
from pathlib import Path

# Load all JSON files from the papers directory
def load_publication_documents(json_directory="papers", max_docs=1, include_sections=True):
    """
    Load publications from JSON files and convert to LangChain Documents
    
    Args:
        json_directory: Directory containing JSON files (default: "papers")
        max_docs: Maximum number of documents to load
        include_sections: If True, include structured sections; if False, use title+abstract only
    
    Returns:
        List of LangChain Document objects
    """
    documents = []
    json_files = list(Path(json_directory).glob("*.json"))[:max_docs]
    
    print(f"Found {len(json_files)} JSON files in '{json_directory}' folder")
    
    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                
                # Handle both single publication and list of publications
                publications = data if isinstance(data, list) else [data]
                
                for pub in publications:
                    # Extract fields
                    title = pub.get('title', 'No title')
                    abstract = pub.get('abstract', 'No abstract')
                    sections = pub.get('sections', {})
                    authors = pub.get('authors', [])
                    pub_id = pub.get('pmc_id') or pub.get('pmid', 'Unknown')
                    
                    # Create document content
                    content_parts = [f"Title: {title}", f"\nAbstract: {abstract}"]
                    
                    # Add structured sections if available and requested
                    if include_sections and sections:
                        content_parts.append("\n\nSections:")
                        for section_name, section_content in sections.items():
                            content_parts.append(f"\n\n## {section_name}\n{section_content}")
                    
                    content = ''.join(content_parts)
                    
                    # Create metadata with section names
                    metadata = {
                        'id': pub_id,
                        'title': title,
                        'authors': ', '.join(authors),
                        'source': str(json_file.name),
                        'num_authors': len(authors),
                        'sections': list(sections.keys()) if sections else [],
                        'num_sections': len(sections) if sections else 0
                    }
                    
                    # Create LangChain Document
                    doc = Document(page_content=content, metadata=metadata)
                    documents.append(doc)
                    
            print(f"✓ Loaded: {json_file.name}")
        
        except Exception as e:
            print(f"✗ Error loading {json_file.name}: {e}")
    
    print(f"\nTotal: {len(documents)} publication documents loaded")
    
    # Show section statistics
    all_sections = []
    for doc in documents:
        all_sections.extend(doc.metadata.get('sections', []))
    
    if all_sections:
        unique_sections = set(all_sections)
        print(f"Unique sections found: {len(unique_sections)}")
        print(f"Common sections: {', '.join(sorted(unique_sections)[:10])}")
    
    return documents

# Load documents with structured sections
raw_documents = load_publication_documents(json_directory="papers", max_docs=1, include_sections=True)

# Now you can use raw_documents with your GraphRAG pipeline
if raw_documents:
    print(f"\nFirst document preview:")
    print(f"ID: {raw_documents[0].metadata['id']}")
    print(f"Title: {raw_documents[0].metadata['title']}")
    print(f"Number of sections: {raw_documents[0].metadata['num_sections']}")
    print(f"Sections: {', '.join(raw_documents[0].metadata['sections'])}")
    print(f"Content length: {len(raw_documents[0].page_content)} characters")

Found 1 JSON files in 'papers' folder
✓ Loaded: PMC10114433.json

Total: 1 publication documents loaded
Unique sections found: 28
Common sections: Acknowledgements, Associated Data, Availability of data and materials, Background, Clinicopathological factors of postoperative liver dysfunction, Competing interests, Conclusions, Contributor Information, Data Availability Statement, Discussion

First document preview:
ID: PMC10114433
Title: Postoperative liver dysfunction is associated with poor long-term outcomes in patients with colorectal cancer: a retrospective cohort study - PMC
Number of sections: 28
Sections: Eigo Otsuji, Background, Methods, Results, Conclusions, Patients, Fig. 1., Risk factor identification and prognostic impact, Table 1., Table 2., Table 3., Fig. 2., Table 4., Fatty liver assessment, Statistical analysis, Clinicopathological factors of postoperative liver dysfunction, Risk factors influencing the severity of postoperative liver dysfunction, Hepatobiliary enzyme a

# 📄 Step 3: Load Medical Research Papers

Load publication JSON files from the `papers/` directory and convert them to LangChain Documents.

**Features:**
- 📚 Parses structured sections (Background, Methods, Results, etc.)
- 🏷️ Extracts metadata (authors, IDs, section names)
- 🔢 Configurable document limit and section inclusion
- 📊 Statistics on sections and content

In [4]:
from langchain.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
documents = text_splitter.split_documents(raw_documents)

# ✂️ Step 4: Split Documents into Chunks

Split large documents into smaller chunks for better LLM processing.

**Parameters:**
- `chunk_size=512` - Maximum tokens per chunk
- `chunk_overlap=24` - Overlap between chunks to preserve context

In [5]:
len(raw_documents)

1

In [6]:
raw_documents[0].page_content

"Title: Postoperative liver dysfunction is associated with poor long-term outcomes in patients with colorectal cancer: a retrospective cohort study - PMC\nAbstract: BackgroundPostoperative hepatobiliary enzyme abnormalities often present as postoperative liver dysfunction in patients with colorectal cancer. This study aimed to clarify the risk factors of postoperative liver dysfunction and its prognostic impact following colorectal cancer surgery. MethodsWe retrospectively analyzed data from 360 consecutive patients who underwent radical resection for Stage I–IV colorectal cancer between 2015 and 2019. A subset of 249 patients with Stage III colorectal cancer were examined to assess the prognostic impact of liver dysfunction. ResultsForty-eight (13.3%) colorectal cancer patients (Stages I–IV) developed postoperative liver dysfunction (Common Terminology Criteria for Adverse Events version 5.0 CTCAE v5.0\u2009≥\u2009Grade 2). Univariate and multivariate analyses identified the liver-to-

In [7]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash" )
# from langchain_ollama import ChatOllama
# llm_ollama = ChatOllama(model="llama3.1:70b", temperature=0)

# 🤖 Step 5: Initialize LLM

Set up **Google Gemini 2.5 Flash** for entity extraction and relationship identification.

> **Note:** Can switch to local Ollama models (commented out) for privacy/cost considerations.

In [8]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
llm_transformer = LLMGraphTransformer(llm)


# 🔄 Step 6: Create Graph Transformer

Initialize the **LLMGraphTransformer** to automatically extract:
- 🏷️ **Entities** (people, concepts, diseases, treatments)
- 🔗 **Relationships** (causes, treats, associates with)

This converts unstructured text into structured knowledge graph format.

In [9]:
graph_documents = llm_transformer.convert_to_graph_documents(documents)

# 🧬 Step 7: Extract Graph Documents

Transform text chunks into graph documents containing nodes (entities) and edges (relationships).

> ⏱️ **Note:** This step may take a few minutes depending on document size and LLM speed.

In [10]:
graph_documents

[GraphDocument(nodes=[Node(id='Postoperative Liver Dysfunction', type='Condition', properties={}), Node(id='Poor Long-Term Outcomes', type='Outcome', properties={}), Node(id='Colorectal Cancer', type='Disease', properties={}), Node(id='Study', type='Study', properties={}), Node(id='Postoperative Hepatobiliary Enzyme Abnormalities', type='Condition', properties={}), Node(id='Colorectal Cancer Surgery', type='Procedure', properties={}), Node(id='Radical Resection', type='Procedure', properties={}), Node(id='Stage I–Iv Colorectal Cancer', type='Disease', properties={}), Node(id='Stage Iii Colorectal Cancer', type='Disease', properties={}), Node(id='Ctcae V5.0', type='Standard', properties={}), Node(id='Liver-To-Spleen Ratio', type='Measurement', properties={}), Node(id='Preoperative Plain Computed Tomography', type='Medical test', properties={}), Node(id='Disease-Free Survival', type='Outcome', properties={}), Node(id='Eigo Otsuji', type='Person', properties={}), Node(id='360 Patients', t

In [15]:
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

# 💾 Step 8: Store Graph in Neo4j

Upload the extracted knowledge graph to Neo4j database.

**Options:**
- `baseEntityLabel=True` - Add base labels for entity categorization
- `include_source=True` - Link entities to source documents for traceability

In [16]:
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t "

In [23]:
from neo4j import GraphDatabase
from IPython.display import display
from yfiles_jupyter_graphs import GraphWidget


def showGraph(cypher: str = "MATCH (n)-[r]->(m) RETURN n,r,m LIMIT 20"):
    driver = GraphDatabase.driver(
        uri=neo4j_uri,
        auth=(neo4j_username, neo4j_password)
    )
    with driver.session() as session:
        result = session.run(cypher)
        widget = GraphWidget(graph=result.graph())
        display(widget)


# 📊 Step 9: Graph Visualization Helper

Create a function to visualize graph data using **yFiles** interactive graph widget.

**Usage:** `showGraph(cypher_query)` - Execute Cypher and display results as an interactive graph.

In [24]:
from neo4j import GraphDatabase
showGraph()

GraphWidget(layout=Layout(height='710px', width='100%'))

# 🎨 Step 10: Visualize Full Graph

Display a sample of the knowledge graph (first 20 nodes and relationships).

> 🔍 **Tip:** Click and drag nodes to explore the graph structure interactively!

# 🧹 Graph Cleanup

Before building a new knowledge graph, we need to clear any existing data in Neo4j to avoid conflicts and duplicates.

In [18]:
graph.query("MATCH (n) DETACH DELETE n")
print("✓ Cleared existing graph data")

✓ Cleared existing graph data


In [None]:
# Query-based Knowledge Graph Extraction and Visualization

def extract_relevant_subgraph(query: str, top_k: int = 5):
    """
    Extract entities and relationships relevant to the user query
    
    Args:
        query: User's natural language query
        top_k: Number of most relevant nodes to return
    
    Returns:
        Cypher query results for visualization
    """
    
    # Use LLM to extract key entities/concepts from the query
    extraction_prompt = f"""
    From this query, extract the main medical/scientific concepts, entities, or keywords:
    Query: "{query}"
    
    Return only a comma-separated list of keywords (no explanation).
    Example: "colorectal cancer, liver dysfunction, prognosis"
    """
    
    response = llm.invoke(extraction_prompt)
    keywords = [k.strip().lower() for k in response.content.split(',')]
    
    print(f"Extracted keywords: {keywords}")
    
    # Build Cypher query to find relevant nodes and relationships
    # Search for nodes whose text contains any of the keywords
    keyword_conditions = " OR ".join([f"toLower(n.id) CONTAINS '{kw}'" for kw in keywords])
    
    cypher_query = f"""
    MATCH (n)
    WHERE {keyword_conditions}
    WITH n LIMIT {top_k}
    MATCH path = (n)-[r]-(connected)
    RETURN n, r, connected
    """
    
    return cypher_query, keywords


def visualize_query_subgraph(query: str, top_k: int = 5):
    """
    Visualize the knowledge graph relevant to the user's query
    """
    print(f"\n🔍 Query: {query}\n")
    
    cypher_query, keywords = extract_relevant_subgraph(query, top_k)
    
    print(f"\n📊 Visualizing subgraph...\n")
    print(f"Cypher query:\n{cypher_query}\n")
    
    # Execute and visualize
    driver = GraphDatabase.driver(
        uri=neo4j_uri,
        auth=(neo4j_username, neo4j_password)
    )
    
    with driver.session() as session:
        result = session.run(cypher_query)
        graph_data = result.graph()
        
        if len(graph_data.nodes) == 0:
            print("⚠️ No relevant entities found for this query.")
            return
        
        print(f"✓ Found {len(graph_data.nodes)} nodes and {len(graph_data.relationships)} relationships")
        
        widget = GraphWidget(graph=graph_data)
        display(widget)
        
    driver.close()

# 🔍 Step 11: Query-Based Subgraph Extraction

Build an **intelligent query system** that:

1. 📝 Analyzes user's natural language query
2. 🎯 Extracts key medical concepts/entities using LLM
3. 🔎 Searches knowledge graph for matching nodes
4. 📊 Visualizes relevant subgraph with relationships

**This enables focused exploration of specific topics within the larger knowledge graph.**

In [None]:
# Example queries - try these!

# Query 1: Liver-related research
visualize_query_subgraph("What research exists about liver dysfunction and cancer?", top_k=10)

# Query 2: Prognosis factors
# visualize_query_subgraph("What are the prognostic factors in colorectal cancer?", top_k=10)

# Query 3: Treatment outcomes
# visualize_query_subgraph("Tell me about postoperative outcomes", top_k=10)

# 🚀 Step 12: Interactive Query Examples

Try these example queries to explore different aspects of the research:

1. **Liver dysfunction research** - Disease-specific entities and relationships
2. **Prognostic factors** - Outcome predictors and clinical markers  
3. **Treatment outcomes** - Intervention results and patient outcomes

> 💡 **Tip:** Modify queries or add your own to explore different research angles!