# 01 - Combined Extract and Build using extract providers

## Setup

If you haven't already, install the toolkit and dependencies using the [Setup](./00-Setup.ipynb) notebook.

## Continuous ingest - Using native llama readers

See [Continous ingest](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/lexical-graph/indexing.md#continous-ingest).

In [None]:
%reload_ext dotenv
%dotenv

import os
import logging
import warnings

from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage.graph.neo4j_graph_store_factory import Neo4jGraphStoreFactory

# Suppress noisy warnings
warnings.filterwarnings("ignore", message=".*Removing unpickleable private attribute.*")
logging.getLogger("neo4j.notifications").setLevel(logging.ERROR)
logging.getLogger("botocore.tokens").setLevel(logging.ERROR)

GraphStoreFactory.register(Neo4jGraphStoreFactory)

from llama_index.readers.web import SimpleWebPageReader

graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(
    graph_store, 
    vector_store
)

doc_urls = [
    'https://docs.aws.amazon.com/neptune/latest/userguide/intro.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/what-is-neptune-analytics.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-features.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-vs-neptune-database.html'
]

docs = SimpleWebPageReader(
    html_to_text=True,
    metadata_fn=lambda url:{'url': url}
).load_data(doc_urls)

graph_index.extract_and_build(docs, show_progress=True)

print('Complete')

## Optional - Learning how to use Readers
### WebReaderProvider

### Setup Graph_index

In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage.graph.neo4j_graph_store_factory import Neo4jGraphStoreFactory

# Register Neo4j as the graph store backend
GraphStoreFactory.register(Neo4jGraphStoreFactory)

# Initialize graph and vector stores from environment configuration
graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

# Create the lexical graph index
graph_index = LexicalGraphIndex(
    graph_store,
    vector_store
)

In [None]:
%reload_ext dotenv
%dotenv

import os
from graphrag_toolkit.lexical_graph.indexing.load.readers import WebReaderProvider, WebReaderConfig


# Configure web reader with enhanced metadata
web_config = WebReaderConfig(
    html_to_text=True,
    metadata_fn=lambda url: {
        'source': 'web',
        'url': url,
        'document_type': 'aws_documentation',
        'service': 'amazon_bedrock',
        'domain': 'aws.amazon.com'
    }
)

web_reader = WebReaderProvider(web_config)

# AWS Bedrock documentation URLs
doc_urls = [
    'https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html',
    'https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html',
    'https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails.html',
    'https://docs.aws.amazon.com/bedrock/latest/userguide/model-evaluation.html'
]

# Read all web documents
all_docs = []
for url in doc_urls:
    docs = web_reader.read(url)
    all_docs.extend(docs)

print(f"Loaded {len(all_docs)} web documents")
if all_docs:
    print(f"First document metadata: {all_docs[0].metadata}")

# Index the documents
graph_index.extract_and_build(all_docs, show_progress=True)

print('Complete')


### PDFReaderProvider

In [None]:
%reload_ext dotenv
%dotenv

import os
from graphrag_toolkit.lexical_graph.indexing.load.readers import PDFReaderProvider, PDFReaderConfig

# Configure PDF reader with enhanced metadata
pdf_config = PDFReaderConfig(
    return_full_document=False,
    metadata_fn=lambda path: {
        'source': 'pdf',
        'file_path': path,
        'document_type': 'pdf_document',
        'content_category': 'technical_documentation'
    }
)

pdf_reader = PDFReaderProvider(pdf_config)

# PDF file path
pdf_path = "pdf/sample.pdf"

# Read the PDF file
pdf_docs = pdf_reader.read(pdf_path)
print(f"Loaded {len(pdf_docs)} PDF documents")

if pdf_docs:
    print(f"First document metadata: {pdf_docs[0].metadata}")

# Index the PDF documents
graph_index.extract_and_build(pdf_docs, show_progress=True)

print("PDF Extraction Complete")


### YouTubeReaderProvider

In [None]:
%reload_ext dotenv
%dotenv

import os
from graphrag_toolkit.lexical_graph.indexing.load.readers import YouTubeReaderProvider, YouTubeReaderConfig

# Configure YouTube reader
youtube_config = YouTubeReaderConfig(
    language="en",
    metadata_fn=lambda url: {
        'source': 'youtube',
        'content_type': 'video_transcript',
        'platform': 'youtube'
    }
)

provider = YouTubeReaderProvider(youtube_config)

# YouTube URL
youtube_url = "https://www.youtube.com/watch?v=YmR2_zlQO5w"

# Extract and build
docs = provider.read(youtube_url)
print(f"Loaded {len(docs)} YouTube documents")

graph_index.extract_and_build(docs, show_progress=True)

print("YouTube Extraction Complete")


### DocxReaderProvider

In [None]:
%reload_ext dotenv
%dotenv

import os
from graphrag_toolkit.lexical_graph.indexing.load.readers import DocxReaderProvider, DocxReaderConfig

# Configure DOCX reader with enhanced metadata
docx_config = DocxReaderConfig(
    metadata_fn=lambda path: {
        'source': 'docx',
        'file_path': path,
        'document_type': 'word_document',
        'content_category': 'office_document',
        'file_extension': '.docx'
    }
)

docx_reader = DocxReaderProvider(docx_config)

# DOCX file path
docx_path = "docs/story.docx"

# Read the DOCX file
docx_docs = docx_reader.read(docx_path)
print(f"Loaded {len(docx_docs)} DOCX documents")

if docx_docs:
    print(f"First document metadata: {docx_docs[0].metadata}")

# Index the DOCX documents
graph_index.extract_and_build(docx_docs, show_progress=True)

print("Microsoft Word Extraction Complete")


### GithubRepositoryReader

In [None]:
%reload_ext dotenv
%dotenv

import os
import nest_asyncio
nest_asyncio.apply()  # Fix for asyncio event loop conflict in Jupyter


# Use the LlamaIndex GitHub reader directly
from llama_index.readers.github import GithubRepositoryReader, GithubClient

# GitHub token - replace with your actual token
github_token = ""  # Replace with your GitHub token

if github_token and github_token != "ghp_your_token_here":
    print("Using authenticated GitHub access with token.")
else:
    print("No valid GITHUB_TOKEN found — using unauthenticated access. You may be rate-limited.")
    print("To add a token:")
    print("Replace 'ghp_your_token_here' with your actual GitHub personal access token")

# Create GitHub client and reader
github_client = GithubClient(github_token=github_token, verbose=True)
reader = GithubRepositoryReader(
    github_client=github_client,
    owner="evanerwee",
    repo="graphrag-toolkit",
    use_parser=False,
    verbose=False,
    filter_directories=(
        ["docs"],  # Only read specific directories
        GithubRepositoryReader.FilterType.INCLUDE,
    ),
    filter_file_extensions=(
        [".md", ".py", ".txt", ".rst"],  # Only read specific file types
        GithubRepositoryReader.FilterType.INCLUDE,
    ),
)

# Load documents from the main branch
print("Loading GitHub repository documents...")
github_docs = reader.load_data(branch="main")

print(f"Loaded {len(github_docs)} GitHub documents")

if github_docs:
    print(f"First document metadata: {github_docs[0].metadata}")
    
    # Add enhanced metadata
    for doc in github_docs:
        doc.metadata.update({
            'source': 'github',
            'repository': 'awslabs/graphrag-toolkit',
            'document_type': 'source_code',
            'content_category': 'repository_content',
            'platform': 'github'
        })

# Index the GitHub documents
graph_index.extract_and_build(github_docs, show_progress=True)

print("GitHub Extraction Complete")


### PPTXReaderProvider

In [None]:
%reload_ext dotenv
%dotenv

import os
from graphrag_toolkit.lexical_graph.indexing.load.readers import PPTXReaderProvider, PPTXReaderConfig

# Configure PPTX reader with enhanced metadata
pptx_config = PPTXReaderConfig(
    metadata_fn=lambda path: {
        'source': 'pptx',
        'file_path': path,
        'document_type': 'powerpoint_presentation',
        'content_category': 'office_document',
        'file_extension': '.pptx'
    }
)

pptx_reader = PPTXReaderProvider(pptx_config)

# PPTX file path
pptx_path = "pptx/sample.pptx"

# Read the PPTX file
pptx_docs = pptx_reader.read(pptx_path)
print(f"Loaded {len(pptx_docs)} PPTX documents")

if pptx_docs:
    print(f"First document metadata: {pptx_docs[0].metadata}")

# Index the PPTX documents
graph_index.extract_and_build(pptx_docs, show_progress=True)

print("PowerPoint Extraction Complete")


### Markdown Reader Provider

In [None]:
from graphrag_toolkit.lexical_graph.indexing.load.readers import MarkdownReaderProvider, MarkdownReaderConfig

# Configure Markdown reader
md_config = MarkdownReaderConfig(
    remove_hyperlinks=True,
    remove_images=True,
    metadata_fn=lambda path: {'source': 'markdown', 'file_path': path}
)

md_reader = MarkdownReaderProvider(md_config)

# Read your actual Markdown file
md_docs = md_reader.read('artifacts/sample.md')

print(f"Loaded {len(md_docs)} Markdown documents")
print(f"Document content preview: {md_docs[0].text[:200]}...")

# Index the documents
graph_index.extract_and_build(md_docs, show_progress=True)

print("Markdown Extraction Complete")

### JSON Reader Provider

In [None]:
# Set up cache directories and environment
import os
import nltk

# Set environment variables for writable cache directories
os.environ['LLAMA_INDEX_CACHE_DIR'] = '/home/jovyan/work/.cache/llama_index'
os.environ['NLTK_DATA'] = '/home/jovyan/work/nltk_data'

# Create cache directories
cache_dir = '/home/jovyan/work/.cache/llama_index'
nltk_dir = '/home/jovyan/work/nltk_data'
os.makedirs(cache_dir, exist_ok=True)
os.makedirs(nltk_dir, exist_ok=True)

# Download NLTK data
nltk.data.path.append(nltk_dir)
nltk.download('punkt', download_dir=nltk_dir)

# Import and configure JSON reader
from graphrag_toolkit.lexical_graph.indexing.load.readers import JSONReaderProvider, JSONReaderConfig

# Configure JSON reader
json_config = JSONReaderConfig(
    metadata_fn=lambda path: {'source': 'json', 'file_path': path}
)

json_reader = JSONReaderProvider(json_config)

# Read your actual JSON file
json_docs = json_reader.read('artifacts/sample.json')

print(f"Loaded {len(json_docs)} JSON documents")
print(f"Document content preview: {json_docs[0].text[:200]}...")

# Index the documents
graph_index.extract_and_build(json_docs, show_progress=True)

print("JSON Extraction Complete")


### Wikipedia Reader Provider

In [None]:
from graphrag_toolkit.lexical_graph.indexing.load.readers import WikipediaReaderProvider, WikipediaReaderConfig
import wikipedia

# Configure Wikipedia reader
wiki_config = WikipediaReaderConfig(
    lang='en',
    metadata_fn=lambda title: {'source': 'wikipedia', 'title': title, 'language': 'en'}
)

wiki_reader = WikipediaReaderProvider(wiki_config)

# Search and validate the Wikipedia page
search_results = wikipedia.search('Llama language model')
print(f"Search results: {search_results}")

# Validate the page exists
page_title = None
for result in search_results:
    try:
        wikipedia.page(result)
        page_title = result
        break
    except wikipedia.exceptions.PageError:
        continue

# Fallback to a known valid page
if not page_title:
    page_title = 'Large language model'

print(f"Using page: {page_title}")

# Read Wikipedia article
wiki_docs = wiki_reader.read(page_title)

print(f"Loaded {len(wiki_docs)} Wikipedia documents")
print(f"Document content preview: {wiki_docs[0].text[:200]}...")
print(f"Document metadata: {wiki_docs[0].metadata}")

# Index the documents
graph_index.extract_and_build(wiki_docs, show_progress=True)

print("Wikipedia Extraction Complete")


### CSV Reader Provider

In [None]:
from graphrag_toolkit.lexical_graph.indexing.load.readers import CSVReaderProvider, CSVReaderConfig

# Configure CSV reader
csv_config = CSVReaderConfig(
    concat_rows=True,
    metadata_fn=lambda path: {'source': 'csv', 'file_path': path}
)

csv_reader = CSVReaderProvider(csv_config)

# Read your actual CSV file
csv_docs = csv_reader.read('artifacts/sample.csv')

print(f"Loaded {len(csv_docs)} CSV documents")
print(f"Document content preview: {csv_docs[0].text[:200]}...")

# Index the documents
graph_index.extract_and_build(csv_docs, show_progress=True)

print("CSV Extraction Complete")


## Complete

This notebook demonstrated various reader providers available in the GraphRAG Toolkit:

- **WebReaderProvider**: For reading web pages
- **PDFReaderProvider**: For reading PDF documents
- **CSVReaderProvider**: For reading CSV files
- **JSONReaderProvider**: For reading JSON files
- **MarkdownReaderProvider**: For reading Markdown files
- **WikipediaReaderProvider**: For reading Wikipedia articles
- **YouTubeReaderProvider**: For reading YouTube
- **DocxReaderProvider**: For reading Microsoft Word
- **GithubRepositoryReader**: For reading Github repository
- **PPTXReaderProvider**: For reading Microsoft Powerpoint

Each provider can be configured with specific options and metadata functions to customize the document loading process.

### Readers for Structured-Data

In [None]:
import os
from graphrag_toolkit.lexical_graph.indexing.load.readers import StructuredDataReaderProvider, StructuredDataReaderConfig

# Configure structured data reader with enhanced metadata
structured_config = StructuredDataReaderConfig(
    col_index=0,  # Column to use as index (0 = first column)
    col_joiner=', ',  # String to join multiple columns
    col_metadata=None,  # Optional column metadata configuration
    pandas_config={"sep": ","},  # CSV separator and other pandas options
    metadata_fn=lambda path: {
        'source': 'structured_data',
        'file_path': path,
        'document_type': 'structured_data',
        'content_category': 'tabular_data'
    }
)

structured_reader = StructuredDataReaderProvider(structured_config)

# Read CSV and Excel files (supports both local files and S3 URLs)
data_files = ["artifacts/sample.csv", "artifacts/sample.xlsx"]
structured_docs = structured_reader.read(data_files)

print(f"Loaded {len(structured_docs)} structured data documents")

if structured_docs:
    print(f"First document metadata: {structured_docs[0].metadata}")

# Index the structured data documents
graph_index.extract_and_build(structured_docs, show_progress=True)

print("Structured Data Extraction Complete")


### DirectoryReaderProvider

In [None]:
from graphrag_toolkit.lexical_graph.indexing.load.readers import DirectoryReaderProvider, DirectoryReaderConfig
import os
from pathlib import Path

# Create test directory structure
test_dir = Path('dir_reader')
test_dir.mkdir(exist_ok=True)

# Create sample files
sample_files = {
    'document1.txt': 'This is the first document. It contains information about artificial intelligence and machine learning.',
    'document2.txt': 'This is the second document. It discusses natural language processing and deep learning techniques.',
    'notes.md': '# Meeting Notes\n\n## Key Points\n- Discussed project timeline\n- Reviewed technical requirements\n- Planned next steps',
    'data.json': '{"name": "Sample Data", "type": "test", "values": [1, 2, 3, 4, 5]}'
}

for filename, content in sample_files.items():
    (test_dir / filename).write_text(content)

print(f"Created test directory '{test_dir}' with {len(sample_files)} files")
print(f"Files: {list(sample_files.keys())}")

#### Basic Directory Reading

In [None]:
# Configure directory reader for all files
dir_config = DirectoryReaderConfig(
    input_dir=str(test_dir),
    exclude_hidden=True,
    recursive=True,
    metadata_fn=lambda path: {
        'source': 'directory',
        'directory_path': path,
        'reader_type': 'basic'
    }
)

dir_reader = DirectoryReaderProvider(dir_config)

# Read all documents from directory
dir_docs = dir_reader.read(None)

print(f"Loaded {len(dir_docs)} documents from directory")
print("\nDocument details:")
for i, doc in enumerate(dir_docs):
    file_name = doc.metadata.get('file_name', 'unknown')
    file_path = doc.metadata.get('file_path', 'unknown')
    print(f"  Document {i+1}: {file_name}")
    print(f"    Path: {file_path}")
    print(f"    Content preview: {doc.text[:100]}...")
    print(f"    Custom metadata: {doc.metadata.get('reader_type')}")
    print()

#### Filtered Directory Reading

Read only specific file types from the directory.

In [None]:
# Configure directory reader for text files only
filtered_config = DirectoryReaderConfig(
    input_dir=str(test_dir),
    exclude_hidden=True,
    recursive=True,
    required_exts=[".txt", ".md"],  # Only read .txt and .md files
    metadata_fn=lambda path: {
        'source': 'directory_filtered',
        'directory_path': path,
        'filter': 'text_files_only'
    }
)

filtered_reader = DirectoryReaderProvider(filtered_config)

# Read filtered documents
filtered_docs = filtered_reader.read(None)

print(f"Loaded {len(filtered_docs)} filtered documents (txt and md only)")
print("\nFiltered document details:")
for i, doc in enumerate(filtered_docs):
    file_name = doc.metadata.get('file_name', 'unknown')
    file_ext = Path(file_name).suffix
    print(f"  Document {i+1}: {file_name} ({file_ext})")
    print(f"    Content preview: {doc.text[:80]}...")
    print()

#### Nested Directory Structure

Test recursive reading with nested directories.

In [None]:
# Create nested directory structure
nested_dir = test_dir / 'subdirectory'
nested_dir.mkdir(exist_ok=True)

deep_dir = nested_dir / 'deep'
deep_dir.mkdir(exist_ok=True)

# Add files to nested directories
nested_files = {
    nested_dir / 'nested_doc.txt': 'This document is in a subdirectory. It contains nested content.',
    deep_dir / 'deep_doc.txt': 'This document is deeply nested. It demonstrates recursive directory reading.'
}

for file_path, content in nested_files.items():
    file_path.write_text(content)

print(f"Created nested structure:")
print(f"  {nested_dir}/nested_doc.txt")
print(f"  {deep_dir}/deep_doc.txt")

# Configure recursive directory reader
recursive_config = DirectoryReaderConfig(
    input_dir=str(test_dir),
    exclude_hidden=True,
    recursive=True,  # Enable recursive reading
    required_exts=[".txt"],
    metadata_fn=lambda path: {
        'source': 'directory_recursive',
        'directory_path': path,
        'scan_type': 'recursive'
    }
)

recursive_reader = DirectoryReaderProvider(recursive_config)

# Read all txt files recursively
recursive_docs = recursive_reader.read(None)

print(f"\nLoaded {len(recursive_docs)} documents recursively")
print("\nRecursive document details:")
for i, doc in enumerate(recursive_docs):
    file_path = doc.metadata.get('file_path', 'unknown')
    file_name = doc.metadata.get('file_name', 'unknown')
    print(f"  Document {i+1}: {file_name}")
    print(f"    Full path: {file_path}")
    print(f"    Content preview: {doc.text[:60]}...")
    print()

#### Index Directory Documents

Index all the documents we've read from the directory.

In [None]:
# Combine all documents for indexing
all_directory_docs = dir_docs + filtered_docs + recursive_docs

print(f"Total documents to index: {len(all_directory_docs)}")

# Show document sources
sources = {}
for doc in all_directory_docs:
    source = doc.metadata.get('source', 'unknown')
    sources[source] = sources.get(source, 0) + 1

print("\nDocument sources:")
for source, count in sources.items():
    print(f"  {source}: {count} documents")

# Index the documents
print("\nIndexing directory documents...")
graph_index.extract_and_build(all_directory_docs, show_progress=True)

print("Directory documents indexed successfully!")