# 01 - Combined Extract and Build using extract providers

## Setup

If you haven't already, install the toolkit and dependencies using the [Setup](./00-Setup.ipynb) notebook.

## Continous ingest

See [Continous ingest](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/lexical-graph/indexing.md#continous-ingest).

In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory

GraphStoreFactory.register(FalkorDBGraphStoreFactory)

from llama_index.readers.web import SimpleWebPageReader

graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(
    graph_store, 
    vector_store
)

doc_urls = [
    'https://docs.aws.amazon.com/neptune/latest/userguide/intro.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/what-is-neptune-analytics.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-features.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-vs-neptune-database.html'
]

docs = SimpleWebPageReader(
    html_to_text=True,
    metadata_fn=lambda url:{'url': url}
).load_data(doc_urls)

#graph_index.extract_and_build(docs, show_progress=True)

print('Complete')

## LLAMA Providers
### WebReaderProvider

In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load.readers.llama_providers.web_reader_provider import WebReaderProvider

GraphStoreFactory.register(FalkorDBGraphStoreFactory)



graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(
    graph_store,
    vector_store
)

doc_urls = [
    'https://docs.aws.amazon.com/neptune/latest/userguide/intro.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/what-is-neptune-analytics.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-features.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-vs-neptune-database.html'
]

provider = WebReaderProvider()
docs = provider.read(doc_urls)

graph_index.extract_and_build(docs, show_progress=True)

print('Complete')

### PDFReaderProvider

In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load.readers.llama_providers.pdf_reader_provider import PDFReaderProvider

# Register FalkorDB
GraphStoreFactory.register(FalkorDBGraphStoreFactory)

# Build index
graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(graph_store, vector_store)

# PDF Input
pdf_path = "pdf/sample.pdf"

# Extract and build
provider = PDFReaderProvider()
docs = provider.read(pdf_path)

graph_index.extract_and_build(docs, show_progress=True)

print("PDF Extraction Complete")


### YouTubeReaderProvider

In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load.readers.llama_providers.youtube_reader_provider import YouTubeReaderProvider

# Register FalkorDB
GraphStoreFactory.register(FalkorDBGraphStoreFactory)

# Build index
graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(graph_store, vector_store)

# YouTube URL
youtube_url = "https://www.youtube.com/watch?v=YmR2_zlQO5w"

# Extract and build
provider = YouTubeReaderProvider()
docs = provider.read(youtube_url)

graph_index.extract_and_build(docs, show_progress=True)

print("YouTube Extraction Complete")


### DocxReaderProvider

In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load.readers.llama_providers.docx_reader_provider import DocxReaderProvider

# Register FalkorDB
GraphStoreFactory.register(FalkorDBGraphStoreFactory)

# Build index
graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(graph_store, vector_store)

# DOCX input path
docx_path = "docs/story.docx"

# Extract and build
provider = DocxReaderProvider()
docs = provider.read(docx_path)

graph_index.extract_and_build(docs, show_progress=True)

print("Microsoft Word Extraction Complete")


### GitHubReaderProvider

In [None]:
%reload_ext dotenv
%dotenv

import os
from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load.readers.llama_providers.github_repo_provider import GitHubReaderProvider

# Register FalkorDB
GraphStoreFactory.register(FalkorDBGraphStoreFactory)

# Build index
graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(graph_store, vector_store)

# GitHub repo (public)
github_repo = "https://github.com/awslabs/graphrag-toolkit"
branch = "main"

# Load optional GitHub token
github_token = os.getenv("GITHUB_TOKEN")

if github_token:
    print("Using authenticated GitHub access with token.")
else:
    print("No GITHUB_TOKEN found — using unauthenticated access. You may be rate-limited.")

# Extract and build
provider = GitHubReaderProvider(github_token=github_token)
docs = provider.read(github_repo, branch=branch)

graph_index.extract_and_build(docs, show_progress=True)

print("GitHub Extraction Complete")


### PPTXReaderProvider

In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load.readers.llama_providers.pptx_reader_provider import PPTXReaderProvider

# Register FalkorDB
GraphStoreFactory.register(FalkorDBGraphStoreFactory)

# Build index
graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(graph_store, vector_store)

# PPTX input path
pptx_path = "pptx/sample.pptx"

# Extract and build
provider = PPTXReaderProvider()
docs = provider.read(pptx_path)

graph_index.extract_and_build(docs, show_progress=True)

print("PowerPoint Extraction Complete")


### DirectoryReaderProvider

In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load.readers.llama_providers.directory_reader_provider import DirectoryReaderProvider

# Register FalkorDB
GraphStoreFactory.register(FalkorDBGraphStoreFactory)

# Build index
graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(graph_store, vector_store)

# Directory input path
directory_path = "soup/"  # This folder should contain mixed .pdf, .pptx, .docx, etc.

# Extract and build
provider = DirectoryReaderProvider(data_dir=directory_path)
docs = provider.read()

graph_index.extract_and_build(docs, show_progress=True)

print("Directory Extraction Complete")


### S3DirectoryReaderConfig

In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load.readers.llama_providers.s3_directory_reader_provider import S3DirectoryReaderProvider
from graphrag_toolkit.lexical_graph.indexing.load.readers.reader_provider_config import S3DirectoryReaderConfig

# Register FalkorDB
GraphStoreFactory.register(FalkorDBGraphStoreFactory)

# Build index
graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(graph_store, vector_store)

# S3 configuration
s3_config = S3DirectoryReaderConfig(
    bucket="ccms-rag-extract-188967239867",  # Replace it with your actual bucket
    prefix="soup/",                          # Folder inside the bucket
    region="us-east-1",                      # Optional, defaults to us-east-1
    profile=None                             # Optional, use None to rely on default credentials
)

# Extract and build
provider = S3DirectoryReaderProvider(config=s3_config)
docs = provider.read()

graph_index.extract_and_build(docs, show_progress=True)

print("S3 Directory Extraction Complete")
