From 1fa02a6f800003309bae3a642371a060095031c8 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 28 May 2025 12:44:26 -0400 Subject: [PATCH 1/9] initial commit --- .github/workflows/docs.yml | 26 +++++ .gitignore | 1 + libs/arangodb/doc/Makefile | 20 ++++ libs/arangodb/doc/conf.py | 39 +++++++ libs/arangodb/doc/index.rst | 127 +++++++++++++++++++++ libs/arangodb/doc/langchain-arangodb.ipynb | 0 libs/arangodb/doc/make.bat | 35 ++++++ libs/arangodb/doc/quickstart.rst | 69 +++++++++++ libs/arangodb/doc/requirements.txt | 3 + 9 files changed, 320 insertions(+) create mode 100644 .github/workflows/docs.yml create mode 100644 libs/arangodb/doc/Makefile create mode 100644 libs/arangodb/doc/conf.py create mode 100644 libs/arangodb/doc/index.rst create mode 100644 libs/arangodb/doc/langchain-arangodb.ipynb create mode 100644 libs/arangodb/doc/make.bat create mode 100644 libs/arangodb/doc/quickstart.rst create mode 100644 libs/arangodb/doc/requirements.txt diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..ed8eeff --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,26 @@ +name: docs + +on: + pull_request: + workflow_dispatch: + +jobs: + docs: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Fetch all tags and branches + run: git fetch --prune --unshallow + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install dependencies + run: cd libs/arangodb &&pip install .[dev] && pip install -r doc/requirements.txt + + - name: Generate Sphinx HTML + run: cd libs/arangodb/doc && make html \ No newline at end of file diff --git a/.gitignore b/.gitignore index 45d553b..10cf23e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ __pycache__ .mypy_cache_test .env .venv* +libs/arangodb/doc/_build \ No newline at end of file diff --git a/libs/arangodb/doc/Makefile b/libs/arangodb/doc/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/libs/arangodb/doc/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/libs/arangodb/doc/conf.py b/libs/arangodb/doc/conf.py new file mode 100644 index 0000000..e45051f --- /dev/null +++ b/libs/arangodb/doc/conf.py @@ -0,0 +1,39 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +import os +import sys + +sys.path.insert(0, os.path.abspath("..")) + +project = 'langchain-arangodb' +copyright = '2025, ArangoDB' +author = 'ArangoDB' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx_rtd_theme", + "sphinx.ext.autodoc", + "sphinx.ext.viewcode", + "sphinx.ext.autosummary", + "sphinx.ext.inheritance_diagram", +] +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'sphinx_rtd_theme' +html_static_path = ['_static'] +autodoc_member_order = "bysource" +autodoc_inherit_docstrings = True +autosummary_generate = True diff --git a/libs/arangodb/doc/index.rst b/libs/arangodb/doc/index.rst new file mode 100644 index 0000000..a145453 --- /dev/null +++ b/libs/arangodb/doc/index.rst @@ -0,0 +1,127 @@ +langchain-arangodb HELLO WORLD!!! +============ + +.. raw:: html + +
+ + NetworkX + + + ArangoDB + + + RAPIDS + + + NVIDIA + +
+ +.. raw:: html + +
+ +.. image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/arangodb/nx-arangodb/blob/main/doc/nx_arangodb.ipynb + :alt: Open In Colab + +.. image:: https://dl.circleci.com/status-badge/img/gh/arangodb/nx-arangodb/tree/main.svg?style=svg + :target: https://dl.circleci.com/status-badge/redirect/gh/arangodb/nx-arangodb/tree/main + :alt: CircleCI + +.. image:: https://github.com/arangodb/nx-arangodb/actions/workflows/analyze.yml/badge.svg + :target: https://github.com/arangodb/nx-arangodb/actions/workflows/analyze.yml + :alt: CodeQL + +.. image:: https://github.com/arangodb/nx-arangodb/actions/workflows/docs.yaml/badge.svg + :target: https://github.com/arangodb/nx-arangodb/actions/workflows/docs.yaml + :alt: Docs + +.. raw:: html + +
+ +.. image:: https://img.shields.io/pypi/v/nx-arangodb?color=3775A9&style=for-the-badge&logo=pypi&logoColor=FFD43B + :target: https://pypi.org/project/nx-arangodb/ + :alt: PyPI version badge + +.. image:: https://img.shields.io/pypi/pyversions/nx-arangodb?color=3776AB&style=for-the-badge&logo=python&logoColor=FFD43B + :target: https://pypi.org/project/nx-arangodb/ + :alt: Python versions badge + +.. raw:: html + +
+ +.. image:: https://img.shields.io/github/license/arangodb/nx-arangodb?color=9E2165&style=for-the-badge + :target: https://github.com/arangodb/nx-arangodb/blob/main/LICENSE + :alt: License + +.. image:: https://img.shields.io/static/v1?style=for-the-badge&label=code%20style&message=black&color=black + :target: https://github.com/psf/black + :alt: Code style: black + +.. image:: https://img.shields.io/pepy/dt/nx-arangodb?style=for-the-badge&color=282661 + :target: https://pepy.tech/project/nx-arangodb + :alt: Downloads + +This is a `backend to NetworkX `_ that offers `ArangoDB `_ as a `Persistence Layer to NetworkX Graphs `_: + +1. Persist NetworkX Graphs to ArangoDB. +2. Reload NetworkX Graphs from ArangoDB. +3. Perform CRUD on ArangoDB Graphs via NetworkX. +4. Run algorithms (CPU & GPU) on ArangoDB Graphs via NetworkX. + +Benefits of having ArangoDB as a backend to NetworkX include: + +1. No need to re-create the graph every time you start a new session. +2. Access to GPU-accelerated graph analytics (`nx-cugraph `_). +3. Access to a database query language (`Arango Query Language `_). +4. Access to a visual interface for graph exploration (`ArangoDB Web UI `_). +5. Access to cross-collaboration on the same graph (`ArangoDB Cloud `_). +6. Access to efficient distribution of graph data (`ArangoDB SmartGraphs `_). + +.. image:: ./_static/nxadb.png + :align: center + :alt: nx-arangodb Diagram + :height: 200px + +Requirements +------------ +- Python 3.10+ +- NetworkX 3.0+ +- ArangoDB 3.10+ + +Installation +------------ + +Latest Release + +.. code-block:: + + pip install nx-arangodb + +Current State + +.. code-block:: + + pip install git+https://github.com/arangodb/nx-arangodb + +Contents +-------- + +The UX of NetworkX-ArangoDB is similar to that of NetworkX, but with the +added functionality of persisting graphs to ArangoDB. For an understanding +of how to use NetworkX, refer to the `NetworkX Documentation `_. + +Expect documentation to grow over time: + +.. toctree:: + :maxdepth: 2 + + quickstart + classes/index + dict/index + algorithms/index + views/index \ No newline at end of file diff --git a/libs/arangodb/doc/langchain-arangodb.ipynb b/libs/arangodb/doc/langchain-arangodb.ipynb new file mode 100644 index 0000000..e69de29 diff --git a/libs/arangodb/doc/make.bat b/libs/arangodb/doc/make.bat new file mode 100644 index 0000000..32bb245 --- /dev/null +++ b/libs/arangodb/doc/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/libs/arangodb/doc/quickstart.rst b/libs/arangodb/doc/quickstart.rst new file mode 100644 index 0000000..3a83b4c --- /dev/null +++ b/libs/arangodb/doc/quickstart.rst @@ -0,0 +1,69 @@ +Quickstart +========== + +1. Set up ArangoDB +2. Set environment variables +3. Instantiate a NetworkX-ArangoDB Graph + +1. Set up ArangoDB +------------------ + +**Option A: Local Instance via Docker** + +Appears on ``localhost:8529`` with the user ``root`` & password ``openSesame``. + +More info: `arangodb.com/download-major `_. + +.. code-block:: bash + + docker run -e ARANGO_ROOT_PASSWORD=openSesame -p 8529:8529 arangodb/arangodb + +**Option B: ArangoDB Cloud Trial** + +`ArangoGraph `_ is ArangoDB's Cloud offering to use ArangoDB as a managed service. + +A 14-day trial is available upon sign up. + +**Option C: Temporary Cloud Instance via Python** + +A temporary cloud database can be provisioned using the `adb-cloud-connector `_ Python package. + + +.. code-block:: bash + + pip install adb-cloud-connector + +.. code-block:: python + + from adb_cloud_connector import get_temp_credentials + + credentials = get_temp_credentials() + + print(credentials) + +2. Set environment variables +---------------------------- + +Set up your LLM Environment Variables: + +.. code-block:: bash + + export OPENAI_API_KEY=sk-proj-.... + +Or via python: + +.. code-block:: python + + import os + os.environ["OPENAI_API_KEY"] = "sk-proj-...." + +3. Instantiate an ArangoGraph +---------------------------------------- + +4. Instantiate an ArangoGraphQAChain +---------------------------------------- + +5. Instantiate a VectorStore +---------------------------------------- + +... \ No newline at end of file diff --git a/libs/arangodb/doc/requirements.txt b/libs/arangodb/doc/requirements.txt new file mode 100644 index 0000000..d4f677a --- /dev/null +++ b/libs/arangodb/doc/requirements.txt @@ -0,0 +1,3 @@ +sphinx +sphinx_rtd_theme +langchain-arangodb \ No newline at end of file From 20a53c1da4f26c2ea60b796499657129df19c101 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 28 May 2025 12:54:41 -0400 Subject: [PATCH 2/9] docs skeleton --- .gitignore | 3 +- .readthedocs.yaml | 29 +++++++++++++ docs/api_reference.rst | 44 ++++++++++++++++++++ docs/chains.rst | 50 ++++++++++++++++++++++ docs/chat_message_histories.rst | 53 ++++++++++++++++++++++++ docs/graphs.rst | 52 +++++++++++++++++++++++ docs/index.rst | 23 +++++++++++ docs/installation.rst | 32 ++++++++++++++ docs/query_constructors.rst | 53 ++++++++++++++++++++++++ docs/vectorstores.rst | 55 +++++++++++++++++++++++++ libs/arangodb/doc/index.rst | 5 +-- libs/arangodb/doc/mydirectory/index.rst | 4 ++ libs/arangodb/pyproject.toml | 3 ++ 13 files changed, 401 insertions(+), 5 deletions(-) create mode 100644 .readthedocs.yaml create mode 100644 docs/api_reference.rst create mode 100644 docs/chains.rst create mode 100644 docs/chat_message_histories.rst create mode 100644 docs/graphs.rst create mode 100644 docs/index.rst create mode 100644 docs/installation.rst create mode 100644 docs/query_constructors.rst create mode 100644 docs/vectorstores.rst create mode 100644 libs/arangodb/doc/mydirectory/index.rst diff --git a/.gitignore b/.gitignore index 10cf23e..44cc9c5 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ __pycache__ .mypy_cache_test .env .venv* -libs/arangodb/doc/_build \ No newline at end of file +libs/arangodb/doc/_build +.DS_Store \ No newline at end of file diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..8678149 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,29 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.12" + +# Build documentation in the "doc/" directory with Sphinx +sphinx: + configuration: libs/arangodb/doc/conf.py + fail_on_warning: false + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: libs/arangodb/doc/requirements.txt \ No newline at end of file diff --git a/docs/api_reference.rst b/docs/api_reference.rst new file mode 100644 index 0000000..e33cd9e --- /dev/null +++ b/docs/api_reference.rst @@ -0,0 +1,44 @@ +API Reference +============ + +This section provides detailed API documentation for all modules in LangChain ArangoDB. + +Vector Stores +------------ + +.. automodule:: langchain_arangodb.vectorstores + :members: + :undoc-members: + :show-inheritance: + +Chat Message Histories +-------------------- + +.. automodule:: langchain_arangodb.chat_message_histories + :members: + :undoc-members: + :show-inheritance: + +Graphs +------ + +.. automodule:: langchain_arangodb.graphs + :members: + :undoc-members: + :show-inheritance: + +Chains +------ + +.. automodule:: langchain_arangodb.chains + :members: + :undoc-members: + :show-inheritance: + +Query Constructors +---------------- + +.. automodule:: langchain_arangodb.query_constructors + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/chains.rst b/docs/chains.rst new file mode 100644 index 0000000..a247847 --- /dev/null +++ b/docs/chains.rst @@ -0,0 +1,50 @@ +Chains +====== + +LangChain ArangoDB provides chain implementations that integrate with ArangoDB for various operations. + +ArangoDBChain +------------ + +The main chain implementation that uses ArangoDB for storing and retrieving chain data. + +.. code-block:: python + + from langchain_arangodb.chains import ArangoDBChain + from langchain.llms import OpenAI + + # Initialize the chain + chain = ArangoDBChain( + llm=OpenAI(), + arango_url="http://localhost:8529", + username="root", + password="", + database="langchain", + collection_name="chain_data" + ) + + # Run the chain + result = chain.run("What is the capital of France?") + +Features +-------- + +- Chain execution with ArangoDB storage +- Integration with LangChain's chain interfaces +- Support for various chain types +- Persistent storage of chain data +- Configurable chain parameters + +Configuration Options +-------------------- + +The chain implementation can be configured with various options: + +- ``llm``: The language model to use +- ``arango_url``: URL of the ArangoDB instance +- ``username``: ArangoDB username +- ``password``: ArangoDB password +- ``database``: Database name +- ``collection_name``: Collection name for storing chain data +- ``chain_type``: Type of chain to use +- ``chain_kwargs``: Additional chain parameters \ No newline at end of file diff --git a/docs/chat_message_histories.rst b/docs/chat_message_histories.rst new file mode 100644 index 0000000..a135047 --- /dev/null +++ b/docs/chat_message_histories.rst @@ -0,0 +1,53 @@ +Chat Message Histories +==================== + +LangChain ArangoDB provides chat message history implementations that allow you to store and retrieve chat messages using ArangoDB. + +ArangoDBChatMessageHistory +------------------------- + +The main chat message history implementation that uses ArangoDB for storing and retrieving chat messages. + +.. code-block:: python + + from langchain_arangodb.chat_message_histories import ArangoDBChatMessageHistory + from langchain.schema import HumanMessage, AIMessage + + # Initialize the chat message history + history = ArangoDBChatMessageHistory( + arango_url="http://localhost:8529", + username="root", + password="", + database="langchain", + collection_name="chat_history", + session_id="user123" + ) + + # Add messages + history.add_user_message("Hello!") + history.add_ai_message("Hi there!") + + # Get all messages + messages = history.messages + +Features +-------- + +- Persistent storage of chat messages +- Session-based message organization +- Support for different message types +- Efficient message retrieval +- Integration with LangChain's chat interfaces + +Configuration Options +-------------------- + +The chat message history can be configured with various options: + +- ``arango_url``: URL of the ArangoDB instance +- ``username``: ArangoDB username +- ``password``: ArangoDB password +- ``database``: Database name +- ``collection_name``: Collection name for storing messages +- ``session_id``: Unique identifier for the chat session +- ``ttl``: Time-to-live for messages (optional) \ No newline at end of file diff --git a/docs/graphs.rst b/docs/graphs.rst new file mode 100644 index 0000000..9946367 --- /dev/null +++ b/docs/graphs.rst @@ -0,0 +1,52 @@ +Graphs +====== + +LangChain ArangoDB provides graph implementations that allow you to work with graph data in ArangoDB. + +ArangoDBGraph +------------ + +The main graph implementation that uses ArangoDB for storing and querying graph data. + +.. code-block:: python + + from langchain_arangodb.graphs import ArangoDBGraph + + # Initialize the graph + graph = ArangoDBGraph( + arango_url="http://localhost:8529", + username="root", + password="", + database="langchain", + graph_name="knowledge_graph" + ) + + # Add nodes and edges + graph.add_node("person", {"name": "John", "age": 30}) + graph.add_node("person", {"name": "Alice", "age": 25}) + graph.add_edge("knows", "person/John", "person/Alice") + + # Query the graph + results = graph.query("FOR v IN person RETURN v") + +Features +-------- + +- Graph data modeling +- Node and edge management +- AQL query support +- Graph traversal capabilities +- Integration with LangChain's graph interfaces + +Configuration Options +-------------------- + +The graph implementation can be configured with various options: + +- ``arango_url``: URL of the ArangoDB instance +- ``username``: ArangoDB username +- ``password``: ArangoDB password +- ``database``: Database name +- ``graph_name``: Name of the graph +- ``edge_definitions``: Edge collection definitions +- ``orphan_collections``: Collections that can contain orphan vertices \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..5a3b537 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,23 @@ +Welcome to LangChain ArangoDB's documentation! +========================================= + +LangChain ArangoDB is a Python package that provides ArangoDB integrations for LangChain, enabling vector storage, graph operations, and chat message history management. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + installation + vectorstores + chat_message_histories + graphs + chains + query_constructors + api_reference + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` \ No newline at end of file diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 0000000..0add556 --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,32 @@ +Installation +============ + +You can install LangChain ArangoDB using pip: + +.. code-block:: bash + + pip install langchain-arangodb + +Or using poetry: + +.. code-block:: bash + + poetry add langchain-arangodb + +Requirements +----------- + +- Python 3.8+ +- ArangoDB 3.9+ +- LangChain + +Configuration +------------ + +To use LangChain ArangoDB, you'll need to have an ArangoDB instance running. You can either: + +1. Use a local ArangoDB instance +2. Use ArangoDB Oasis (cloud service) +3. Use a self-hosted ArangoDB instance + +The connection details will be required when initializing the various components. \ No newline at end of file diff --git a/docs/query_constructors.rst b/docs/query_constructors.rst new file mode 100644 index 0000000..fb782fe --- /dev/null +++ b/docs/query_constructors.rst @@ -0,0 +1,53 @@ +Query Constructors +================ + +LangChain ArangoDB provides query constructor implementations that help build AQL queries for ArangoDB. + +ArangoDBQueryConstructor +----------------------- + +The main query constructor implementation that helps build AQL queries. + +.. code-block:: python + + from langchain_arangodb.query_constructors import ArangoDBQueryConstructor + + # Initialize the query constructor + constructor = ArangoDBQueryConstructor( + collection_name="documents", + filter_fields=["category", "tags"], + sort_fields=["created_at", "updated_at"] + ) + + # Build a query + query = constructor.construct_query( + filter_criteria={ + "category": "news", + "tags": ["important", "urgent"] + }, + sort_by="created_at", + sort_order="DESC", + limit=10 + ) + +Features +-------- + +- AQL query construction +- Support for filtering +- Support for sorting +- Support for pagination +- Support for aggregation +- Integration with LangChain's query interfaces + +Configuration Options +-------------------- + +The query constructor can be configured with various options: + +- ``collection_name``: Name of the collection to query +- ``filter_fields``: Fields that can be used for filtering +- ``sort_fields``: Fields that can be used for sorting +- ``default_limit``: Default number of results to return +- ``default_sort_field``: Default field to sort by +- ``default_sort_order``: Default sort order (ASC/DESC) \ No newline at end of file diff --git a/docs/vectorstores.rst b/docs/vectorstores.rst new file mode 100644 index 0000000..3d7989f --- /dev/null +++ b/docs/vectorstores.rst @@ -0,0 +1,55 @@ +Vector Stores +============ + +LangChain ArangoDB provides vector store implementations that allow you to store and retrieve embeddings using ArangoDB. + +ArangoDBVectorStore +------------------ + +The main vector store implementation that uses ArangoDB for storing and retrieving vector embeddings. + +.. code-block:: python + + from langchain_arangodb.vectorstores import ArangoDBVectorStore + from langchain.embeddings import OpenAIEmbeddings + + # Initialize the vector store + vectorstore = ArangoDBVectorStore( + embedding=OpenAIEmbeddings(), + arango_url="http://localhost:8529", + username="root", + password="", + database="langchain", + collection_name="vectors" + ) + + # Add texts to the vector store + texts = ["Hello world", "How are you"] + vectorstore.add_texts(texts) + + # Search for similar texts + results = vectorstore.similarity_search("Hello", k=2) + +Features +-------- + +- Efficient vector similarity search +- Support for metadata filtering +- Batch operations for adding texts +- Configurable collection settings +- Integration with LangChain's embedding interfaces + +Configuration Options +-------------------- + +The vector store can be configured with various options: + +- ``embedding``: The embedding model to use +- ``arango_url``: URL of the ArangoDB instance +- ``username``: ArangoDB username +- ``password``: ArangoDB password +- ``database``: Database name +- ``collection_name``: Collection name for storing vectors +- ``index_name``: Name of the vector index (default: "vector_index") +- ``index_type``: Type of vector index to use +- ``index_fields``: Fields to include in the index \ No newline at end of file diff --git a/libs/arangodb/doc/index.rst b/libs/arangodb/doc/index.rst index a145453..cfdbe49 100644 --- a/libs/arangodb/doc/index.rst +++ b/libs/arangodb/doc/index.rst @@ -121,7 +121,4 @@ Expect documentation to grow over time: :maxdepth: 2 quickstart - classes/index - dict/index - algorithms/index - views/index \ No newline at end of file + mydirectory/index \ No newline at end of file diff --git a/libs/arangodb/doc/mydirectory/index.rst b/libs/arangodb/doc/mydirectory/index.rst new file mode 100644 index 0000000..7e344ae --- /dev/null +++ b/libs/arangodb/doc/mydirectory/index.rst @@ -0,0 +1,4 @@ +.. _mydirectory: + +Hello World +============ \ No newline at end of file diff --git a/libs/arangodb/pyproject.toml b/libs/arangodb/pyproject.toml index 9350b88..f890022 100644 --- a/libs/arangodb/pyproject.toml +++ b/libs/arangodb/pyproject.toml @@ -93,3 +93,6 @@ markers = [ "compile: mark placeholder test used to compile integration tests without running them", ] asyncio_mode = "auto" + +[tool.poetry.extras] +docs = ["sphinx", "sphinx-rtd-theme", "myst-parser"] From 3a831711b15e5aa5793770b6dfad12d601bb0541 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli Date: Mon, 9 Jun 2025 08:13:22 -0700 Subject: [PATCH 3/9] Initial vector stores docs --- libs/arangodb/doc/index.rst | 142 ++++---- libs/arangodb/doc/vectorstores.rst | 536 +++++++++++++++++++++++++++++ 2 files changed, 597 insertions(+), 81 deletions(-) create mode 100644 libs/arangodb/doc/vectorstores.rst diff --git a/libs/arangodb/doc/index.rst b/libs/arangodb/doc/index.rst index cfdbe49..4170e35 100644 --- a/libs/arangodb/doc/index.rst +++ b/libs/arangodb/doc/index.rst @@ -1,124 +1,104 @@ -langchain-arangodb HELLO WORLD!!! -============ +LangChain ArangoDB +================== + +LangChain ArangoDB is a Python package that provides ArangoDB integrations for LangChain, enabling vector storage, graph operations, and chat message history management. .. raw:: html .. raw:: html
-.. image:: https://colab.research.google.com/assets/colab-badge.svg - :target: https://colab.research.google.com/github/arangodb/nx-arangodb/blob/main/doc/nx_arangodb.ipynb - :alt: Open In Colab - -.. image:: https://dl.circleci.com/status-badge/img/gh/arangodb/nx-arangodb/tree/main.svg?style=svg - :target: https://dl.circleci.com/status-badge/redirect/gh/arangodb/nx-arangodb/tree/main - :alt: CircleCI - -.. image:: https://github.com/arangodb/nx-arangodb/actions/workflows/analyze.yml/badge.svg - :target: https://github.com/arangodb/nx-arangodb/actions/workflows/analyze.yml - :alt: CodeQL - -.. image:: https://github.com/arangodb/nx-arangodb/actions/workflows/docs.yaml/badge.svg - :target: https://github.com/arangodb/nx-arangodb/actions/workflows/docs.yaml - :alt: Docs - -.. raw:: html - -
- -.. image:: https://img.shields.io/pypi/v/nx-arangodb?color=3775A9&style=for-the-badge&logo=pypi&logoColor=FFD43B - :target: https://pypi.org/project/nx-arangodb/ - :alt: PyPI version badge - -.. image:: https://img.shields.io/pypi/pyversions/nx-arangodb?color=3776AB&style=for-the-badge&logo=python&logoColor=FFD43B - :target: https://pypi.org/project/nx-arangodb/ - :alt: Python versions badge - -.. raw:: html - -
- -.. image:: https://img.shields.io/github/license/arangodb/nx-arangodb?color=9E2165&style=for-the-badge - :target: https://github.com/arangodb/nx-arangodb/blob/main/LICENSE - :alt: License - -.. image:: https://img.shields.io/static/v1?style=for-the-badge&label=code%20style&message=black&color=black - :target: https://github.com/psf/black - :alt: Code style: black - -.. image:: https://img.shields.io/pepy/dt/nx-arangodb?style=for-the-badge&color=282661 - :target: https://pepy.tech/project/nx-arangodb - :alt: Downloads +Key Features +------------ -This is a `backend to NetworkX `_ that offers `ArangoDB `_ as a `Persistence Layer to NetworkX Graphs `_: +LangChain ArangoDB provides comprehensive integrations for building AI applications: -1. Persist NetworkX Graphs to ArangoDB. -2. Reload NetworkX Graphs from ArangoDB. -3. Perform CRUD on ArangoDB Graphs via NetworkX. -4. Run algorithms (CPU & GPU) on ArangoDB Graphs via NetworkX. +**Vector Operations** + - High-performance vector similarity search + - Support for multiple distance metrics (cosine, Euclidean) + - Approximate and exact nearest neighbor search + - Maximal marginal relevance (MMR) search for diverse results -Benefits of having ArangoDB as a backend to NetworkX include: +**Graph Operations** + - Knowledge graph construction and querying + - Graph-based question answering chains + - Integration with LangChain's graph interfaces -1. No need to re-create the graph every time you start a new session. -2. Access to GPU-accelerated graph analytics (`nx-cugraph `_). -3. Access to a database query language (`Arango Query Language `_). -4. Access to a visual interface for graph exploration (`ArangoDB Web UI `_). -5. Access to cross-collaboration on the same graph (`ArangoDB Cloud `_). -6. Access to efficient distribution of graph data (`ArangoDB SmartGraphs `_). +**Chat Memory** + - Persistent chat message history storage + - Session-based conversation management + - Efficient message retrieval and filtering -.. image:: ./_static/nxadb.png - :align: center - :alt: nx-arangodb Diagram - :height: 200px +**Query Construction** + - AQL (ArangoDB Query Language) integration + - Structured query generation from natural language Requirements ------------ - Python 3.10+ -- NetworkX 3.0+ +- LangChain - ArangoDB 3.10+ +- python-arango Installation ------------ Latest Release -.. code-block:: +.. code-block:: bash + + pip install langchain-arangodb - pip install nx-arangodb +Current Development State -Current State +.. code-block:: bash -.. code-block:: + pip install git+https://github.com/arangodb/langchain-arangodb - pip install git+https://github.com/arangodb/nx-arangodb +Quick Start +----------- -Contents --------- +.. code-block:: python -The UX of NetworkX-ArangoDB is similar to that of NetworkX, but with the -added functionality of persisting graphs to ArangoDB. For an understanding -of how to use NetworkX, refer to the `NetworkX Documentation `_. + from arango import ArangoClient + from langchain_openai import OpenAIEmbeddings + from langchain_arangodb.vectorstores import ArangoVector -Expect documentation to grow over time: + # Connect to ArangoDB + client = ArangoClient("http://localhost:8529") + db = client.db("langchain", username="root", password="openSesame") + + # Create vector store + vectorstore = ArangoVector.from_texts( + texts=["Hello world", "LangChain with ArangoDB"], + embedding=OpenAIEmbeddings(), + database=db + ) + + # Search + results = vectorstore.similarity_search("greeting", k=1) + +Documentation Contents .. toctree:: :maxdepth: 2 + :caption: User Guide: quickstart + vectorstores + +.. toctree:: + :maxdepth: 2 + :caption: Advanced: + mydirectory/index \ No newline at end of file diff --git a/libs/arangodb/doc/vectorstores.rst b/libs/arangodb/doc/vectorstores.rst new file mode 100644 index 0000000..bf7b571 --- /dev/null +++ b/libs/arangodb/doc/vectorstores.rst @@ -0,0 +1,536 @@ +Vector Stores +============ + +LangChain ArangoDB provides powerful vector store implementations that allow you to store, index, and retrieve embeddings using ArangoDB's native vector search capabilities. + +Overview +-------- + +The ``ArangoVector`` class is the main vector store implementation that integrates with LangChain's embedding interfaces and provides: + +- Efficient vector similarity search with cosine and Euclidean distance metrics +- Approximate and exact nearest neighbor search +- Maximal marginal relevance (MMR) search for diverse results +- Batch operations for adding and managing documents +- Configurable vector indexing with customizable parameters +- Integration with ArangoDB's distributed architecture + +Quick Start +----------- + +.. code-block:: python + + from arango import ArangoClient + from langchain_openai import OpenAIEmbeddings + from langchain_arangodb.vectorstores import ArangoVector + + # Connect to ArangoDB + client = ArangoClient("http://localhost:8529") + db = client.db("langchain", username="root", password="openSesame") + + # Initialize embeddings + embeddings = OpenAIEmbeddings(model="text-embedding-3-small") + + # Create vector store from texts + texts = [ + "ArangoDB is a multi-model database", + "LangChain provides tools for AI applications", + "Vector search enables semantic similarity" + ] + + vectorstore = ArangoVector.from_texts( + texts=texts, + embedding=embeddings, + database=db, + collection_name="my_documents" + ) + + # Search for similar documents + results = vectorstore.similarity_search("database technology", k=2) + for doc in results: + print(doc.page_content) + +Configuration +------------- + +Constructor Parameters +~~~~~~~~~~~~~~~~~~~~~ + +.. py:class:: ArangoVector(embedding, embedding_dimension, database, **kwargs) + + :param embedding: Any embedding function implementing LangChain's Embeddings interface + :param embedding_dimension: The dimension of the embedding vectors (must match your embedding model) + :param database: ArangoDB database instance from python-arango + :param collection_name: Name of the collection to store documents (default: "documents") + :param search_type: Type of search - currently only "vector" is supported + :param embedding_field: Field name for storing embedding vectors (default: "embedding") + :param text_field: Field name for storing text content (default: "text") + :param index_name: Name of the vector index (default: "vector_index") + :param distance_strategy: Distance metric for similarity calculation (default: "COSINE") + :param num_centroids: Number of centroids for vector index clustering (default: 1) + :param relevance_score_fn: Custom function to normalize relevance scores (optional) + +Distance Strategies +~~~~~~~~~~~~~~~~~~ + +The vector store supports multiple distance metrics: + +- **COSINE**: Cosine similarity (default) - good for normalized vectors +- **EUCLIDEAN_DISTANCE**: L2 distance - good for absolute distance measurements +- **MAX_INNER_PRODUCT**: Maximum inner product similarity +- **DOT_PRODUCT**: Dot product similarity +- **JACCARD**: Jaccard similarity coefficient + +*Note: Currently only COSINE and EUCLIDEAN_DISTANCE are fully supported in the vector search implementation.* + +.. code-block:: python + + from langchain_arangodb.vectorstores.utils import DistanceStrategy + + # Using Euclidean distance + vectorstore = ArangoVector( + embedding=embeddings, + embedding_dimension=1536, + database=db, + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE + ) + +Search Methods +-------------- + +Basic Similarity Search +~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Simple similarity search + results = vectorstore.similarity_search("your query", k=5) + + # Search with additional fields returned + results = vectorstore.similarity_search( + "your query", + k=5, + return_fields={"metadata_field", "custom_field"} + ) + + # Search with custom embedding + custom_embedding = embeddings.embed_query("your query") + results = vectorstore.similarity_search_by_vector(custom_embedding, k=5) + +Search with Scores +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Get similarity scores with results + docs_and_scores = vectorstore.similarity_search_with_score("your query", k=5) + + for doc, score in docs_and_scores: + print(f"Score: {score:.3f} - Content: {doc.page_content[:100]}...") + +Maximal Marginal Relevance (MMR) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +MMR search helps ensure diverse results by balancing relevance and diversity: + +.. code-block:: python + + # MMR search for diverse results + diverse_results = vectorstore.max_marginal_relevance_search( + query="your query", + k=5, # Number of final results + fetch_k=20, # Number of initial candidates to fetch + lambda_mult=0.5 # Balance between relevance (0) and diversity (1) + ) + +Approximate vs Exact Search +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Approximate search (faster, requires ArangoDB >= 3.12.4) + results = vectorstore.similarity_search("query", use_approx=True) + + # Exact search (slower but precise) + results = vectorstore.similarity_search("query", use_approx=False) + +Hybrid Search +~~~~~~~~~~~~~ + +Hybrid search combines vector similarity with traditional keyword search using Reciprocal Rank Fusion (RRF), providing more comprehensive and accurate results: + +.. code-block:: python + + from langchain_arangodb.vectorstores import ArangoVector, SearchType + + # Create vector store with hybrid search enabled + vectorstore = ArangoVector.from_texts( + texts=["Machine learning algorithms", "AI-powered applications"], + embedding=embeddings, + database=db, + search_type=SearchType.HYBRID, + insert_text=True, # Required for hybrid search + ) + + # Create both vector and keyword indexes + vectorstore.create_vector_index() + vectorstore.create_keyword_index() + + # Perform hybrid search + results = vectorstore.similarity_search_with_score( + query="AI technology", + k=3, + search_type=SearchType.HYBRID, + vector_weight=1.0, # Weight for vector similarity + keyword_weight=1.0, # Weight for keyword matching + ) + +**Hybrid Search Parameters:** + +- ``search_type=SearchType.HYBRID``: Enables hybrid search mode +- ``vector_weight``: Weight for vector similarity scores (default: 1.0) +- ``keyword_weight``: Weight for keyword search scores (default: 1.0) +- ``rrf_search_limit``: Number of top results to consider for RRF fusion (default: 50) +- ``keyword_search_clause``: Custom AQL search clause for keyword matching (optional) + +**Custom Keyword Search:** + +.. code-block:: python + + # Custom keyword search with metadata filtering + custom_keyword_clause = f""" + SEARCH ANALYZER( + doc.{vectorstore.text_field} IN TOKENS(@query, @analyzer), + @analyzer + ) AND doc.category == "technology" + """ + + results = vectorstore.similarity_search_with_score( + query="machine learning", + k=5, + search_type=SearchType.HYBRID, + keyword_search_clause=custom_keyword_clause, + ) + +**Keyword Index Management:** + +.. code-block:: python + + # Create keyword index for hybrid search + vectorstore.create_keyword_index() + + # Check if keyword index exists + keyword_index = vectorstore.retrieve_keyword_index() + if keyword_index: + print(f"Keyword index: {keyword_index['name']}") + + # Delete keyword index + vectorstore.delete_keyword_index() + +**Weight Balancing Examples:** + +.. code-block:: python + + # Favor vector similarity (semantic search) + semantic_results = vectorstore.similarity_search_with_score( + query="artificial intelligence", + k=3, + search_type=SearchType.HYBRID, + vector_weight=10.0, + keyword_weight=1.0, + ) + + # Favor keyword matching (traditional search) + keyword_results = vectorstore.similarity_search_with_score( + query="machine learning algorithms", + k=3, + search_type=SearchType.HYBRID, + vector_weight=1.0, + keyword_weight=10.0, + ) + + # Balanced hybrid approach + balanced_results = vectorstore.similarity_search_with_score( + query="AI applications", + k=3, + search_type=SearchType.HYBRID, + vector_weight=1.0, + keyword_weight=1.0, + ) + +Document Management +------------------ + +Adding Documents +~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Add texts with metadata + texts = ["Document 1", "Document 2"] + metadatas = [{"category": "tech"}, {"category": "science"}] + ids = vectorstore.add_texts(texts, metadatas=metadatas) + + # Add pre-computed embeddings + embeddings_list = [embedding.embed_query(text) for text in texts] + ids = vectorstore.add_embeddings( + texts=texts, + embeddings=embeddings_list, + metadatas=metadatas, + ids=["custom_id_1", "custom_id_2"], # Optional custom IDs + batch_size=1000, # Custom batch size + use_async_db=True # Use async operations + ) + + # IDs are automatically generated using farmhash if not provided + # This ensures consistent, deterministic IDs based on content + auto_ids = vectorstore.add_texts(["New document"]) + print(f"Auto-generated ID: {auto_ids[0]}") + +Retrieving Documents +~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Get documents by IDs + documents = vectorstore.get_by_ids(["doc_id_1", "doc_id_2"]) + +Deleting Documents +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Delete specific documents + vectorstore.delete(ids=["doc_id_1", "doc_id_2"]) + + # Delete all documents in collection + vectorstore.delete() + +Advanced Configuration +--------------------- + +Vector Index Management +~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Check if vector index exists + index_info = vectorstore.retrieve_vector_index() + if index_info: + print(f"Index exists: {index_info['name']}") + + # Create vector index manually + vectorstore.create_vector_index() + + # Delete vector index + vectorstore.delete_vector_index() + +Batch Operations +~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Large batch insertion with custom batch size + large_texts = ["text"] * 10000 + vectorstore.add_texts( + texts=large_texts, + batch_size=1000, # Process in batches of 1000 + use_async_db=True # Use async operations for better performance + ) + +Custom Collection Setup +~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Initialize with specific collection settings + vectorstore = ArangoVector( + embedding=embeddings, + embedding_dimension=1536, + database=db, + collection_name="custom_vectors", + index_name="my_vector_index", + num_centroids=10, # More centroids for larger datasets + search_type=SearchType.VECTOR, # or SearchType.HYBRID + ) + + # Create from texts with index management + vectorstore = ArangoVector.from_texts( + texts=["Document content"], + embedding=embeddings, + database=db, + collection_name="my_collection", + overwrite_index=True, # Recreate existing indexes + embedding_field="custom_embedding", + text_field="custom_text", + ids=["custom_id_1"], # Custom document IDs + ) + +Custom Relevance Scoring +~~~~~~~~~~~~~~~~~~~~~~~ + +You can provide custom relevance score normalization functions: + +.. code-block:: python + + def custom_relevance_function(score: float) -> float: + """Custom normalization that inverts and scales scores.""" + return 1.0 / (1.0 + score) + + vectorstore = ArangoVector( + embedding=embeddings, + embedding_dimension=1536, + database=db, + relevance_score_fn=custom_relevance_function + ) + + # Get relevance scores with custom normalization + docs_with_scores = vectorstore.similarity_search_with_score("query", k=3) + for doc, score in docs_with_scores: + print(f"Custom score: {score}") + +Performance Tips +--------------- + +1. **Choose the right distance strategy**: Use COSINE for normalized embeddings, EUCLIDEAN for raw distances +2. **Use approximate search**: Enable ``use_approx=True`` for large datasets (requires ArangoDB >= 3.12.4) +3. **Optimize batch size**: Use larger batch sizes (500-1000) for bulk operations +4. **Configure centroids**: Increase ``num_centroids`` for larger collections (rule of thumb: sqrt(num_documents)) +5. **Use async operations**: Enable ``use_async_db=True`` for non-blocking operations +6. **Hybrid search optimization**: For hybrid search, ensure both vector and keyword indexes are created +7. **Custom field names**: Use descriptive field names for embedding and text fields to avoid conflicts +8. **Memory management**: Use ``import_bulk`` operations with appropriate batch sizes for large datasets + +Example: Complete Workflow +------------------------- + +.. code-block:: python + + from arango import ArangoClient + from langchain_openai import OpenAIEmbeddings + from langchain_arangodb.vectorstores import ArangoVector + from langchain_arangodb.vectorstores.utils import DistanceStrategy + + # Setup + client = ArangoClient("http://localhost:8529") + db = client.db("vectorstore_demo", username="root", password="openSesame") + embeddings = OpenAIEmbeddings(model="text-embedding-3-small") + + # Create vector store with hybrid search support + vectorstore = ArangoVector( + embedding=embeddings, + embedding_dimension=1536, + database=db, + collection_name="knowledge_base", + search_type=SearchType.HYBRID, + distance_strategy=DistanceStrategy.COSINE, + num_centroids=5, + insert_text=True # Required for hybrid search + ) + + # Add documents with metadata + documents = [ + "Python is a programming language", + "Machine learning uses algorithms", + "Databases store structured data", + "APIs enable system integration" + ] + + metadatas = [ + {"topic": "programming", "difficulty": "beginner"}, + {"topic": "ai", "difficulty": "intermediate"}, + {"topic": "database", "difficulty": "beginner"}, + {"topic": "integration", "difficulty": "intermediate"} + ] + + # Add to vector store + doc_ids = vectorstore.add_texts(documents, metadatas=metadatas) + print(f"Added {len(doc_ids)} documents") + + # Create indexes for hybrid search + vectorstore.create_vector_index() + vectorstore.create_keyword_index() + + # Perform searches + print("\n--- Vector Search ---") + vector_results = vectorstore.similarity_search( + "programming languages", + k=2, + search_type=SearchType.VECTOR + ) + for doc in vector_results: + print(f"- {doc.page_content}") + + print("\n--- Hybrid Search ---") + hybrid_results = vectorstore.similarity_search_with_score( + "data storage algorithms", + k=2, + search_type=SearchType.HYBRID, + vector_weight=1.0, + keyword_weight=1.0 + ) + for doc, score in hybrid_results: + print(f"Score: {score:.3f} - {doc.page_content}") + + print("\n--- MMR Search for Diversity ---") + diverse_results = vectorstore.max_marginal_relevance_search( + "technology concepts", + k=3, + lambda_mult=0.7 + ) + for doc in diverse_results: + print(f"- {doc.page_content}") + +API Reference +------------- + +.. automodule:: langchain_arangodb.vectorstores.arangodb_vector + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: langchain_arangodb.vectorstores.utils + :members: + :undoc-members: + :show-inheritance: + +Future Enhancements +------------------- + +Additional Distance Strategies +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Support for additional distance strategies is planned: + +- **MAX_INNER_PRODUCT**: Maximum inner product similarity +- **DOT_PRODUCT**: Dot product similarity +- **JACCARD**: Jaccard similarity coefficient + +Graph-Enhanced Search +~~~~~~~~~~~~~~~~~~~~ + +Integration with ArangoDB's graph capabilities for enhanced semantic search: + +.. code-block:: python + + # Future graph-enhanced search API (planned) + # results = vectorstore.graph_enhanced_search( + # query="your query", + # k=5, + # graph_traversal_depth=2, + # include_connected_nodes=True + # ) + +Multi-Modal Search +~~~~~~~~~~~~~~~~~ + +Support for multi-modal embeddings and cross-modal search capabilities: + +.. code-block:: python + + # Future multi-modal search API (planned) + # results = vectorstore.multi_modal_search( + # query="text query", + # image_query=image_embedding, + # modality_weights={"text": 0.7, "image": 0.3} + # ) \ No newline at end of file From 14aa831254cfe94a5fe1cc0820f9b9f366e3c54b Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli Date: Mon, 9 Jun 2025 08:23:14 -0700 Subject: [PATCH 4/9] Updating quickstart guide to langchain-arangodb --- libs/arangodb/doc/quickstart.rst | 282 +++++++++++++++++++++++++++++-- 1 file changed, 271 insertions(+), 11 deletions(-) diff --git a/libs/arangodb/doc/quickstart.rst b/libs/arangodb/doc/quickstart.rst index 3a83b4c..d695aac 100644 --- a/libs/arangodb/doc/quickstart.rst +++ b/libs/arangodb/doc/quickstart.rst @@ -1,9 +1,13 @@ Quickstart ========== +Get started with LangChain ArangoDB in 5 simple steps: + 1. Set up ArangoDB -2. Set environment variables -3. Instantiate a NetworkX-ArangoDB Graph +2. Set environment variables +3. Instantiate a Vector Store +4. Instantiate an ArangoDB Graph +5. Instantiate an ArangoDB Graph QA Chain 1. Set up ArangoDB ------------------ @@ -28,7 +32,6 @@ A 14-day trial is available upon sign up. A temporary cloud database can be provisioned using the `adb-cloud-connector `_ Python package. - .. code-block:: bash pip install adb-cloud-connector @@ -38,7 +41,6 @@ A temporary cloud database can be provisioned using the `adb-cloud-connector Date: Mon, 9 Jun 2025 08:49:08 -0700 Subject: [PATCH 5/9] Adding chat_message history --- libs/arangodb/doc/chat_message_histories.rst | 528 +++++++++++++++++++ libs/arangodb/doc/index.rst | 1 + 2 files changed, 529 insertions(+) create mode 100644 libs/arangodb/doc/chat_message_histories.rst diff --git a/libs/arangodb/doc/chat_message_histories.rst b/libs/arangodb/doc/chat_message_histories.rst new file mode 100644 index 0000000..f2c5208 --- /dev/null +++ b/libs/arangodb/doc/chat_message_histories.rst @@ -0,0 +1,528 @@ +Chat Message Histories +===================== + +LangChain ArangoDB provides persistent chat message history storage using ArangoDB's document database capabilities. The ``ArangoChatMessageHistory`` class enables you to store, retrieve, and manage conversation history across sessions. + +Overview +-------- + +The ``ArangoChatMessageHistory`` class integrates with LangChain's chat memory system to provide: + +- **Persistent Storage**: Chat messages are stored permanently in ArangoDB +- **Session Management**: Organize conversations by session ID +- **Automatic Indexing**: Efficient retrieval with automatic session-based indexing +- **Message Ordering**: Messages are retrieved in chronological order +- **Memory Integration**: Works seamlessly with LangChain's memory components + +Quick Start +----------- + +.. code-block:: python + + from arango import ArangoClient + from langchain_arangodb.chat_message_histories import ArangoChatMessageHistory + from langchain_core.messages import HumanMessage, AIMessage + + # Connect to ArangoDB + client = ArangoClient("http://localhost:8529") + db = client.db("langchain_demo", username="root", password="openSesame") + + # Initialize chat history for a specific session + chat_history = ArangoChatMessageHistory( + session_id="user_123", + db=db, + collection_name="chat_sessions" + ) + + # Add messages to the conversation + chat_history.add_message(HumanMessage(content="Hello, how are you?")) + chat_history.add_message(AIMessage(content="I'm doing well, thank you! How can I help you today?")) + + # Retrieve all messages in the session + messages = chat_history.messages + for message in messages: + print(f"{message.type}: {message.content}") + +Configuration +------------- + +Constructor Parameters +~~~~~~~~~~~~~~~~~~~~~ + +.. py:class:: ArangoChatMessageHistory(session_id, db, collection_name="ChatHistory", window=3) + + :param session_id: Unique identifier for the chat session (string or int) + :param db: ArangoDB database instance from python-arango + :param collection_name: Name of the collection to store messages (default: "ChatHistory") + :param window: Message window size for future windowing feature (default: 3) + +The class automatically: + +- Creates the collection if it doesn't exist +- Creates a persistent index on ``session_id`` for efficient queries +- Handles message serialization and deserialization + +Core Methods +------------ + +Adding Messages +~~~~~~~~~~~~~~ + +.. code-block:: python + + from langchain_core.messages import HumanMessage, AIMessage, SystemMessage + + # Add different types of messages + chat_history.add_message(HumanMessage(content="What is machine learning?")) + chat_history.add_message(AIMessage(content="Machine learning is a subset of AI...")) + chat_history.add_message(SystemMessage(content="System: Conversation started")) + + # Messages are automatically timestamped and stored with session context + +Retrieving Messages +~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Get all messages for the current session + all_messages = chat_history.messages + + # Messages are returned in chronological order (most recent first in database, + # but converted to proper order for LangChain) + for i, message in enumerate(all_messages): + print(f"Message {i+1}: [{message.type}] {message.content}") + +Clearing History +~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Clear all messages for the current session + chat_history.clear() + + # Verify the session is cleared + print(f"Messages after clear: {len(chat_history.messages)}") + +Integration with LangChain Memory +--------------------------------- + +Conversation Buffer Memory +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from langchain.memory import ConversationBufferMemory + from langchain_openai import ChatOpenAI + + # Create chat history + chat_history = ArangoChatMessageHistory( + session_id="conversation_1", + db=db, + collection_name="conversations" + ) + + # Create memory with persistent storage + memory = ConversationBufferMemory( + chat_memory=chat_history, + return_messages=True, + memory_key="chat_history" + ) + + # Use with any LangChain chain + llm = ChatOpenAI(model="gpt-3.5-turbo") + + # The memory will automatically persist conversations + conversation_input = {"input": "Tell me about Python programming"} + +Conversation Summary Memory +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from langchain.memory import ConversationSummaryMemory + + # Create summary memory with persistent storage + summary_memory = ConversationSummaryMemory( + llm=ChatOpenAI(model="gpt-3.5-turbo"), + chat_memory=chat_history, + return_messages=True + ) + + # Conversation summaries are also persisted + +Integration with Chains +----------------------- + +QA Chain with Memory +~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from langchain_arangodb.chains import ArangoGraphQAChain + from langchain_arangodb.graphs import ArangoGraph + + # Set up graph and chat history + graph = ArangoGraph(database=db) + chat_history = ArangoChatMessageHistory( + session_id="qa_session_1", + db=db, + collection_name="qa_conversations" + ) + + # Create memory + memory = ConversationBufferMemory( + chat_memory=chat_history, + return_messages=True + ) + + # Create QA chain with persistent memory + qa_chain = ArangoGraphQAChain.from_llm( + llm=ChatOpenAI(model="gpt-3.5-turbo"), + graph=graph, + memory=memory, + verbose=True + ) + + # Conversations are automatically persisted + response1 = qa_chain.run("What entities are in our knowledge graph?") + response2 = qa_chain.run("Tell me more about the first one you mentioned") + +Conversation Chain +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from langchain.chains import ConversationChain + + # Create a simple conversation chain with persistent memory + conversation = ConversationChain( + llm=ChatOpenAI(model="gpt-3.5-turbo"), + memory=ConversationBufferMemory( + chat_memory=ArangoChatMessageHistory( + session_id="simple_chat", + db=db + ), + return_messages=True + ), + verbose=True + ) + + # Each interaction is persisted + response1 = conversation.predict(input="Hi, I'm interested in learning about databases") + response2 = conversation.predict(input="What makes ArangoDB special?") + response3 = conversation.predict(input="Can you elaborate on the multi-model aspect?") + +Advanced Usage +-------------- + +Multiple Sessions +~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Manage different conversation sessions + user_sessions = {} + + def get_chat_history(user_id: str) -> ArangoChatMessageHistory: + if user_id not in user_sessions: + user_sessions[user_id] = ArangoChatMessageHistory( + session_id=f"user_{user_id}", + db=db, + collection_name="user_conversations" + ) + return user_sessions[user_id] + + # Use for different users + alice_history = get_chat_history("alice") + bob_history = get_chat_history("bob") + + # Each user maintains separate conversation history + alice_history.add_message(HumanMessage(content="Hello from Alice")) + bob_history.add_message(HumanMessage(content="Hello from Bob")) + +Custom Collection Management +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Use different collections for different purposes + support_history = ArangoChatMessageHistory( + session_id="support_ticket_123", + db=db, + collection_name="customer_support" + ) + + training_history = ArangoChatMessageHistory( + session_id="training_session_1", + db=db, + collection_name="ai_training_conversations" + ) + + # Each collection can have different retention policies or indexes + +Session Analytics +~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Query conversation statistics directly from ArangoDB + def get_session_stats(db, collection_name: str, session_id: str) -> dict: + query = """ + FOR doc IN @@collection + FILTER doc.session_id == @session_id + COLLECT WITH COUNT INTO length + RETURN { + message_count: length, + session_id: @session_id + } + """ + + bind_vars = { + "@collection": collection_name, + "session_id": session_id + } + + result = list(db.aql.execute(query, bind_vars=bind_vars)) + return result[0] if result else {"message_count": 0, "session_id": session_id} + + # Get conversation statistics + stats = get_session_stats(db, "chat_sessions", "user_123") + print(f"Session user_123 has {stats['message_count']} messages") + +Data Structure +-------------- + +Storage Format +~~~~~~~~~~~~~ + +Messages are stored in ArangoDB with the following structure: + +.. code-block:: json + + { + "_key": "auto_generated_key", + "_id": "collection_name/auto_generated_key", + "_rev": "revision_id", + "session_id": "user_123", + "role": "human", + "content": "Hello, how are you?", + "time": "2024-01-01T12:00:00Z" + } + +**Field Descriptions:** + +- ``session_id``: Groups messages by conversation session +- ``role``: Message type (human, ai, system, etc.) +- ``content``: The actual message content +- ``time``: Timestamp for message ordering (automatically added by ArangoDB) + +Indexing Strategy +~~~~~~~~~~~~~~~ + +The class automatically creates a persistent index on ``session_id`` to ensure efficient retrieval: + +.. code-block:: aql + + // Automatic index creation + CREATE INDEX session_idx ON ChatHistory (session_id) OPTIONS {type: "persistent", unique: false} + +This index enables fast filtering of messages by session while maintaining good performance even with large message volumes. + +Best Practices +-------------- + +Session ID Management +~~~~~~~~~~~~~~~~~~~ + +1. **Use descriptive session IDs**: Include user context or conversation type +2. **Avoid special characters**: Stick to alphanumeric characters and underscores +3. **Include timestamps for analytics**: Consider formats like ``user_123_2024_01_01`` + +.. code-block:: python + + # Good session ID patterns + session_id = f"user_{user_id}_{datetime.now().strftime('%Y_%m_%d')}" + session_id = f"support_ticket_{ticket_id}" + session_id = f"training_{model_version}_{session_counter}" + +Memory Management +~~~~~~~~~~~~~~~ + +1. **Choose appropriate memory types** based on conversation length +2. **Implement session cleanup** for privacy or storage management +3. **Monitor collection size** and implement archiving if needed + +.. code-block:: python + + # Cleanup old sessions + def cleanup_old_sessions(db, collection_name: str, days_old: int = 30): + cutoff_date = datetime.now() - timedelta(days=days_old) + + query = """ + FOR doc IN @@collection + FILTER doc.time < @cutoff_date + REMOVE doc IN @@collection + """ + + bind_vars = { + "@collection": collection_name, + "cutoff_date": cutoff_date.isoformat() + } + + db.aql.execute(query, bind_vars=bind_vars) + +Error Handling +~~~~~~~~~~~~~ + +.. code-block:: python + + from arango.exceptions import ArangoError + + try: + chat_history = ArangoChatMessageHistory( + session_id="test_session", + db=db, + collection_name="chat_test" + ) + + chat_history.add_message(HumanMessage(content="Test message")) + messages = chat_history.messages + + except ValueError as e: + print(f"Invalid session ID: {e}") + except ArangoError as e: + print(f"Database error: {e}") + except Exception as e: + print(f"Unexpected error: {e}") + +Performance Considerations +------------------------- + +1. **Session ID indexing**: Automatic indexing ensures O(log n) lookup performance +2. **Message ordering**: Uses ArangoDB's built-in sorting capabilities +3. **Batch operations**: Consider bulk operations for high-volume scenarios +4. **Collection sizing**: Monitor and archive old conversations as needed + +Example: Complete Chat Application +--------------------------------- + +.. code-block:: python + + from arango import ArangoClient + from langchain_openai import ChatOpenAI + from langchain.chains import ConversationChain + from langchain.memory import ConversationBufferMemory + from langchain_arangodb.chat_message_histories import ArangoChatMessageHistory + + class ChatApplication: + def __init__(self, db_url: str, username: str, password: str): + # Initialize ArangoDB connection + self.client = ArangoClient(db_url) + self.db = self.client.db("chat_app", username=username, password=password) + + # Initialize LLM + self.llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7) + + # Session storage + self.sessions = {} + + def get_conversation(self, session_id: str) -> ConversationChain: + """Get or create a conversation for a session.""" + if session_id not in self.sessions: + # Create persistent chat history + chat_history = ArangoChatMessageHistory( + session_id=session_id, + db=self.db, + collection_name="app_conversations" + ) + + # Create memory with chat history + memory = ConversationBufferMemory( + chat_memory=chat_history, + return_messages=True + ) + + # Create conversation chain + conversation = ConversationChain( + llm=self.llm, + memory=memory, + verbose=True + ) + + self.sessions[session_id] = conversation + + return self.sessions[session_id] + + def chat(self, session_id: str, message: str) -> str: + """Send a message and get a response.""" + conversation = self.get_conversation(session_id) + return conversation.predict(input=message) + + def get_history(self, session_id: str) -> list: + """Get conversation history for a session.""" + chat_history = ArangoChatMessageHistory( + session_id=session_id, + db=self.db, + collection_name="app_conversations" + ) + return chat_history.messages + + def clear_session(self, session_id: str): + """Clear a conversation session.""" + if session_id in self.sessions: + del self.sessions[session_id] + + chat_history = ArangoChatMessageHistory( + session_id=session_id, + db=self.db, + collection_name="app_conversations" + ) + chat_history.clear() + + # Usage example + app = ChatApplication("http://localhost:8529", "root", "openSesame") + + # Start conversations with different users + response1 = app.chat("user_alice", "Hello, I need help with Python programming") + response2 = app.chat("user_bob", "What's the weather like?") + response3 = app.chat("user_alice", "Can you explain list comprehensions?") + + # Get conversation history + alice_history = app.get_history("user_alice") + print(f"Alice has {len(alice_history)} messages in her conversation") + + # Clear a session when done + app.clear_session("user_bob") + +API Reference +------------- + +.. automodule:: langchain_arangodb.chat_message_histories.arangodb + :members: + :undoc-members: + :show-inheritance: + +Troubleshooting +--------------- + +Common Issues +~~~~~~~~~~~~ + +**ValueError: Please ensure that the session_id parameter is provided** + - Ensure session_id is not None, empty string, or 0 + - Use descriptive, non-empty session identifiers + +**Database connection errors** + - Verify ArangoDB is running and accessible + - Check connection credentials and database permissions + - Ensure the database exists or the user has create permissions + +**Index creation failures** + - Verify the user has index creation permissions + - Check if the collection already has conflicting indexes + - Ensure adequate disk space for index creation + +**Message retrieval issues** + - Verify session_id matches exactly (case-sensitive) + - Check if messages exist in the collection using ArangoDB web interface + - Ensure proper message format in the database diff --git a/libs/arangodb/doc/index.rst b/libs/arangodb/doc/index.rst index 4170e35..06f2799 100644 --- a/libs/arangodb/doc/index.rst +++ b/libs/arangodb/doc/index.rst @@ -96,6 +96,7 @@ Documentation Contents quickstart vectorstores + chat_message_histories .. toctree:: :maxdepth: 2 From 99e3be72c15aa13f39c89c760776eb0254883310 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 9 Jun 2025 13:02:34 -0400 Subject: [PATCH 6/9] bump: docs --- {docs => libs/arangodb/doc}/api_reference.rst | 10 +++++----- libs/arangodb/doc/conf.py | 3 ++- libs/arangodb/doc/index.rst | 8 +++++++- 3 files changed, 14 insertions(+), 7 deletions(-) rename {docs => libs/arangodb/doc}/api_reference.rst (63%) diff --git a/docs/api_reference.rst b/libs/arangodb/doc/api_reference.rst similarity index 63% rename from docs/api_reference.rst rename to libs/arangodb/doc/api_reference.rst index e33cd9e..063d103 100644 --- a/docs/api_reference.rst +++ b/libs/arangodb/doc/api_reference.rst @@ -6,7 +6,7 @@ This section provides detailed API documentation for all modules in LangChain Ar Vector Stores ------------ -.. automodule:: langchain_arangodb.vectorstores +.. automodule:: langchain_arangodb.vectorstores.arangodb_vector :members: :undoc-members: :show-inheritance: @@ -14,7 +14,7 @@ Vector Stores Chat Message Histories -------------------- -.. automodule:: langchain_arangodb.chat_message_histories +.. automodule:: langchain_arangodb.chat_message_histories.arangodb :members: :undoc-members: :show-inheritance: @@ -22,7 +22,7 @@ Chat Message Histories Graphs ------ -.. automodule:: langchain_arangodb.graphs +.. automodule:: langchain_arangodb.graphs.arangodb_graph :members: :undoc-members: :show-inheritance: @@ -30,7 +30,7 @@ Graphs Chains ------ -.. automodule:: langchain_arangodb.chains +.. automodule:: langchain_arangodb.chains.graph_qa.arangodb :members: :undoc-members: :show-inheritance: @@ -38,7 +38,7 @@ Chains Query Constructors ---------------- -.. automodule:: langchain_arangodb.query_constructors +.. automodule:: langchain_arangodb.query_constructors.arangodb :members: :undoc-members: :show-inheritance: \ No newline at end of file diff --git a/libs/arangodb/doc/conf.py b/libs/arangodb/doc/conf.py index e45051f..a2849d3 100644 --- a/libs/arangodb/doc/conf.py +++ b/libs/arangodb/doc/conf.py @@ -33,7 +33,8 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output html_theme = 'sphinx_rtd_theme' -html_static_path = ['_static'] +html_static_path = [] # ['_static'] autodoc_member_order = "bysource" autodoc_inherit_docstrings = True autosummary_generate = True +master_doc = "index" diff --git a/libs/arangodb/doc/index.rst b/libs/arangodb/doc/index.rst index 06f2799..b134c58 100644 --- a/libs/arangodb/doc/index.rst +++ b/libs/arangodb/doc/index.rst @@ -102,4 +102,10 @@ Documentation Contents :maxdepth: 2 :caption: Advanced: - mydirectory/index \ No newline at end of file + mydirectory/index + +.. toctree:: + :maxdepth: 2 + :caption: API Reference: + + api_reference \ No newline at end of file From 6f839c6e5d9a01e816eb8830a102bed85b61f0e7 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 11 Jun 2025 16:10:38 -0400 Subject: [PATCH 7/9] fix: pyproject --- libs/arangodb/poetry.lock | 347 ++++++++++++++++++++++++++++++++++- libs/arangodb/pyproject.toml | 5 +- 2 files changed, 346 insertions(+), 6 deletions(-) diff --git a/libs/arangodb/poetry.lock b/libs/arangodb/poetry.lock index 0f3e0cd..20d17b8 100644 --- a/libs/arangodb/poetry.lock +++ b/libs/arangodb/poetry.lock @@ -1,5 +1,17 @@ # This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +[[package]] +name = "alabaster" +version = "0.7.16" +description = "A light, configurable Sphinx theme" +optional = false +python-versions = ">=3.9" +groups = ["test"] +files = [ + {file = "alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92"}, + {file = "alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65"}, +] + [[package]] name = "annotated-types" version = "0.7.0" @@ -48,6 +60,21 @@ files = [ {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, ] +[[package]] +name = "babel" +version = "2.17.0" +description = "Internationalization utilities" +optional = false +python-versions = ">=3.8" +groups = ["test"] +files = [ + {file = "babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2"}, + {file = "babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d"}, +] + +[package.extras] +dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata ; sys_platform == \"win32\""] + [[package]] name = "certifi" version = "2025.1.31" @@ -409,6 +436,18 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1 [package.extras] toml = ["tomli ; python_full_version <= \"3.11.0a6\""] +[[package]] +name = "docutils" +version = "0.20.1" +description = "Docutils -- Python Documentation Utilities" +optional = false +python-versions = ">=3.7" +groups = ["test"] +files = [ + {file = "docutils-0.20.1-py3-none-any.whl", hash = "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6"}, + {file = "docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b"}, +] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -587,17 +626,30 @@ files = [ [package.extras] all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] +[[package]] +name = "imagesize" +version = "1.4.1" +description = "Getting image size from png/jpeg/jpeg2000/gif file" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["test"] +files = [ + {file = "imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b"}, + {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"}, +] + [[package]] name = "importlib-metadata" version = "8.6.1" description = "Read metadata from Python packages" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "test"] files = [ {file = "importlib_metadata-8.6.1-py3-none-any.whl", hash = "sha256:02a89390c1e15fdfdc0d7c6b25cb3e62650d0494005c97d6f148bf5b9787525e"}, {file = "importlib_metadata-8.6.1.tar.gz", hash = "sha256:310b41d755445d74569f993ccfc22838295d9fe005425094fad953d7f15c8580"}, ] +markers = {test = "python_version < \"3.10\""} [package.dependencies] zipp = ">=3.20" @@ -623,6 +675,24 @@ files = [ {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, ] +[[package]] +name = "jinja2" +version = "3.1.6" +description = "A very fast and expressive template engine." +optional = false +python-versions = ">=3.7" +groups = ["test"] +files = [ + {file = "jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67"}, + {file = "jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d"}, +] + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + [[package]] name = "jsonpatch" version = "1.33" @@ -764,6 +834,77 @@ openai-agents = ["openai-agents (>=0.0.3,<0.1)"] otel = ["opentelemetry-api (>=1.30.0,<2.0.0)", "opentelemetry-exporter-otlp-proto-http (>=1.30.0,<2.0.0)", "opentelemetry-sdk (>=1.30.0,<2.0.0)"] pytest = ["pytest (>=7.0.0)", "rich (>=13.9.4,<14.0.0)"] +[[package]] +name = "markupsafe" +version = "3.0.2" +description = "Safely add untrusted strings to HTML/XML markup." +optional = false +python-versions = ">=3.9" +groups = ["test"] +files = [ + {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-win32.whl", hash = "sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"}, + {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"}, +] + [[package]] name = "mypy" version = "1.15.0" @@ -1130,6 +1271,21 @@ files = [ [package.dependencies] typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" +[[package]] +name = "pygments" +version = "2.19.1" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.8" +groups = ["test"] +files = [ + {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, + {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, +] + +[package.extras] +windows-terminal = ["colorama (>=0.4.6)"] + [[package]] name = "pyjwt" version = "2.10.1" @@ -1425,6 +1581,190 @@ files = [ {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, ] +[[package]] +name = "snowballstemmer" +version = "3.0.1" +description = "This package provides 32 stemmers for 30 languages generated from Snowball algorithms." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*" +groups = ["test"] +files = [ + {file = "snowballstemmer-3.0.1-py3-none-any.whl", hash = "sha256:6cd7b3897da8d6c9ffb968a6781fa6532dce9c3618a4b127d920dab764a19064"}, + {file = "snowballstemmer-3.0.1.tar.gz", hash = "sha256:6d5eeeec8e9f84d4d56b847692bacf79bc2c8e90c7f80ca4444ff8b6f2e52895"}, +] + +[[package]] +name = "sphinx" +version = "7.4.7" +description = "Python documentation generator" +optional = false +python-versions = ">=3.9" +groups = ["test"] +files = [ + {file = "sphinx-7.4.7-py3-none-any.whl", hash = "sha256:c2419e2135d11f1951cd994d6eb18a1835bd8fdd8429f9ca375dc1f3281bd239"}, + {file = "sphinx-7.4.7.tar.gz", hash = "sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe"}, +] + +[package.dependencies] +alabaster = ">=0.7.14,<0.8.0" +babel = ">=2.13" +colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\""} +docutils = ">=0.20,<0.22" +imagesize = ">=1.3" +importlib-metadata = {version = ">=6.0", markers = "python_version < \"3.10\""} +Jinja2 = ">=3.1" +packaging = ">=23.0" +Pygments = ">=2.17" +requests = ">=2.30.0" +snowballstemmer = ">=2.2" +sphinxcontrib-applehelp = "*" +sphinxcontrib-devhelp = "*" +sphinxcontrib-htmlhelp = ">=2.0.0" +sphinxcontrib-jsmath = "*" +sphinxcontrib-qthelp = "*" +sphinxcontrib-serializinghtml = ">=1.1.9" +tomli = {version = ">=2", markers = "python_version < \"3.11\""} + +[package.extras] +docs = ["sphinxcontrib-websupport"] +lint = ["flake8 (>=6.0)", "importlib-metadata (>=6.0)", "mypy (==1.10.1)", "pytest (>=6.0)", "ruff (==0.5.2)", "sphinx-lint (>=0.9)", "tomli (>=2)", "types-docutils (==0.21.0.20240711)", "types-requests (>=2.30.0)"] +test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=8.0)", "setuptools (>=70.0)", "typing_extensions (>=4.9)"] + +[[package]] +name = "sphinx-rtd-theme" +version = "2.0.0" +description = "Read the Docs theme for Sphinx" +optional = false +python-versions = ">=3.6" +groups = ["test"] +files = [ + {file = "sphinx_rtd_theme-2.0.0-py2.py3-none-any.whl", hash = "sha256:ec93d0856dc280cf3aee9a4c9807c60e027c7f7b461b77aeffed682e68f0e586"}, + {file = "sphinx_rtd_theme-2.0.0.tar.gz", hash = "sha256:bd5d7b80622406762073a04ef8fadc5f9151261563d47027de09910ce03afe6b"}, +] + +[package.dependencies] +docutils = "<0.21" +sphinx = ">=5,<8" +sphinxcontrib-jquery = ">=4,<5" + +[package.extras] +dev = ["bump2version", "sphinxcontrib-httpdomain", "transifex-client", "wheel"] + +[[package]] +name = "sphinxcontrib-applehelp" +version = "2.0.0" +description = "sphinxcontrib-applehelp is a Sphinx extension which outputs Apple help books" +optional = false +python-versions = ">=3.9" +groups = ["test"] +files = [ + {file = "sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5"}, + {file = "sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-devhelp" +version = "2.0.0" +description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp documents" +optional = false +python-versions = ">=3.9" +groups = ["test"] +files = [ + {file = "sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2"}, + {file = "sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-htmlhelp" +version = "2.1.0" +description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files" +optional = false +python-versions = ">=3.9" +groups = ["test"] +files = [ + {file = "sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8"}, + {file = "sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["html5lib", "pytest"] + +[[package]] +name = "sphinxcontrib-jquery" +version = "4.1" +description = "Extension to include jQuery on newer Sphinx releases" +optional = false +python-versions = ">=2.7" +groups = ["test"] +files = [ + {file = "sphinxcontrib-jquery-4.1.tar.gz", hash = "sha256:1620739f04e36a2c779f1a131a2dfd49b2fd07351bf1968ced074365933abc7a"}, + {file = "sphinxcontrib_jquery-4.1-py2.py3-none-any.whl", hash = "sha256:f936030d7d0147dd026a4f2b5a57343d233f1fc7b363f68b3d4f1cb0993878ae"}, +] + +[package.dependencies] +Sphinx = ">=1.8" + +[[package]] +name = "sphinxcontrib-jsmath" +version = "1.0.1" +description = "A sphinx extension which renders display math in HTML via JavaScript" +optional = false +python-versions = ">=3.5" +groups = ["test"] +files = [ + {file = "sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"}, + {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"}, +] + +[package.extras] +test = ["flake8", "mypy", "pytest"] + +[[package]] +name = "sphinxcontrib-qthelp" +version = "2.0.0" +description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp documents" +optional = false +python-versions = ">=3.9" +groups = ["test"] +files = [ + {file = "sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb"}, + {file = "sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["defusedxml (>=0.7.1)", "pytest"] + +[[package]] +name = "sphinxcontrib-serializinghtml" +version = "2.0.0" +description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)" +optional = false +python-versions = ">=3.9" +groups = ["test"] +files = [ + {file = "sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331"}, + {file = "sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + [[package]] name = "sqlalchemy" version = "2.0.40" @@ -1698,11 +2038,12 @@ version = "3.21.0" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "test"] files = [ {file = "zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931"}, {file = "zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4"}, ] +markers = {test = "python_version < \"3.10\""} [package.extras] check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] @@ -1828,4 +2169,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = ">=3.9,<4.0" -content-hash = "33a9211b419562aead7c2d717db154f768753d06ea2048c3947d23cdc7dc0e87" +content-hash = "eb2055bbaaa7c14f8b98adfc3541d5316d9c47771c516cacca1f1a39583e9a0a" diff --git a/libs/arangodb/pyproject.toml b/libs/arangodb/pyproject.toml index 8bde512..ec537d1 100644 --- a/libs/arangodb/pyproject.toml +++ b/libs/arangodb/pyproject.toml @@ -29,6 +29,8 @@ pytest-socket = "^0.7.0" pytest-watcher = "^0.3.4" langchain-core = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/core"} pytest-cov = "^6.0.0" +sphinx = "^7.0.0" +sphinx-rtd-theme = "^2.0.0" [tool.poetry.group.codespell] optional = true @@ -93,6 +95,3 @@ markers = [ "compile: mark placeholder test used to compile integration tests without running them", ] asyncio_mode = "auto" - -[tool.poetry.extras] -docs = ["sphinx", "sphinx-rtd-theme", "myst-parser"] From 55a4eba1ef1a007fc10c3d0fe8a5ead99c549b25 Mon Sep 17 00:00:00 2001 From: SLasyaN Date: Wed, 11 Jun 2025 15:18:10 -0700 Subject: [PATCH 8/9] Docs lasya (#10) * grapgh integration and unit tests * tests for graph(integration and unit- 98% each) and graph_qa(integration and unit- 100% each) * fix: lint PT1 * lint tests * remove: AQL_WRITE_OPERATIONS * lint changes * lint tests * lint tests * lint tests * lint tests * fix: lint * Squashed commit of the following: commit f5e7cc1e65373cca6f60b8370e6c57adb0991781 Author: Ajay Kallepalli <72517322+ajaykallepalli@users.noreply.github.com> Date: Mon Jun 9 07:39:38 2025 -0700 Coverage for Hybrid search added (#8) * Coverage for Hybrid search added * Fix lint commit a25e8715bfce633dbeaae1ef0080a962f2754357 Merge: 8d3cbd4 d00bd64 Author: Ajay Kallepalli <72517322+ajaykallepalli@users.noreply.github.com> Date: Mon Jun 2 09:19:58 2025 -0700 Merge pull request #3 from arangoml/chat_vector_tests unit and integration tests for chat message histories and vector stores commit 8d3cbd49ecd50703f015f13448a15a1d2ff1f28f Author: Anthony Mahanna Date: Fri May 30 13:30:22 2025 -0400 bump: version commit cae31bf5a1349cf46be182a689a87e72b5a9ccb4 Author: Anthony Mahanna Date: Fri May 30 13:30:17 2025 -0400 fix: _release commit 7d857512f3aa93363506218e65ad15b351d3ca60 Author: Anthony Mahanna Date: Fri May 30 13:11:10 2025 -0400 bump: version commit d00bd64feba336b5d88086d7e72a1406f74d3658 Merge: 70e6cfd 994b540 Author: Anthony Mahanna Date: Fri May 30 13:07:54 2025 -0400 Merge branch 'main' into chat_vector_tests commit 70e6cfd6cc512dcea29cf1d2c8864f81f6b9347e Author: Anthony Mahanna Date: Fri May 30 13:04:37 2025 -0400 fix: ci commit b7b53f230c97d1bcd98b149e6f2d0357f04ec8d7 Merge: 24a28ac 61950e2 Author: Anthony Mahanna Date: Fri May 30 11:58:54 2025 -0400 Merge branch 'tests' into chat_vector_tests commit 24a28ac3398ddfaae228a75fc2d75ca514aa7381 Author: Ajay Kallepalli Date: Wed May 28 08:07:17 2025 -0700 ruff format locally failing CI/CD commit 65aace7421bd2436941c5d02fff7bc9873735cd2 Author: Ajay Kallepalli Date: Wed May 28 07:55:39 2025 -0700 Updating assert statements commit 5906fbfedf770b882bd048af91b59ebb4267ef45 Author: Ajay Kallepalli Date: Wed May 28 07:51:53 2025 -0700 Updating assert statements commit 9e0031ade5e6a6184f1cc5250ba96274538dae22 Author: Ajay Kallepalli Date: Wed May 21 10:18:05 2025 -0700 make format py312 commit 8ceac2db818af3c85c9bb1ff5744c48f09ad7f52 Author: Ajay Kallepalli Date: Wed May 21 09:58:41 2025 -0700 Updating assert statements to match latest ruff requirements python 12 commit bbbcecc24b84c02be6c63d7ef1dd79a2a879b6ed Author: Ajay Kallepalli Date: Wed May 21 09:45:32 2025 -0700 Updating assert statements to match latest ruff requirements commit cde5615f95214a74845baa80cc7fe055aefdf7e4 Merge: 5034e4a 9344bf6 Author: Ajay Kallepalli Date: Wed May 21 09:36:41 2025 -0700 Merge branch 'tests' into chat_vector_tests commit 5034e4ad60e30ef5ef6108aaf37fd83f9aaa080a Author: Ajay Kallepalli Date: Wed May 21 08:38:23 2025 -0700 No lint errors, all tests pass commit 9c35b8ff29428c0a7581b8c5cfe5024d7b61ee74 Author: Ajay Kallepalli Date: Wed May 21 08:37:40 2025 -0700 No lint errors commit ccad356c2b21997780da5039565ee613c0f5a125 Author: Ajay Kallepalli Date: Sun May 18 20:21:57 2025 -0700 Fixing linting and formatting errors commit 581808f59abb4278121909751f1426c596ffffd4 Author: Ajay Kallepalli Date: Sun May 18 20:01:12 2025 -0700 Testing from existing collection, all major coverage complete commit 4025fb72006af54d792a669a11ddea8622db3dfe Author: Ajay Kallepalli Date: Sun May 18 18:23:29 2025 -0700 Adding unit tests and integration tests for get by id commit 895a97af20f87354cc1bb68d6d48df6637d7555f Author: Ajay Kallepalli Date: Sun May 18 17:43:07 2025 -0700 All integration test and unit test passing, coverage 73% and 66% commit 5679003017b5bec5957979ffe922fc53b5ff74e0 Merge: b95bb04 b361cd2 Author: Ajay Kallepalli Date: Wed May 14 10:50:48 2025 -0700 Merge branch 'tests' into chat_vector_tests commit b95bb047a856df61e3c3ba40b01f5d0a3b242562 Author: Ajay Kallepalli Date: Wed May 14 09:03:20 2025 -0700 No changes to arangodb_vector commit 11a08fe8b674b1c65015b56160ef1a0c7a1a09d0 Author: Ajay Kallepalli Date: Wed May 14 08:41:58 2025 -0700 minimal changes to arangodb_vector.py commit 4560c9ccc7f32cf7534b61817cf92cb6c6fef4cb Author: Ajay Kallepalli Date: Wed May 14 08:12:52 2025 -0700 All 18 tests pass commit 463ea0859cb6e4fe4a29e6cb89d7a4aa3ecebb62 Author: Ajay Kallepalli Date: Wed May 7 08:45:41 2025 -0700 Adding chat history unit tests commit f7fa9d9a2dba199ea99421ee84e3b5315cc0bb70 Author: Ajay Kallepalli Date: Wed May 7 07:47:37 2025 -0700 integration_tests_chat_history all passing * Revert "Squashed commit of the following:" This reverts commit b6f7716cacc3850d4dfb12cdd393a582072a5477. * comments addressed except .DS store file removal * update * docs update * changed docs * Warning reolution * format * Delete .DS_Store * cleanup --------- Co-authored-by: lasyasn Co-authored-by: Anthony Mahanna --- libs/arangodb/doc/api_reference.rst | 1 + libs/arangodb/doc/arangoqachain.rst | 252 +++++++++ libs/arangodb/doc/chat_message_histories.rst | 16 +- libs/arangodb/doc/conf.py | 14 +- libs/arangodb/doc/graph.rst | 524 ++++++++++++++++++ libs/arangodb/doc/index.rst | 11 +- libs/arangodb/doc/mydirectory/index.rst | 4 - .../chains/graph_qa/arangodb.py | 57 +- .../graphs/arangodb_graph.py | 483 ++++++++++++---- 9 files changed, 1199 insertions(+), 163 deletions(-) create mode 100644 libs/arangodb/doc/arangoqachain.rst create mode 100644 libs/arangodb/doc/graph.rst delete mode 100644 libs/arangodb/doc/mydirectory/index.rst diff --git a/libs/arangodb/doc/api_reference.rst b/libs/arangodb/doc/api_reference.rst index 063d103..a02b288 100644 --- a/libs/arangodb/doc/api_reference.rst +++ b/libs/arangodb/doc/api_reference.rst @@ -27,6 +27,7 @@ Graphs :undoc-members: :show-inheritance: + Chains ------ diff --git a/libs/arangodb/doc/arangoqachain.rst b/libs/arangodb/doc/arangoqachain.rst new file mode 100644 index 0000000..ac0da13 --- /dev/null +++ b/libs/arangodb/doc/arangoqachain.rst @@ -0,0 +1,252 @@ +ArangoGraphQAChain +======================== + +This guide demonstrates how to use the ArangoGraphQAChain for question-answering against an ArangoDB graph database. + +Basic Setup +----------- + +First, let's set up the necessary imports and create a basic instance: + +.. code-block:: python + + from langchain_arangodb.chains.graph_qa.arangodb import ArangoGraphQAChain + from langchain_arangodb.graphs.arangodb_graph import ArangoGraph + from langchain.chat_models import ChatOpenAI + from arango import ArangoClient + + # Initialize ArangoDB connection + client = ArangoClient() + db = client.db("your_database", username="user", password="pass") + + # Create graph instance + graph = ArangoGraph(db) + + # Initialize LLM + llm = ChatOpenAI(temperature=0) + + # Create the chain + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True # Be cautious with this setting + ) + +Individual Method Usage +----------------------- + +1. Basic Query Execution +~~~~~~~~~~~~~~~~~~~~~~~~ + +The simplest way to use the chain is with a direct query: + +.. code-block:: python + + response = chain.invoke({"query": "Who starred in Pulp Fiction?"}) + print(response["result"]) + +2. Using Custom Input/Output Keys +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can customize the input and output keys: + +.. code-block:: python + + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True, + input_key="question", + output_key="answer" + ) + + response = chain.invoke({"question": "Who directed Inception?"}) + print(response["answer"]) + +3. Limiting Results +~~~~~~~~~~~~~~~~~~~ + +Control the number of results returned: + +.. code-block:: python + + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True, + top_k=5, # Return only top 5 results + output_list_limit=16, # Limit list length in response + output_string_limit=128 # Limit string length in response + ) + +4. Query Explanation Mode +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Get query explanation without execution: + +.. code-block:: python + + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True, + execute_aql_query=False # Only explain, don't execute + ) + + explanation = chain.invoke({"query": "Find all movies released after 2020"}) + print(explanation["aql_result"]) # Contains query plan + +5. Read-Only Mode +~~~~~~~~~~~~~~~~~ + +Enforce read-only operations: + +.. code-block:: python + + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True, + force_read_only_query=True # Prevents write operations + ) + +6. Custom AQL Examples +~~~~~~~~~~~~~~~~~~~~~~ + +Provide example AQL queries for better generation: + +.. code-block:: python + + example_queries = """ + FOR m IN Movies + FILTER m.year > 2020 + RETURN m.title + + FOR a IN Actors + FILTER a.awards > 0 + RETURN a.name + """ + + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True, + aql_examples=example_queries + ) + +7. Detailed Output +~~~~~~~~~~~~~~~~~~ + +Get more detailed output including AQL query and results: + +.. code-block:: python + + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True, + return_aql_query=True, + return_aql_result=True + ) + + response = chain.invoke({"query": "Who acted in The Matrix?"}) + print("Query:", response["aql_query"]) + print("Raw Results:", response["aql_result"]) + print("Final Answer:", response["result"]) + +Complete Workflow Example +------------------------- + +Here's a complete workflow showing how to use multiple features together: + +.. code-block:: python + + from langchain_arangodb.chains.graph_qa.arangodb import ArangoGraphQAChain + from langchain_arangodb.graphs.arangodb_graph import ArangoGraph + from langchain.chat_models import ChatOpenAI + from arango import ArangoClient + + # 1. Setup Database Connection + client = ArangoClient() + db = client.db("movies_db", username="user", password="pass") + + # 2. Initialize Graph + graph = ArangoGraph(db) + + # 3. Create Collections and Sample Data + if not db.has_collection("Movies"): + movies = db.create_collection("Movies") + movies.insert({"_key": "matrix", "title": "The Matrix", "year": 1999}) + + if not db.has_collection("Actors"): + actors = db.create_collection("Actors") + actors.insert({"_key": "keanu", "name": "Keanu Reeves"}) + + if not db.has_collection("ActedIn"): + acted_in = db.create_collection("ActedIn", edge=True) + acted_in.insert({ + "_from": "Actors/keanu", + "_to": "Movies/matrix" + }) + + # 4. Refresh Schema + graph.refresh_schema() + + # 5. Initialize Chain with Advanced Features + llm = ChatOpenAI(temperature=0) + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True, + top_k=5, + force_read_only_query=True, + return_aql_query=True, + return_aql_result=True, + output_list_limit=20, + output_string_limit=200 + ) + + # 6. Run Multiple Queries + queries = [ + "Who acted in The Matrix?", + "What movies were released in 1999?", + "List all actors in the database" + ] + + for query in queries: + print(f"\nProcessing query: {query}") + response = chain.invoke({"query": query}) + + print("AQL Query:", response["aql_query"]) + print("Raw Results:", response["aql_result"]) + print("Final Answer:", response["result"]) + print("-" * 50) + +Security Considerations +----------------------- + +1. Always use appropriate database credentials with minimal required permissions +2. Be cautious with ``allow_dangerous_requests=True`` +3. Use ``force_read_only_query=True`` when only read operations are needed +4. Monitor and log query execution in production environments +5. Regularly review and update AQL examples to prevent injection risks + +Error Handling +-------------- + +The chain includes built-in error handling: + +.. code-block:: python + + try: + response = chain.invoke({"query": "Find all movies"}) + except ValueError as e: + if "Maximum amount of AQL Query Generation attempts" in str(e): + print("Failed to generate valid AQL after multiple attempts") + elif "Write operations are not allowed" in str(e): + print("Attempted write operation in read-only mode") + else: + print(f"Other error: {e}") + +The chain will automatically attempt to fix invalid AQL queries up to +``max_aql_generation_attempts`` times (default: 3) before raising an error. \ No newline at end of file diff --git a/libs/arangodb/doc/chat_message_histories.rst b/libs/arangodb/doc/chat_message_histories.rst index f2c5208..b391966 100644 --- a/libs/arangodb/doc/chat_message_histories.rst +++ b/libs/arangodb/doc/chat_message_histories.rst @@ -317,11 +317,11 @@ Messages are stored in ArangoDB with the following structure: - ``time``: Timestamp for message ordering (automatically added by ArangoDB) Indexing Strategy -~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~ The class automatically creates a persistent index on ``session_id`` to ensure efficient retrieval: -.. code-block:: aql +.. code-block:: python // Automatic index creation CREATE INDEX session_idx ON ChatHistory (session_id) OPTIONS {type: "persistent", unique: false} @@ -332,7 +332,7 @@ Best Practices -------------- Session ID Management -~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~ 1. **Use descriptive session IDs**: Include user context or conversation type 2. **Avoid special characters**: Stick to alphanumeric characters and underscores @@ -346,7 +346,7 @@ Session ID Management session_id = f"training_{model_version}_{session_counter}" Memory Management -~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~ 1. **Choose appropriate memory types** based on conversation length 2. **Implement session cleanup** for privacy or storage management @@ -372,7 +372,7 @@ Memory Management db.aql.execute(query, bind_vars=bind_vars) Error Handling -~~~~~~~~~~~~~ +~~~~~~~~~~~~~~ .. code-block:: python @@ -396,7 +396,7 @@ Error Handling print(f"Unexpected error: {e}") Performance Considerations -------------------------- +-------------------------- 1. **Session ID indexing**: Automatic indexing ensures O(log n) lookup performance 2. **Message ordering**: Uses ArangoDB's built-in sorting capabilities @@ -404,7 +404,7 @@ Performance Considerations 4. **Collection sizing**: Monitor and archive old conversations as needed Example: Complete Chat Application ---------------------------------- +---------------------------------- .. code-block:: python @@ -506,7 +506,7 @@ Troubleshooting --------------- Common Issues -~~~~~~~~~~~~ +~~~~~~~~~~~~~ **ValueError: Please ensure that the session_id parameter is provided** - Ensure session_id is not None, empty string, or 0 diff --git a/libs/arangodb/doc/conf.py b/libs/arangodb/doc/conf.py index a2849d3..175ad18 100644 --- a/libs/arangodb/doc/conf.py +++ b/libs/arangodb/doc/conf.py @@ -11,9 +11,9 @@ sys.path.insert(0, os.path.abspath("..")) -project = 'langchain-arangodb' -copyright = '2025, ArangoDB' -author = 'ArangoDB' +project = "langchain-arangodb" +copyright = "2025, ArangoDB" +author = "ArangoDB" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration @@ -25,15 +25,15 @@ "sphinx.ext.autosummary", "sphinx.ext.inheritance_diagram", ] -templates_path = ['_templates'] -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'sphinx_rtd_theme' -html_static_path = [] # ['_static'] +html_theme = "sphinx_rtd_theme" +html_static_path = [] # type: ignore autodoc_member_order = "bysource" autodoc_inherit_docstrings = True autosummary_generate = True diff --git a/libs/arangodb/doc/graph.rst b/libs/arangodb/doc/graph.rst new file mode 100644 index 0000000..e7d49db --- /dev/null +++ b/libs/arangodb/doc/graph.rst @@ -0,0 +1,524 @@ +ArangoGraph +=========== + +The ``ArangoGraph`` class provides an interface to interact with ArangoDB for graph operations in LangChain. + +Installation +------------ + +.. code-block:: bash + + pip install langchain-arangodb + +Basic Usage +----------- + +.. code-block:: python + + from langchain_arangodb.graphs.arangodb_graph import ArangoGraph, get_arangodb_client + + # Connect to ArangoDB + db = get_arangodb_client( + url="http://localhost:8529", + dbname="_system", + username="root", + password="password" + ) + + # Initialize ArangoGraph + graph = ArangoGraph(db) + + +Factory Methods +--------------- + +get_arangodb_client +~~~~~~~~~~~~~~~~~~~~ + +Creates a connection to ArangoDB. + +.. code-block:: python + + from langchain_arangodb.graphs.arangodb_graph import get_arangodb_client + + # Using direct credentials + db = get_arangodb_client( + url="http://localhost:8529", + dbname="_system", + username="root", + password="password" + ) + + # Using environment variables + # ARANGODB_URL + # ARANGODB_DBNAME + # ARANGODB_USERNAME + # ARANGODB_PASSWORD + db = get_arangodb_client() + +from_db_credentials +~~~~~~~~~~~~~~~~~~ + +Alternative constructor that creates an ArangoGraph instance directly from credentials. + +.. code-block:: python + + graph = ArangoGraph.from_db_credentials( + url="http://localhost:8529", + dbname="_system", + username="root", + password="password" + ) + +Core Methods +------------ + +add_graph_documents +~~~~~~~~~~~~~~~~~~~ + +Adds graph documents to the database. + +.. code-block:: python + + from langchain_core.documents import Document + from langchain_arangodb.graphs.graph_document import GraphDocument, Node, Relationship + + # Create nodes and relationships + nodes = [ + Node(id="1", type="Person", properties={"name": "Alice"}), + Node(id="2", type="Company", properties={"name": "Acme"}) + ] + + relationship = Relationship( + source=nodes[0], + target=nodes[1], + type="WORKS_AT", + properties={"since": 2020} + ) + + # Create graph document + doc = GraphDocument( + nodes=nodes, + relationships=[relationship], + source=Document(page_content="Employee record") + ) + + # Add to database + graph.add_graph_documents( + graph_documents=[doc], + include_source=True, + graph_name="EmployeeGraph", + update_graph_definition_if_exists=True, + capitalization_strategy="lower" + ) +Example: Using LLMGraphTransformer + +.. code-block:: python + + from langchain.experimental import LLMGraphTransformer + from langchain_core.chat_models import ChatOpenAI + from langchain_openai import OpenAIEmbeddings + + # Text to transform into a graph + text = "Bob knows Alice, John knows Bob." + + # Initialize transformer with ChatOpenAI + transformer = LLMGraphTransformer( + llm=ChatOpenAI(temperature=0) + ) + + # Create graph document from text + graph_doc = transformer.create_graph_doc(text) + + # Add to ArangoDB with embeddings + graph.add_graph_documents( + [graph_doc], + graph_name="people_graph", + use_one_entity_collection=False, # Creates 'Person' node collection and 'KNOWS' edge collection + update_graph_definition_if_exists=True, + include_source=True, + embeddings=OpenAIEmbeddings(), + embed_nodes=True # Embeds 'Alice' and 'Bob' nodes + ) + +query +~~~~~ + +Executes AQL queries against the database. + +.. code-block:: python + + # Simple query + result = graph.query("FOR doc IN users RETURN doc") + + # Query with parameters + result = graph.query( + "FOR u IN users FILTER u.age > @min_age RETURN u", + params={"min_age": 21} + ) + + + +explain +~~~~~~~ + +Gets the query execution plan. + +.. code-block:: python + + plan = graph.explain( + "FOR doc IN users RETURN doc" + ) + +Schema Management +----------------- + +refresh_schema +~~~~~~~~~~~~~~ + +Updates the internal schema representation. + +.. code-block:: python + + graph.refresh_schema( + sample_ratio=0.1, # Sample 10% of documents + graph_name="MyGraph", + include_examples=True + ) + +generate_schema +~~~~~~~~~~~~~~~ + +Generates a schema representation of the database. + +.. code-block:: python + + schema = graph.generate_schema( + sample_ratio=0.1, + graph_name="MyGraph", + include_examples=True, + list_limit=32 + ) + +set_schema +~~~~~~~~~~ + +Sets a custom schema. + +.. code-block:: python + + custom_schema = { + "collections": { + "users": {"fields": ["name", "age"]}, + "products": {"fields": ["name", "price"]} + } + } + + graph.set_schema(custom_schema) + +Schema Properties +----------------- + +schema +~~~~~~ + +Gets the current schema as a dictionary. + +.. code-block:: python + + current_schema = graph.schema + +schema_json +~~~~~~~~~~~~ + +Gets the schema as a JSON string. + +.. code-block:: python + + schema_json = graph.schema_json + +schema_yaml +~~~~~~~~~~~ + +Gets the schema as a YAML string. + +.. code-block:: python + + schema_yaml = graph.schema_yaml + +get_structured_schema +~~~~~~~~~~~~~~~~~~~~~ + +Gets the schema in a structured format. + +.. code-block:: python + + structured_schema = graph.get_structured_schema + +Internal Utility Methods +----------------------- + +These methods are used internally but may be useful for advanced use cases: + +_sanitize_collection_name +~~~~~~~~~~~~~~~~~~~~~~~~ + +Sanitizes collection names to be valid in ArangoDB. + +.. code-block:: python + + safe_name = graph._sanitize_collection_name("My Collection!") + # Returns: "My_Collection_" + +_sanitize_input +~~~~~~~~~~~~~~~~ + +Sanitizes input data by truncating long strings and lists. + +.. code-block:: python + + sanitized = graph._sanitize_input( + {"list": [1,2,3,4,5,6]}, + list_limit=5, + string_limit=100 + ) + +_hash +~~~~~ + +Generates a hash string for a value. + +.. code-block:: python + + hash_str = graph._hash("some value") + +_process_source +~~~~~~~~~~~~~~~~ + +Processes a source document for storage. + +.. code-block:: python + + from langchain_core.documents import Document + + source = Document( + page_content="test content", + metadata={"author": "Alice"} + ) + + source_id = graph._process_source( + source=source, + source_collection_name="sources", + source_embedding=[0.1, 0.2, 0.3], + embedding_field="embedding", + insertion_db=db + ) + +_import_data +~~~~~~~~~~~~~ + +Bulk imports data into collections. + +.. code-block:: python + + data = { + "users": [ + {"_key": "1", "name": "Alice"}, + {"_key": "2", "name": "Bob"} + ] + } + + graph._import_data(db, data, is_edge=False) + + +Example Workflow +---------------- + +Here's a complete example demonstrating a typical workflow using ArangoGraph to create a knowledge graph from documents: + +.. code-block:: python + + from langchain_core.documents import Document + from langchain_core.embeddings import Embeddings + from langchain_arangodb.graphs.arangodb_graph import ArangoGraph, get_arangodb_client + from langchain_arangodb.graphs.graph_document import GraphDocument, Node, Relationship + + # 1. Setup embeddings (example using OpenAI - you can use any embeddings model) + from langchain_openai import OpenAIEmbeddings + embeddings = OpenAIEmbeddings() + # 2. Connect to ArangoDB and initialize graph + db = get_arangodb_client( + url="http://localhost:8529", + dbname="knowledge_base", + username="root", + password="password" + ) + graph = ArangoGraph(db) + + # 3. Create sample documents with relationships + documents = [ + Document( + page_content="Alice is a software engineer at Acme Corp.", + metadata={"source": "employee_records", "date": "2024-01-01"} + ), + Document( + page_content="Bob is a project manager working with Alice on Project X.", + metadata={"source": "project_docs", "date": "2024-01-02"} + ) + ] + + # 4. Create nodes and relationships for each document + graph_documents = [] + for doc in documents: + # Extract entities and relationships (simplified example) + if "Alice" in doc.page_content: + alice_node = Node(id="alice", type="Person", properties={"name": "Alice", "role": "Software Engineer"}) + company_node = Node(id="acme", type="Company", properties={"name": "Acme Corp"}) + works_at_rel = Relationship( + source=alice_node, + target=company_node, + type="WORKS_AT" + ) + graph_doc = GraphDocument( + nodes=[alice_node, company_node], + relationships=[works_at_rel], + source=doc + ) + graph_documents.append(graph_doc) + + if "Bob" in doc.page_content: + bob_node = Node(id="bob", type="Person", properties={"name": "Bob", "role": "Project Manager"}) + project_node = Node(id="project_x", type="Project", properties={"name": "Project X"}) + manages_rel = Relationship( + source=bob_node, + target=project_node, + type="MANAGES" + ) + works_with_rel = Relationship( + source=bob_node, + target=alice_node, + type="WORKS_WITH" + ) + graph_doc = GraphDocument( + nodes=[bob_node, project_node], + relationships=[manages_rel, works_with_rel], + source=doc + ) + graph_documents.append(graph_doc) + + # 5. Add documents to the graph with embeddings + graph.add_graph_documents( + graph_documents=graph_documents, + include_source=True, # Store original documents + graph_name="CompanyGraph", + update_graph_definition_if_exists=True, + embed_source=True, # Generate embeddings for documents + embed_nodes=True, # Generate embeddings for nodes + embed_relationships=True, # Generate embeddings for relationships + embeddings=embeddings, + batch_size=100, + capitalization_strategy="lower" + ) + + # 6. Query the graph + # Find all people who work at Acme Corp + employees = graph.query(""" + FOR v, e IN 1..1 OUTBOUND + (FOR c IN ENTITY FILTER c.type == 'Company' AND c.name == 'Acme Corp' RETURN c)._id + ENTITY_EDGE + RETURN { + name: v.name, + role: v.role, + company: 'Acme Corp' + } + """) + + # Find all projects and their managers + projects = graph.query(""" + FOR v, e IN 1..1 INBOUND + (FOR p IN ENTITY FILTER p.type == 'Project' RETURN p)._id + ENTITY_EDGE + FILTER e.type == 'MANAGES' + RETURN { + project: v.name, + manager: e._from + } + """) + + # 7. Generate and inspect schema + schema = graph.generate_schema( + sample_ratio=1.0, # Use all documents for schema + graph_name="CompanyGraph", + include_examples=True + ) + + print("Schema:", schema) + + # 8. Error handling for queries + try: + # Complex query with potential for errors + result = graph.query(""" + FOR v, e, p IN 1..3 OUTBOUND + (FOR p IN ENTITY FILTER p.name == 'Alice' RETURN p)._id + ENTITY_EDGE + RETURN p + """) + except ArangoServerError as e: + print(f"Query error: {e}") + +This workflow demonstrates: + +1. Setting up the environment with embeddings +2. Connecting to ArangoDB +3. Creating documents with structured relationships +4. Adding documents to the graph with embeddings +5. Querying the graph using AQL +6. Schema management +7. Error handling + +The example creates a simple company knowledge graph with: + +- People (employees) +- Companies +- Projects +- Various relationships (WORKS_AT, MANAGES, WORKS_WITH) +- Document sources with embeddings + +Key Features Used: + +- Document embedding +- Node and relationship embedding +- Source document storage +- Graph schema management +- AQL queries +- Error handling +- Batch processing + + +Best Practices +-------------- + +1. Always use appropriate capitalization strategy for consistency +2. Use batch operations for large data imports +3. Consider using embeddings for semantic search capabilities +4. Implement proper error handling for database operations +5. Use schema management for better data organization + +Error Handling +-------------- + +.. code-block:: python + + from arango.exceptions import ArangoServerError + + try: + result = graph.query("FOR doc IN nonexistent RETURN doc") + except ArangoServerError as e: + print(f"Database error: {e}") + + +-------------- + + + + diff --git a/libs/arangodb/doc/index.rst b/libs/arangodb/doc/index.rst index b134c58..7c458fa 100644 --- a/libs/arangodb/doc/index.rst +++ b/libs/arangodb/doc/index.rst @@ -97,15 +97,12 @@ Documentation Contents quickstart vectorstores chat_message_histories - -.. toctree:: - :maxdepth: 2 - :caption: Advanced: - - mydirectory/index + graph + arangoqachain .. toctree:: :maxdepth: 2 :caption: API Reference: - api_reference \ No newline at end of file + api_reference + diff --git a/libs/arangodb/doc/mydirectory/index.rst b/libs/arangodb/doc/mydirectory/index.rst deleted file mode 100644 index 7e344ae..0000000 --- a/libs/arangodb/doc/mydirectory/index.rst +++ /dev/null @@ -1,4 +0,0 @@ -.. _mydirectory: - -Hello World -============ \ No newline at end of file diff --git a/libs/arangodb/langchain_arangodb/chains/graph_qa/arangodb.py b/libs/arangodb/langchain_arangodb/chains/graph_qa/arangodb.py index fefa6be..f2050af 100644 --- a/libs/arangodb/langchain_arangodb/chains/graph_qa/arangodb.py +++ b/libs/arangodb/langchain_arangodb/chains/graph_qa/arangodb.py @@ -105,14 +105,17 @@ def __init__(self, **kwargs: Any) -> None: @property def input_keys(self) -> List[str]: + """Get the input keys for the chain.""" return [self.input_key] @property def output_keys(self) -> List[str]: + """Get the output keys for the chain.""" return [self.output_key] @property def _chain_type(self) -> str: + """Get the chain type.""" return "graph_aql_chain" @classmethod @@ -120,12 +123,34 @@ def from_llm( cls, llm: BaseLanguageModel, *, - qa_prompt: BasePromptTemplate = AQL_QA_PROMPT, - aql_generation_prompt: BasePromptTemplate = AQL_GENERATION_PROMPT, - aql_fix_prompt: BasePromptTemplate = AQL_FIX_PROMPT, + qa_prompt: Optional[BasePromptTemplate] = None, + aql_generation_prompt: Optional[BasePromptTemplate] = None, + aql_fix_prompt: Optional[BasePromptTemplate] = None, **kwargs: Any, ) -> ArangoGraphQAChain: - """Initialize from LLM.""" + """Initialize from LLM. + + :param llm: The language model to use. + :type llm: BaseLanguageModel + :param qa_prompt: The prompt to use for the QA chain. + :type qa_prompt: BasePromptTemplate + :param aql_generation_prompt: The prompt to use for the AQL generation chain. + :type aql_generation_prompt: BasePromptTemplate + :param aql_fix_prompt: The prompt to use for the AQL fix chain. + :type aql_fix_prompt: BasePromptTemplate + :param kwargs: Additional keyword arguments. + :type kwargs: Any + :return: The initialized ArangoGraphQAChain. + :rtype: ArangoGraphQAChain + :raises ValueError: If the LLM is not provided. + """ + if qa_prompt is None: + qa_prompt = AQL_QA_PROMPT + if aql_generation_prompt is None: + aql_generation_prompt = AQL_GENERATION_PROMPT + if aql_fix_prompt is None: + aql_fix_prompt = AQL_FIX_PROMPT + qa_chain = qa_prompt | llm aql_generation_chain = aql_generation_prompt | llm aql_fix_chain = aql_fix_prompt | llm @@ -149,37 +174,37 @@ def _call( Users can modify the following ArangoGraphQAChain Class Variables: - :var top_k: The maximum number of AQL Query Results to return + :param top_k: The maximum number of AQL Query Results to return :type top_k: int - :var aql_examples: A set of AQL Query Examples that are passed to + :param aql_examples: A set of AQL Query Examples that are passed to the AQL Generation Prompt Template to promote few-shot-learning. Defaults to an empty string. :type aql_examples: str - :var return_aql_query: Whether to return the AQL Query in the + :param return_aql_query: Whether to return the AQL Query in the output dictionary. Defaults to False. :type return_aql_query: bool - :var return_aql_result: Whether to return the AQL Query in the + :param return_aql_result: Whether to return the AQL Query in the output dictionary. Defaults to False :type return_aql_result: bool - :var max_aql_generation_attempts: The maximum amount of AQL + :param max_aql_generation_attempts: The maximum amount of AQL Generation attempts to be made prior to raising the last AQL Query Execution Error. Defaults to 3. :type max_aql_generation_attempts: int - :var execute_aql_query: If False, the AQL Query is only + :param execute_aql_query: If False, the AQL Query is only explained & returned, not executed. Defaults to True. :type execute_aql_query: bool - :var output_list_limit: The maximum list length to display + :param output_list_limit: The maximum list length to display in the output. If the list is longer, it will be truncated. Defaults to 32. :type output_list_limit: int - :var output_string_limit: The maximum string length to display + :param output_string_limit: The maximum string length to display in the output. If the string is longer, it will be truncated. Defaults to 256. :type output_string_limit: int @@ -348,11 +373,11 @@ def _call( def _is_read_only_query(self, aql_query: str) -> Tuple[bool, Optional[str]]: """Check if the AQL query is read-only. - Args: - aql_query: The AQL query to check. + :param aql_query: The AQL query to check. + :type aql_query: str - Returns: - bool: True if the query is read-only, False otherwise. + :return: True if the query is read-only, False otherwise. + :rtype: Tuple[bool, Optional[str]] """ normalized_query = aql_query.upper() diff --git a/libs/arangodb/langchain_arangodb/graphs/arangodb_graph.py b/libs/arangodb/langchain_arangodb/graphs/arangodb_graph.py index f128242..17be379 100644 --- a/libs/arangodb/langchain_arangodb/graphs/arangodb_graph.py +++ b/libs/arangodb/langchain_arangodb/graphs/arangodb_graph.py @@ -29,18 +29,24 @@ def get_arangodb_client( ) -> Any: """Get the Arango DB client from credentials. - Args: - url: Arango DB url. Can be passed in as named arg or set as environment - var ``ARANGODB_URL``. Defaults to "http://localhost:8529". - dbname: Arango DB name. Can be passed in as named arg or set as - environment var ``ARANGODB_DBNAME``. Defaults to "_system". - username: Can be passed in as named arg or set as environment var - ``ARANGODB_USERNAME``. Defaults to "root". - password: Can be passed ni as named arg or set as environment var - ``ARANGODB_PASSWORD``. Defaults to "". - - Returns: - An arango.database.StandardDatabase. + :param url: Arango DB url. Can be passed in as named arg or set as environment + var ``ARANGODB_URL``. Defaults to "http://localhost:8529". + :type url: str + :param dbname: Arango DB name. Can be passed in as named arg or set as + environment var ``ARANGODB_DBNAME``. Defaults to "_system". + :type dbname: str + :param username: Can be passed in as named arg or set as environment var + ``ARANGODB_USERNAME``. Defaults to "root". + :type username: str + :param password: Can be passed in as named arg or set as environment var + ``ARANGODB_PASSWORD``. Defaults to "". + :type password: str + + :return: An arango.database.StandardDatabase. + :rtype: Any + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. """ _url: str = url or str(os.environ.get("ARANGODB_URL", "http://localhost:8529")) _dbname: str = dbname or str(os.environ.get("ARANGODB_DBNAME", "_system")) @@ -53,24 +59,39 @@ def get_arangodb_client( class ArangoGraph(GraphStore): """ArangoDB wrapper for graph operations. - Parameters: - - db (arango.database.StandardDatabase): ArangoDB database instance. - - generate_schema_on_init (bool): Whether to generate the graph schema + :param db: The ArangoDB database instance. + :type db: StandardDatabase + :param generate_schema_on_init: Whether to generate the graph schema on initialization. Defaults to True. - - schema_sample_ratio (float): A float (0 to 1) to determine the - ratio of documents/edges sampled in relation to the Collection size - to generate each Collection Schema. If 0, one document/edge + :type generate_schema_on_init: bool + :param schema_sample_ratio: The ratio of documents/edges to sample in relation to + the Collection size to generate each Collection Schema. If 0, one document/edge is used per Collection. Defaults to 0. - - schema_graph_name (str): The name of an existing ArangoDB Graph to specifically + :type schema_sample_ratio: float + :param schema_graph_name: The name of an existing ArangoDB Graph to specifically use to generate the schema. If None, the entire database will be used. Defaults to None. - - schema_include_examples (bool): Whether to include example values fetched from + :type schema_graph_name: Optional[str] + :param schema_include_examples: Whether to include example values fetched from a sample documents as part of the schema. Defaults to True. Lists of size higher than **schema_list_limit** will be excluded from the schema, even if **schema_include_examples** is set to True. Defaults to True. - - schema_list_limit (int): The maximum list size the schema will include as part + :type schema_include_examples: bool + :param schema_list_limit: The maximum list size the schema will include as part of the example values. If the list is longer than this limit, a string describing the list will be used in the schema instead. Default is 32. + :type schema_list_limit: int + :param schema_string_limit: The maximum number of characters to include + in a string. If the string is longer than this limit, a string + describing the string will be used in the schema instead. Default is 256. + :type schema_string_limit: int + + :return: None + :rtype: None + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + *Security note*: Make sure that the database connection uses credentials that are narrowly-scoped to only include necessary permissions. @@ -82,6 +103,8 @@ class ArangoGraph(GraphStore): limit the permissions granted to the credentials used with this tool. See https://python.langchain.com/docs/security for more information. + + """ def __init__( @@ -94,6 +117,10 @@ def __init__( schema_list_limit: int = 32, schema_string_limit: int = 256, ) -> None: + """ + Initializes the ArangoGraph instance. + + """ self.__db: StandardDatabase = db self.__async_db = db.begin_async_execution() @@ -123,16 +150,30 @@ def get_structured_schema(self) -> Dict[str, Any]: @property def schema_json(self) -> str: - """Returns the schema of the Graph Database as a JSON string""" + """Returns the schema of the Graph Database as a JSON string + + :return: The schema of the Graph Database as a JSON string + :rtype: str + """ return json.dumps(self.__schema) @property def schema_yaml(self) -> str: - """Returns the schema of the Graph Database as a YAML string""" + """Returns the schema of the Graph Database as a YAML string + + :return: The schema of the Graph Database as a YAML string + :rtype: str + """ return yaml.dump(self.__schema, sort_keys=False) def set_schema(self, schema: Dict[str, Any]) -> None: - """Sets a custom schema for the ArangoDB Database.""" + """Sets a custom schema for the ArangoDB Database. + + :param schema: The schema to set. + :type schema: Dict[str, Any] + :return: None + :rtype: None + """ self.__schema = schema def refresh_schema( @@ -146,20 +187,31 @@ def refresh_schema( Refresh the graph schema information. Parameters: - - sample_ratio (float): A float (0 to 1) to determine the + + :param sample_ratio: A float (0 to 1) to determine the ratio of documents/edges sampled in relation to the Collection size to generate each Collection Schema. If 0, one document/edge is used per Collection. Defaults to 0. - - graph_name (str): The name of an existing ArangoDB Graph to specifically + :type sample_ratio: float + :param graph_name: The name of an existing ArangoDB Graph to specifically use to generate the schema. If None, the entire database will be used. Defaults to None. - - include_examples (bool): Whether to include example values fetched from + :type graph_name: Optional[str] + :param include_examples: Whether to include example values fetched from a sample documents as part of the schema. Defaults to True. Lists of size higher than **list_limit** will be excluded from the schema, even if **schema_include_examples** is set to True. Defaults to True. - - list_limit (int): The maximum list size the schema will include as part + :type include_examples: bool + :param list_limit: The maximum list size the schema will include as part of the example values. If the list is longer than this limit, a string describing the list will be used in the schema instead. Default is 32. + :type list_limit: int + + :return: None + :rtype: None + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. """ self.__schema = self.generate_schema( sample_ratio, graph_name, include_examples, list_limit @@ -176,21 +228,31 @@ def generate_schema( """ Generates the schema of the ArangoDB Database and returns it - Parameters: - - sample_ratio (float): A ratio (0 to 1) to determine the - ratio of documents/edges used (in relation to the Collection size) - to render each Collection Schema. If 0, one document/edge - is used per Collection. - - graph_name (str): The name of the graph to use to generate the schema. If + :param sample_ratio: A ratio (0 to 1) to determine the + ratio of documents/edges used (in relation to the Collection size) + to render each Collection Schema. If 0, one document/edge + is used per Collection. + :type sample_ratio: float + :param graph_name: The name of the graph to use to generate the schema. If None, the entire database will be used. - - include_examples (bool): A flag whether to scan the database for + :type graph_name: Optional[str] + :param include_examples: A flag whether to scan the database for example values and use them in the graph schema. Default is True. - - list_limit (int): The maximum number of elements to include in a list. + :type include_examples: bool + :param list_limit: The maximum number of elements to include in a list. If the list is longer than this limit, a string describing the list will be used in the schema instead. Default is 32. - - schema_string_limit (int): The maximum number of characters to include + :type list_limit: int + :param schema_string_limit: The maximum number of characters to include in a string. If the string is longer than this limit, a string describing the string will be used in the schema instead. Default is 128. + :type schema_string_limit: int + :return: A dictionary containing the graph schema and collection schema. + :rtype: Dict[str, List[Dict[str, Any]]] + :raises ValueError: If the sample ratio is not between 0 and 1. + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. """ if not 0 <= sample_ratio <= 1: raise ValueError("**sample_ratio** value must be in between 0 to 1") @@ -273,18 +335,26 @@ def query( Execute an AQL query and return the results. Parameters: - - query (str): The AQL query to execute. - - params (dict): Additional arguments piped to the function. - - top_k: Number of results to process from the AQL cursor. - Defaults to None. - - list_limit: Removes lists above **list_limit** size - that have been returned from the AQL query. - - string_limit: Removes strings above **string_limit** size - that have been returned from the AQL query. - - Remaining params are passed to the AQL query execution. - - Returns: - - A list of dictionaries containing the query results. + :param query: The AQL query to execute. + :type query: str + :param params: Additional arguments piped to the function. + Defaults to None. + :type params: dict + :param list_limit: Removes lists above **list_limit** size + that have been returned from the AQL query. + :type list_limit: Optional[int] + :param string_limit: Removes strings above **string_limit** size + that have been returned from the AQL query. + :type string_limit: Optional[int] + :param remaining_params: Remaining params are passed to the AQL query execution. + Defaults to None. + :type remaining_params: Optional[dict] + + :return: A list of dictionaries containing the query results. + :rtype: List[Any] + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. """ top_k = params.pop("top_k", None) list_limit = params.pop("list_limit", 32) @@ -308,11 +378,16 @@ def explain(self, query: str, params: dict = {}) -> List[Dict[str, Any]]: """ Explain an AQL query without executing it. - Parameters: - - query (str): The AQL query to explain. - - Returns: - - A list of dictionaries containing the query explanation. + :param query: The AQL query to explain. + :type query: str + :param params: Additional arguments piped to the function. + Defaults to None. + :type params: dict + :return: A list of dictionaries containing the query explanation. + :rtype: List[Dict[str, Any]] + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. """ return self.__db.aql.explain(query) # type: ignore @@ -340,50 +415,51 @@ def add_graph_documents( Constructs nodes & relationships in the graph based on the provided GraphDocument objects. - Parameters: - - graph_documents (List[GraphDocument]): A list of GraphDocument objects - that contain the nodes and relationships to be added to the graph. Each - GraphDocument should encapsulate the structure of part of the graph, - including nodes, relationships, and the source document information. - - include_source (bool, optional): If True, stores the source document - and links it to nodes in the graph using the HAS_SOURCE relationship. - This is useful for tracing back the origin of data. Merges source - documents based on the `id` property from the source document if available, - otherwise it calculates the Farmhash hash of `page_content` - for merging process. Defaults to False. - - graph_name (str): The name of the ArangoDB General Graph to create. If None, - no graph will be created. - - update_graph_definition_if_exists (bool): If True, updates the graph - Edge Definitions - if it already exists. Defaults to False. Not used if `graph_name` is None. It is - recommended to set this to True if `use_one_entity_collection` is set to False. - - batch_size (int): The number of nodes/edges to insert in a single batch. - - use_one_entity_collection (bool): If True, all nodes are stored in a single - entity collection. If False, nodes are stored in separate collections based - on their type. Defaults to True. - - insert_async (bool): If True, inserts data asynchronously. Defaults to False. - - source_collection_name (str): The name of the collection to store the source - documents. Defaults to "SOURCE". - - source_edge_collection_name (str): The name of the edge collection to store - the relationships between source documents and nodes. Defaults to "HAS_SOURCE". - - entity_collection_name (str): The name of the collection to store the nodes. - Defaults to "ENTITY". Only used if `use_one_entity_collection` is True. - - entity_edge_collection_name (str): The name of the edge collection to store - the relationships between nodes. Defaults to "LINKS_TO". Only used if - `use_one_entity_collection` is True. - - embeddings (Embeddings): An Embeddings object to use for embedding the source, - nodes and relationships. Defaults to None. - - embedding_field (set[str]): The field name to store the embedding. Defaults - to "embedding". Only used if `embedding` is not None, and `embed_source`, - `embed_nodes`, or `embed_relationships` is True. - - embed_source (bool): If True, embeds the source document. Defaults to False. - - embed_nodes (bool): If True, embeds the nodes. Defaults to False. - - embed_relationships (bool): If True, embeds the relationships. - Defaults to False. - - capitalization_strategy (str): The capitalization strategy applied on the - node and edge keys. Can be "lower", "upper", or "none". Defaults to "none". - Useful as a basic Entity Resolution technique to avoid duplicates based - on capitalization. + :param graph_documents: The GraphDocument objects to add to the graph. + :type graph_documents: List[GraphDocument] + :param include_source: Whether to include the source document in the graph. + :type include_source: bool + :param graph_name: The name of the graph to add the documents to. + :type graph_name: Optional[str] + :param update_graph_definition_if_exists: Whether to update the graph definition + if it already exists. + :type update_graph_definition_if_exists: bool + :param batch_size: The number of documents to process in each batch. + :type batch_size: int + :param use_one_entity_collection: Whether to use one entity collection + for all nodes. + :type use_one_entity_collection: bool + :param insert_async: Whether to insert the documents asynchronously. + :type insert_async: bool + :param source_collection_name: The name of the source collection. + :type source_collection_name: Union[str, None] + :param source_edge_collection_name: The name of the source edge collection. + :type source_edge_collection_name: Union[str, None] + :param entity_collection_name: The name of the entity collection. + :type entity_collection_name: Union[str, None] + :param entity_edge_collection_name: The name of the entity edge collection. + :type entity_edge_collection_name: Union[str, None] + :param embeddings: The embeddings model to use. + :type embeddings: Union[Embeddings, None] + :param embedding_field: The field to use for the embedding. + :type embedding_field: str + :param embed_source: Whether to embed the source document. + :type embed_source: bool + :param embed_nodes: Whether to embed the nodes. + :type embed_nodes: bool + :param embed_relationships: Whether to embed the relationships. + :type embed_relationships: bool + :param capitalization_strategy: The capitalization strategy to use. + :type capitalization_strategy: str + + :return: None + :rtype: None + :raises ValueError: If the capitalization strategy is not 'lower', + 'upper', or 'none'. + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + """ if not graph_documents: return @@ -624,18 +700,24 @@ def from_db_credentials( ) -> Any: """Convenience constructor that builds Arango DB from credentials. - Args: - url: Arango DB url. Can be passed in as named arg or set as environment + :param url: Arango DB url. Can be passed in as named arg or set as environment var ``ARANGODB_URL``. Defaults to "http://localhost:8529". - dbname: Arango DB name. Can be passed in as named arg or set as + :type url: str + :param dbname: Arango DB name. Can be passed in as named arg or set as environment var ``ARANGODB_DBNAME``. Defaults to "_system". - username: Can be passed in as named arg or set as environment var + :type dbname: str + :param username: Can be passed in as named arg or set as environment var ``ARANGODB_USERNAME``. Defaults to "root". - password: Can be passed ni as named arg or set as environment var - ``ARANGODB_PASSWORD``. Defaults to "". + :type username: str + :param password: Can be passed in as named arg or set as environment var + ``ARANGODB_USERNAME``. Defaults to "root". + :type password: str + + :return: An arango.database.StandardDatabase. + :rtype: Any + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. - Returns: - An arango.database.StandardDatabase. """ db = get_arangodb_client( url=url, dbname=dbname, username=username, password=password @@ -648,7 +730,21 @@ def _import_data( data: Dict[str, List[Dict[str, Any]]], is_edge: bool, ) -> None: - """Imports data into the ArangoDB database in bulk.""" + """Imports data into the ArangoDB database in bulk. + + :param db: The ArangoDB database instance. + :type db: Database + :param data: The data to import. + :type data: Dict[str, List[Dict[str, Any]]] + :param is_edge: Whether the data is an edge. + :type is_edge: bool + + :return: None + :rtype: None + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + """ for collection, batch in data.items(): self._create_collection(collection, is_edge) db.collection(collection).import_bulk(batch, on_duplicate="update") @@ -658,7 +754,19 @@ def _import_data( def _create_collection( self, collection_name: str, is_edge: bool = False, **kwargs: Any ) -> None: - """Creates a collection in the ArangoDB database if it does not exist.""" + """Creates a collection in the ArangoDB database if it does not exist. + + :param collection_name: The name of the collection to create. + :type collection_name: str + :param is_edge: Whether the collection is an edge. + :type is_edge: bool + + :return: None + :rtype: None + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + """ if not self.db.has_collection(collection_name): self.db.create_collection(collection_name, edge=is_edge, **kwargs) @@ -669,7 +777,20 @@ def _process_node_as_entity( nodes: DefaultDict[str, list], entity_collection_name: str, ) -> str: - """Processes a Graph Document Node into ArangoDB as a unanimous Entity.""" + """Processes a Graph Document Node into ArangoDB as a unanimous Entity. + + :param node_key: The key of the node. + :type node_key: str + :param node: The node to process. + :type node: Node + :param nodes: The nodes to process. + :type nodes: DefaultDict[str, list] + :param entity_collection_name: The name of the entity collection. + :type entity_collection_name: str + + :return: The name of the entity collection. + :rtype: str + """ nodes[entity_collection_name].append( { "_key": node_key, @@ -683,7 +804,20 @@ def _process_node_as_entity( def _process_node_as_type( self, node_key: str, node: Node, nodes: DefaultDict[str, list], _: str ) -> str: - """Processes a Graph Document Node into ArangoDB based on its Node Type.""" + """Processes a Graph Document Node into ArangoDB based on its Node Type. + + :param node_key: The key of the node. + :type node_key: str + :param node: The node to process. + :type node: Node + :param nodes: The nodes to process. + :type nodes: DefaultDict[str, list] + :param _: The name of the node type. + :type _: str + + :return: The name of the node type. + :rtype: str + """ node_type = self._sanitize_collection_name(node.type) nodes[node_type].append({"_key": node_key, "text": node.id, **node.properties}) return node_type @@ -700,7 +834,34 @@ def _process_edge_as_entity( entity_edge_collection_name: str, _: DefaultDict[str, DefaultDict[str, set[str]]], ) -> None: - """Processes a Graph Document Edge into ArangoDB as a unanimous Entity.""" + """Processes a Graph Document Edge into ArangoDB as a unanimous Entity. + + :param edge: The edge to process. + :type edge: Relationship + :param edge_str: The string representation of the edge. + :type edge_str: str + :param edge_key: The key of the edge. + :type edge_key: str + :param source_key: The key of the source node. + :type source_key: str + :param target_key: The key of the target node. + :type target_key: str + :param edges: The edges to process. + :type edges: DefaultDict[str, list] + :param entity_collection_name: The name of the entity collection. + :type entity_collection_name: str + :param entity_edge_collection_name: The name of the entity edge collection. + :type entity_edge_collection_name: str + :param _: The name of the edge type. + :type _: DefaultDict[str, DefaultDict[str, set[str]]] + + :return: None + :rtype: None + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + + """ edges[entity_edge_collection_name].append( { "_key": edge_key, @@ -724,7 +885,29 @@ def _process_edge_as_type( _2: str, edge_definitions_dict: DefaultDict[str, DefaultDict[str, set[str]]], ) -> None: - """Processes a Graph Document Edge into ArangoDB based on its Edge Type.""" + """Processes a Graph Document Edge into ArangoDB based on its Edge Type. + + :param edge: The edge to process. + :type edge: Relationship + :param edge_str: The string representation of the edge. + :type edge_str: str + :param edge_key: The key of the edge. + :type edge_key: str + :param source_key: The key of the source node. + :type source_key: str + :param target_key: The key of the target node. + :type target_key: str + :param edges: The edges to process. + :type edges: DefaultDict[str, list] + :param edge_definitions_dict: The edge definitions dictionary. + :type edge_definitions_dict: DefaultDict[str, DefaultDict[str, set[str]]] + + :return: None + :rtype: None + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + """ source: Node = edge.source target: Node = edge.target @@ -753,7 +936,25 @@ def _get_node_key( entity_collection_name: str, process_node_fn: Any, ) -> str: - """Gets the key of a node and processes it if it doesn't exist.""" + """Gets the key of a node and processes it if it doesn't exist. + + :param node: The node to process. + :type node: Node + :param nodes: The nodes to process. + :type nodes: DefaultDict[str, list] + :param node_key_map: The node key map. + :type node_key_map: Dict[str, str] + :param entity_collection_name: The name of the entity collection. + :type entity_collection_name: str + :param process_node_fn: The function to process the node. + :type process_node_fn: Any + + :return: The key of the node. + :rtype: str + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + """ node.id = str(node.id) if node.id in node_key_map: return node_key_map[node.id] @@ -772,7 +973,25 @@ def _process_source( embedding_field: str, insertion_db: Database, ) -> str: - """Processes a Graph Document Source into ArangoDB.""" + """Processes a Graph Document Source into ArangoDB. + + :param source: The source to process. + :type source: Document + :param source_collection_name: The name of the source collection. + :type source_collection_name: str + :param source_embedding: The embedding of the source. + :type source_embedding: Union[list[float], None] + :param embedding_field: The field name to store the embedding. + :type embedding_field: str + :param insertion_db: The database to insert the source into. + :type insertion_db: Database + + :return: The key of the source. + :rtype: str + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + """ source_id = self._hash( source.id if source.id else source.page_content.encode("utf-8") ) @@ -792,7 +1011,16 @@ def _process_source( return source_id def _hash(self, value: Any) -> str: - """Applies the Farmhash hash function to a value.""" + """Applies the Farmhash hash function to a value. + + :param value: The value to hash. + :type value: Any + + :return: The hashed value. + :rtype: str + :raises ValueError: If the value is not a string or has no + string representation. + """ try: value_str = str(value) except Exception: @@ -807,6 +1035,13 @@ def _sanitize_collection_name(self, name: str) -> str: - Trims the name to 256 characters if it's too long. - Replaces invalid characters with underscores (_). - Ensures the name starts with a letter (prepends 'a' if needed). + + :param name: The name to sanitize. + :type name: str + + :return: The sanitized name. + :rtype: str + :raises ValueError: If the collection name is empty. """ if not name: raise ValueError("Collection name cannot be empty.") @@ -831,13 +1066,19 @@ def _sanitize_input(self, d: Any, list_limit: int, string_limit: int) -> Any: results, can occupy significant context space and detract from the LLM's performance by introducing unnecessary noise and cost. - Args: - d (Any): The input dictionary or list to sanitize. - list_limit (int): The limit for the number of elements in a list. - string_limit (int): The limit for the number of characters in a string. + :param d: The input dictionary or list to sanitize. + :type d: Any + :param list_limit: The limit for the number of elements in a list. + :type list_limit: int + :param string_limit: The limit for the number of characters in a string. + :type string_limit: int + + :return: The sanitized dictionary or list. + :rtype: Any + :raises ValueError: If the input is not a dictionary or list. + :raises ValueError: If the list limit is less than 0. + :raises ValueError: If the string limit is less than 0. - Returns: - Any: The sanitized dictionary or list. """ if isinstance(d, dict): From b729ffd2df1e3cad8c2bfc5a6112073e0cbc4f3f Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli <72517322+ajaykallepalli@users.noreply.github.com> Date: Wed, 11 Jun 2025 15:30:57 -0700 Subject: [PATCH 9/9] Create Documentation for ArangoDB Vector Stores and Chat Message Histories (#11) * Update API documentation to match ArangoClient format - Convert vector stores and chat message histories docstrings to use :param: and :type: format - Add comprehensive examples from integration tests directly in API documentation - Enhanced with rich code examples showing real usage patterns * Fixing params formatting * removing empty notebook * Removing placeholders updating content to match test cases * Updating langchain logo, adding indexing for API reference * Fix doc issues * removing API reference from quickstart guide * Fixing lint line length issues * updating docstring and imports for documentation * Removing advanced index * fix: invoke * rm: docs * bump * cleanup --------- Co-authored-by: Anthony Mahanna --- docs/chains.rst | 50 -- docs/chat_message_histories.rst | 53 -- docs/graphs.rst | 52 -- docs/index.rst | 23 - docs/installation.rst | 32 - docs/query_constructors.rst | 53 -- docs/vectorstores.rst | 55 -- libs/arangodb/doc/api_reference.rst | 52 +- libs/arangodb/doc/arangoqachain.rst | 12 +- libs/arangodb/doc/chat_message_histories.rst | 9 +- libs/arangodb/doc/graph.rst | 8 +- libs/arangodb/doc/index.rst | 11 +- libs/arangodb/doc/langchain-arangodb.ipynb | 0 libs/arangodb/doc/quickstart.rst | 107 ++- libs/arangodb/doc/requirements.txt | 3 + libs/arangodb/doc/vectorstores.rst | 131 +++- .../chat_message_histories/arangodb.py | 119 ++- .../graphs/arangodb_graph.py | 1 - .../vectorstores/arangodb_vector.py | 739 +++++++++++------- 19 files changed, 800 insertions(+), 710 deletions(-) delete mode 100644 docs/chains.rst delete mode 100644 docs/chat_message_histories.rst delete mode 100644 docs/graphs.rst delete mode 100644 docs/index.rst delete mode 100644 docs/installation.rst delete mode 100644 docs/query_constructors.rst delete mode 100644 docs/vectorstores.rst delete mode 100644 libs/arangodb/doc/langchain-arangodb.ipynb diff --git a/docs/chains.rst b/docs/chains.rst deleted file mode 100644 index a247847..0000000 --- a/docs/chains.rst +++ /dev/null @@ -1,50 +0,0 @@ -Chains -====== - -LangChain ArangoDB provides chain implementations that integrate with ArangoDB for various operations. - -ArangoDBChain ------------- - -The main chain implementation that uses ArangoDB for storing and retrieving chain data. - -.. code-block:: python - - from langchain_arangodb.chains import ArangoDBChain - from langchain.llms import OpenAI - - # Initialize the chain - chain = ArangoDBChain( - llm=OpenAI(), - arango_url="http://localhost:8529", - username="root", - password="", - database="langchain", - collection_name="chain_data" - ) - - # Run the chain - result = chain.run("What is the capital of France?") - -Features --------- - -- Chain execution with ArangoDB storage -- Integration with LangChain's chain interfaces -- Support for various chain types -- Persistent storage of chain data -- Configurable chain parameters - -Configuration Options --------------------- - -The chain implementation can be configured with various options: - -- ``llm``: The language model to use -- ``arango_url``: URL of the ArangoDB instance -- ``username``: ArangoDB username -- ``password``: ArangoDB password -- ``database``: Database name -- ``collection_name``: Collection name for storing chain data -- ``chain_type``: Type of chain to use -- ``chain_kwargs``: Additional chain parameters \ No newline at end of file diff --git a/docs/chat_message_histories.rst b/docs/chat_message_histories.rst deleted file mode 100644 index a135047..0000000 --- a/docs/chat_message_histories.rst +++ /dev/null @@ -1,53 +0,0 @@ -Chat Message Histories -==================== - -LangChain ArangoDB provides chat message history implementations that allow you to store and retrieve chat messages using ArangoDB. - -ArangoDBChatMessageHistory -------------------------- - -The main chat message history implementation that uses ArangoDB for storing and retrieving chat messages. - -.. code-block:: python - - from langchain_arangodb.chat_message_histories import ArangoDBChatMessageHistory - from langchain.schema import HumanMessage, AIMessage - - # Initialize the chat message history - history = ArangoDBChatMessageHistory( - arango_url="http://localhost:8529", - username="root", - password="", - database="langchain", - collection_name="chat_history", - session_id="user123" - ) - - # Add messages - history.add_user_message("Hello!") - history.add_ai_message("Hi there!") - - # Get all messages - messages = history.messages - -Features --------- - -- Persistent storage of chat messages -- Session-based message organization -- Support for different message types -- Efficient message retrieval -- Integration with LangChain's chat interfaces - -Configuration Options --------------------- - -The chat message history can be configured with various options: - -- ``arango_url``: URL of the ArangoDB instance -- ``username``: ArangoDB username -- ``password``: ArangoDB password -- ``database``: Database name -- ``collection_name``: Collection name for storing messages -- ``session_id``: Unique identifier for the chat session -- ``ttl``: Time-to-live for messages (optional) \ No newline at end of file diff --git a/docs/graphs.rst b/docs/graphs.rst deleted file mode 100644 index 9946367..0000000 --- a/docs/graphs.rst +++ /dev/null @@ -1,52 +0,0 @@ -Graphs -====== - -LangChain ArangoDB provides graph implementations that allow you to work with graph data in ArangoDB. - -ArangoDBGraph ------------- - -The main graph implementation that uses ArangoDB for storing and querying graph data. - -.. code-block:: python - - from langchain_arangodb.graphs import ArangoDBGraph - - # Initialize the graph - graph = ArangoDBGraph( - arango_url="http://localhost:8529", - username="root", - password="", - database="langchain", - graph_name="knowledge_graph" - ) - - # Add nodes and edges - graph.add_node("person", {"name": "John", "age": 30}) - graph.add_node("person", {"name": "Alice", "age": 25}) - graph.add_edge("knows", "person/John", "person/Alice") - - # Query the graph - results = graph.query("FOR v IN person RETURN v") - -Features --------- - -- Graph data modeling -- Node and edge management -- AQL query support -- Graph traversal capabilities -- Integration with LangChain's graph interfaces - -Configuration Options --------------------- - -The graph implementation can be configured with various options: - -- ``arango_url``: URL of the ArangoDB instance -- ``username``: ArangoDB username -- ``password``: ArangoDB password -- ``database``: Database name -- ``graph_name``: Name of the graph -- ``edge_definitions``: Edge collection definitions -- ``orphan_collections``: Collections that can contain orphan vertices \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index 5a3b537..0000000 --- a/docs/index.rst +++ /dev/null @@ -1,23 +0,0 @@ -Welcome to LangChain ArangoDB's documentation! -========================================= - -LangChain ArangoDB is a Python package that provides ArangoDB integrations for LangChain, enabling vector storage, graph operations, and chat message history management. - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - - installation - vectorstores - chat_message_histories - graphs - chains - query_constructors - api_reference - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` \ No newline at end of file diff --git a/docs/installation.rst b/docs/installation.rst deleted file mode 100644 index 0add556..0000000 --- a/docs/installation.rst +++ /dev/null @@ -1,32 +0,0 @@ -Installation -============ - -You can install LangChain ArangoDB using pip: - -.. code-block:: bash - - pip install langchain-arangodb - -Or using poetry: - -.. code-block:: bash - - poetry add langchain-arangodb - -Requirements ------------ - -- Python 3.8+ -- ArangoDB 3.9+ -- LangChain - -Configuration ------------- - -To use LangChain ArangoDB, you'll need to have an ArangoDB instance running. You can either: - -1. Use a local ArangoDB instance -2. Use ArangoDB Oasis (cloud service) -3. Use a self-hosted ArangoDB instance - -The connection details will be required when initializing the various components. \ No newline at end of file diff --git a/docs/query_constructors.rst b/docs/query_constructors.rst deleted file mode 100644 index fb782fe..0000000 --- a/docs/query_constructors.rst +++ /dev/null @@ -1,53 +0,0 @@ -Query Constructors -================ - -LangChain ArangoDB provides query constructor implementations that help build AQL queries for ArangoDB. - -ArangoDBQueryConstructor ------------------------ - -The main query constructor implementation that helps build AQL queries. - -.. code-block:: python - - from langchain_arangodb.query_constructors import ArangoDBQueryConstructor - - # Initialize the query constructor - constructor = ArangoDBQueryConstructor( - collection_name="documents", - filter_fields=["category", "tags"], - sort_fields=["created_at", "updated_at"] - ) - - # Build a query - query = constructor.construct_query( - filter_criteria={ - "category": "news", - "tags": ["important", "urgent"] - }, - sort_by="created_at", - sort_order="DESC", - limit=10 - ) - -Features --------- - -- AQL query construction -- Support for filtering -- Support for sorting -- Support for pagination -- Support for aggregation -- Integration with LangChain's query interfaces - -Configuration Options --------------------- - -The query constructor can be configured with various options: - -- ``collection_name``: Name of the collection to query -- ``filter_fields``: Fields that can be used for filtering -- ``sort_fields``: Fields that can be used for sorting -- ``default_limit``: Default number of results to return -- ``default_sort_field``: Default field to sort by -- ``default_sort_order``: Default sort order (ASC/DESC) \ No newline at end of file diff --git a/docs/vectorstores.rst b/docs/vectorstores.rst deleted file mode 100644 index 3d7989f..0000000 --- a/docs/vectorstores.rst +++ /dev/null @@ -1,55 +0,0 @@ -Vector Stores -============ - -LangChain ArangoDB provides vector store implementations that allow you to store and retrieve embeddings using ArangoDB. - -ArangoDBVectorStore ------------------- - -The main vector store implementation that uses ArangoDB for storing and retrieving vector embeddings. - -.. code-block:: python - - from langchain_arangodb.vectorstores import ArangoDBVectorStore - from langchain.embeddings import OpenAIEmbeddings - - # Initialize the vector store - vectorstore = ArangoDBVectorStore( - embedding=OpenAIEmbeddings(), - arango_url="http://localhost:8529", - username="root", - password="", - database="langchain", - collection_name="vectors" - ) - - # Add texts to the vector store - texts = ["Hello world", "How are you"] - vectorstore.add_texts(texts) - - # Search for similar texts - results = vectorstore.similarity_search("Hello", k=2) - -Features --------- - -- Efficient vector similarity search -- Support for metadata filtering -- Batch operations for adding texts -- Configurable collection settings -- Integration with LangChain's embedding interfaces - -Configuration Options --------------------- - -The vector store can be configured with various options: - -- ``embedding``: The embedding model to use -- ``arango_url``: URL of the ArangoDB instance -- ``username``: ArangoDB username -- ``password``: ArangoDB password -- ``database``: Database name -- ``collection_name``: Collection name for storing vectors -- ``index_name``: Name of the vector index (default: "vector_index") -- ``index_type``: Type of vector index to use -- ``index_fields``: Fields to include in the index \ No newline at end of file diff --git a/libs/arangodb/doc/api_reference.rst b/libs/arangodb/doc/api_reference.rst index a02b288..b0d4218 100644 --- a/libs/arangodb/doc/api_reference.rst +++ b/libs/arangodb/doc/api_reference.rst @@ -1,45 +1,53 @@ API Reference -============ +============= This section provides detailed API documentation for all modules in LangChain ArangoDB. Vector Stores ------------- +------------- .. automodule:: langchain_arangodb.vectorstores.arangodb_vector - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: Chat Message Histories --------------------- +---------------------- .. automodule:: langchain_arangodb.chat_message_histories.arangodb - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: -Graphs ------- +Graph Stores +------------ .. automodule:: langchain_arangodb.graphs.arangodb_graph - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: Chains ------ .. automodule:: langchain_arangodb.chains.graph_qa.arangodb - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: Query Constructors ----------------- +------------------ + +.. automodule:: langchain_arangodb.chains.graph_qa.prompts + :members: + :undoc-members: + :show-inheritance: + +Utilities +--------- -.. automodule:: langchain_arangodb.query_constructors.arangodb - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file +.. automodule:: langchain_arangodb.vectorstores.utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/libs/arangodb/doc/arangoqachain.rst b/libs/arangodb/doc/arangoqachain.rst index ac0da13..8221546 100644 --- a/libs/arangodb/doc/arangoqachain.rst +++ b/libs/arangodb/doc/arangoqachain.rst @@ -42,7 +42,7 @@ The simplest way to use the chain is with a direct query: .. code-block:: python - response = chain.invoke({"query": "Who starred in Pulp Fiction?"}) + response = chain.invoke("Who starred in Pulp Fiction?") print(response["result"]) 2. Using Custom Input/Output Keys @@ -60,7 +60,7 @@ You can customize the input and output keys: output_key="answer" ) - response = chain.invoke({"question": "Who directed Inception?"}) + response = chain.invoke("Who directed Inception?") print(response["answer"]) 3. Limiting Results @@ -93,7 +93,7 @@ Get query explanation without execution: execute_aql_query=False # Only explain, don't execute ) - explanation = chain.invoke({"query": "Find all movies released after 2020"}) + explanation = chain.invoke("Find all movies released after 2020") print(explanation["aql_result"]) # Contains query plan 5. Read-Only Mode @@ -149,7 +149,7 @@ Get more detailed output including AQL query and results: return_aql_result=True ) - response = chain.invoke({"query": "Who acted in The Matrix?"}) + response = chain.invoke("Who acted in The Matrix?") print("Query:", response["aql_query"]) print("Raw Results:", response["aql_result"]) print("Final Answer:", response["result"]) @@ -215,7 +215,7 @@ Here's a complete workflow showing how to use multiple features together: for query in queries: print(f"\nProcessing query: {query}") - response = chain.invoke({"query": query}) + response = chain.invoke(query) print("AQL Query:", response["aql_query"]) print("Raw Results:", response["aql_result"]) @@ -239,7 +239,7 @@ The chain includes built-in error handling: .. code-block:: python try: - response = chain.invoke({"query": "Find all movies"}) + response = chain.invoke("Find all movies") except ValueError as e: if "Maximum amount of AQL Query Generation attempts" in str(e): print("Failed to generate valid AQL after multiple attempts") diff --git a/libs/arangodb/doc/chat_message_histories.rst b/libs/arangodb/doc/chat_message_histories.rst index b391966..8042937 100644 --- a/libs/arangodb/doc/chat_message_histories.rst +++ b/libs/arangodb/doc/chat_message_histories.rst @@ -113,6 +113,7 @@ Conversation Buffer Memory from langchain.memory import ConversationBufferMemory from langchain_openai import ChatOpenAI + from langchain_arangodb.chat_message_histories import ArangoChatMessageHistory # Create chat history chat_history = ArangoChatMessageHistory( @@ -494,14 +495,6 @@ Example: Complete Chat Application # Clear a session when done app.clear_session("user_bob") -API Reference -------------- - -.. automodule:: langchain_arangodb.chat_message_histories.arangodb - :members: - :undoc-members: - :show-inheritance: - Troubleshooting --------------- diff --git a/libs/arangodb/doc/graph.rst b/libs/arangodb/doc/graph.rst index e7d49db..cffd278 100644 --- a/libs/arangodb/doc/graph.rst +++ b/libs/arangodb/doc/graph.rst @@ -57,7 +57,7 @@ Creates a connection to ArangoDB. db = get_arangodb_client() from_db_credentials -~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~ Alternative constructor that creates an ArangoGraph instance directly from credentials. @@ -111,6 +111,7 @@ Adds graph documents to the database. update_graph_definition_if_exists=True, capitalization_strategy="lower" ) + Example: Using LLMGraphTransformer .. code-block:: python @@ -256,12 +257,12 @@ Gets the schema in a structured format. structured_schema = graph.get_structured_schema Internal Utility Methods ------------------------ +------------------------ These methods are used internally but may be useful for advanced use cases: _sanitize_collection_name -~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~ Sanitizes collection names to be valid in ArangoDB. @@ -517,7 +518,6 @@ Error Handling print(f"Database error: {e}") --------------- diff --git a/libs/arangodb/doc/index.rst b/libs/arangodb/doc/index.rst index 7c458fa..acbb4b8 100644 --- a/libs/arangodb/doc/index.rst +++ b/libs/arangodb/doc/index.rst @@ -7,7 +7,8 @@ LangChain ArangoDB is a Python package that provides ArangoDB integrations for L
- LangChain + + LangChain ArangoDB @@ -45,9 +46,9 @@ LangChain ArangoDB provides comprehensive integrations for building AI applicati Requirements ------------ -- Python 3.10+ +- Python 3.9+ - LangChain -- ArangoDB 3.10+ +- ArangoDB - python-arango Installation @@ -95,9 +96,9 @@ Documentation Contents :caption: User Guide: quickstart - vectorstores - chat_message_histories graph + chat_message_histories + vectorstores arangoqachain .. toctree:: diff --git a/libs/arangodb/doc/langchain-arangodb.ipynb b/libs/arangodb/doc/langchain-arangodb.ipynb deleted file mode 100644 index e69de29..0000000 diff --git a/libs/arangodb/doc/quickstart.rst b/libs/arangodb/doc/quickstart.rst index d695aac..e48c783 100644 --- a/libs/arangodb/doc/quickstart.rst +++ b/libs/arangodb/doc/quickstart.rst @@ -133,31 +133,46 @@ Create and work with knowledge graphs using ArangoDB: .. code-block:: python from langchain_arangodb.graphs import ArangoGraph - - # Initialize the graph - graph = ArangoGraph( - database=db, - vertex_collections=["Person", "Company", "Technology"], - edge_collections=["WorksAt", "Uses", "DeveloperOf"] + from langchain_arangodb.graphs.graph_document import GraphDocument, Node, Relationship + from langchain_core.documents import Document + + # Initialize the graph (no need to specify collections, they're created automatically) + graph = ArangoGraph(database=db) + + # Create graph documents using proper Node and Relationship objects + graph_doc = GraphDocument( + nodes=[ + Node(id="person1", type="Person", properties={"name": "Alice", "role": "Developer"}), + Node(id="company1", type="Company", properties={"name": "TechCorp", "industry": "Software"}), + Node(id="tech1", type="Technology", properties={"name": "ArangoDB", "category": "Database"}) + ], + relationships=[ + Relationship( + source=Node(id="person1", type="Person"), + target=Node(id="company1", type="Company"), + type="WorksAt", + properties={"since": "2023"} + ), + Relationship( + source=Node(id="company1", type="Company"), + target=Node(id="tech1", type="Technology"), + type="Uses", + properties={"purpose": "Data storage"} + ) + ], + source=Document(page_content="Graph data about people and companies") ) - # Add nodes and relationships - graph.add_graph_documents([ - { - "nodes": [ - {"id": "person1", "type": "Person", "properties": {"name": "Alice", "role": "Developer"}}, - {"id": "company1", "type": "Company", "properties": {"name": "TechCorp", "industry": "Software"}}, - {"id": "tech1", "type": "Technology", "properties": {"name": "ArangoDB", "category": "Database"}} - ], - "relationships": [ - {"source": "person1", "target": "company1", "type": "WorksAt", "properties": {"since": "2023"}}, - {"source": "company1", "target": "tech1", "type": "Uses", "properties": {"purpose": "Data storage"}} - ] - } - ]) - - # Query the graph - query_result = graph.query("FOR v, e, p IN 1..2 OUTBOUND 'Person/person1' GRAPH 'knowledge_graph' RETURN {vertex: v, edge: e}") + # Add the graph document to the database + graph.add_graph_documents([graph_doc]) + + # Query the graph using AQL + query_result = graph.query(""" + FOR person IN Person + FILTER person.name == 'Alice' + FOR company IN 1..1 OUTBOUND person._id WorksAt + RETURN {person: person.name, company: company.name} + """) print(query_result) **Schema Management** @@ -165,12 +180,16 @@ Create and work with knowledge graphs using ArangoDB: .. code-block:: python # Get current schema - schema = graph.get_schema + schema = graph.schema print("Graph Schema:", schema) # Refresh schema after changes graph.refresh_schema() + # Get schema as JSON or YAML + schema_json = graph.schema_json + schema_yaml = graph.schema_yaml + 5. Instantiate an ArangoDB Graph QA Chain ------------------------------------------ @@ -188,25 +207,25 @@ Create a question-answering system that leverages your graph data: qa_chain = ArangoGraphQAChain.from_llm( llm=llm, graph=graph, - verbose=True + allow_dangerous_requests=True ) # Ask questions about your graph - response = qa_chain.run("Who works at TechCorp and what technologies do they use?") - print(response) + response = qa_chain.invoke("Who works at TechCorp and what technologies do they use?") + print(response["result"]) # Ask about relationships - response = qa_chain.run("What is the relationship between Alice and ArangoDB?") - print(response) + response = qa_chain.invoke("What is the relationship between Alice and ArangoDB?") + print(response["result"]) **Advanced: Custom Prompts** .. code-block:: python - from langchain_arangodb.chains.graph_qa import CYPHER_GENERATION_PROMPT + from langchain_arangodb.chains.graph_qa.prompts import AQL_GENERATION_PROMPT # Customize the prompt for better AQL generation - custom_prompt = CYPHER_GENERATION_PROMPT.partial( + custom_prompt = AQL_GENERATION_PROMPT.partial( schema=graph.get_schema, examples="Example: To find all people working at companies that use ArangoDB:\n" "FOR person IN Person\n" @@ -219,8 +238,9 @@ Create a question-answering system that leverages your graph data: qa_chain_custom = ArangoGraphQAChain.from_llm( llm=llm, graph=graph, - cypher_prompt=custom_prompt, - verbose=True + aql_generation_prompt=custom_prompt, + verbose=True, + allow_dangerous_requests=True ) **Chat Message History Integration** @@ -251,12 +271,15 @@ Create a question-answering system that leverages your graph data: llm=llm, graph=graph, memory=memory, - verbose=True + verbose=True, + allow_dangerous_requests=True ) # Now your conversations are persisted - response1 = qa_chain_with_memory.run("Tell me about the people in our database") - response2 = qa_chain_with_memory.run("What companies do they work for?") + response1 = qa_chain_with_memory.invoke("Tell me about the people in our database") + response2 = qa_chain_with_memory.invoke("What companies do they work for?") + print(response1["result"]) + print(response2["result"]) Complete Example: RAG with Graph and Vector Search -------------------------------------------------- @@ -296,14 +319,14 @@ Combine all components for a powerful RAG application: ) # Graph for structured knowledge - graph = ArangoGraph( - database=db, - vertex_collections=["Entity", "Concept"], - edge_collections=["RelatedTo", "PartOf"] - ) + graph = ArangoGraph(database=db) # QA chain with graph reasoning - qa_chain = ArangoGraphQAChain.from_llm(llm=llm, graph=graph) + qa_chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True + ) # Chat history for context chat_history = ArangoChatMessageHistory( diff --git a/libs/arangodb/doc/requirements.txt b/libs/arangodb/doc/requirements.txt index d4f677a..d82cef4 100644 --- a/libs/arangodb/doc/requirements.txt +++ b/libs/arangodb/doc/requirements.txt @@ -1,3 +1,6 @@ sphinx sphinx_rtd_theme +langchain +langchain-core +python-arango langchain-arangodb \ No newline at end of file diff --git a/libs/arangodb/doc/vectorstores.rst b/libs/arangodb/doc/vectorstores.rst index bf7b571..6ff5f90 100644 --- a/libs/arangodb/doc/vectorstores.rst +++ b/libs/arangodb/doc/vectorstores.rst @@ -11,9 +11,9 @@ The ``ArangoVector`` class is the main vector store implementation that integrat - Efficient vector similarity search with cosine and Euclidean distance metrics - Approximate and exact nearest neighbor search - Maximal marginal relevance (MMR) search for diverse results +- Hybrid search combining vector and keyword-based retrieval (RRF) - Batch operations for adding and managing documents - Configurable vector indexing with customizable parameters -- Integration with ArangoDB's distributed architecture Quick Start ----------- @@ -50,6 +50,37 @@ Quick Start for doc in results: print(doc.page_content) +Creating from Existing Collections +---------------------------------- + +You can create a vector store from an existing ArangoDB collection by embedding specific text properties: + +.. code-block:: python + + from langchain_arangodb.vectorstores import ArangoVector + + # Create vector store from existing collection + vectorstore = ArangoVector.from_existing_collection( + collection_name="existing_docs", + text_properties_to_embed=["title", "description", "content"], + embedding=embeddings, + database=db, + embedding_field="text_embedding", + text_field="combined_text", + batch_size=1000, + insert_text=True, # Store concatenated text for hybrid search + skip_existing_embeddings=False, # Re-embed all documents + search_type=SearchType.HYBRID + ) + +**Key Parameters:** + +- ``text_properties_to_embed``: List of document properties to concatenate and embed +- ``batch_size``: Number of documents to process at once (default: 1000) +- ``insert_text``: Whether to store concatenated text (required for hybrid search) +- ``skip_existing_embeddings``: Skip documents that already have embeddings +- ``aql_return_text_query``: Custom AQL query for text extraction (optional) + Configuration ------------- @@ -62,13 +93,17 @@ Constructor Parameters :param embedding_dimension: The dimension of the embedding vectors (must match your embedding model) :param database: ArangoDB database instance from python-arango :param collection_name: Name of the collection to store documents (default: "documents") - :param search_type: Type of search - currently only "vector" is supported + :param search_type: Type of search - supports "vector" and "hybrid" search modes :param embedding_field: Field name for storing embedding vectors (default: "embedding") :param text_field: Field name for storing text content (default: "text") - :param index_name: Name of the vector index (default: "vector_index") + :param vector_index_name: Name of the vector index (default: "vector_index") :param distance_strategy: Distance metric for similarity calculation (default: "COSINE") :param num_centroids: Number of centroids for vector index clustering (default: 1) :param relevance_score_fn: Custom function to normalize relevance scores (optional) + :param keyword_index_name: Name of the keyword search index (default: "keyword_index") + :param keyword_analyzer: Text analyzer for keyword search (default: "text_en") + :param rrf_constant: Constant for Reciprocal Rank Fusion in hybrid search (default: 60) + :param rrf_search_limit: Maximum results for RRF scoring (default: 100) Distance Strategies ~~~~~~~~~~~~~~~~~~ @@ -95,6 +130,37 @@ The vector store supports multiple distance metrics: distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE ) +Keyword Analyzers +~~~~~~~~~~~~~~~~~ + +For hybrid search, ArangoDB supports multiple text analyzers for different languages: + +**Supported Analyzers:** + +- ``text_en``: English (default) +- ``text_de``: German +- ``text_es``: Spanish +- ``text_fi``: Finnish +- ``text_fr``: French +- ``text_it``: Italian +- ``text_nl``: Dutch +- ``text_no``: Norwegian +- ``text_pt``: Portuguese +- ``text_ru``: Russian +- ``text_sv``: Swedish +- ``text_zh``: Chinese + +.. code-block:: python + + # Using German analyzer for hybrid search + vectorstore = ArangoVector( + embedding=embeddings, + embedding_dimension=1536, + database=db, + search_type=SearchType.HYBRID, + keyword_analyzer="text_de" + ) + Search Methods -------------- @@ -364,6 +430,7 @@ Custom Collection Setup embedding_field="custom_embedding", text_field="custom_text", ids=["custom_id_1"], # Custom document IDs + insert_text=True, # Store text content (required for hybrid search) ) Custom Relevance Scoring @@ -481,19 +548,6 @@ Example: Complete Workflow for doc in diverse_results: print(f"- {doc.page_content}") -API Reference -------------- - -.. automodule:: langchain_arangodb.vectorstores.arangodb_vector - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: langchain_arangodb.vectorstores.utils - :members: - :undoc-members: - :show-inheritance: - Future Enhancements ------------------- @@ -533,4 +587,47 @@ Support for multi-modal embeddings and cross-modal search capabilities: # query="text query", # image_query=image_embedding, # modality_weights={"text": 0.7, "image": 0.3} - # ) \ No newline at end of file + # ) + +Vector and Keyword Search +~~~~~~~~~~~~~~~~~~~~~~~~~ + +For direct control over hybrid search with both vector and keyword components: + +.. code-block:: python + + # Hybrid search with pre-computed embedding + query_embedding = embeddings.embed_query("machine learning") + + # Search combining vector similarity and keyword matching + results = vectorstore.similarity_search_by_vector_and_keyword( + query="artificial intelligence", + embedding=query_embedding, + k=5, + vector_weight=1.0, + keyword_weight=1.0, + keyword_search_clause="SEARCH doc.text IN TOKENS(@query, 'text_en')" + ) + + # With scores + docs_and_scores = vectorstore.similarity_search_by_vector_and_keyword_with_score( + query="deep learning", + embedding=query_embedding, + k=3, + vector_weight=2.0, # Favor vector similarity + keyword_weight=1.0 + ) + +**Important Notes:** + +- **insert_text requirement**: When using hybrid search (``SearchType.HYBRID``), the ``insert_text`` parameter must be set to ``True`` in ``add_texts``, ``add_embeddings``, and ``from_texts`` methods. This ensures text content is stored for keyword search. +- **Hybrid search prerequisites**: Both vector and keyword indexes must be created before performing hybrid search +- **RRF scoring**: Hybrid search uses Reciprocal Rank Fusion to combine vector and keyword search results + +.. code-block:: python + + # Correct usage for hybrid search + vectorstore.add_texts( + texts=["New document"], + insert_text=True # Required for hybrid search + ) \ No newline at end of file diff --git a/libs/arangodb/langchain_arangodb/chat_message_histories/arangodb.py b/libs/arangodb/langchain_arangodb/chat_message_histories/arangodb.py index 0c61c9e..49a7057 100644 --- a/libs/arangodb/langchain_arangodb/chat_message_histories/arangodb.py +++ b/libs/arangodb/langchain_arangodb/chat_message_histories/arangodb.py @@ -6,7 +6,54 @@ class ArangoChatMessageHistory(BaseChatMessageHistory): - """Chat message history stored in an ArangoDB database.""" + """Chat message history stored in an ArangoDB database. + + This class provides persistent storage for chat message histories using ArangoDB + as the backend. It supports session-based message storage with automatic + collection creation and indexing. + + :param session_id: Unique identifier for the chat session. + :type session_id: Union[str, int] + :param db: ArangoDB database instance for storing chat messages. + :type db: arango.database.StandardDatabase + :param collection_name: Name of the ArangoDB collection to store messages. + Defaults to "ChatHistory". + :type collection_name: str + :param window: Maximum number of messages to keep in memory (currently unused). + Defaults to 3. + :type window: int + :param args: Additional positional arguments passed to BaseChatMessageHistory. + :type args: Any + :param kwargs: Additional keyword arguments passed to BaseChatMessageHistory. + :type kwargs: Any + + .. code-block:: python + + from arango import ArangoClient + from langchain_arangodb.chat_message_histories import ArangoChatMessageHistory + + # Connect to ArangoDB + client = ArangoClient("http://localhost:8529") + db = client.db("test", username="root", password="openSesame") + + # Create chat message history + history = ArangoChatMessageHistory( + session_id="user_123", + db=db, + collection_name="chat_sessions" + ) + + # Add messages + history.add_user_message("Hello! How are you?") + history.add_ai_message("I'm doing well, thank you!") + + # Retrieve messages + messages = history.messages + print(f"Found {len(messages)} messages") + + # Clear session + history.clear() + """ def __init__( self, @@ -44,7 +91,28 @@ def __init__( @property def messages(self) -> List[BaseMessage]: - """Retrieve the messages from ArangoDB""" + """Retrieve the messages from ArangoDB. + + Retrieves all messages for the current session from the ArangoDB collection, + sorted by timestamp in descending order (most recent first). + + :return: List of chat messages for the current session, + sorted in reverse chronological order (most recent first). + :rtype: List[BaseMessage] + + .. code-block:: python + + # Get all messages for the session + messages = history.messages + for msg in messages: + print(f"{msg.type}: {msg.content}") + + # Check if session has any messages + if history.messages: + print(f"Session has {len(history.messages)} messages") + else: + print("No messages in this session") + """ query = """ FOR doc IN @@col FILTER doc.session_id == @session_id @@ -71,7 +139,31 @@ def messages(self, messages: List[BaseMessage]) -> None: ) def add_message(self, message: BaseMessage) -> None: - """Append the message to the record in ArangoDB""" + """Append the message to the record in ArangoDB. + + Stores a single chat message in the ArangoDB collection associated with + the current session. The message is stored with its type, content, and + session identifier. + + :param message: The chat message to add to the history. + :type message: BaseMessage + + .. code-block:: python + + from langchain_core.messages import HumanMessage, AIMessage + + # Add user message + user_msg = HumanMessage(content="What is the weather like?") + history.add_message(user_msg) + + # Add AI response + ai_msg = AIMessage(content="I don't have access to current weather data.") + history.add_message(ai_msg) + + # Or use convenience methods + history.add_user_message("Hello!") + history.add_ai_message("Hi there!") + """ self._db.collection(self._collection_name).insert( { @@ -82,7 +174,26 @@ def add_message(self, message: BaseMessage) -> None: ) def clear(self) -> None: - """Clear session memory from ArangoDB""" + """Clear session memory from ArangoDB. + + Removes all messages associated with the current session from the ArangoDB + collection. The collection itself is preserved for future use. + + .. code-block:: python + + # Add some messages + history.add_user_message("Hello") + history.add_ai_message("Hi!") + print(f"Messages before clear: {len(history.messages)}") # Output: 2 + + # Clear all messages for this session + history.clear() + print(f"Messages after clear: {len(history.messages)}") # Output: 0 + + # Collection still exists for future messages + history.add_user_message("Starting fresh conversation") + print(f"New message count: {len(history.messages)}") # Output: 1 + """ query = """ FOR doc IN @@col FILTER doc.session_id == @session_id diff --git a/libs/arangodb/langchain_arangodb/graphs/arangodb_graph.py b/libs/arangodb/langchain_arangodb/graphs/arangodb_graph.py index 17be379..5d3a226 100644 --- a/libs/arangodb/langchain_arangodb/graphs/arangodb_graph.py +++ b/libs/arangodb/langchain_arangodb/graphs/arangodb_graph.py @@ -334,7 +334,6 @@ def query( """ Execute an AQL query and return the results. - Parameters: :param query: The AQL query to execute. :type query: str :param params: Additional arguments piped to the function. diff --git a/libs/arangodb/langchain_arangodb/vectorstores/arangodb_vector.py b/libs/arangodb/langchain_arangodb/vectorstores/arangodb_vector.py index 3998410..24909fa 100644 --- a/libs/arangodb/langchain_arangodb/vectorstores/arangodb_vector.py +++ b/libs/arangodb/langchain_arangodb/vectorstores/arangodb_vector.py @@ -61,74 +61,57 @@ class ArangoVector(VectorStore): It supports both vector similarity search and hybrid search (vector + keyword) capabilities. - Args: - embedding: The embedding function to use for converting text to vectors. - Must implement the `langchain.embeddings.base.Embeddings` interface. - embedding_dimension: The dimensionality of the embedding vectors. - Must match the output dimension of the embedding function. - database: The ArangoDB database instance to use for storage and retrieval. - collection_name: The name of the ArangoDB collection to store - documents. Defaults to "documents". - search_type: The type of search to perform. Can be either "vector" for pure - vector similarity search or "hybrid" for combining vector and - keyword search. Defaults to "vector". - embedding_field: The field name in the document to store the embedding vector. - Defaults to "embedding". - text_field: The field name in the document to store the text content. - Defaults to "text". - vector_index_name: The name of the vector index to create in ArangoDB. - This index enables efficient vector similarity search. - Defaults to "vector_index". - distance_strategy: The distance metric to use for vector similarity. - Can be either "COSINE" or "EUCLIDEAN_DISTANCE". - Defaults to "COSINE". - num_centroids: The number of centroids to use for the vector index. - Higher values can improve search accuracy but increase memory usage. - Defaults to 1. - relevance_score_fn: Optional function to normalize the relevance score. - If not provided, uses the default normalization for the distance strategy. - keyword_index_name: The name of the ArangoDB View created to enable - Full-Text-Search capabilities. Only used if search_type is set - to "hybrid". Defaults to "keyword_index". - keyword_analyzer: The text analyzer to use for keyword search. - Must be one of the supported analyzers in ArangoDB. - Defaults to "text_en". - rrf_constant: The constant used in Reciprocal Rank Fusion (RRF) for hybrid - search. Higher values give more weight to lower-ranked results. - Defaults to 60. - rrf_search_limit: The maximum number of results to consider in RRF scoring. - Defaults to 100. - - Example: - .. code-block:: python - - from arango import ArangoClient - from langchain_community.embeddings.openai import OpenAIEmbeddings - from langchain_community.vectorstores.arangodb_vector import ArangoVector - - # Initialize ArangoDB connection - db = ArangoClient("http://localhost:8529").db( - "test", - username="root", - password="openSesame" - ) - - # Create embedding function - embedding = OpenAIEmbeddings( - model="text-embedding-3-small", - dimensions=dimension - ) - - # Create vector store - vector_store = ArangoVector.from_texts( - texts=["hello world", "hello langchain", "hello arangodb"], - embedding=embedding, - database=db, - collection_name="Documents" - ) - - # Perform similarity search - print(vector_store.similarity_search("arangodb", k=1)) + :param embedding: The embedding function to use for converting text to vectors. + Must implement the `langchain.embeddings.base.Embeddings` interface. + :type embedding: langchain.embeddings.base.Embeddings + :param embedding_dimension: The dimensionality of the embedding vectors. + Must match the output dimension of the embedding function. + :type embedding_dimension: int + :param database: The ArangoDB database instance to use for storage and retrieval. + :type database: arango.database.StandardDatabase + :param collection_name: The name of the ArangoDB collection to store + documents. Defaults to "documents". + :type collection_name: str + :param search_type: The type of search to perform. Can be either SearchType.VECTOR + for pure vector similarity search or SearchType.HYBRID for combining vector and + keyword search. Defaults to SearchType.VECTOR. + :type search_type: SearchType + :param embedding_field: The field name in the document to store the embedding vector + Defaults to "embedding". + :type embedding_field: str + :param text_field: The field name in the document to store the text content. + Defaults to "text". + :type text_field: str + :param vector_index_name: The name of the vector index to create in ArangoDB. + This index enables efficient vector similarity search. + Defaults to "vector_index". + :type vector_index_name: str + :param distance_strategy: The distance metric to use for vector similarity. + Can be either DistanceStrategy.COSINE or DistanceStrategy.EUCLIDEAN_DISTANCE. + Defaults to DistanceStrategy.COSINE. + :type distance_strategy: DistanceStrategy + :param num_centroids: The number of centroids to use for the vector index. + Higher values can improve search accuracy but increase memory usage. + Defaults to 1. + :type num_centroids: int + :param relevance_score_fn: Optional function to normalize the relevance score. + If not provided, uses the default normalization for the distance strategy. + :type relevance_score_fn: Optional[Callable[[float], float]] + :param keyword_index_name: The name of the ArangoDB View created to enable + Full-Text-Search capabilities. Only used if search_type is set + to SearchType.HYBRID. Defaults to "keyword_index". + :type keyword_index_name: str + :param keyword_analyzer: The text analyzer to use for keyword search. + Must be one of the supported analyzers in ArangoDB. + Defaults to "text_en". + :type keyword_analyzer: str + :param rrf_constant: The constant used in Reciprocal Rank Fusion (RRF) for hybrid + search. Higher values give more weight to lower-ranked results. + Defaults to 60. + :type rrf_constant: int + :param rrf_search_limit: The maximum number of results to consider in RRF scoring. + Defaults to 100. + :type rrf_search_limit: int """ def __init__( @@ -317,17 +300,39 @@ def add_texts( This method embeds the provided texts using the embedding function and stores them in ArangoDB along with their embeddings and metadata. - Args: - texts: An iterable of text strings to add to the vector store. - metadatas: Optional list of metadata dictionaries to associate with each - text. Each dictionary can contain arbitrary key-value pairs that - will be stored alongside the text and embedding. - ids: Optional list of unique identifiers for each text. If not provided, - IDs will be generated using a hash of the text content. - **kwargs: Additional keyword arguments passed to add_embeddings. - - Returns: - List of document IDs that were added to the vector store. + :param texts: An iterable of text strings to add to the vector store. + :type texts: Iterable[str] + :param metadatas: Optional list of metadata dictionaries to associate with each + text. Each dictionary can contain arbitrary key-value pairs that + will be stored alongside the text and embedding. + :type metadatas: Optional[List[dict]] + :param ids: Optional list of unique identifiers for each text. If not provided, + IDs will be generated using a hash of the text content. + :type ids: Optional[List[str]] + :param kwargs: Additional keyword arguments passed to add_embeddings. + :type kwargs: Any + :return: List of document IDs that were added to the vector store. + :rtype: List[str] + + .. code-block:: python + + # Add simple texts + texts = ["hello world", "hello arango", "test document"] + ids = vector_store.add_texts(texts) + print(f"Added {len(ids)} documents") + + # Add texts with metadata + texts = ["Machine learning tutorial", "Python programming guide"] + metadatas = [ + {"category": "AI", "difficulty": "beginner"}, + {"category": "Programming", "difficulty": "intermediate"} + ] + ids = vector_store.add_texts(texts, metadatas=metadatas) + + # Add texts with custom IDs + texts = ["Document 1", "Document 2"] + custom_ids = ["doc_001", "doc_002"] + ids = vector_store.add_texts(texts, ids=custom_ids) """ embeddings = self.embedding.embed_documents(list(texts)) @@ -356,33 +361,68 @@ def similarity_search( or a hybrid approach combining vector and keyword search. The search type can be overridden for individual queries. - Args: - query: The text query to search for. - k: The number of most similar documents to return. Defaults to 4. - return_fields: Set of additional document fields to return in results. - The _key and text fields are always returned. - use_approx: Whether to use approximate nearest neighbor search. - Enables faster but potentially less accurate results. - Defaults to True. - embedding: Optional pre-computed embedding for the query. - If not provided, the query will be embedded using the embedding - function. - filter_clause: Optional AQL filter clause to apply to the search. - Can be used to filter results based on document properties. - search_type: Override the default search type for this query. - Can be either "vector" or "hybrid". - vector_weight: Weight to apply to vector similarity scores in hybrid search. - Only used when search_type is "hybrid". Defaults to 1.0. - keyword_weight: Weight to apply to keyword search scores in hybrid search. - Only used when search_type is "hybrid". Defaults to 1.0. - keyword_search_clause: Optional AQL filter clause to apply Full Text Search. - If empty, a default search clause will be used. - metadata_clause: Optional AQL clause to return additional metadata once - the top k results are retrieved. If specified, the metadata will be - added to the Document.metadata field. - - Returns: - List of Document objects most similar to the query. + :param query: The text query to search for. + :type query: str + :param k: The number of most similar documents to return. Defaults to 4. + :type k: int + :param return_fields: Set of additional document fields to return in results. + The _key and text fields are always returned. + :type return_fields: set[str] + :param use_approx: Whether to use approximate nearest neighbor search. + Enables faster but potentially less accurate results. + Defaults to True. + :type use_approx: bool + :param embedding: Optional pre-computed embedding for the query. + If not provided, the query will be embedded using the embedding + function. + :type embedding: Optional[List[float]] + :param filter_clause: Optional AQL filter clause to apply to the search. + Can be used to filter results based on document properties. + :type filter_clause: str + :param search_type: Override the default search type for this query. + Can be either SearchType.VECTOR or SearchType.HYBRID. + :type search_type: Optional[SearchType] + :param vector_weight: Weight to apply to vector similarity + scores in hybrid search. Only used when search_type is SearchType.HYBRID. + Defaults to 1.0. + :type vector_weight: float + :param keyword_weight: Weight to apply to keyword search scores in hybrid + search. Only used when search_type is SearchType.HYBRID. Defaults to 1.0. + :type keyword_weight: float + :param keyword_search_clause: Optional AQL filter clause to apply + Full Text Search. If empty, a default search clause will be used. + :type keyword_search_clause: str + :param metadata_clause: Optional AQL clause to return additional metadata once + the top k results are retrieved. If specified, the metadata will be + added to the Document.metadata field. + :type metadata_clause: str + :param kwargs: Additional keyword arguments. + :type kwargs: Any + :return: List of Document objects most similar to the query. + :rtype: List[Document] + + .. code-block:: python + + # Simple vector search + results = vector_store.similarity_search("hello", k=1) + print(results[0].page_content) + + # Search with metadata filtering + results = vector_store.similarity_search( + "machine learning", + k=2, + filter_clause="doc.category == 'AI'", + return_fields={"category", "difficulty"} + ) + + # Hybrid search with custom weights + results = vector_store.similarity_search( + "neural networks", + k=3, + search_type=SearchType.HYBRID, + vector_weight=0.8, + keyword_weight=0.2 + ) """ search_type = search_type or self.search_type embedding = embedding or self.embedding.embed_query(query) @@ -430,30 +470,41 @@ def similarity_search_with_score( Similar to similarity_search but returns a tuple of (Document, score) for each result. The score represents the similarity between the query and the document. - Args: - query: The text query to search for. - k: The number of most similar documents to return. Defaults to 4. - return_fields: Set of additional document fields to return in results. - The _key and text fields are always returned. - use_approx: Whether to use approximate nearest neighbor search. - Enables faster but potentially less accurate results. - Defaults to True. - embedding: Optional pre-computed embedding for the query. - If not provided, the query will be embedded using the embedding - function. - filter_clause: Optional AQL filter clause to apply to the search. - Can be used to filter results based on document properties. - search_type: Override the default search type for this query. - Can be either "vector" or "hybrid". - vector_weight: Weight to apply to vector similarity scores in hybrid search. - Only used when search_type is "hybrid". Defaults to 1.0. - keyword_weight: Weight to apply to keyword search scores in hybrid search. - Only used when search_type is "hybrid". Defaults to 1.0. - keyword_search_clause: Optional AQL filter clause to apply Full Text Search. - If empty, a default search clause will be used. - - Returns: - List of tuples containing (Document, score) pairs, sorted by score. + :param query: The text query to search for. + :type query: str + :param k: The number of most similar documents to return. Defaults to 4. + :type k: int + :param return_fields: Set of additional document fields to return in results. + The _key and text fields are always returned. + :type return_fields: set[str] + :param use_approx: Whether to use approximate nearest neighbor search. + Enables faster but potentially less accurate results. Defaults to True. + :type use_approx: bool + :param embedding: Optional pre-computed embedding for the query. + If not provided, the query will be embedded using the embedding function. + :type embedding: Optional[List[float]] + :param filter_clause: Optional AQL filter clause to apply to the search. + Can be used to filter results based on document properties. + :type filter_clause: str + :param search_type: Override the default search type for this query. + Can be either SearchType.VECTOR or SearchType.HYBRID. + :type search_type: Optional[SearchType] + :param vector_weight: Weight to apply to vector similarity scores in + hybrid search. Only used when search_type is SearchType.HYBRID. + Defaults to 1.0. + :type vector_weight: float + :param keyword_weight: Weight to apply to keyword search scores + in hybrid search. Only used when search_type is SearchType.HYBRID. + Defaults to 1.0. + :type keyword_weight: float + :param keyword_search_clause: Optional AQL filter clause to apply + Full Text Search. If empty, a default search clause will be used. + :type keyword_search_clause: str + :param metadata_clause: Optional AQL clause to return additional metadata once + the top k results are retrieved. + :type metadata_clause: str + :return: List of tuples containing (Document, score) pairs, sorted by score. + :rtype: List[tuple[Document, float]] """ search_type = search_type or self.search_type embedding = embedding or self.embedding.embed_query(query) @@ -494,21 +545,27 @@ def similarity_search_by_vector( ) -> List[Document]: """Return docs most similar to embedding vector. - Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - return_fields: Fields to return in the result. For example, - {"foo", "bar"} will return the "foo" and "bar" fields of the document, - in addition to the _key & text field. Defaults to an empty set. - use_approx: Whether to use approximate vector search via ANN. - Defaults to True. If False, exact vector search will be used. - filter_clause: Filter clause to apply to the query. - metadata_clause: Optional AQL clause to return additional metadata once - the top k results are retrieved. If specified, the metadata will be - added to the Document.metadata field. - - Returns: - List of Documents most similar to the query vector. + :param embedding: Embedding to look up documents similar to. + :type embedding: List[float] + :param k: Number of Documents to return. Defaults to 4. + :type k: int + :param return_fields: Fields to return in the result. For example, + {"foo", "bar"} will return the "foo" and "bar" fields of the document, + in addition to the _key & text field. Defaults to an empty set. + :type return_fields: set[str] + :param use_approx: Whether to use approximate vector search via ANN. + Defaults to True. If False, exact vector search will be used. + :type use_approx: bool + :param filter_clause: Filter clause to apply to the query. + :type filter_clause: str + :param metadata_clause: Optional AQL clause to return additional metadata once + the top k results are retrieved. If specified, the metadata will be + added to the Document.metadata field. + :type metadata_clause: str + :param kwargs: Additional keyword arguments. + :type kwargs: Any + :return: List of Documents most similar to the query vector. + :rtype: List[Document] """ results = self.similarity_search_by_vector_with_score( embedding=embedding, @@ -558,24 +615,28 @@ def similarity_search_by_vector_with_score( filter_clause: str = "", metadata_clause: str = "", ) -> List[tuple[Document, float]]: - """Return docs most similar to embedding vector. - - Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - return_fields: Fields to return in the result. For example, - {"foo", "bar"} will return the "foo" and "bar" fields of the document, - in addition to the _key & text field. Defaults to an empty set. - use_approx: Whether to use approximate vector search via ANN. - Defaults to True. If False, exact vector search will be used. - filter_clause: Filter clause to apply to the query. - metadata_clause: Optional AQL clause to return additional metadata once - the top k results are retrieved. If specified, the metadata will be - added to the Document.metadata field. - **kwargs: Additional keyword arguments passed to the query execution. - - Returns: - List of Documents most similar to the query vector. + """Return docs most similar to embedding vector with scores. + + :param embedding: Embedding to look up documents similar to. + :type embedding: List[float] + :param k: Number of Documents to return. Defaults to 4. + :type k: int + :param return_fields: Fields to return in the result. For example, + {"foo", "bar"} will return the "foo" and "bar" fields of the document, + in addition to the _key & text field. Defaults to an empty set. + :type return_fields: set[str] + :param use_approx: Whether to use approximate vector search via ANN. + Defaults to True. If False, exact vector search will be used. + :type use_approx: bool + :param filter_clause: Filter clause to apply to the query. + :type filter_clause: str + :param metadata_clause: Optional AQL clause to return additional metadata once + the top k results are retrieved. If specified, the metadata will be + added to the Document.metadata field. + :type metadata_clause: str + :return: List of tuples containing (Document, score) + pairs most similar to the query vector. + :rtype: List[tuple[Document, float]] """ aql_query, bind_vars = self._build_vector_search_query( embedding=embedding, @@ -605,29 +666,41 @@ def similarity_search_by_vector_and_keyword_with_score( keyword_search_clause: str = "", metadata_clause: str = "", ) -> List[tuple[Document, float]]: - """Run similarity search with ArangoDB. - - Args: - query (str): Query text to search for. - k (int): Number of results to return. Defaults to 4. - return_fields: Fields to return in the result. For example, - {"foo", "bar"} will return the "foo" and "bar" fields of the document, - in addition to the _key & text field. Defaults to an empty set. - use_approx: Whether to use approximate vector search via ANN. - Defaults to True. If False, exact vector search will be used. - filter_clause: Filter clause to apply to the query. - vector_weight: Weight to apply to vector similarity scores in hybrid search. - Only used when search_type is "hybrid". Defaults to 1.0. - keyword_weight: Weight to apply to keyword search scores in hybrid search. - Only used when search_type is "hybrid". Defaults to 1.0. - keyword_search_clause: Optional AQL filter clause to apply Full Text Search. - If empty, a default search clause will be used. - metadata_clause: Optional AQL clause to return additional metadata once - the top k results are retrieved. If specified, the metadata will be - added to the Document.metadata field. - - Returns: - List of Documents most similar to the query. + """Run hybrid similarity search combining vector and keyword search with scores. + + :param query: Query text to search for. + :type query: str + :param embedding: Embedding vector for the query. + :type embedding: List[float] + :param k: Number of results to return. Defaults to 4. + :type k: int + :param return_fields: Fields to return in the result. For example, + {"foo", "bar"} will return the "foo" and "bar" fields of the document, + in addition to the _key & text field. Defaults to an empty set. + :type return_fields: set[str] + :param use_approx: Whether to use approximate vector search via ANN. + Defaults to True. If False, exact vector search will be used. + :type use_approx: bool + :param filter_clause: Filter clause to apply to the query. + :type filter_clause: str + :param vector_weight: Weight to apply to vector similarity scores + in hybrid search. Only used when search_type is SearchType.HYBRID. + Defaults to 1.0. + :type vector_weight: float + :param keyword_weight: Weight to apply to keyword search scores in + hybrid search. Only used when search_type is SearchType.HYBRID. + Defaults to 1.0. + :type keyword_weight: float + :param keyword_search_clause: Optional AQL filter clause to apply + Full Text Search. If empty, a default search clause will be used. + :type keyword_search_clause: str + :param metadata_clause: Optional AQL clause to return additional metadata once + the top k results are retrieved. If specified, the metadata will be + added to the Document.metadata field. + :type metadata_clause: str + :return: List of tuples containing (Document, score) + pairs most similar to thequery. + :rtype: List[tuple[Document, float]] """ aql_query, bind_vars = self._build_hybrid_search_query( @@ -652,13 +725,13 @@ def similarity_search_by_vector_and_keyword_with_score( def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: """Delete by vector ID or other criteria. - Args: - ids: List of ids to delete. - **kwargs: Other keyword arguments that can be used to delete vectors. - - Returns: - Optional[bool]: True if deletion is successful, - None if no ids are provided, or raises an exception if an error occurs. + :param ids: List of ids to delete. + :type ids: Optional[List[str]] + :param kwargs: Other keyword arguments that can be used to delete vectors. + :type kwargs: Any + :return: True if deletion is successful, None if no ids are provided, + or raises an exception if an error occurs. + :rtype: Optional[bool] """ if not ids: return None @@ -672,11 +745,10 @@ def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[boo def get_by_ids(self, ids: Sequence[str], /) -> list[Document]: """Get documents by their IDs. - Args: - ids: List of ids to get. - - Returns: - List of Documents with the given ids. + :param ids: List of ids to get. + :type ids: Sequence[str] + :return: List of Documents with the given ids. + :rtype: list[Document] """ docs = [] doc: dict[str, Any] @@ -703,31 +775,61 @@ def max_marginal_relevance_search( """Search for documents using Maximal Marginal Relevance (MMR). MMR optimizes for both similarity to the query and diversity among the results. - It helps avoid returning redundant or very similar documents. - - Args: - query: The text query to search for. - k: The number of documents to return. Defaults to 4. - fetch_k: The number of documents to fetch for MMR selection. - Should be larger than k to allow for diversity selection. - Defaults to 20. - lambda_mult: Controls the diversity vs relevance tradeoff. - Values between 0 and 1, where: - - 0: Maximum diversity - - 1: Maximum relevance - Defaults to 0.5. - return_fields: Set of additional document fields to return in results. - The _key and text fields are always returned. - use_approx: Whether to use approximate nearest neighbor search. - Enables faster but potentially less accurate results. - Defaults to True. - embedding: Optional pre-computed embedding for the query. - If not provided, the query will be embedded using the embedding - function. - **kwargs: Additional keyword arguments passed to the search methods. - - Returns: - List of Document objects selected by MMR algorithm. + It helps avoid returning redundant or very similar documents by balancing + relevance and diversity in the selection process. + + :param query: The text query to search for. + :type query: str + :param k: The number of documents to return. Defaults to 4. + :type k: int + :param fetch_k: The number of documents to fetch for MMR selection. + Should be larger than k to allow for diversity selection. + Defaults to 20. + :type fetch_k: int + :param lambda_mult: Controls the diversity vs relevance tradeoff. + Values between 0 and 1, where 0 = maximum diversity, 1 = maximum relevance. + Defaults to 0.5. + :type lambda_mult: float + :param return_fields: Set of additional document fields to return in results. + The _key and text fields are always returned. + :type return_fields: set[str] + :param use_approx: Whether to use approximate nearest neighbor search. + Enables faster but potentially less accurate results. + Defaults to True. + :type use_approx: bool + :param embedding: Optional pre-computed embedding for the query. + If not provided, the query will be embedded using the embedding + function. + :type embedding: Optional[List[float]] + :param kwargs: Additional keyword arguments passed to the search methods. + :type kwargs: Any + :return: List of Document objects selected by MMR algorithm. + :rtype: List[Document] + + .. code-block:: python + + # Search with balanced diversity and relevance + results = vector_store.max_marginal_relevance_search( + "machine learning", + k=3, + fetch_k=10 + ) + + # Emphasize diversity over relevance + diverse_results = vector_store.max_marginal_relevance_search( + "neural networks", + k=5, + fetch_k=20, + lambda_mult=0.1 # More diverse results + ) + + # Emphasize relevance over diversity + relevant_results = vector_store.max_marginal_relevance_search( + "deep learning", + k=3, + fetch_k=15, + lambda_mult=0.9 # More relevant results + ) """ return_fields.add(self.embedding_field) @@ -801,36 +903,89 @@ def from_texts( This is a convenience method that creates a new ArangoVector instance, embeds the provided texts, and stores them in ArangoDB. - Args: - texts: List of text strings to add to the vector store. - embedding: The embedding function to use for converting text to vectors. - metadatas: Optional list of metadata dictionaries to associate with each - text. - database: The ArangoDB database instance to use. - collection_name: The name of the ArangoDB collection to use. - Defaults to "documents". - search_type: The type of search to perform. Can be either "vector" or - "hybrid". Defaults to "vector". - embedding_field: The field name to store embeddings. Defaults to - "embedding". - text_field: The field name to store text content. Defaults to "text". - index_name: The name of the vector index. Defaults to "vector_index". - distance_strategy: The distance metric to use. Defaults to "COSINE". - num_centroids: Number of centroids for vector index. Defaults to 1. - ids: Optional list of unique identifiers for each text. - overwrite_index: Whether to delete and recreate existing indexes. - Defaults to False. - insert_text: Whether to store the text content in the database. - Required for hybrid search. Defaults to True. - keyword_index_name: Name of the keyword search index. Defaults to - "keyword_index". - keyword_analyzer: Text analyzer for keyword search. Defaults to "text_en". - rrf_constant: Constant for RRF scoring in hybrid search. Defaults to 60. - rrf_search_limit: Maximum results for RRF scoring. Defaults to 100. - **kwargs: Additional keyword arguments passed to the constructor. - - Returns: - A new ArangoVector instance with the texts embedded and stored. + :param texts: List of text strings to add to the vector store. + :type texts: List[str] + :param embedding: The embedding function to use for converting text to vectors. + :type embedding: langchain.embeddings.base.Embeddings + :param metadatas: Optional list of metadata dictionaries to associate with each + text. + :type metadatas: Optional[List[dict]] + :param database: The ArangoDB database instance to use. + :type database: Optional[arango.database.StandardDatabase] + :param collection_name: The name of the ArangoDB collection to use. + Defaults to "documents". + :type collection_name: str + :param search_type: The type of search to perform. + Can be either SearchType.VECTOR or SearchType.HYBRID. + Defaults to SearchType.VECTOR. + :type search_type: SearchType + :param embedding_field: The field name to store embeddings. Defaults to + "embedding". + :type embedding_field: str + :param text_field: The field name to store text content. Defaults to "text". + :type text_field: str + :param index_name: The name of the vector index. Defaults to "vector_index". + :type index_name: str + :param distance_strategy: The distance metric to use. Can be + DistanceStrategy.COSINE or DistanceStrategy.EUCLIDEAN_DISTANCE. + Defaults to DistanceStrategy.COSINE. + :type distance_strategy: DistanceStrategy + :param num_centroids: Number of centroids for vector index. Defaults to 1. + :type num_centroids: int + :param ids: Optional list of unique identifiers for each text. + :type ids: Optional[List[str]] + :param overwrite_index: Whether to delete and recreate existing indexes. + Defaults to False. + :type overwrite_index: bool + :param insert_text: Whether to store the text content in the database. + Required for hybrid search. Defaults to True. + :type insert_text: bool + :param keyword_index_name: Name of the keyword search index. Defaults to + "keyword_index". + :type keyword_index_name: str + :param keyword_analyzer: Text analyzer for keyword search. + Defaults to "text_en". + :type keyword_analyzer: str + :param rrf_constant: Constant for RRF scoring in hybrid search. Defaults to 60. + :type rrf_constant: int + :param rrf_search_limit: Maximum results for RRF scoring. Defaults to 100. + :type rrf_search_limit: int + :param kwargs: Additional keyword arguments passed to the constructor. + :type kwargs: Any + :return: A new ArangoVector instance with the texts embedded and stored. + :rtype: ArangoVector + + .. code-block:: python + + from arango import ArangoClient + from langchain_arangodb.vectorstores import ArangoVector + from langchain_community.embeddings import OpenAIEmbeddings + + # Connect to ArangoDB + client = ArangoClient("http://localhost:8529") + db = client.db("test", username="root", password="openSesame") + + # Create vector store from texts + texts = ["hello world", "hello arango", "test document"] + metadatas = [{"source": "doc1"}, {"source": "doc2"}, {"source": "doc3"}] + + vector_store = ArangoVector.from_texts( + texts=texts, + embedding=OpenAIEmbeddings(), + metadatas=metadatas, + database=db, + collection_name="test_collection" + ) + + # Create hybrid search store + hybrid_store = ArangoVector.from_texts( + texts=["Machine learning algorithms", "Deep neural networks"], + embedding=OpenAIEmbeddings(), + database=db, + search_type=SearchType.HYBRID, + collection_name="hybrid_docs", + overwrite_index=True # Clean start + ) """ if not database: raise ValueError("Database must be provided.") @@ -897,34 +1052,52 @@ def from_existing_collection( This method reads documents from an existing collection, extracts specified text properties, embeds them, and creates a new vector store. - Args: - collection_name: Name of the existing ArangoDB collection. - text_properties_to_embed: List of document properties containing text to - embed. These properties will be concatenated to create the - text for embedding. - embedding: The embedding function to use for converting text to vectors. - database: The ArangoDB database instance to use. - embedding_field: The field name to store embeddings. Defaults to - "embedding". - text_field: The field name to store text content. Defaults to "text". - batch_size: Number of documents to process in each batch. Defaults to 1000. - aql_return_text_query: Custom AQL query to extract text from properties. - Defaults to "RETURN doc[p]". - insert_text: Whether to store the concatenated text in the database. - Required for hybrid search. Defaults to False. - skip_existing_embeddings: Whether to skip documents that already have - embeddings. Defaults to False. - search_type: The type of search to perform. Can be either "vector" or - "hybrid". Defaults to "vector". - keyword_index_name: Name of the keyword search index. Defaults to - "keyword_index". - keyword_analyzer: Text analyzer for keyword search. Defaults to "text_en". - rrf_constant: Constant for RRF scoring in hybrid search. Defaults to 60. - rrf_search_limit: Maximum results for RRF scoring. Defaults to 100. - **kwargs: Additional keyword arguments passed to the constructor. - - Returns: - A new ArangoVector instance with embeddings created from the collection. + :param collection_name: Name of the existing ArangoDB collection. + :type collection_name: str + :param text_properties_to_embed: List of document properties containing text to + embed. These properties will be concatenated to create + the text for embedding. + :type text_properties_to_embed: List[str] + :param embedding: The embedding function to use for converting text to vectors. + :type embedding: Embeddings + :param database: The ArangoDB database instance to use. + :type database: StandardDatabase + :param embedding_field: The field name to store embeddings. + Defaults to "embedding". + :type embedding_field: str + :param text_field: The field name to store text content. Defaults to "text". + :type text_field: str + :param batch_size: Number of documents to process in each batch. + Defaults to 1000. + :type batch_size: int + :param aql_return_text_query: Custom AQL query to extract text from properties. + Defaults to "RETURN doc[p]". + :type aql_return_text_query: str + :param insert_text: Whether to store the concatenated text in the database. + Required for hybrid search. Defaults to False. + :type insert_text: bool + :param skip_existing_embeddings: Whether to skip documents that already have + embeddings. Defaults to False. + :type skip_existing_embeddings: bool + :param search_type: The type of search to perform. + Can be either SearchType.VECTOR or SearchType.HYBRID. + Defaults to SearchType.VECTOR. + :type search_type: SearchType + :param keyword_index_name: Name of the keyword search index. + Defaults to "keyword_index". + :type keyword_index_name: str + :param keyword_analyzer: Text analyzer for keyword search. + Defaults to "text_en". + :type keyword_analyzer: str + :param rrf_constant: Constant for RRF scoring in hybrid search. Defaults to 60. + :type rrf_constant: int + :param rrf_search_limit: Maximum results for RRF scoring. Defaults to 100. + :type rrf_search_limit: int + :param kwargs: Additional keyword arguments passed to the constructor. + :type kwargs: Any + :return: A new ArangoVector instance with embeddings created from the + collection. + :rtype: ArangoVector """ if not text_properties_to_embed: m = "Parameter `text_properties_to_embed` must not be an empty list" @@ -1070,7 +1243,7 @@ def _build_vector_search_query( LIMIT {k} {filter_clause if use_approx else ""} LET data = KEEP(doc, {return_fields_list}) - LET metadata = {f'({metadata_clause})' if metadata_clause else '{}'} + LET metadata = {f"({metadata_clause})" if metadata_clause else "{}"} RETURN {{data, score, metadata}} """ @@ -1164,7 +1337,7 @@ def _build_hybrid_search_query( LIMIT 1 RETURN KEEP(doc, {return_fields_list}) ) - LET metadata = {f'({metadata_clause})' if metadata_clause else '{}'} + LET metadata = {f"({metadata_clause})" if metadata_clause else "{}"} RETURN {{ data, score, metadata }} """