diff --git a/libs/arangodb/doc/api_reference.rst b/libs/arangodb/doc/api_reference.rst index 063d103..a02b288 100644 --- a/libs/arangodb/doc/api_reference.rst +++ b/libs/arangodb/doc/api_reference.rst @@ -27,6 +27,7 @@ Graphs :undoc-members: :show-inheritance: + Chains ------ diff --git a/libs/arangodb/doc/arangoqachain.rst b/libs/arangodb/doc/arangoqachain.rst new file mode 100644 index 0000000..ac0da13 --- /dev/null +++ b/libs/arangodb/doc/arangoqachain.rst @@ -0,0 +1,252 @@ +ArangoGraphQAChain +======================== + +This guide demonstrates how to use the ArangoGraphQAChain for question-answering against an ArangoDB graph database. + +Basic Setup +----------- + +First, let's set up the necessary imports and create a basic instance: + +.. code-block:: python + + from langchain_arangodb.chains.graph_qa.arangodb import ArangoGraphQAChain + from langchain_arangodb.graphs.arangodb_graph import ArangoGraph + from langchain.chat_models import ChatOpenAI + from arango import ArangoClient + + # Initialize ArangoDB connection + client = ArangoClient() + db = client.db("your_database", username="user", password="pass") + + # Create graph instance + graph = ArangoGraph(db) + + # Initialize LLM + llm = ChatOpenAI(temperature=0) + + # Create the chain + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True # Be cautious with this setting + ) + +Individual Method Usage +----------------------- + +1. Basic Query Execution +~~~~~~~~~~~~~~~~~~~~~~~~ + +The simplest way to use the chain is with a direct query: + +.. code-block:: python + + response = chain.invoke({"query": "Who starred in Pulp Fiction?"}) + print(response["result"]) + +2. Using Custom Input/Output Keys +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can customize the input and output keys: + +.. code-block:: python + + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True, + input_key="question", + output_key="answer" + ) + + response = chain.invoke({"question": "Who directed Inception?"}) + print(response["answer"]) + +3. Limiting Results +~~~~~~~~~~~~~~~~~~~ + +Control the number of results returned: + +.. code-block:: python + + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True, + top_k=5, # Return only top 5 results + output_list_limit=16, # Limit list length in response + output_string_limit=128 # Limit string length in response + ) + +4. Query Explanation Mode +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Get query explanation without execution: + +.. code-block:: python + + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True, + execute_aql_query=False # Only explain, don't execute + ) + + explanation = chain.invoke({"query": "Find all movies released after 2020"}) + print(explanation["aql_result"]) # Contains query plan + +5. Read-Only Mode +~~~~~~~~~~~~~~~~~ + +Enforce read-only operations: + +.. code-block:: python + + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True, + force_read_only_query=True # Prevents write operations + ) + +6. Custom AQL Examples +~~~~~~~~~~~~~~~~~~~~~~ + +Provide example AQL queries for better generation: + +.. code-block:: python + + example_queries = """ + FOR m IN Movies + FILTER m.year > 2020 + RETURN m.title + + FOR a IN Actors + FILTER a.awards > 0 + RETURN a.name + """ + + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True, + aql_examples=example_queries + ) + +7. Detailed Output +~~~~~~~~~~~~~~~~~~ + +Get more detailed output including AQL query and results: + +.. code-block:: python + + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True, + return_aql_query=True, + return_aql_result=True + ) + + response = chain.invoke({"query": "Who acted in The Matrix?"}) + print("Query:", response["aql_query"]) + print("Raw Results:", response["aql_result"]) + print("Final Answer:", response["result"]) + +Complete Workflow Example +------------------------- + +Here's a complete workflow showing how to use multiple features together: + +.. code-block:: python + + from langchain_arangodb.chains.graph_qa.arangodb import ArangoGraphQAChain + from langchain_arangodb.graphs.arangodb_graph import ArangoGraph + from langchain.chat_models import ChatOpenAI + from arango import ArangoClient + + # 1. Setup Database Connection + client = ArangoClient() + db = client.db("movies_db", username="user", password="pass") + + # 2. Initialize Graph + graph = ArangoGraph(db) + + # 3. Create Collections and Sample Data + if not db.has_collection("Movies"): + movies = db.create_collection("Movies") + movies.insert({"_key": "matrix", "title": "The Matrix", "year": 1999}) + + if not db.has_collection("Actors"): + actors = db.create_collection("Actors") + actors.insert({"_key": "keanu", "name": "Keanu Reeves"}) + + if not db.has_collection("ActedIn"): + acted_in = db.create_collection("ActedIn", edge=True) + acted_in.insert({ + "_from": "Actors/keanu", + "_to": "Movies/matrix" + }) + + # 4. Refresh Schema + graph.refresh_schema() + + # 5. Initialize Chain with Advanced Features + llm = ChatOpenAI(temperature=0) + chain = ArangoGraphQAChain.from_llm( + llm=llm, + graph=graph, + allow_dangerous_requests=True, + top_k=5, + force_read_only_query=True, + return_aql_query=True, + return_aql_result=True, + output_list_limit=20, + output_string_limit=200 + ) + + # 6. Run Multiple Queries + queries = [ + "Who acted in The Matrix?", + "What movies were released in 1999?", + "List all actors in the database" + ] + + for query in queries: + print(f"\nProcessing query: {query}") + response = chain.invoke({"query": query}) + + print("AQL Query:", response["aql_query"]) + print("Raw Results:", response["aql_result"]) + print("Final Answer:", response["result"]) + print("-" * 50) + +Security Considerations +----------------------- + +1. Always use appropriate database credentials with minimal required permissions +2. Be cautious with ``allow_dangerous_requests=True`` +3. Use ``force_read_only_query=True`` when only read operations are needed +4. Monitor and log query execution in production environments +5. Regularly review and update AQL examples to prevent injection risks + +Error Handling +-------------- + +The chain includes built-in error handling: + +.. code-block:: python + + try: + response = chain.invoke({"query": "Find all movies"}) + except ValueError as e: + if "Maximum amount of AQL Query Generation attempts" in str(e): + print("Failed to generate valid AQL after multiple attempts") + elif "Write operations are not allowed" in str(e): + print("Attempted write operation in read-only mode") + else: + print(f"Other error: {e}") + +The chain will automatically attempt to fix invalid AQL queries up to +``max_aql_generation_attempts`` times (default: 3) before raising an error. \ No newline at end of file diff --git a/libs/arangodb/doc/chat_message_histories.rst b/libs/arangodb/doc/chat_message_histories.rst index f2c5208..b391966 100644 --- a/libs/arangodb/doc/chat_message_histories.rst +++ b/libs/arangodb/doc/chat_message_histories.rst @@ -317,11 +317,11 @@ Messages are stored in ArangoDB with the following structure: - ``time``: Timestamp for message ordering (automatically added by ArangoDB) Indexing Strategy -~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~ The class automatically creates a persistent index on ``session_id`` to ensure efficient retrieval: -.. code-block:: aql +.. code-block:: python // Automatic index creation CREATE INDEX session_idx ON ChatHistory (session_id) OPTIONS {type: "persistent", unique: false} @@ -332,7 +332,7 @@ Best Practices -------------- Session ID Management -~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~ 1. **Use descriptive session IDs**: Include user context or conversation type 2. **Avoid special characters**: Stick to alphanumeric characters and underscores @@ -346,7 +346,7 @@ Session ID Management session_id = f"training_{model_version}_{session_counter}" Memory Management -~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~ 1. **Choose appropriate memory types** based on conversation length 2. **Implement session cleanup** for privacy or storage management @@ -372,7 +372,7 @@ Memory Management db.aql.execute(query, bind_vars=bind_vars) Error Handling -~~~~~~~~~~~~~ +~~~~~~~~~~~~~~ .. code-block:: python @@ -396,7 +396,7 @@ Error Handling print(f"Unexpected error: {e}") Performance Considerations -------------------------- +-------------------------- 1. **Session ID indexing**: Automatic indexing ensures O(log n) lookup performance 2. **Message ordering**: Uses ArangoDB's built-in sorting capabilities @@ -404,7 +404,7 @@ Performance Considerations 4. **Collection sizing**: Monitor and archive old conversations as needed Example: Complete Chat Application ---------------------------------- +---------------------------------- .. code-block:: python @@ -506,7 +506,7 @@ Troubleshooting --------------- Common Issues -~~~~~~~~~~~~ +~~~~~~~~~~~~~ **ValueError: Please ensure that the session_id parameter is provided** - Ensure session_id is not None, empty string, or 0 diff --git a/libs/arangodb/doc/conf.py b/libs/arangodb/doc/conf.py index a2849d3..175ad18 100644 --- a/libs/arangodb/doc/conf.py +++ b/libs/arangodb/doc/conf.py @@ -11,9 +11,9 @@ sys.path.insert(0, os.path.abspath("..")) -project = 'langchain-arangodb' -copyright = '2025, ArangoDB' -author = 'ArangoDB' +project = "langchain-arangodb" +copyright = "2025, ArangoDB" +author = "ArangoDB" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration @@ -25,15 +25,15 @@ "sphinx.ext.autosummary", "sphinx.ext.inheritance_diagram", ] -templates_path = ['_templates'] -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'sphinx_rtd_theme' -html_static_path = [] # ['_static'] +html_theme = "sphinx_rtd_theme" +html_static_path = [] # type: ignore autodoc_member_order = "bysource" autodoc_inherit_docstrings = True autosummary_generate = True diff --git a/libs/arangodb/doc/graph.rst b/libs/arangodb/doc/graph.rst new file mode 100644 index 0000000..e7d49db --- /dev/null +++ b/libs/arangodb/doc/graph.rst @@ -0,0 +1,524 @@ +ArangoGraph +=========== + +The ``ArangoGraph`` class provides an interface to interact with ArangoDB for graph operations in LangChain. + +Installation +------------ + +.. code-block:: bash + + pip install langchain-arangodb + +Basic Usage +----------- + +.. code-block:: python + + from langchain_arangodb.graphs.arangodb_graph import ArangoGraph, get_arangodb_client + + # Connect to ArangoDB + db = get_arangodb_client( + url="http://localhost:8529", + dbname="_system", + username="root", + password="password" + ) + + # Initialize ArangoGraph + graph = ArangoGraph(db) + + +Factory Methods +--------------- + +get_arangodb_client +~~~~~~~~~~~~~~~~~~~~ + +Creates a connection to ArangoDB. + +.. code-block:: python + + from langchain_arangodb.graphs.arangodb_graph import get_arangodb_client + + # Using direct credentials + db = get_arangodb_client( + url="http://localhost:8529", + dbname="_system", + username="root", + password="password" + ) + + # Using environment variables + # ARANGODB_URL + # ARANGODB_DBNAME + # ARANGODB_USERNAME + # ARANGODB_PASSWORD + db = get_arangodb_client() + +from_db_credentials +~~~~~~~~~~~~~~~~~~ + +Alternative constructor that creates an ArangoGraph instance directly from credentials. + +.. code-block:: python + + graph = ArangoGraph.from_db_credentials( + url="http://localhost:8529", + dbname="_system", + username="root", + password="password" + ) + +Core Methods +------------ + +add_graph_documents +~~~~~~~~~~~~~~~~~~~ + +Adds graph documents to the database. + +.. code-block:: python + + from langchain_core.documents import Document + from langchain_arangodb.graphs.graph_document import GraphDocument, Node, Relationship + + # Create nodes and relationships + nodes = [ + Node(id="1", type="Person", properties={"name": "Alice"}), + Node(id="2", type="Company", properties={"name": "Acme"}) + ] + + relationship = Relationship( + source=nodes[0], + target=nodes[1], + type="WORKS_AT", + properties={"since": 2020} + ) + + # Create graph document + doc = GraphDocument( + nodes=nodes, + relationships=[relationship], + source=Document(page_content="Employee record") + ) + + # Add to database + graph.add_graph_documents( + graph_documents=[doc], + include_source=True, + graph_name="EmployeeGraph", + update_graph_definition_if_exists=True, + capitalization_strategy="lower" + ) +Example: Using LLMGraphTransformer + +.. code-block:: python + + from langchain.experimental import LLMGraphTransformer + from langchain_core.chat_models import ChatOpenAI + from langchain_openai import OpenAIEmbeddings + + # Text to transform into a graph + text = "Bob knows Alice, John knows Bob." + + # Initialize transformer with ChatOpenAI + transformer = LLMGraphTransformer( + llm=ChatOpenAI(temperature=0) + ) + + # Create graph document from text + graph_doc = transformer.create_graph_doc(text) + + # Add to ArangoDB with embeddings + graph.add_graph_documents( + [graph_doc], + graph_name="people_graph", + use_one_entity_collection=False, # Creates 'Person' node collection and 'KNOWS' edge collection + update_graph_definition_if_exists=True, + include_source=True, + embeddings=OpenAIEmbeddings(), + embed_nodes=True # Embeds 'Alice' and 'Bob' nodes + ) + +query +~~~~~ + +Executes AQL queries against the database. + +.. code-block:: python + + # Simple query + result = graph.query("FOR doc IN users RETURN doc") + + # Query with parameters + result = graph.query( + "FOR u IN users FILTER u.age > @min_age RETURN u", + params={"min_age": 21} + ) + + + +explain +~~~~~~~ + +Gets the query execution plan. + +.. code-block:: python + + plan = graph.explain( + "FOR doc IN users RETURN doc" + ) + +Schema Management +----------------- + +refresh_schema +~~~~~~~~~~~~~~ + +Updates the internal schema representation. + +.. code-block:: python + + graph.refresh_schema( + sample_ratio=0.1, # Sample 10% of documents + graph_name="MyGraph", + include_examples=True + ) + +generate_schema +~~~~~~~~~~~~~~~ + +Generates a schema representation of the database. + +.. code-block:: python + + schema = graph.generate_schema( + sample_ratio=0.1, + graph_name="MyGraph", + include_examples=True, + list_limit=32 + ) + +set_schema +~~~~~~~~~~ + +Sets a custom schema. + +.. code-block:: python + + custom_schema = { + "collections": { + "users": {"fields": ["name", "age"]}, + "products": {"fields": ["name", "price"]} + } + } + + graph.set_schema(custom_schema) + +Schema Properties +----------------- + +schema +~~~~~~ + +Gets the current schema as a dictionary. + +.. code-block:: python + + current_schema = graph.schema + +schema_json +~~~~~~~~~~~~ + +Gets the schema as a JSON string. + +.. code-block:: python + + schema_json = graph.schema_json + +schema_yaml +~~~~~~~~~~~ + +Gets the schema as a YAML string. + +.. code-block:: python + + schema_yaml = graph.schema_yaml + +get_structured_schema +~~~~~~~~~~~~~~~~~~~~~ + +Gets the schema in a structured format. + +.. code-block:: python + + structured_schema = graph.get_structured_schema + +Internal Utility Methods +----------------------- + +These methods are used internally but may be useful for advanced use cases: + +_sanitize_collection_name +~~~~~~~~~~~~~~~~~~~~~~~~ + +Sanitizes collection names to be valid in ArangoDB. + +.. code-block:: python + + safe_name = graph._sanitize_collection_name("My Collection!") + # Returns: "My_Collection_" + +_sanitize_input +~~~~~~~~~~~~~~~~ + +Sanitizes input data by truncating long strings and lists. + +.. code-block:: python + + sanitized = graph._sanitize_input( + {"list": [1,2,3,4,5,6]}, + list_limit=5, + string_limit=100 + ) + +_hash +~~~~~ + +Generates a hash string for a value. + +.. code-block:: python + + hash_str = graph._hash("some value") + +_process_source +~~~~~~~~~~~~~~~~ + +Processes a source document for storage. + +.. code-block:: python + + from langchain_core.documents import Document + + source = Document( + page_content="test content", + metadata={"author": "Alice"} + ) + + source_id = graph._process_source( + source=source, + source_collection_name="sources", + source_embedding=[0.1, 0.2, 0.3], + embedding_field="embedding", + insertion_db=db + ) + +_import_data +~~~~~~~~~~~~~ + +Bulk imports data into collections. + +.. code-block:: python + + data = { + "users": [ + {"_key": "1", "name": "Alice"}, + {"_key": "2", "name": "Bob"} + ] + } + + graph._import_data(db, data, is_edge=False) + + +Example Workflow +---------------- + +Here's a complete example demonstrating a typical workflow using ArangoGraph to create a knowledge graph from documents: + +.. code-block:: python + + from langchain_core.documents import Document + from langchain_core.embeddings import Embeddings + from langchain_arangodb.graphs.arangodb_graph import ArangoGraph, get_arangodb_client + from langchain_arangodb.graphs.graph_document import GraphDocument, Node, Relationship + + # 1. Setup embeddings (example using OpenAI - you can use any embeddings model) + from langchain_openai import OpenAIEmbeddings + embeddings = OpenAIEmbeddings() + # 2. Connect to ArangoDB and initialize graph + db = get_arangodb_client( + url="http://localhost:8529", + dbname="knowledge_base", + username="root", + password="password" + ) + graph = ArangoGraph(db) + + # 3. Create sample documents with relationships + documents = [ + Document( + page_content="Alice is a software engineer at Acme Corp.", + metadata={"source": "employee_records", "date": "2024-01-01"} + ), + Document( + page_content="Bob is a project manager working with Alice on Project X.", + metadata={"source": "project_docs", "date": "2024-01-02"} + ) + ] + + # 4. Create nodes and relationships for each document + graph_documents = [] + for doc in documents: + # Extract entities and relationships (simplified example) + if "Alice" in doc.page_content: + alice_node = Node(id="alice", type="Person", properties={"name": "Alice", "role": "Software Engineer"}) + company_node = Node(id="acme", type="Company", properties={"name": "Acme Corp"}) + works_at_rel = Relationship( + source=alice_node, + target=company_node, + type="WORKS_AT" + ) + graph_doc = GraphDocument( + nodes=[alice_node, company_node], + relationships=[works_at_rel], + source=doc + ) + graph_documents.append(graph_doc) + + if "Bob" in doc.page_content: + bob_node = Node(id="bob", type="Person", properties={"name": "Bob", "role": "Project Manager"}) + project_node = Node(id="project_x", type="Project", properties={"name": "Project X"}) + manages_rel = Relationship( + source=bob_node, + target=project_node, + type="MANAGES" + ) + works_with_rel = Relationship( + source=bob_node, + target=alice_node, + type="WORKS_WITH" + ) + graph_doc = GraphDocument( + nodes=[bob_node, project_node], + relationships=[manages_rel, works_with_rel], + source=doc + ) + graph_documents.append(graph_doc) + + # 5. Add documents to the graph with embeddings + graph.add_graph_documents( + graph_documents=graph_documents, + include_source=True, # Store original documents + graph_name="CompanyGraph", + update_graph_definition_if_exists=True, + embed_source=True, # Generate embeddings for documents + embed_nodes=True, # Generate embeddings for nodes + embed_relationships=True, # Generate embeddings for relationships + embeddings=embeddings, + batch_size=100, + capitalization_strategy="lower" + ) + + # 6. Query the graph + # Find all people who work at Acme Corp + employees = graph.query(""" + FOR v, e IN 1..1 OUTBOUND + (FOR c IN ENTITY FILTER c.type == 'Company' AND c.name == 'Acme Corp' RETURN c)._id + ENTITY_EDGE + RETURN { + name: v.name, + role: v.role, + company: 'Acme Corp' + } + """) + + # Find all projects and their managers + projects = graph.query(""" + FOR v, e IN 1..1 INBOUND + (FOR p IN ENTITY FILTER p.type == 'Project' RETURN p)._id + ENTITY_EDGE + FILTER e.type == 'MANAGES' + RETURN { + project: v.name, + manager: e._from + } + """) + + # 7. Generate and inspect schema + schema = graph.generate_schema( + sample_ratio=1.0, # Use all documents for schema + graph_name="CompanyGraph", + include_examples=True + ) + + print("Schema:", schema) + + # 8. Error handling for queries + try: + # Complex query with potential for errors + result = graph.query(""" + FOR v, e, p IN 1..3 OUTBOUND + (FOR p IN ENTITY FILTER p.name == 'Alice' RETURN p)._id + ENTITY_EDGE + RETURN p + """) + except ArangoServerError as e: + print(f"Query error: {e}") + +This workflow demonstrates: + +1. Setting up the environment with embeddings +2. Connecting to ArangoDB +3. Creating documents with structured relationships +4. Adding documents to the graph with embeddings +5. Querying the graph using AQL +6. Schema management +7. Error handling + +The example creates a simple company knowledge graph with: + +- People (employees) +- Companies +- Projects +- Various relationships (WORKS_AT, MANAGES, WORKS_WITH) +- Document sources with embeddings + +Key Features Used: + +- Document embedding +- Node and relationship embedding +- Source document storage +- Graph schema management +- AQL queries +- Error handling +- Batch processing + + +Best Practices +-------------- + +1. Always use appropriate capitalization strategy for consistency +2. Use batch operations for large data imports +3. Consider using embeddings for semantic search capabilities +4. Implement proper error handling for database operations +5. Use schema management for better data organization + +Error Handling +-------------- + +.. code-block:: python + + from arango.exceptions import ArangoServerError + + try: + result = graph.query("FOR doc IN nonexistent RETURN doc") + except ArangoServerError as e: + print(f"Database error: {e}") + + +-------------- + + + + diff --git a/libs/arangodb/doc/index.rst b/libs/arangodb/doc/index.rst index b134c58..7c458fa 100644 --- a/libs/arangodb/doc/index.rst +++ b/libs/arangodb/doc/index.rst @@ -97,15 +97,12 @@ Documentation Contents quickstart vectorstores chat_message_histories - -.. toctree:: - :maxdepth: 2 - :caption: Advanced: - - mydirectory/index + graph + arangoqachain .. toctree:: :maxdepth: 2 :caption: API Reference: - api_reference \ No newline at end of file + api_reference + diff --git a/libs/arangodb/doc/mydirectory/index.rst b/libs/arangodb/doc/mydirectory/index.rst deleted file mode 100644 index 7e344ae..0000000 --- a/libs/arangodb/doc/mydirectory/index.rst +++ /dev/null @@ -1,4 +0,0 @@ -.. _mydirectory: - -Hello World -============ \ No newline at end of file diff --git a/libs/arangodb/langchain_arangodb/chains/graph_qa/arangodb.py b/libs/arangodb/langchain_arangodb/chains/graph_qa/arangodb.py index fefa6be..f2050af 100644 --- a/libs/arangodb/langchain_arangodb/chains/graph_qa/arangodb.py +++ b/libs/arangodb/langchain_arangodb/chains/graph_qa/arangodb.py @@ -105,14 +105,17 @@ def __init__(self, **kwargs: Any) -> None: @property def input_keys(self) -> List[str]: + """Get the input keys for the chain.""" return [self.input_key] @property def output_keys(self) -> List[str]: + """Get the output keys for the chain.""" return [self.output_key] @property def _chain_type(self) -> str: + """Get the chain type.""" return "graph_aql_chain" @classmethod @@ -120,12 +123,34 @@ def from_llm( cls, llm: BaseLanguageModel, *, - qa_prompt: BasePromptTemplate = AQL_QA_PROMPT, - aql_generation_prompt: BasePromptTemplate = AQL_GENERATION_PROMPT, - aql_fix_prompt: BasePromptTemplate = AQL_FIX_PROMPT, + qa_prompt: Optional[BasePromptTemplate] = None, + aql_generation_prompt: Optional[BasePromptTemplate] = None, + aql_fix_prompt: Optional[BasePromptTemplate] = None, **kwargs: Any, ) -> ArangoGraphQAChain: - """Initialize from LLM.""" + """Initialize from LLM. + + :param llm: The language model to use. + :type llm: BaseLanguageModel + :param qa_prompt: The prompt to use for the QA chain. + :type qa_prompt: BasePromptTemplate + :param aql_generation_prompt: The prompt to use for the AQL generation chain. + :type aql_generation_prompt: BasePromptTemplate + :param aql_fix_prompt: The prompt to use for the AQL fix chain. + :type aql_fix_prompt: BasePromptTemplate + :param kwargs: Additional keyword arguments. + :type kwargs: Any + :return: The initialized ArangoGraphQAChain. + :rtype: ArangoGraphQAChain + :raises ValueError: If the LLM is not provided. + """ + if qa_prompt is None: + qa_prompt = AQL_QA_PROMPT + if aql_generation_prompt is None: + aql_generation_prompt = AQL_GENERATION_PROMPT + if aql_fix_prompt is None: + aql_fix_prompt = AQL_FIX_PROMPT + qa_chain = qa_prompt | llm aql_generation_chain = aql_generation_prompt | llm aql_fix_chain = aql_fix_prompt | llm @@ -149,37 +174,37 @@ def _call( Users can modify the following ArangoGraphQAChain Class Variables: - :var top_k: The maximum number of AQL Query Results to return + :param top_k: The maximum number of AQL Query Results to return :type top_k: int - :var aql_examples: A set of AQL Query Examples that are passed to + :param aql_examples: A set of AQL Query Examples that are passed to the AQL Generation Prompt Template to promote few-shot-learning. Defaults to an empty string. :type aql_examples: str - :var return_aql_query: Whether to return the AQL Query in the + :param return_aql_query: Whether to return the AQL Query in the output dictionary. Defaults to False. :type return_aql_query: bool - :var return_aql_result: Whether to return the AQL Query in the + :param return_aql_result: Whether to return the AQL Query in the output dictionary. Defaults to False :type return_aql_result: bool - :var max_aql_generation_attempts: The maximum amount of AQL + :param max_aql_generation_attempts: The maximum amount of AQL Generation attempts to be made prior to raising the last AQL Query Execution Error. Defaults to 3. :type max_aql_generation_attempts: int - :var execute_aql_query: If False, the AQL Query is only + :param execute_aql_query: If False, the AQL Query is only explained & returned, not executed. Defaults to True. :type execute_aql_query: bool - :var output_list_limit: The maximum list length to display + :param output_list_limit: The maximum list length to display in the output. If the list is longer, it will be truncated. Defaults to 32. :type output_list_limit: int - :var output_string_limit: The maximum string length to display + :param output_string_limit: The maximum string length to display in the output. If the string is longer, it will be truncated. Defaults to 256. :type output_string_limit: int @@ -348,11 +373,11 @@ def _call( def _is_read_only_query(self, aql_query: str) -> Tuple[bool, Optional[str]]: """Check if the AQL query is read-only. - Args: - aql_query: The AQL query to check. + :param aql_query: The AQL query to check. + :type aql_query: str - Returns: - bool: True if the query is read-only, False otherwise. + :return: True if the query is read-only, False otherwise. + :rtype: Tuple[bool, Optional[str]] """ normalized_query = aql_query.upper() diff --git a/libs/arangodb/langchain_arangodb/graphs/arangodb_graph.py b/libs/arangodb/langchain_arangodb/graphs/arangodb_graph.py index f128242..17be379 100644 --- a/libs/arangodb/langchain_arangodb/graphs/arangodb_graph.py +++ b/libs/arangodb/langchain_arangodb/graphs/arangodb_graph.py @@ -29,18 +29,24 @@ def get_arangodb_client( ) -> Any: """Get the Arango DB client from credentials. - Args: - url: Arango DB url. Can be passed in as named arg or set as environment - var ``ARANGODB_URL``. Defaults to "http://localhost:8529". - dbname: Arango DB name. Can be passed in as named arg or set as - environment var ``ARANGODB_DBNAME``. Defaults to "_system". - username: Can be passed in as named arg or set as environment var - ``ARANGODB_USERNAME``. Defaults to "root". - password: Can be passed ni as named arg or set as environment var - ``ARANGODB_PASSWORD``. Defaults to "". - - Returns: - An arango.database.StandardDatabase. + :param url: Arango DB url. Can be passed in as named arg or set as environment + var ``ARANGODB_URL``. Defaults to "http://localhost:8529". + :type url: str + :param dbname: Arango DB name. Can be passed in as named arg or set as + environment var ``ARANGODB_DBNAME``. Defaults to "_system". + :type dbname: str + :param username: Can be passed in as named arg or set as environment var + ``ARANGODB_USERNAME``. Defaults to "root". + :type username: str + :param password: Can be passed in as named arg or set as environment var + ``ARANGODB_PASSWORD``. Defaults to "". + :type password: str + + :return: An arango.database.StandardDatabase. + :rtype: Any + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. """ _url: str = url or str(os.environ.get("ARANGODB_URL", "http://localhost:8529")) _dbname: str = dbname or str(os.environ.get("ARANGODB_DBNAME", "_system")) @@ -53,24 +59,39 @@ def get_arangodb_client( class ArangoGraph(GraphStore): """ArangoDB wrapper for graph operations. - Parameters: - - db (arango.database.StandardDatabase): ArangoDB database instance. - - generate_schema_on_init (bool): Whether to generate the graph schema + :param db: The ArangoDB database instance. + :type db: StandardDatabase + :param generate_schema_on_init: Whether to generate the graph schema on initialization. Defaults to True. - - schema_sample_ratio (float): A float (0 to 1) to determine the - ratio of documents/edges sampled in relation to the Collection size - to generate each Collection Schema. If 0, one document/edge + :type generate_schema_on_init: bool + :param schema_sample_ratio: The ratio of documents/edges to sample in relation to + the Collection size to generate each Collection Schema. If 0, one document/edge is used per Collection. Defaults to 0. - - schema_graph_name (str): The name of an existing ArangoDB Graph to specifically + :type schema_sample_ratio: float + :param schema_graph_name: The name of an existing ArangoDB Graph to specifically use to generate the schema. If None, the entire database will be used. Defaults to None. - - schema_include_examples (bool): Whether to include example values fetched from + :type schema_graph_name: Optional[str] + :param schema_include_examples: Whether to include example values fetched from a sample documents as part of the schema. Defaults to True. Lists of size higher than **schema_list_limit** will be excluded from the schema, even if **schema_include_examples** is set to True. Defaults to True. - - schema_list_limit (int): The maximum list size the schema will include as part + :type schema_include_examples: bool + :param schema_list_limit: The maximum list size the schema will include as part of the example values. If the list is longer than this limit, a string describing the list will be used in the schema instead. Default is 32. + :type schema_list_limit: int + :param schema_string_limit: The maximum number of characters to include + in a string. If the string is longer than this limit, a string + describing the string will be used in the schema instead. Default is 256. + :type schema_string_limit: int + + :return: None + :rtype: None + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + *Security note*: Make sure that the database connection uses credentials that are narrowly-scoped to only include necessary permissions. @@ -82,6 +103,8 @@ class ArangoGraph(GraphStore): limit the permissions granted to the credentials used with this tool. See https://python.langchain.com/docs/security for more information. + + """ def __init__( @@ -94,6 +117,10 @@ def __init__( schema_list_limit: int = 32, schema_string_limit: int = 256, ) -> None: + """ + Initializes the ArangoGraph instance. + + """ self.__db: StandardDatabase = db self.__async_db = db.begin_async_execution() @@ -123,16 +150,30 @@ def get_structured_schema(self) -> Dict[str, Any]: @property def schema_json(self) -> str: - """Returns the schema of the Graph Database as a JSON string""" + """Returns the schema of the Graph Database as a JSON string + + :return: The schema of the Graph Database as a JSON string + :rtype: str + """ return json.dumps(self.__schema) @property def schema_yaml(self) -> str: - """Returns the schema of the Graph Database as a YAML string""" + """Returns the schema of the Graph Database as a YAML string + + :return: The schema of the Graph Database as a YAML string + :rtype: str + """ return yaml.dump(self.__schema, sort_keys=False) def set_schema(self, schema: Dict[str, Any]) -> None: - """Sets a custom schema for the ArangoDB Database.""" + """Sets a custom schema for the ArangoDB Database. + + :param schema: The schema to set. + :type schema: Dict[str, Any] + :return: None + :rtype: None + """ self.__schema = schema def refresh_schema( @@ -146,20 +187,31 @@ def refresh_schema( Refresh the graph schema information. Parameters: - - sample_ratio (float): A float (0 to 1) to determine the + + :param sample_ratio: A float (0 to 1) to determine the ratio of documents/edges sampled in relation to the Collection size to generate each Collection Schema. If 0, one document/edge is used per Collection. Defaults to 0. - - graph_name (str): The name of an existing ArangoDB Graph to specifically + :type sample_ratio: float + :param graph_name: The name of an existing ArangoDB Graph to specifically use to generate the schema. If None, the entire database will be used. Defaults to None. - - include_examples (bool): Whether to include example values fetched from + :type graph_name: Optional[str] + :param include_examples: Whether to include example values fetched from a sample documents as part of the schema. Defaults to True. Lists of size higher than **list_limit** will be excluded from the schema, even if **schema_include_examples** is set to True. Defaults to True. - - list_limit (int): The maximum list size the schema will include as part + :type include_examples: bool + :param list_limit: The maximum list size the schema will include as part of the example values. If the list is longer than this limit, a string describing the list will be used in the schema instead. Default is 32. + :type list_limit: int + + :return: None + :rtype: None + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. """ self.__schema = self.generate_schema( sample_ratio, graph_name, include_examples, list_limit @@ -176,21 +228,31 @@ def generate_schema( """ Generates the schema of the ArangoDB Database and returns it - Parameters: - - sample_ratio (float): A ratio (0 to 1) to determine the - ratio of documents/edges used (in relation to the Collection size) - to render each Collection Schema. If 0, one document/edge - is used per Collection. - - graph_name (str): The name of the graph to use to generate the schema. If + :param sample_ratio: A ratio (0 to 1) to determine the + ratio of documents/edges used (in relation to the Collection size) + to render each Collection Schema. If 0, one document/edge + is used per Collection. + :type sample_ratio: float + :param graph_name: The name of the graph to use to generate the schema. If None, the entire database will be used. - - include_examples (bool): A flag whether to scan the database for + :type graph_name: Optional[str] + :param include_examples: A flag whether to scan the database for example values and use them in the graph schema. Default is True. - - list_limit (int): The maximum number of elements to include in a list. + :type include_examples: bool + :param list_limit: The maximum number of elements to include in a list. If the list is longer than this limit, a string describing the list will be used in the schema instead. Default is 32. - - schema_string_limit (int): The maximum number of characters to include + :type list_limit: int + :param schema_string_limit: The maximum number of characters to include in a string. If the string is longer than this limit, a string describing the string will be used in the schema instead. Default is 128. + :type schema_string_limit: int + :return: A dictionary containing the graph schema and collection schema. + :rtype: Dict[str, List[Dict[str, Any]]] + :raises ValueError: If the sample ratio is not between 0 and 1. + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. """ if not 0 <= sample_ratio <= 1: raise ValueError("**sample_ratio** value must be in between 0 to 1") @@ -273,18 +335,26 @@ def query( Execute an AQL query and return the results. Parameters: - - query (str): The AQL query to execute. - - params (dict): Additional arguments piped to the function. - - top_k: Number of results to process from the AQL cursor. - Defaults to None. - - list_limit: Removes lists above **list_limit** size - that have been returned from the AQL query. - - string_limit: Removes strings above **string_limit** size - that have been returned from the AQL query. - - Remaining params are passed to the AQL query execution. - - Returns: - - A list of dictionaries containing the query results. + :param query: The AQL query to execute. + :type query: str + :param params: Additional arguments piped to the function. + Defaults to None. + :type params: dict + :param list_limit: Removes lists above **list_limit** size + that have been returned from the AQL query. + :type list_limit: Optional[int] + :param string_limit: Removes strings above **string_limit** size + that have been returned from the AQL query. + :type string_limit: Optional[int] + :param remaining_params: Remaining params are passed to the AQL query execution. + Defaults to None. + :type remaining_params: Optional[dict] + + :return: A list of dictionaries containing the query results. + :rtype: List[Any] + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. """ top_k = params.pop("top_k", None) list_limit = params.pop("list_limit", 32) @@ -308,11 +378,16 @@ def explain(self, query: str, params: dict = {}) -> List[Dict[str, Any]]: """ Explain an AQL query without executing it. - Parameters: - - query (str): The AQL query to explain. - - Returns: - - A list of dictionaries containing the query explanation. + :param query: The AQL query to explain. + :type query: str + :param params: Additional arguments piped to the function. + Defaults to None. + :type params: dict + :return: A list of dictionaries containing the query explanation. + :rtype: List[Dict[str, Any]] + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. """ return self.__db.aql.explain(query) # type: ignore @@ -340,50 +415,51 @@ def add_graph_documents( Constructs nodes & relationships in the graph based on the provided GraphDocument objects. - Parameters: - - graph_documents (List[GraphDocument]): A list of GraphDocument objects - that contain the nodes and relationships to be added to the graph. Each - GraphDocument should encapsulate the structure of part of the graph, - including nodes, relationships, and the source document information. - - include_source (bool, optional): If True, stores the source document - and links it to nodes in the graph using the HAS_SOURCE relationship. - This is useful for tracing back the origin of data. Merges source - documents based on the `id` property from the source document if available, - otherwise it calculates the Farmhash hash of `page_content` - for merging process. Defaults to False. - - graph_name (str): The name of the ArangoDB General Graph to create. If None, - no graph will be created. - - update_graph_definition_if_exists (bool): If True, updates the graph - Edge Definitions - if it already exists. Defaults to False. Not used if `graph_name` is None. It is - recommended to set this to True if `use_one_entity_collection` is set to False. - - batch_size (int): The number of nodes/edges to insert in a single batch. - - use_one_entity_collection (bool): If True, all nodes are stored in a single - entity collection. If False, nodes are stored in separate collections based - on their type. Defaults to True. - - insert_async (bool): If True, inserts data asynchronously. Defaults to False. - - source_collection_name (str): The name of the collection to store the source - documents. Defaults to "SOURCE". - - source_edge_collection_name (str): The name of the edge collection to store - the relationships between source documents and nodes. Defaults to "HAS_SOURCE". - - entity_collection_name (str): The name of the collection to store the nodes. - Defaults to "ENTITY". Only used if `use_one_entity_collection` is True. - - entity_edge_collection_name (str): The name of the edge collection to store - the relationships between nodes. Defaults to "LINKS_TO". Only used if - `use_one_entity_collection` is True. - - embeddings (Embeddings): An Embeddings object to use for embedding the source, - nodes and relationships. Defaults to None. - - embedding_field (set[str]): The field name to store the embedding. Defaults - to "embedding". Only used if `embedding` is not None, and `embed_source`, - `embed_nodes`, or `embed_relationships` is True. - - embed_source (bool): If True, embeds the source document. Defaults to False. - - embed_nodes (bool): If True, embeds the nodes. Defaults to False. - - embed_relationships (bool): If True, embeds the relationships. - Defaults to False. - - capitalization_strategy (str): The capitalization strategy applied on the - node and edge keys. Can be "lower", "upper", or "none". Defaults to "none". - Useful as a basic Entity Resolution technique to avoid duplicates based - on capitalization. + :param graph_documents: The GraphDocument objects to add to the graph. + :type graph_documents: List[GraphDocument] + :param include_source: Whether to include the source document in the graph. + :type include_source: bool + :param graph_name: The name of the graph to add the documents to. + :type graph_name: Optional[str] + :param update_graph_definition_if_exists: Whether to update the graph definition + if it already exists. + :type update_graph_definition_if_exists: bool + :param batch_size: The number of documents to process in each batch. + :type batch_size: int + :param use_one_entity_collection: Whether to use one entity collection + for all nodes. + :type use_one_entity_collection: bool + :param insert_async: Whether to insert the documents asynchronously. + :type insert_async: bool + :param source_collection_name: The name of the source collection. + :type source_collection_name: Union[str, None] + :param source_edge_collection_name: The name of the source edge collection. + :type source_edge_collection_name: Union[str, None] + :param entity_collection_name: The name of the entity collection. + :type entity_collection_name: Union[str, None] + :param entity_edge_collection_name: The name of the entity edge collection. + :type entity_edge_collection_name: Union[str, None] + :param embeddings: The embeddings model to use. + :type embeddings: Union[Embeddings, None] + :param embedding_field: The field to use for the embedding. + :type embedding_field: str + :param embed_source: Whether to embed the source document. + :type embed_source: bool + :param embed_nodes: Whether to embed the nodes. + :type embed_nodes: bool + :param embed_relationships: Whether to embed the relationships. + :type embed_relationships: bool + :param capitalization_strategy: The capitalization strategy to use. + :type capitalization_strategy: str + + :return: None + :rtype: None + :raises ValueError: If the capitalization strategy is not 'lower', + 'upper', or 'none'. + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + """ if not graph_documents: return @@ -624,18 +700,24 @@ def from_db_credentials( ) -> Any: """Convenience constructor that builds Arango DB from credentials. - Args: - url: Arango DB url. Can be passed in as named arg or set as environment + :param url: Arango DB url. Can be passed in as named arg or set as environment var ``ARANGODB_URL``. Defaults to "http://localhost:8529". - dbname: Arango DB name. Can be passed in as named arg or set as + :type url: str + :param dbname: Arango DB name. Can be passed in as named arg or set as environment var ``ARANGODB_DBNAME``. Defaults to "_system". - username: Can be passed in as named arg or set as environment var + :type dbname: str + :param username: Can be passed in as named arg or set as environment var ``ARANGODB_USERNAME``. Defaults to "root". - password: Can be passed ni as named arg or set as environment var - ``ARANGODB_PASSWORD``. Defaults to "". + :type username: str + :param password: Can be passed in as named arg or set as environment var + ``ARANGODB_USERNAME``. Defaults to "root". + :type password: str + + :return: An arango.database.StandardDatabase. + :rtype: Any + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. - Returns: - An arango.database.StandardDatabase. """ db = get_arangodb_client( url=url, dbname=dbname, username=username, password=password @@ -648,7 +730,21 @@ def _import_data( data: Dict[str, List[Dict[str, Any]]], is_edge: bool, ) -> None: - """Imports data into the ArangoDB database in bulk.""" + """Imports data into the ArangoDB database in bulk. + + :param db: The ArangoDB database instance. + :type db: Database + :param data: The data to import. + :type data: Dict[str, List[Dict[str, Any]]] + :param is_edge: Whether the data is an edge. + :type is_edge: bool + + :return: None + :rtype: None + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + """ for collection, batch in data.items(): self._create_collection(collection, is_edge) db.collection(collection).import_bulk(batch, on_duplicate="update") @@ -658,7 +754,19 @@ def _import_data( def _create_collection( self, collection_name: str, is_edge: bool = False, **kwargs: Any ) -> None: - """Creates a collection in the ArangoDB database if it does not exist.""" + """Creates a collection in the ArangoDB database if it does not exist. + + :param collection_name: The name of the collection to create. + :type collection_name: str + :param is_edge: Whether the collection is an edge. + :type is_edge: bool + + :return: None + :rtype: None + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + """ if not self.db.has_collection(collection_name): self.db.create_collection(collection_name, edge=is_edge, **kwargs) @@ -669,7 +777,20 @@ def _process_node_as_entity( nodes: DefaultDict[str, list], entity_collection_name: str, ) -> str: - """Processes a Graph Document Node into ArangoDB as a unanimous Entity.""" + """Processes a Graph Document Node into ArangoDB as a unanimous Entity. + + :param node_key: The key of the node. + :type node_key: str + :param node: The node to process. + :type node: Node + :param nodes: The nodes to process. + :type nodes: DefaultDict[str, list] + :param entity_collection_name: The name of the entity collection. + :type entity_collection_name: str + + :return: The name of the entity collection. + :rtype: str + """ nodes[entity_collection_name].append( { "_key": node_key, @@ -683,7 +804,20 @@ def _process_node_as_entity( def _process_node_as_type( self, node_key: str, node: Node, nodes: DefaultDict[str, list], _: str ) -> str: - """Processes a Graph Document Node into ArangoDB based on its Node Type.""" + """Processes a Graph Document Node into ArangoDB based on its Node Type. + + :param node_key: The key of the node. + :type node_key: str + :param node: The node to process. + :type node: Node + :param nodes: The nodes to process. + :type nodes: DefaultDict[str, list] + :param _: The name of the node type. + :type _: str + + :return: The name of the node type. + :rtype: str + """ node_type = self._sanitize_collection_name(node.type) nodes[node_type].append({"_key": node_key, "text": node.id, **node.properties}) return node_type @@ -700,7 +834,34 @@ def _process_edge_as_entity( entity_edge_collection_name: str, _: DefaultDict[str, DefaultDict[str, set[str]]], ) -> None: - """Processes a Graph Document Edge into ArangoDB as a unanimous Entity.""" + """Processes a Graph Document Edge into ArangoDB as a unanimous Entity. + + :param edge: The edge to process. + :type edge: Relationship + :param edge_str: The string representation of the edge. + :type edge_str: str + :param edge_key: The key of the edge. + :type edge_key: str + :param source_key: The key of the source node. + :type source_key: str + :param target_key: The key of the target node. + :type target_key: str + :param edges: The edges to process. + :type edges: DefaultDict[str, list] + :param entity_collection_name: The name of the entity collection. + :type entity_collection_name: str + :param entity_edge_collection_name: The name of the entity edge collection. + :type entity_edge_collection_name: str + :param _: The name of the edge type. + :type _: DefaultDict[str, DefaultDict[str, set[str]]] + + :return: None + :rtype: None + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + + """ edges[entity_edge_collection_name].append( { "_key": edge_key, @@ -724,7 +885,29 @@ def _process_edge_as_type( _2: str, edge_definitions_dict: DefaultDict[str, DefaultDict[str, set[str]]], ) -> None: - """Processes a Graph Document Edge into ArangoDB based on its Edge Type.""" + """Processes a Graph Document Edge into ArangoDB based on its Edge Type. + + :param edge: The edge to process. + :type edge: Relationship + :param edge_str: The string representation of the edge. + :type edge_str: str + :param edge_key: The key of the edge. + :type edge_key: str + :param source_key: The key of the source node. + :type source_key: str + :param target_key: The key of the target node. + :type target_key: str + :param edges: The edges to process. + :type edges: DefaultDict[str, list] + :param edge_definitions_dict: The edge definitions dictionary. + :type edge_definitions_dict: DefaultDict[str, DefaultDict[str, set[str]]] + + :return: None + :rtype: None + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + """ source: Node = edge.source target: Node = edge.target @@ -753,7 +936,25 @@ def _get_node_key( entity_collection_name: str, process_node_fn: Any, ) -> str: - """Gets the key of a node and processes it if it doesn't exist.""" + """Gets the key of a node and processes it if it doesn't exist. + + :param node: The node to process. + :type node: Node + :param nodes: The nodes to process. + :type nodes: DefaultDict[str, list] + :param node_key_map: The node key map. + :type node_key_map: Dict[str, str] + :param entity_collection_name: The name of the entity collection. + :type entity_collection_name: str + :param process_node_fn: The function to process the node. + :type process_node_fn: Any + + :return: The key of the node. + :rtype: str + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + """ node.id = str(node.id) if node.id in node_key_map: return node_key_map[node.id] @@ -772,7 +973,25 @@ def _process_source( embedding_field: str, insertion_db: Database, ) -> str: - """Processes a Graph Document Source into ArangoDB.""" + """Processes a Graph Document Source into ArangoDB. + + :param source: The source to process. + :type source: Document + :param source_collection_name: The name of the source collection. + :type source_collection_name: str + :param source_embedding: The embedding of the source. + :type source_embedding: Union[list[float], None] + :param embedding_field: The field name to store the embedding. + :type embedding_field: str + :param insertion_db: The database to insert the source into. + :type insertion_db: Database + + :return: The key of the source. + :rtype: str + :raises ArangoClientError: If the ArangoDB client cannot be created. + :raises ArangoServerError: If the ArangoDB server cannot be reached. + :raises ArangoCollectionError: If the collection cannot be created. + """ source_id = self._hash( source.id if source.id else source.page_content.encode("utf-8") ) @@ -792,7 +1011,16 @@ def _process_source( return source_id def _hash(self, value: Any) -> str: - """Applies the Farmhash hash function to a value.""" + """Applies the Farmhash hash function to a value. + + :param value: The value to hash. + :type value: Any + + :return: The hashed value. + :rtype: str + :raises ValueError: If the value is not a string or has no + string representation. + """ try: value_str = str(value) except Exception: @@ -807,6 +1035,13 @@ def _sanitize_collection_name(self, name: str) -> str: - Trims the name to 256 characters if it's too long. - Replaces invalid characters with underscores (_). - Ensures the name starts with a letter (prepends 'a' if needed). + + :param name: The name to sanitize. + :type name: str + + :return: The sanitized name. + :rtype: str + :raises ValueError: If the collection name is empty. """ if not name: raise ValueError("Collection name cannot be empty.") @@ -831,13 +1066,19 @@ def _sanitize_input(self, d: Any, list_limit: int, string_limit: int) -> Any: results, can occupy significant context space and detract from the LLM's performance by introducing unnecessary noise and cost. - Args: - d (Any): The input dictionary or list to sanitize. - list_limit (int): The limit for the number of elements in a list. - string_limit (int): The limit for the number of characters in a string. + :param d: The input dictionary or list to sanitize. + :type d: Any + :param list_limit: The limit for the number of elements in a list. + :type list_limit: int + :param string_limit: The limit for the number of characters in a string. + :type string_limit: int + + :return: The sanitized dictionary or list. + :rtype: Any + :raises ValueError: If the input is not a dictionary or list. + :raises ValueError: If the list limit is less than 0. + :raises ValueError: If the string limit is less than 0. - Returns: - Any: The sanitized dictionary or list. """ if isinstance(d, dict):