From b6b5ceb382578cd6f3bfb72abbd9ad20f2fc5fa9 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Mon, 4 Dec 2023 11:58:52 -0800 Subject: [PATCH] Harrison/embass (#14242) Co-authored-by: Julius Lipp --- .../document_loaders/embaas.ipynb | 167 ------------ .../langchain/document_loaders/__init__.py | 3 - .../langchain/document_loaders/embaas.py | 244 ------------------ .../document_loaders/test_embaas.py | 59 ----- .../document_loaders/test_imports.py | 2 - 5 files changed, 475 deletions(-) delete mode 100644 docs/docs/integrations/document_loaders/embaas.ipynb delete mode 100644 libs/langchain/langchain/document_loaders/embaas.py delete mode 100644 libs/langchain/tests/integration_tests/document_loaders/test_embaas.py diff --git a/docs/docs/integrations/document_loaders/embaas.ipynb b/docs/docs/integrations/document_loaders/embaas.ipynb deleted file mode 100644 index 26129822674ced..00000000000000 --- a/docs/docs/integrations/document_loaders/embaas.ipynb +++ /dev/null @@ -1,167 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "# Embaas\n", - "[embaas](https://embaas.io) is a fully managed NLP API service that offers features like embedding generation, document text extraction, document to embeddings and more. You can choose a [variety of pre-trained models](https://embaas.io/docs/models/embeddings).\n", - "\n", - "### Prerequisites\n", - "Create a free embaas account at [https://embaas.io/register](https://embaas.io/register) and generate an [API key](https://embaas.io/dashboard/api-keys)\n", - "\n", - "### Document Text Extraction API\n", - "The document text extraction API allows you to extract the text from a given document. The API supports a variety of document formats, including PDF, mp3, mp4 and more. For a full list of supported formats, check out the API docs (link below)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Set API key\n", - "embaas_api_key = \"YOUR_API_KEY\"\n", - "# or set environment variable\n", - "os.environ[\"EMBAAS_API_KEY\"] = \"YOUR_API_KEY\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "#### Using a blob (bytes)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from langchain.document_loaders.blob_loaders import Blob\n", - "from langchain.document_loaders.embaas import EmbaasBlobLoader" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "blob_loader = EmbaasBlobLoader()\n", - "blob = Blob.from_path(\"example.pdf\")\n", - "documents = blob_loader.load(blob)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2023-06-12T22:19:48.380467Z", - "start_time": "2023-06-12T22:19:48.366886Z" - }, - "collapsed": false - }, - "outputs": [], - "source": [ - "# You can also directly create embeddings with your preferred embeddings model\n", - "blob_loader = EmbaasBlobLoader(params={\"model\": \"e5-large-v2\", \"should_embed\": True})\n", - "blob = Blob.from_path(\"example.pdf\")\n", - "documents = blob_loader.load(blob)\n", - "\n", - "print(documents[0][\"metadata\"][\"embedding\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "#### Using a file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from langchain.document_loaders.embaas import EmbaasLoader" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "file_loader = EmbaasLoader(file_path=\"example.pdf\")\n", - "documents = file_loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "ExecuteTime": { - "end_time": "2023-06-12T22:24:31.894665Z", - "start_time": "2023-06-12T22:24:31.880857Z" - }, - "collapsed": false - }, - "outputs": [], - "source": [ - "# Disable automatic text splitting\n", - "file_loader = EmbaasLoader(file_path=\"example.mp3\", params={\"should_chunk\": False})\n", - "documents = file_loader.load()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "For more detailed information about the embaas document text extraction API, please refer to [the official embaas API documentation](https://embaas.io/api-reference)." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/libs/langchain/langchain/document_loaders/__init__.py b/libs/langchain/langchain/document_loaders/__init__.py index 119496f9c66e59..d52d3b955b072a 100644 --- a/libs/langchain/langchain/document_loaders/__init__.py +++ b/libs/langchain/langchain/document_loaders/__init__.py @@ -77,7 +77,6 @@ OutlookMessageLoader, UnstructuredEmailLoader, ) -from langchain.document_loaders.embaas import EmbaasBlobLoader, EmbaasLoader from langchain.document_loaders.epub import UnstructuredEPubLoader from langchain.document_loaders.etherscan import EtherscanLoader from langchain.document_loaders.evernote import EverNoteLoader @@ -259,8 +258,6 @@ "Docx2txtLoader", "DropboxLoader", "DuckDBLoader", - "EmbaasBlobLoader", - "EmbaasLoader", "EtherscanLoader", "EverNoteLoader", "FacebookChatLoader", diff --git a/libs/langchain/langchain/document_loaders/embaas.py b/libs/langchain/langchain/document_loaders/embaas.py deleted file mode 100644 index 6f4fc800a12661..00000000000000 --- a/libs/langchain/langchain/document_loaders/embaas.py +++ /dev/null @@ -1,244 +0,0 @@ -import base64 -import warnings -from typing import Any, Dict, Iterator, List, Optional - -import requests -from langchain_core.documents import Document -from langchain_core.pydantic_v1 import BaseModel, root_validator, validator -from typing_extensions import NotRequired, TypedDict - -from langchain.document_loaders.base import BaseBlobParser, BaseLoader -from langchain.document_loaders.blob_loaders import Blob -from langchain.text_splitter import TextSplitter -from langchain.utils import get_from_dict_or_env - -EMBAAS_DOC_API_URL = "https://api.embaas.io/v1/document/extract-text/bytes/" - - -class EmbaasDocumentExtractionParameters(TypedDict): - """Parameters for the embaas document extraction API.""" - - mime_type: NotRequired[str] - """The mime type of the document.""" - file_extension: NotRequired[str] - """The file extension of the document.""" - file_name: NotRequired[str] - """The file name of the document.""" - - should_chunk: NotRequired[bool] - """Whether to chunk the document into pages.""" - chunk_size: NotRequired[int] - """The maximum size of the text chunks.""" - chunk_overlap: NotRequired[int] - """The maximum overlap allowed between chunks.""" - chunk_splitter: NotRequired[str] - """The text splitter class name for creating chunks.""" - separators: NotRequired[List[str]] - """The separators for chunks.""" - - should_embed: NotRequired[bool] - """Whether to create embeddings for the document in the response.""" - model: NotRequired[str] - """The model to pass to the Embaas document extraction API.""" - instruction: NotRequired[str] - """The instruction to pass to the Embaas document extraction API.""" - - -class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters): - """Payload for the Embaas document extraction API.""" - - bytes: str - """The base64 encoded bytes of the document to extract text from.""" - - -class BaseEmbaasLoader(BaseModel): - """Base loader for `Embaas` document extraction API.""" - - embaas_api_key: Optional[str] = None - """The API key for the Embaas document extraction API.""" - api_url: str = EMBAAS_DOC_API_URL - """The URL of the Embaas document extraction API.""" - params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters() - """Additional parameters to pass to the Embaas document extraction API.""" - - @root_validator(pre=True) - def validate_environment(cls, values: Dict) -> Dict: - """Validate that api key and python package exists in environment.""" - embaas_api_key = get_from_dict_or_env( - values, "embaas_api_key", "EMBAAS_API_KEY" - ) - values["embaas_api_key"] = embaas_api_key - return values - - -class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser): - """Load `Embaas` blob. - - To use, you should have the - environment variable ``EMBAAS_API_KEY`` set with your API key, or pass - it as a named parameter to the constructor. - - Example: - .. code-block:: python - - # Default parsing - from langchain.document_loaders.embaas import EmbaasBlobLoader - loader = EmbaasBlobLoader() - blob = Blob.from_path(path="example.mp3") - documents = loader.parse(blob=blob) - - # Custom api parameters (create embeddings automatically) - from langchain.document_loaders.embaas import EmbaasBlobLoader - loader = EmbaasBlobLoader( - params={ - "should_embed": True, - "model": "e5-large-v2", - "chunk_size": 256, - "chunk_splitter": "CharacterTextSplitter" - } - ) - blob = Blob.from_path(path="example.pdf") - documents = loader.parse(blob=blob) - """ - - def lazy_parse(self, blob: Blob) -> Iterator[Document]: - """Parses the blob lazily. - - Args: - blob: The blob to parse. - """ - yield from self._get_documents(blob=blob) - - @staticmethod - def _api_response_to_documents(chunks: List[Dict[str, Any]]) -> List[Document]: - """Convert the API response to a list of documents.""" - docs = [] - for chunk in chunks: - metadata = chunk["metadata"] - if chunk.get("embedding", None) is not None: - metadata["embedding"] = chunk["embedding"] - doc = Document(page_content=chunk["text"], metadata=metadata) - docs.append(doc) - - return docs - - def _generate_payload(self, blob: Blob) -> EmbaasDocumentExtractionPayload: - """Generates payload for the API request.""" - base64_byte_str = base64.b64encode(blob.as_bytes()).decode() - payload: EmbaasDocumentExtractionPayload = EmbaasDocumentExtractionPayload( - bytes=base64_byte_str, - # Workaround for mypy issue: https://github.com/python/mypy/issues/9408 - # type: ignore - **self.params, - ) - - if blob.mimetype is not None and payload.get("mime_type", None) is None: - payload["mime_type"] = blob.mimetype - - return payload - - def _handle_request( - self, payload: EmbaasDocumentExtractionPayload - ) -> List[Document]: - """Sends a request to the embaas API and handles the response.""" - headers = { - "Authorization": f"Bearer {self.embaas_api_key}", - "Content-Type": "application/json", - } - - response = requests.post(self.api_url, headers=headers, json=payload) - response.raise_for_status() - - parsed_response = response.json() - return EmbaasBlobLoader._api_response_to_documents( - chunks=parsed_response["data"]["chunks"] - ) - - def _get_documents(self, blob: Blob) -> Iterator[Document]: - """Get the documents from the blob.""" - payload = self._generate_payload(blob=blob) - - try: - documents = self._handle_request(payload=payload) - except requests.exceptions.RequestException as e: - if e.response is None or not e.response.text: - raise ValueError( - f"Error raised by Embaas document text extraction API: {e}" - ) - - parsed_response = e.response.json() - if "message" in parsed_response: - raise ValueError( - f"Validation Error raised by Embaas document text extraction API:" - f" {parsed_response['message']}" - ) - raise - - yield from documents - - -class EmbaasLoader(BaseEmbaasLoader, BaseLoader): - """Load from `Embaas`. - - To use, you should have the - environment variable ``EMBAAS_API_KEY`` set with your API key, or pass - it as a named parameter to the constructor. - - Example: - .. code-block:: python - - # Default parsing - from langchain.document_loaders.embaas import EmbaasLoader - loader = EmbaasLoader(file_path="example.mp3") - documents = loader.load() - - # Custom api parameters (create embeddings automatically) - from langchain.document_loaders.embaas import EmbaasBlobLoader - loader = EmbaasBlobLoader( - file_path="example.pdf", - params={ - "should_embed": True, - "model": "e5-large-v2", - "chunk_size": 256, - "chunk_splitter": "CharacterTextSplitter" - } - ) - documents = loader.load() - """ - - file_path: str - """The path to the file to load.""" - blob_loader: Optional[EmbaasBlobLoader] - """The blob loader to use. If not provided, a default one will be created.""" - - @validator("blob_loader", always=True) - def validate_blob_loader( - cls, v: EmbaasBlobLoader, values: Dict - ) -> EmbaasBlobLoader: - return v or EmbaasBlobLoader( - embaas_api_key=values["embaas_api_key"], - api_url=values["api_url"], - params=values["params"], - ) - - def lazy_load(self) -> Iterator[Document]: - """Load the documents from the file path lazily.""" - blob = Blob.from_path(path=self.file_path) - - assert self.blob_loader is not None - # Should never be None, but mypy doesn't know that. - yield from self.blob_loader.lazy_parse(blob=blob) - - def load(self) -> List[Document]: - return list(self.lazy_load()) - - def load_and_split( - self, text_splitter: Optional[TextSplitter] = None - ) -> List[Document]: - if self.params.get("should_embed", False): - warnings.warn( - "Embeddings are not supported with load_and_split." - " Use the API splitter to properly generate embeddings." - " For more information see embaas.io docs." - ) - return super().load_and_split(text_splitter=text_splitter) diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_embaas.py b/libs/langchain/tests/integration_tests/document_loaders/test_embaas.py deleted file mode 100644 index 2170a143c66acd..00000000000000 --- a/libs/langchain/tests/integration_tests/document_loaders/test_embaas.py +++ /dev/null @@ -1,59 +0,0 @@ -from typing import Any -from unittest.mock import MagicMock, patch - -import responses - -from langchain.document_loaders import EmbaasBlobLoader, EmbaasLoader -from langchain.document_loaders.blob_loaders import Blob -from langchain.document_loaders.embaas import EMBAAS_DOC_API_URL - - -@responses.activate -def test_handle_request() -> None: - responses.add( - responses.POST, - EMBAAS_DOC_API_URL, - json={ - "data": { - "chunks": [ - { - "text": "Hello", - "metadata": {"start_page": 1, "end_page": 2}, - "embeddings": [0.0], - } - ] - } - }, - status=200, - ) - - loader = EmbaasBlobLoader(embaas_api_key="api_key", params={"should_embed": True}) - documents = loader.parse(blob=Blob.from_data(data="Hello")) - assert len(documents) == 1 - assert documents[0].page_content == "Hello" - assert documents[0].metadata["start_page"] == 1 - assert documents[0].metadata["end_page"] == 2 - assert documents[0].metadata["embeddings"] == [0.0] - - -@responses.activate -def test_handle_request_exception() -> None: - responses.add( - responses.POST, - EMBAAS_DOC_API_URL, - json={"message": "Invalid request"}, - status=400, - ) - loader = EmbaasBlobLoader(embaas_api_key="api_key") - try: - loader.parse(blob=Blob.from_data(data="Hello")) - except Exception as e: - assert "Invalid request" in str(e) - - -@patch.object(EmbaasBlobLoader, "_handle_request") -def test_load(mock_handle_request: Any) -> None: - mock_handle_request.return_value = [MagicMock()] - loader = EmbaasLoader(file_path="test_embaas.py", embaas_api_key="api_key") - documents = loader.load() - assert len(documents) == 1 diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_imports.py b/libs/langchain/tests/unit_tests/document_loaders/test_imports.py index db754275234bab..5da2e800ab87bd 100644 --- a/libs/langchain/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/langchain/tests/unit_tests/document_loaders/test_imports.py @@ -52,8 +52,6 @@ "Docx2txtLoader", "DropboxLoader", "DuckDBLoader", - "EmbaasBlobLoader", - "EmbaasLoader", "EtherscanLoader", "EverNoteLoader", "FacebookChatLoader",