From 13faf650bc0c68521c1faf273901e00ef875f5a7 Mon Sep 17 00:00:00 2001 From: Kirill <58888049+KirillKukharev@users.noreply.github.com> Date: Tue, 18 Jun 2024 09:11:17 +0300 Subject: [PATCH] Add YandexGPT Encoder (#1) * add yandexgpt encoder * add unit tests for YandexGPT * Change encoder type name --- docs/encoders/yandex.ipynb | 167 +++++++++++++++++++++++++++ semantic_router/encoders/__init__.py | 4 + semantic_router/encoders/yandex.py | 137 ++++++++++++++++++++++ semantic_router/schema.py | 1 + semantic_router/utils/defaults.py | 5 + tests/unit/encoders/test_yandex.py | 69 +++++++++++ 6 files changed, 383 insertions(+) create mode 100644 docs/encoders/yandex.ipynb create mode 100644 semantic_router/encoders/yandex.py create mode 100644 tests/unit/encoders/test_yandex.py diff --git a/docs/encoders/yandex.ipynb b/docs/encoders/yandex.ipynb new file mode 100644 index 00000000..d0ec9150 --- /dev/null +++ b/docs/encoders/yandex.ipynb @@ -0,0 +1,167 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": "# Using YandexGPTEncoder", + "id": "35d3b3544b0b2bf5" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## Getting Started", + "id": "8a04e30ad27664cb" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "We start by installing semantic-router.", + "id": "e15f40cfbd181277" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "!pip install -qU semantic-router", + "id": "a22753e184585d66" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "We start by defining a dictionary mapping routes to example phrases that should trigger those routes.", + "id": "c6ab1caebff2d748" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from semantic_router import Route\n", + "\n", + "politics = Route(\n", + " name=\"politics\",\n", + " utterances=[\n", + " \"isn't politics the best thing ever\",\n", + " \"why don't you tell me about your political opinions\",\n", + " \"don't you just love the president\",\n", + " \"don't you just hate the president\",\n", + " \"they're going to destroy this country!\",\n", + " \"they will save the country!\",\n", + " ],\n", + ")" + ], + "id": "1387c6a6b2399cbb" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Let's define another for good measure:", + "id": "d14c31bb9ba0a2cf" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "chitchat = Route(\n", + " name=\"chitchat\",\n", + " utterances=[\n", + " \"how's the weather today?\",\n", + " \"how are things going?\",\n", + " \"lovely weather today\",\n", + " \"the weather is horrendous\",\n", + " \"let's go to the chippy\",\n", + " ],\n", + ")\n", + "\n", + "routes = [politics, chitchat]" + ], + "id": "9433a9a4d8420d4a" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Now we initialize our embedding model.", + "id": "ebb87de5d9181b90" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from semantic_router.encoders import YandexGPTEncoder\n", + "\n", + "key = \"your-api-key\"\n", + "catalog_id = \"your-catalog-id\"\n", + "\n", + "encoder = YandexGPTEncoder(api_key=key, catalog_id=catalog_id)" + ], + "id": "954563c1102f8f5d" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Now we define the RouteLayer. When called, the route layer will consume text (a query) and output the category (Route) it belongs to — to initialize a RouteLayer we need our encoder model and a list of routes.", + "id": "580ba91ad0dce419" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from semantic_router.layer import RouteLayer\n", + "\n", + "rl = RouteLayer(encoder=encoder, routes=routes)" + ], + "id": "7db9e2ea9afdf0ec" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Now we can test it:", + "id": "6b456a5153ec37e7" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "rl(\"don't you love politics?\")", + "id": "c552767d54a45455" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "rl(\"how's the weather today?\")", + "id": "b5e95b8cd6b009c3" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/semantic_router/encoders/__init__.py b/semantic_router/encoders/__init__.py index a1026240..c1b6934b 100644 --- a/semantic_router/encoders/__init__.py +++ b/semantic_router/encoders/__init__.py @@ -14,6 +14,7 @@ from semantic_router.encoders.tfidf import TfidfEncoder from semantic_router.encoders.vit import VitEncoder from semantic_router.encoders.zure import AzureOpenAIEncoder +from semantic_router.encoders.yandex import YandexGPTEncoder from semantic_router.schema import EncoderType __all__ = [ @@ -31,6 +32,7 @@ "CLIPEncoder", "GoogleEncoder", "BedrockEncoder", + "YandexGPTEncoder", ] @@ -71,6 +73,8 @@ def __init__(self, type: str, name: Optional[str]): self.model = GoogleEncoder(name=name) elif self.type == EncoderType.BEDROCK: self.model = BedrockEncoder(name=name) # type: ignore + elif self.type == EncoderType.YANDEX: + self.model = YandexGPTEncoder(name=name) else: raise ValueError(f"Encoder type '{type}' not supported") diff --git a/semantic_router/encoders/yandex.py b/semantic_router/encoders/yandex.py new file mode 100644 index 00000000..48a59439 --- /dev/null +++ b/semantic_router/encoders/yandex.py @@ -0,0 +1,137 @@ +""" +This module provides the YandexGPTEncoder class for generating embeddings using YandexGPT. + +The YandexGPTEncoder class is a subclass of BaseEncoder and utilizes the Embeddings class from the +YandexGPT to generate embeddings for given documents. It requires a YandexGPT API key and a model URI. + +Example usage: + + from semantic_router.encoders import YandexGPTEncoder + + encoder = YandexGPTEncoder(api_key="your-api-key", catalog_id="your-catalog-id") + embeddings = encoder(["document1", "document2"]) + +Classes: + YandexGPTEncoder: A class for generating embeddings using YandexGPT. +""" + +import os +from time import sleep +from typing import Any, List, Optional + +import requests +from semantic_router.encoders import BaseEncoder +from semantic_router.utils.defaults import EncoderDefault + + +class YandexGPTEncoder(BaseEncoder): + """YandexGPTEncoder class for generating embeddings using YandexGPT. + + Attributes: + client: An instance of the TextEmbeddingModel client. + type: The type of the encoder, which is "yandexgpt". + """ + client: Optional[Any] = None + type: str = "yandexgpt" + + def __init__( + self, + name: Optional[str] = None, + api_key: Optional[str] = None, + catalog_id: Optional[str] = None, + score_threshold: float = 0.75): + """Initializes the YandexGPTEncoder. + + Args: + name: The name of the pre-trained model to use for embedding. + If not provided, the default model specified in EncoderDefault will + be used. + api_key: The YandexGPT API key. + If not provided, it will be retrieved from the YANDEX_GPT_KEY + environment variable. + catalog_id: The catalog ID used to retrieve the model from. + + Raises: + ValueError: If the YandexGPT API key or model URI is not provided. + """ + if name is None: + name = EncoderDefault.YANDEX.value["embedding_model"] + + super().__init__(name=name, score_threshold=score_threshold) + + self.client = self._initialize_client(api_key, catalog_id) + + def _initialize_client(self, api_key, catalog_id): + """Initializes the YandexGPT client. + + Args: + api_key: The YandexGPT API key. + catalog_id: The URI of the YandexGPT model. + + Returns: + An instance of the Embeddings client. + + Raises: + ImportError: If the required YandexGPT library is not installed. + ValueError: If the YandexGPT API key or model URI is not provided. + """ + + api_key = api_key or os.getenv("YANDEX_GPT_KEY") + catalog_id = catalog_id or os.getenv("YANDEX_CATALOG_ID") + if api_key is None: + raise ValueError("YandexGPT API key cannot be 'None'.") + if catalog_id is None: + raise ValueError("YandexGPT catalog ID cannot be 'None'.") + try: + return {"api_key": api_key, "model_Uri": f"emb://{catalog_id}/text-search-doc/latest"} + except Exception as e: + raise ValueError( + f"Yandex API client failed to initialize. Error: {e}" + ) from e + + def _get_headers(self): + """Returns the headers for the YandexGPT API request.""" + return { + "Content-Type": "application/json", + "Authorization": f"Api-Key {self.client['api_key']}", + "x-data-logging-enabled": "false" + } + + def __call__(self, docs: List[str]) -> List[List[float]]: + """Generates embeddings for the given documents. + + Args: + docs: A list of strings representing the documents to embed. + + Returns: + A list of lists, where each inner list contains the embedding values for a + document. + + Raises: + ValueError: If the YandexGPT client is not initialized or if the + API call fails. + """ + if self.client is None: + raise ValueError("YandexGPT client is not initialized.") + + url = "https://llm.api.cloud.yandex.net/foundationModels/v1/textEmbedding" + embeddings = [] + for doc in docs: + data = { + "modelUri": self.client["model_Uri"], + "text": doc + } + + try: + sleep(0.2) # Ensure compliance with rate limits + response = requests.post(url, json=data, headers=self._get_headers()) + if response.status_code == 200: + embeddings.append(response.json()["embedding"]) + else: + raise ValueError(f"Failed to get embedding for document: {doc}") + except Exception as e: + raise ValueError(f"YandexGPT API call failed. Error: {e}") from e + + return embeddings + + diff --git a/semantic_router/schema.py b/semantic_router/schema.py index b444c988..cd1669ef 100644 --- a/semantic_router/schema.py +++ b/semantic_router/schema.py @@ -16,6 +16,7 @@ class EncoderType(Enum): CLIP = "clip" GOOGLE = "google" BEDROCK = "bedrock" + YANDEX = "yandex" class EncoderInfo(BaseModel): diff --git a/semantic_router/utils/defaults.py b/semantic_router/utils/defaults.py index 75331c06..f21ad1e7 100644 --- a/semantic_router/utils/defaults.py +++ b/semantic_router/utils/defaults.py @@ -36,3 +36,8 @@ class EncoderDefault(Enum): "BEDROCK_EMBEDDING_MODEL", "amazon.titan-embed-image-v1" ) } + YANDEX = { + "embedding_model": os.getenv( + "YANDEX_EMBEDDING_MODEL", "general:embedding" + ) + } diff --git a/tests/unit/encoders/test_yandex.py b/tests/unit/encoders/test_yandex.py new file mode 100644 index 00000000..0409afdf --- /dev/null +++ b/tests/unit/encoders/test_yandex.py @@ -0,0 +1,69 @@ +import pytest +from unittest.mock import patch, MagicMock, Mock +from semantic_router.encoders import BaseEncoder +from semantic_router.encoders import YandexGPTEncoder + +@pytest.fixture +def yandexgpt_encoder(mocker): + mocker.patch("requests.post") + return YandexGPTEncoder(api_key="test_api_key", catalog_id="test_catalog_id") + +class TestYandexGPTEncoder: + + def test_yandex_encoder_init_with_all_params(self): + encoder = YandexGPTEncoder(api_key="api-key", catalog_id="catalog-id") + assert encoder.client is not None + assert encoder.client["api_key"] == "api-key" + assert encoder.client["model_Uri"] == "emb://catalog-id/text-search-doc/latest" + + def test_yandex_encoder_init_no_api_key(self, mocker): + mocker.patch("os.getenv", return_value=None) + with pytest.raises(ValueError) as _: + YandexGPTEncoder(catalog_id="test_catalog_id") + + def test_yandex_encoder_init_missing_catalog_id(self): + with pytest.raises(ValueError, match="YandexGPT catalog ID cannot be 'None'."): + YandexGPTEncoder(api_key="api-key", catalog_id=None) + + def test_yandex_encoder_call_uninitialized_client(self, yandexgpt_encoder): + yandexgpt_encoder.client = None + with pytest.raises(ValueError) as e: + yandexgpt_encoder(["test document"]) + assert "YandexGPT client is not initialized." in str(e.value) + + def test_yandex_encoder_call_success(self, yandexgpt_encoder, mocker): + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {"embedding": [0.1, 0.2]} + mocker.patch("requests.post", return_value=mock_response) + embeddings = yandexgpt_encoder(["test document"]) + assert embeddings == [[0.1, 0.2]] + + def test_yandex_encoder_call_failure(self, yandexgpt_encoder, mocker): + mock_response = Mock() + mock_response.status_code = 500 + mocker.patch("requests.post", return_value=mock_response) + with pytest.raises(ValueError) as e: + yandexgpt_encoder(["test document"]) + assert "Failed to get embedding for document: test document" in str(e.value) + + def test_yandex_encoder_call_exception(self, yandexgpt_encoder, mocker): + mocker.patch("requests.post", side_effect=Exception("API call error")) + with pytest.raises(ValueError) as e: + yandexgpt_encoder(["test document"]) + assert "YandexGPT API call failed. Error: API call error" in str(e.value) + + + @patch('requests.post') + def test_embedding_generation_success(self, mock_post, yandexgpt_encoder): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"embedding": [0.1, 0.2, 0.3]} + mock_post.return_value = mock_response + + docs = ["document1", "document2"] + embeddings = yandexgpt_encoder(docs) + + assert len(embeddings) == 2 + assert embeddings[0] == [0.1, 0.2, 0.3] + assert embeddings[1] == [0.1, 0.2, 0.3]