airbytehq · topefolorunso · Oct 12, 2023 · Oct 26, 2023 · Oct 28, 2023 · Oct 30, 2023
diff --git a/airbyte-integrations/connectors/destination-vectara/.dockerignore b/airbyte-integrations/connectors/destination-vectara/.dockerignore
@@ -0,0 +1,5 @@
+*
+!Dockerfile
+!main.py
+!destination_vectara
+!setup.py
diff --git a/airbyte-integrations/connectors/destination-vectara/Dockerfile b/airbyte-integrations/connectors/destination-vectara/Dockerfile
@@ -0,0 +1,38 @@
+FROM python:3.9.11-alpine3.15 as base
+
+# build and load all requirements
+FROM base as builder
+WORKDIR /airbyte/integration_code
+
+# upgrade pip to the latest version
+RUN apk --no-cache upgrade \
+    && pip install --upgrade pip \
+    && apk --no-cache add tzdata build-base
+
+
+COPY setup.py ./
+# install necessary packages to a temporary folder
+RUN pip install --prefix=/install .
+
+# build a clean environment
+FROM base
+WORKDIR /airbyte/integration_code
+
+# copy all loaded and built libraries to a pure basic image
+COPY --from=builder /install /usr/local
+# add default timezone settings
+COPY --from=builder /usr/share/zoneinfo/Etc/UTC /etc/localtime
+RUN echo "Etc/UTC" > /etc/timezone
+
+# bash is installed for more convenient debugging.
+RUN apk --no-cache add bash
+
+# copy payload code only
+COPY main.py ./
+COPY destination_vectara ./destination_vectara
+
+ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
+ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
+
+LABEL io.airbyte.version=0.1.0
+LABEL io.airbyte.name=airbyte/destination-vectara
diff --git a/airbyte-integrations/connectors/destination-vectara/README.md b/airbyte-integrations/connectors/destination-vectara/README.md
@@ -0,0 +1,123 @@
+# Vectara Destination
+
+This is the repository for the Vectara destination connector, written in Python.
+For information about how to use this connector within Airbyte, see [the documentation](https://docs.airbyte.com/integrations/destinations/vectara).
+
+## Local development
+
+### Prerequisites
+**To iterate on this connector, make sure to complete this prerequisites section.**
+
+#### Minimum Python version required `= 3.7.0`
+
+#### Build & Activate Virtual Environment and install dependencies
+From this connector directory, create a virtual environment:
+```
+python -m venv .venv
+```
+
+This will generate a virtualenv for this module in `.venv/`. Make sure this venv is active in your
+development environment of choice. To activate it from the terminal, run:
+```
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+If you are in an IDE, follow your IDE's instructions to activate the virtualenv.
+
+Note that while we are installing dependencies from `requirements.txt`, you should only edit `setup.py` for your dependencies. `requirements.txt` is
+used for editable installs (`pip install -e`) to pull in Python dependencies from the monorepo and will call `setup.py`.
+If this is mumbo jumbo to you, don't worry about it, just put your deps in `setup.py` but install using `pip install -r requirements.txt` and everything
+should work as you expect.
+
+#### Building via Gradle
+From the Airbyte repository root, run:
+```
+./gradlew :airbyte-integrations:connectors:destination-vectara:build
+```
+
+#### Create credentials
+**If you are a community contributor**, follow the instructions in the [documentation](https://docs.airbyte.com/integrations/destinations/vectara)
+to generate the necessary credentials. Then create a file `secrets/config.json` conforming to the `destination_vectara/spec.json` file.
+Note that the `secrets` directory is gitignored by default, so there is no danger of accidentally checking in sensitive information.
+See `integration_tests/sample_config.json` for a sample config file.
+
+**If you are an Airbyte core member**, copy the credentials in Lastpass under the secret name `destination vectara test creds`
+and place them into `secrets/config.json`.
+
+### Locally running the connector
+```
+python main.py spec
+python main.py check --config secrets/config.json
+python main.py discover --config secrets/config.json
+python main.py read --config secrets/config.json --catalog integration_tests/configured_catalog.json
+```
+
+### Locally running the connector docker image
+
+#### Build
+First, make sure you build the latest Docker image:
+```
+docker build . -t airbyte/destination-vectara:dev
+```
+
+You can also build the connector image via Gradle:
+```
+./gradlew :airbyte-integrations:connectors:destination-vectara:airbyteDocker
+```
+When building via Gradle, the docker image name and tag, respectively, are the values of the `io.airbyte.name` and `io.airbyte.version` `LABEL`s in
+the Dockerfile.
+
+#### Run
+Then run any of the connector commands as follows:
+```
+docker run --rm airbyte/destination-vectara:dev spec
+docker run --rm -v $(pwd)/secrets:/secrets airbyte/destination-vectara:dev check --config /secrets/config.json
+# messages.jsonl is a file containing line-separated JSON representing AirbyteMessages
+cat messages.jsonl | docker run --rm -v $(pwd)/secrets:/secrets -v $(pwd)/integration_tests:/integration_tests airbyte/destination-vectara:dev write --config /secrets/config.json --catalog /integration_tests/configured_catalog.json
+```
+## Testing
+   Make sure to familiarize yourself with [pytest test discovery](https://docs.pytest.org/en/latest/goodpractices.html#test-discovery) to know how your test files and methods should be named.
+First install test dependencies into your virtual environment:
+```
+pip install .[tests]
+```
+### Unit Tests
+To run unit tests locally, from the connector directory run:
+```
+python -m pytest unit_tests
+```
+
+### Integration Tests
+There are two types of integration tests: Acceptance Tests (Airbyte's test suite for all destination connectors) and custom integration tests (which are specific to this connector).
+#### Custom Integration tests
+Place custom tests inside `integration_tests/` folder, then, from the connector root, run
+```
+python -m pytest integration_tests
+```
+#### Acceptance Tests
+Coming soon: 
+
+### Using gradle to run tests
+All commands should be run from airbyte project root.
+To run unit tests:
+```
+./gradlew :airbyte-integrations:connectors:destination-vectara:unitTest
+```
+To run acceptance and custom integration tests:
+```
+./gradlew :airbyte-integrations:connectors:destination-vectara:integrationTest
+```
+
+## Dependency Management
+All of your dependencies should go in `setup.py`, NOT `requirements.txt`. The requirements file is only used to connect internal Airbyte dependencies in the monorepo for local development.
+We split dependencies between two groups, dependencies that are:
+* required for your connector to work need to go to `MAIN_REQUIREMENTS` list.
+* required for the testing need to go to `TEST_REQUIREMENTS` list
+
+### Publishing a new version of the connector
+You've checked out the repo, implemented a million dollar feature, and you're ready to share your changes with the world. Now what?
+1. Make sure your changes are passing unit and integration tests.
+1. Bump the connector version in `Dockerfile` -- just increment the value of the `LABEL io.airbyte.version` appropriately (we use [SemVer](https://semver.org/)).
+1. Create a Pull Request.
+1. Pat yourself on the back for being an awesome contributor.
+1. Someone from Airbyte will take a look at your PR and iterate with you to merge it into master.
diff --git a/airbyte-integrations/connectors/destination-vectara/destination_vectara/__init__.py b/airbyte-integrations/connectors/destination-vectara/destination_vectara/__init__.py
@@ -0,0 +1,8 @@
+#
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+#
+
+
+from .destination import DestinationVectara
+
+__all__ = ["DestinationVectara"]
diff --git a/airbyte-integrations/connectors/destination-vectara/destination_vectara/client.py b/airbyte-integrations/connectors/destination-vectara/destination_vectara/client.py
@@ -0,0 +1,185 @@
+#
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+#
+
+import datetime
+import json
+import requests
+import traceback
+import uuid
+
+from typing import Any, Mapping
+
+from destination_vectara.config import VectaraConfig
+
+
+
+METADATA_STREAM_FIELD = "_ab_stream"
+# METADATA_RECORD_ID_FIELD = "_ab_record_id"
+
+class VectaraClient:
+
+    BASE_URL = "https://api.vectara.io/v1"
+
+    def __init__(self, config: VectaraConfig):
+        self.customer_id = config.customer_id
+        self.corpus_name = config.corpus_name
+        self.client_id = config.oauth2.client_id
+        self.client_secret = config.oauth2.client_secret
+        # self.corpus_id = config.corpus_id
+
+    def check(self):
+        try:
+            jwt_token = self._get_jwt_token()
+            if not jwt_token:
+                return "Unable to get JWT Token. Confirm your Client ID and Client Secret."
+
+            list_corpora_response = self._request(
+                endpoint="list-corpora",
+                data={
+                    "numResults": 100, 
+                    "filter": self.corpus_name
+                    }
+                )
+            possible_corpora_ids_names_map = {corpus.get("id"): corpus.get("name") for corpus in list_corpora_response.get("corpus") if corpus.get("name") == self.corpus_name}
+            if len(possible_corpora_ids_names_map) > 1:
+                return f"Multiple Corpora exist with name {self.corpus_name}"
+            if len(possible_corpora_ids_names_map) == 1:
+                self.corpus_id = list(possible_corpora_ids_names_map.keys())[0]
+            else:
+                create_corpus_response = self._request(
+                    endpoint="create-corpus",
+                    data={
+                        "corpus": {
+                            "name": self.corpus_name,
+                            "filterAttributes": [
+                                    {
+                                        "name": METADATA_STREAM_FIELD,
+                                        "indexed": True,
+                                        "type": "FILTER_ATTRIBUTE_TYPE__TEXT",
+                                        "level": "FILTER_ATTRIBUTE_LEVEL__DOCUMENT"
+                                    },
+                                    # {
+                                    #     "name": METADATA_RECORD_ID_FIELD,
+                                    #     "indexed": True,
+                                    #     "type": "FILTER_ATTRIBUTE_TYPE__TEXT",
+                                    #     "level": "FILTER_ATTRIBUTE_LEVEL__DOCUMENT"
+                                    # }
+                                ]
+                            }
+                        }
+                    )
+                self.corpus_id = create_corpus_response.get("corpusId")
+
+        except Exception as e:
+            return str(e) + "\n" + "".join(traceback.TracebackException.from_exception(e).format())
+
+    def _get_jwt_token(self):
+        """Connect to the server and get a JWT token."""
+        token_endpoint = f"https://vectara-prod-{self.customer_id}.auth.us-west-2.amazoncognito.com/oauth2/token"
+        headers = {
+            "Content-Type": "application/x-www-form-urlencoded",
+            }
+        data = {
+            "grant_type": "client_credentials",
+            "client_id": self.client_id,
+            "client_secret": self.client_secret
+        }
+
+        request_time = datetime.datetime.now().timestamp()
+        response = requests.request(method="POST", url=token_endpoint, headers=headers, data=data)
+        response_json = response.json()
+
+        self.jwt_token = response_json.get("access_token")
+        self.jwt_token_expires_ts = request_time + response_json.get("expires_in")
+        return self.jwt_token
+
+    def _request(
+        self, endpoint: str, http_method: str = "POST", params: Mapping[str, Any] = None, data: Mapping[str, Any] = None
+        ):
+
+        url = f"{self.BASE_URL}/{endpoint}"
+
+        current_ts = datetime.datetime.now().timestamp()
+        if self.jwt_token_expires_ts - current_ts <= 60:
+            self._get_jwt_token()
+
+        headers = {
+            "Content-Type": "application/json",
+            "Accept": "application/json", 
+            "Authorization": f"Bearer {self.jwt_token}",
+            "customer-id": self.customer_id
+            }
+
+        response = requests.request(method=http_method, url=url, headers=headers, params=params, data=json.dumps(data))
+        response.raise_for_status()
+        return response.json()
+
+    def _delete_doc_by_metadata(self, metadata_field_name, metadata_field_values):
+        document_ids = []
+        for value in metadata_field_values:
+            query_documents_response = self._request(
+                endpoint="query",
+                data= {
+                    "query": [
+                            {
+                                "query": "", 
+                                "numResults": 100,
+                                "corpusKey": [
+                                    {
+                                    "customerId": self.customer_id,
+                                    "corpusId": self.corpus_id,
+                                    "metadataFilter": f"doc.{metadata_field_name} = '{value}'"
+                                    }
+                                ]
+                            }
+                        ]
+                    }
+                )
+            document_ids.extend([document.get("id") for document in query_documents_response.get("responseSet").get("document")])
+        documents_not_deleted = []
+        for document_id in document_ids:
+            delete_document_response = self._request(
+                endpoint="delete-doc",
+                data={
+                    "customerId": self.customer_id, 
+                    "corpusId": self.corpus_id,
+                    "documentId": document_id
+                    }
+                )
+            # TODO whether this is needed?
+            if delete_document_response:
+                documents_not_deleted.append(document_id)
+        return documents_not_deleted
+
+    def _index_documents(self, documents):
+        for stream_name, document_content in documents:            
+            document_metadata = self._normalize({METADATA_STREAM_FIELD: stream_name})
+            index_document_response = self._request(
+                endpoint="index",
+                data={
+                        "customerId": self.customer_id, 
+                        "corpusId": self.corpus_id,
+                        "document": {
+                            "documentId": uuid.uuid4().int,
+                            "metadataJson": json.dumps(document_metadata),
+                            "section": [
+                                {
+                                    "title": "Content",
+                                    "text": document_content,
+                                }
+                            ]
+                        }
+                    }
+                )
+            assert index_document_response.get("status").get("code") == "OK", index_document_response.get("status").get("statusDetail")
+
+    def _normalize(self, metadata: dict) -> dict:
+        result = {}
+        for key, value in metadata.items():
+            if isinstance(value, (str, int, float, bool)):
+                result[key] = value
+            else:
+                # JSON encode all other types
+                result[key] = json.dumps(value)
+        return result