Skip to content

Commit

Permalink
python-connector-base: add CDK system dependencies (#31929)
Browse files Browse the repository at this point in the history
  • Loading branch information
alafanechere committed Oct 31, 2023
1 parent 64a756d commit deef5ee
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 6 deletions.
8 changes: 6 additions & 2 deletions airbyte-ci/connectors/base_images/README.md
Expand Up @@ -2,11 +2,12 @@

This python package contains the base images used by Airbyte connectors.
It is intended to be used as a python library.
Our connector build pipeline ([`airbyte-ci`](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/pipelines/README.md#L1)) uses the base image declared in this package.
Our connector build pipeline ([`airbyte-ci`](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/pipelines/README.md#L1)) uses this library to build the connector images.
Our base images are declared in code, using the [Dagger Python SDK](https://dagger-io.readthedocs.io/en/sdk-python-v0.6.4/).

- [Python base image code declaration](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/base_images/base_images/python/bases.py)
- ~Java base image code declaration~ TODO
- ~Java base image code declaration~ *TODO*


## Where are the Dockerfiles?
Our base images are not declared using Dockerfiles.
Expand All @@ -26,6 +27,8 @@ ENV POETRY_VIRTUALENVS_IN_PROJECT=false
ENV POETRY_NO_INTERACTION=1
RUN pip install poetry==1.6.1
RUN sh -c apt update && apt-get install -y socat=1.7.4.4-2
RUN sh -c apt-get update && apt-get install -y tesseract-ocr=5.3.0-2 poppler-utils=22.12.0-2+b1
RUN mkdir /usr/share/nltk_data
```


Expand All @@ -37,6 +40,7 @@ RUN sh -c apt update && apt-get install -y socat=1.7.4.4-2

| Version | Published | Docker Image Address | Changelog |
|---------|-----------|--------------|-----------|
| 1.2.0 || docker.io/airbyte/python-connector-base:1.2.0@sha256:c22a9d97464b69d6ef01898edf3f8612dc11614f05a84984451dde195f337db9 | Add CDK system dependencies: nltk data, tesseract, poppler. |
| 1.1.0 || docker.io/airbyte/python-connector-base:1.1.0@sha256:bd98f6505c6764b1b5f99d3aedc23dfc9e9af631a62533f60eb32b1d3dbab20c | Install socat |
| 1.0.0 || docker.io/airbyte/python-connector-base:1.0.0@sha256:dd17e347fbda94f7c3abff539be298a65af2d7fc27a307d89297df1081a45c27 | Initial release: based on Python 3.9.18, on slim-bookworm system, with pip==23.2.1 and poetry==1.6.1 |

Expand Down
62 changes: 59 additions & 3 deletions airbyte-ci/connectors/base_images/base_images/python/bases.py
Expand Up @@ -4,7 +4,7 @@

from __future__ import annotations

from typing import Final
from typing import Callable, Final

import dagger
from base_images import bases, published_image
Expand All @@ -17,8 +17,62 @@ class AirbytePythonConnectorBaseImage(bases.AirbyteConnectorBaseImage):

root_image: Final[published_image.PublishedImage] = PYTHON_3_9_18
repository: Final[str] = "airbyte/python-connector-base"

pip_cache_name: Final[str] = "pip-cache"
nltk_data_path: Final[str] = "/usr/share/nltk_data"
ntlk_data = {
"tokenizers": {"https://github.com/nltk/nltk_data/raw/5db857e6f7df11eabb5e5665836db9ec8df07e28/packages/tokenizers/punkt.zip"},
"taggers": {
"https://github.com/nltk/nltk_data/raw/5db857e6f7df11eabb5e5665836db9ec8df07e28/packages/taggers/averaged_perceptron_tagger.zip"
},
}

def install_cdk_system_dependencies(self) -> Callable:
def get_nltk_data_dir() -> dagger.Directory:
"""Returns a dagger directory containing the nltk data.
Returns:
dagger.Directory: A dagger directory containing the nltk data.
"""
data_container = self.dagger_client.container().from_("bash:latest")

for nltk_data_subfolder, nltk_data_urls in self.ntlk_data.items():
full_nltk_data_path = f"{self.nltk_data_path}/{nltk_data_subfolder}"
for nltk_data_url in nltk_data_urls:
zip_file = self.dagger_client.http(nltk_data_url)
data_container = (
data_container.with_file("/tmp/data.zip", zip_file)
.with_exec(["mkdir", "-p", full_nltk_data_path], skip_entrypoint=True)
.with_exec(["unzip", "-o", "/tmp/data.zip", "-d", full_nltk_data_path], skip_entrypoint=True)
.with_exec(["rm", "/tmp/data.zip"], skip_entrypoint=True)
)
return data_container.directory(self.nltk_data_path)

def with_tesseract_and_poppler(container: dagger.Container) -> dagger.Container:
"""
Installs Tesseract-OCR and Poppler-utils in the base image.
These tools are necessary for OCR (Optical Character Recognition) processes and working with PDFs, respectively.
"""

container = container.with_exec(
["sh", "-c", "apt-get update && apt-get install -y tesseract-ocr=5.3.0-2 poppler-utils=22.12.0-2+b1"], skip_entrypoint=True
)

return container

def with_file_based_connector_dependencies(container: dagger.Container) -> dagger.Container:
"""
Installs the dependencies for file-based connectors. This includes:
- tesseract-ocr
- poppler-utils
- nltk data
"""
container = with_tesseract_and_poppler(container)
container = container.with_exec(["mkdir", self.nltk_data_path], skip_entrypoint=True).with_directory(
self.nltk_data_path, get_nltk_data_dir()
)
return container

return with_file_based_connector_dependencies

def get_container(self, platform: dagger.Platform) -> dagger.Container:
"""Returns the container used to build the base image.
Expand Down Expand Up @@ -48,6 +102,8 @@ def get_container(self, platform: dagger.Platform) -> dagger.Container:
.with_exec(["pip", "install", "poetry==1.6.1"], skip_entrypoint=True)
# Install socat 1.7.4.4
.with_exec(["sh", "-c", "apt update && apt-get install -y socat=1.7.4.4-2"])
# Install CDK system dependencies
.with_(self.install_cdk_system_dependencies())
)

async def run_sanity_checks(self, platform: dagger.Platform):
Expand All @@ -59,7 +115,6 @@ async def run_sanity_checks(self, platform: dagger.Platform):
platform (dagger.Platform): The platform on which the sanity checks should run.
"""
container = self.get_container(platform)

await base_sanity_checks.check_timezone_is_utc(container)
await base_sanity_checks.check_a_command_is_available_using_version_option(container, "bash")
await python_sanity_checks.check_python_version(container, "3.9.18")
Expand All @@ -68,3 +123,4 @@ async def run_sanity_checks(self, platform: dagger.Platform):
await python_sanity_checks.check_python_image_has_expected_env_vars(container)
await base_sanity_checks.check_a_command_is_available_using_version_option(container, "socat", "-V")
await base_sanity_checks.check_socat_version(container, "1.7.4.4")
await python_sanity_checks.check_cdk_system_dependencies(container)
Expand Up @@ -85,3 +85,69 @@ async def check_python_image_has_expected_env_vars(python_image_container: dagge
# It's not suboptimal to call printenv multiple times because the printenv output is cached.
for expected_env_var in expected_env_vars:
await base_sanity_checks.check_env_var_with_printenv(python_image_container, expected_env_var)


async def check_nltk_data(python_image_container: dagger.Container):
"""Install nltk and check that the required data is available.
As of today the required data is:
- taggers/averaged_perceptron_tagger
- tokenizers/punkt
Args:
python_image_container (dagger.Container): The container on which the sanity checks should run.
Raises:
errors.SanityCheckError: Raised if the nltk data is not available.
"""
with_nltk = await python_image_container.with_exec(["pip", "install", "nltk==3.8.1"], skip_entrypoint=True)
try:
await with_nltk.with_exec(
["python", "-c", 'import nltk;nltk.data.find("taggers/averaged_perceptron_tagger");nltk.data.find("tokenizers/punkt")'],
skip_entrypoint=True,
)
except dagger.ExecError as e:
raise errors.SanityCheckError(e)


async def check_tesseract_version(python_image_container: dagger.Container, tesseract_version: str):
"""Check that the tesseract version is the expected one.
Args:
python_image_container (dagger.Container): The container on which the sanity checks should run.
tesseract_version (str): The expected tesseract version.
Raises:
errors.SanityCheckError: Raised if the tesseract --version command could not be executed or if the outputted version is not the expected one.
"""
try:
tesseract_version_output = await python_image_container.with_exec(["tesseract", "--version"], skip_entrypoint=True).stdout()
except dagger.ExecError as e:
raise errors.SanityCheckError(e)
if not tesseract_version_output.startswith(f"tesseract {tesseract_version}"):
raise errors.SanityCheckError(f"unexpected tesseract version: {tesseract_version_output}")


async def check_poppler_utils_version(python_image_container: dagger.Container, poppler_version: str):
"""Check that the poppler version is the expected one.
The poppler version can be checked by running a pdftotext -v command.
Args:
python_image_container (dagger.Container): The container on which the sanity checks should run.
poppler_version (str): The expected poppler version.
Raises:
errors.SanityCheckError: Raised if the pdftotext -v command could not be executed or if the outputted version is not the expected one.
"""
try:
pdf_to_text_version_output = await python_image_container.with_exec(["pdftotext", "-v"], skip_entrypoint=True).stderr()
except dagger.ExecError as e:
raise errors.SanityCheckError(e)

if f"pdftotext version {poppler_version}" not in pdf_to_text_version_output:
raise errors.SanityCheckError(f"unexpected poppler version: {pdf_to_text_version_output}")


async def check_cdk_system_dependencies(python_image_container: dagger.Container):
await check_nltk_data(python_image_container)
await check_tesseract_version(python_image_container, "5.3.0")
await check_poppler_utils_version(python_image_container, "22.12.0")
Expand Up @@ -2,9 +2,11 @@

This python package contains the base images used by Airbyte connectors.
It is intended to be used as a python library.
Our connector build pipeline ([`airbyte-ci`](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/pipelines/README.md#L1)) **will** use this library to build the connector images.
Our connector build pipeline ([`airbyte-ci`](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/pipelines/README.md#L1)) uses this library to build the connector images.
Our base images are declared in code, using the [Dagger Python SDK](https://dagger-io.readthedocs.io/en/sdk-python-v0.6.4/).

- [Python base image code declaration](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/base_images/base_images/python/bases.py)
- ~Java base image code declaration~ *TODO*


## Where are the Dockerfiles?
Expand Down
@@ -1,4 +1,9 @@
[
{
"version": "1.2.0",
"changelog_entry": "Add CDK system dependencies: nltk data, tesseract, poppler.",
"dockerfile_example": "FROM docker.io/python:3.9.18-slim-bookworm@sha256:44b7f161ed03f85e96d423b9916cdc8cb0509fb970fd643bdbc9896d49e1cad0\nRUN ln -snf /usr/share/zoneinfo/Etc/UTC /etc/localtime\nRUN pip install --upgrade pip==23.2.1\nENV POETRY_VIRTUALENVS_CREATE=false\nENV POETRY_VIRTUALENVS_IN_PROJECT=false\nENV POETRY_NO_INTERACTION=1\nRUN pip install poetry==1.6.1\nRUN sh -c apt update && apt-get install -y socat=1.7.4.4-2\nRUN sh -c apt-get update && apt-get install -y tesseract-ocr=5.3.0-2 poppler-utils=22.12.0-2+b1\nRUN mkdir /usr/share/nltk_data"
},
{
"version": "1.1.0",
"changelog_entry": "Install socat",
Expand Down

0 comments on commit deef5ee

Please sign in to comment.