New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
python-connector-base: add CDK system dependencies #31929
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
|
||
from __future__ import annotations | ||
|
||
from typing import Final | ||
from typing import Callable, Final | ||
|
||
import dagger | ||
from base_images import bases, published_image | ||
|
@@ -17,8 +17,62 @@ class AirbytePythonConnectorBaseImage(bases.AirbyteConnectorBaseImage): | |
|
||
root_image: Final[published_image.PublishedImage] = PYTHON_3_9_18 | ||
repository: Final[str] = "airbyte/python-connector-base" | ||
|
||
pip_cache_name: Final[str] = "pip-cache" | ||
nltk_data_path: Final[str] = "/usr/share/nltk_data" | ||
ntlk_data = { | ||
"tokenizers": {"https://github.com/nltk/nltk_data/raw/5db857e6f7df11eabb5e5665836db9ec8df07e28/packages/tokenizers/punkt.zip"}, | ||
"taggers": { | ||
"https://github.com/nltk/nltk_data/raw/5db857e6f7df11eabb5e5665836db9ec8df07e28/packages/taggers/averaged_perceptron_tagger.zip" | ||
}, | ||
Comment on lines
+22
to
+26
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just noting that since these are raw github locations, if these packages move for any reason, all of our connectors will start to fail to build (right?). I guess this is the same for e.g. any pypi package, but it feels a bit more dangerous on github where e.g. a repository rename or something that removes this specific commit could affect this. Don't think there's any action to take, just wanted to flag! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Nope, as the base image is published to DockerHub, once its built successfully all the connectors can use it as a base image with a This official NLTK data is published on GitHub. We keep per commit URL to make sure we get static file content that can't change to boost build repro. In any case: if it breaks it will break when we'll try to cut a new base image version, not a connector version. |
||
} | ||
|
||
def install_cdk_system_dependencies(self) -> Callable: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❓ I guess this callable pattern is what allows us to perform the manipulation of the container as part of the container building pipeline rather than e.g.
If so, nice, and I'm wondering if this can help with the strangeness we have in pipelines where some things go from context -> container and others go from container -> container 🤔 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @erohmensing yes the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it, thanks for the context! |
||
def get_nltk_data_dir() -> dagger.Directory: | ||
"""Returns a dagger directory containing the nltk data. | ||
|
||
Returns: | ||
dagger.Directory: A dagger directory containing the nltk data. | ||
""" | ||
data_container = self.dagger_client.container().from_("bash:latest") | ||
|
||
for nltk_data_subfolder, nltk_data_urls in self.ntlk_data.items(): | ||
full_nltk_data_path = f"{self.nltk_data_path}/{nltk_data_subfolder}" | ||
for nltk_data_url in nltk_data_urls: | ||
zip_file = self.dagger_client.http(nltk_data_url) | ||
data_container = ( | ||
data_container.with_file("/tmp/data.zip", zip_file) | ||
.with_exec(["mkdir", "-p", full_nltk_data_path], skip_entrypoint=True) | ||
.with_exec(["unzip", "-o", "/tmp/data.zip", "-d", full_nltk_data_path], skip_entrypoint=True) | ||
.with_exec(["rm", "/tmp/data.zip"], skip_entrypoint=True) | ||
) | ||
return data_container.directory(self.nltk_data_path) | ||
|
||
def with_tesseract_and_poppler(container: dagger.Container) -> dagger.Container: | ||
""" | ||
Installs Tesseract-OCR and Poppler-utils in the base image. | ||
These tools are necessary for OCR (Optical Character Recognition) processes and working with PDFs, respectively. | ||
""" | ||
|
||
container = container.with_exec( | ||
["sh", "-c", "apt-get update && apt-get install -y tesseract-ocr=5.3.0-2 poppler-utils=22.12.0-2+b1"], skip_entrypoint=True | ||
Comment on lines
+56
to
+57
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seeing multiple places where the |
||
) | ||
|
||
return container | ||
|
||
def with_file_based_connector_dependencies(container: dagger.Container) -> dagger.Container: | ||
""" | ||
Installs the dependencies for file-based connectors. This includes: | ||
- tesseract-ocr | ||
- poppler-utils | ||
- nltk data | ||
""" | ||
container = with_tesseract_and_poppler(container) | ||
container = container.with_exec(["mkdir", self.nltk_data_path], skip_entrypoint=True).with_directory( | ||
self.nltk_data_path, get_nltk_data_dir() | ||
) | ||
return container | ||
|
||
return with_file_based_connector_dependencies | ||
|
||
def get_container(self, platform: dagger.Platform) -> dagger.Container: | ||
"""Returns the container used to build the base image. | ||
|
@@ -48,6 +102,8 @@ def get_container(self, platform: dagger.Platform) -> dagger.Container: | |
.with_exec(["pip", "install", "poetry==1.6.1"], skip_entrypoint=True) | ||
# Install socat 1.7.4.4 | ||
.with_exec(["sh", "-c", "apt update && apt-get install -y socat=1.7.4.4-2"]) | ||
# Install CDK system dependencies | ||
.with_(self.install_cdk_system_dependencies()) | ||
Comment on lines
+105
to
+106
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Love this - helps us figure out where things go if there are more of them, and what things to pull out if we ever move them somewhere else |
||
) | ||
|
||
async def run_sanity_checks(self, platform: dagger.Platform): | ||
|
@@ -59,7 +115,6 @@ async def run_sanity_checks(self, platform: dagger.Platform): | |
platform (dagger.Platform): The platform on which the sanity checks should run. | ||
""" | ||
container = self.get_container(platform) | ||
|
||
await base_sanity_checks.check_timezone_is_utc(container) | ||
await base_sanity_checks.check_a_command_is_available_using_version_option(container, "bash") | ||
await python_sanity_checks.check_python_version(container, "3.9.18") | ||
|
@@ -68,3 +123,4 @@ async def run_sanity_checks(self, platform: dagger.Platform): | |
await python_sanity_checks.check_python_image_has_expected_env_vars(container) | ||
await base_sanity_checks.check_a_command_is_available_using_version_option(container, "socat", "-V") | ||
await base_sanity_checks.check_socat_version(container, "1.7.4.4") | ||
await python_sanity_checks.check_cdk_system_dependencies(container) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -85,3 +85,69 @@ async def check_python_image_has_expected_env_vars(python_image_container: dagge | |
# It's not suboptimal to call printenv multiple times because the printenv output is cached. | ||
for expected_env_var in expected_env_vars: | ||
await base_sanity_checks.check_env_var_with_printenv(python_image_container, expected_env_var) | ||
|
||
|
||
async def check_nltk_data(python_image_container: dagger.Container): | ||
"""Install nltk and check that the required data is available. | ||
As of today the required data is: | ||
- taggers/averaged_perceptron_tagger | ||
- tokenizers/punkt | ||
|
||
Args: | ||
python_image_container (dagger.Container): The container on which the sanity checks should run. | ||
|
||
Raises: | ||
errors.SanityCheckError: Raised if the nltk data is not available. | ||
""" | ||
with_nltk = await python_image_container.with_exec(["pip", "install", "nltk==3.8.1"], skip_entrypoint=True) | ||
try: | ||
await with_nltk.with_exec( | ||
["python", "-c", 'import nltk;nltk.data.find("taggers/averaged_perceptron_tagger");nltk.data.find("tokenizers/punkt")'], | ||
skip_entrypoint=True, | ||
) | ||
except dagger.ExecError as e: | ||
raise errors.SanityCheckError(e) | ||
Comment on lines
+103
to
+109
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't have to be here, but I think we would benefit from making some custom assertion mechanism for this test file - we are doing quite a lot of
|
||
|
||
|
||
async def check_tesseract_version(python_image_container: dagger.Container, tesseract_version: str): | ||
"""Check that the tesseract version is the expected one. | ||
|
||
Args: | ||
python_image_container (dagger.Container): The container on which the sanity checks should run. | ||
tesseract_version (str): The expected tesseract version. | ||
|
||
Raises: | ||
errors.SanityCheckError: Raised if the tesseract --version command could not be executed or if the outputted version is not the expected one. | ||
""" | ||
try: | ||
tesseract_version_output = await python_image_container.with_exec(["tesseract", "--version"], skip_entrypoint=True).stdout() | ||
except dagger.ExecError as e: | ||
raise errors.SanityCheckError(e) | ||
if not tesseract_version_output.startswith(f"tesseract {tesseract_version}"): | ||
raise errors.SanityCheckError(f"unexpected tesseract version: {tesseract_version_output}") | ||
|
||
|
||
async def check_poppler_utils_version(python_image_container: dagger.Container, poppler_version: str): | ||
"""Check that the poppler version is the expected one. | ||
The poppler version can be checked by running a pdftotext -v command. | ||
|
||
Args: | ||
python_image_container (dagger.Container): The container on which the sanity checks should run. | ||
poppler_version (str): The expected poppler version. | ||
|
||
Raises: | ||
errors.SanityCheckError: Raised if the pdftotext -v command could not be executed or if the outputted version is not the expected one. | ||
""" | ||
try: | ||
pdf_to_text_version_output = await python_image_container.with_exec(["pdftotext", "-v"], skip_entrypoint=True).stderr() | ||
except dagger.ExecError as e: | ||
raise errors.SanityCheckError(e) | ||
|
||
if f"pdftotext version {poppler_version}" not in pdf_to_text_version_output: | ||
raise errors.SanityCheckError(f"unexpected poppler version: {pdf_to_text_version_output}") | ||
|
||
|
||
async def check_cdk_system_dependencies(python_image_container: dagger.Container): | ||
await check_nltk_data(python_image_container) | ||
await check_tesseract_version(python_image_container, "5.3.0") | ||
await check_poppler_utils_version(python_image_container, "22.12.0") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This new version should also come with an artificially generated Dockerfile, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I actually can't find those in the repo anymore but I swear they were there when we added this functionality...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah! Maybe it just updates the readme according to this, and we just need to change the
However, we do artificially generate Dockerfiles for debugging and documentation purposes.