From 25aceaf11dc3450c630cdf6c418e524f74e23960 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 27 Oct 2023 10:05:07 +0200 Subject: [PATCH 01/10] remove local unstructured implementation --- .../source-s3/source_s3/v4/config.py | 33 +--- .../source_s3/v4/unstructured_parser.py | 123 -------------- .../unit_tests/v4/test_unstructured_parser.py | 150 ------------------ 3 files changed, 2 insertions(+), 304 deletions(-) delete mode 100644 airbyte-integrations/connectors/source-s3/source_s3/v4/unstructured_parser.py delete mode 100644 airbyte-integrations/connectors/source-s3/unit_tests/v4/test_unstructured_parser.py diff --git a/airbyte-integrations/connectors/source-s3/source_s3/v4/config.py b/airbyte-integrations/connectors/source-s3/source_s3/v4/config.py index f3299f5819136..16a9af413b254 100644 --- a/airbyte-integrations/connectors/source-s3/source_s3/v4/config.py +++ b/airbyte-integrations/connectors/source-s3/source_s3/v4/config.py @@ -2,33 +2,10 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # -from typing import List, Optional, Union +from typing import Optional from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec -from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat -from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat -from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig -from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat -from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat -from pydantic import AnyUrl, BaseModel, Field, ValidationError, root_validator - - -class UnstructuredFormat(BaseModel): - class Config: - title = "Markdown/PDF/Docx Format (Experimental)" - schema_extra = {"description": "Extract text from document formats and emit as one record per file."} - - filetype: str = Field( - "unstructured", - const=True, - ) - - -class S3FileBasedStreamConfig(FileBasedStreamConfig): - format: Union[AvroFormat, CsvFormat, JsonlFormat, ParquetFormat, UnstructuredFormat] = Field( - title="Format", - description="The configuration options that are used to alter how to read incoming files that deviate from the standard formatting.", - ) +from pydantic import AnyUrl, Field, ValidationError, root_validator class Config(AbstractFileBasedSpec): @@ -65,12 +42,6 @@ def documentation_url(cls) -> AnyUrl: "", title="Endpoint", description="Endpoint to an S3 compatible service. Leave empty to use AWS.", order=4 ) - streams: List[S3FileBasedStreamConfig] = Field( - title="The list of streams to sync", - description='Each instance of this configuration defines a stream. Use this to define which files belong in the stream, their format, and how they should be parsed and validated. When sending data to warehouse destination such as Snowflake or BigQuery, each stream is a separate table.', - order=10, - ) - @root_validator def validate_optional_args(cls, values): aws_access_key_id = values.get("aws_access_key_id") diff --git a/airbyte-integrations/connectors/source-s3/source_s3/v4/unstructured_parser.py b/airbyte-integrations/connectors/source-s3/source_s3/v4/unstructured_parser.py deleted file mode 100644 index 16fe633052627..0000000000000 --- a/airbyte-integrations/connectors/source-s3/source_s3/v4/unstructured_parser.py +++ /dev/null @@ -1,123 +0,0 @@ -# -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. -# -import logging -from io import BytesIO, IOBase -from typing import Any, Dict, Iterable, List, Mapping, Optional - -from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig -from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError -from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode -from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser -from airbyte_cdk.sources.file_based.remote_file import RemoteFile -from airbyte_cdk.sources.file_based.schema_helpers import SchemaType -from source_s3.v4.config import S3FileBasedStreamConfig - - -class UnstructuredParser(FileTypeParser): - @property - def parser_max_n_files_for_schema_inference(self) -> Optional[int]: - """ - Just check one file as the schema is static - """ - return 1 - - @property - def parser_max_n_files_for_parsability(self) -> Optional[int]: - """ - Do not check any files for parsability because it might be an expensive operation and doesn't give much confidence whether the sync will succeed. - """ - return 0 - - async def infer_schema( - self, - config: S3FileBasedStreamConfig, - file: RemoteFile, - stream_reader: AbstractFileBasedStreamReader, - logger: logging.Logger, - ) -> SchemaType: - with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle: - filetype = self._get_filetype(file_handle) - - if filetype not in self._supported_file_types(): - raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri) - - return { - "content": {"type": "string"}, - "document_key": {"type": "string"}, - } - - def parse_records( - self, - config: FileBasedStreamConfig, - file: RemoteFile, - stream_reader: AbstractFileBasedStreamReader, - logger: logging.Logger, - discovered_schema: Optional[Mapping[str, SchemaType]], - ) -> Iterable[Dict[str, Any]]: - with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle: - markdown = self.read_file(file_handle, logger) - yield { - "content": markdown, - "document_key": file.uri, - } - - def read_file(self, file_handle: IOBase, logger: logging.Logger) -> str: - from unstructured.file_utils.filetype import FileType - from unstructured.partition.auto import partition - from unstructured.partition.md import optional_decode - - file_name = file_handle.name - filetype = self._get_filetype(file_handle) - - if filetype == FileType.MD: - return optional_decode(file_handle.read()) - if filetype not in self._supported_file_types(): - raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file_name) - - if filetype is FileType.PDF: - # for PDF, read the file into a BytesIO object because some code paths in pdf parsing are doing an instance check on the file object and don't work with file-like objects - file_handle.seek(0) - file = BytesIO(file_handle.read()) - file_handle.seek(0) - else: - file = file_handle - - elements = partition(file=file, metadata_filename=file_name) - return self._render_markdown(elements) - - def _get_filetype(self, file: IOBase): - from unstructured.file_utils.filetype import detect_filetype - - # set name to none, otherwise unstructured will try to get the modified date from the local file system - file_name = file.name - file.name = None - return detect_filetype( - file=file, - file_filename=file_name, - ) - - def _supported_file_types(self): - from unstructured.file_utils.filetype import FileType - - return [FileType.MD, FileType.PDF, FileType.DOCX] - - def _render_markdown(self, elements: List[Any]) -> str: - return "\n\n".join((self._convert_to_markdown(el) for el in elements)) - - def _convert_to_markdown(self, el: Any) -> str: - from unstructured.documents.elements import Formula, ListItem, Title - - if type(el) == Title: - heading_str = "#" * (el.metadata.category_depth or 1) - return f"{heading_str} {el.text}" - elif type(el) == ListItem: - return f"- {el.text}" - elif type(el) == Formula: - return f"```\n{el.text}\n```" - else: - return el.text if hasattr(el, "text") else "" - - @property - def file_read_mode(self) -> FileReadMode: - return FileReadMode.READ_BINARY diff --git a/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_unstructured_parser.py b/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_unstructured_parser.py deleted file mode 100644 index aebd1593fea24..0000000000000 --- a/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_unstructured_parser.py +++ /dev/null @@ -1,150 +0,0 @@ -# -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. -# - -import asyncio -from unittest.mock import MagicMock, mock_open, patch - -import pytest -from airbyte_cdk.sources.file_based.exceptions import RecordParseError -from source_s3.v4.unstructured_parser import UnstructuredParser -from unstructured.documents.elements import ElementMetadata, Formula, ListItem, Text, Title -from unstructured.file_utils.filetype import FileType - -FILE_URI = "path/to/file.xyz" - - -@pytest.mark.parametrize( - "filetype, raises", - [ - pytest.param( - FileType.MD, - False, - id="markdown file", - ), - pytest.param( - FileType.CSV, - True, - id="wrong file format", - ), - pytest.param( - FileType.PDF, - False, - id="pdf file", - ), - pytest.param( - FileType.DOCX, - False, - id="docx file", - ), - ], -) -@patch("unstructured.file_utils.filetype.detect_filetype") -def test_infer_schema(mock_detect_filetype, filetype, raises): - stream_reader = MagicMock() - mock_open(stream_reader.open_file) - fake_file = MagicMock() - fake_file.uri = FILE_URI - logger = MagicMock() - mock_detect_filetype.return_value = filetype - if raises: - with pytest.raises(RecordParseError): - asyncio.run(UnstructuredParser().infer_schema(MagicMock(), fake_file, stream_reader, logger)) - else: - schema = asyncio.run(UnstructuredParser().infer_schema(MagicMock(), MagicMock(), MagicMock(), MagicMock())) - assert schema == { - "content": {"type": "string"}, - "document_key": {"type": "string"}, - } - - -@pytest.mark.parametrize( - "filetype, parse_result, raises, expected_records", - [ - pytest.param( - FileType.MD, - "test", - False, - [ - { - "content": "test", - "document_key": FILE_URI, - } - ], - id="markdown file", - ), - pytest.param( - FileType.CSV, - "test", - True, - None, - id="wrong file format", - ), - pytest.param( - FileType.PDF, - [ - Title("heading"), - Text("This is the text"), - ListItem("This is a list item"), - Formula("This is a formula"), - ], - False, - [ - { - "content": "# heading\n\nThis is the text\n\n- This is a list item\n\n```\nThis is a formula\n```", - "document_key": FILE_URI, - } - ], - id="pdf file", - ), - pytest.param( - FileType.PDF, - [ - Title("first level heading", metadata=ElementMetadata(category_depth=1)), - Title("second level heading", metadata=ElementMetadata(category_depth=2)), - ], - False, - [ - { - "content": "# first level heading\n\n## second level heading", - "document_key": FILE_URI, - } - ], - id="multi-level headings", - ), - pytest.param( - FileType.DOCX, - [ - Title("heading"), - Text("This is the text"), - ListItem("This is a list item"), - Formula("This is a formula"), - ], - False, - [ - { - "content": "# heading\n\nThis is the text\n\n- This is a list item\n\n```\nThis is a formula\n```", - "document_key": FILE_URI, - } - ], - id="docx file", - ), - ], -) -@patch("unstructured.partition.auto.partition") -@patch("unstructured.partition.md.optional_decode") -@patch("unstructured.file_utils.filetype.detect_filetype") -def test_parse_records(mock_detect_filetype, mock_optional_decode, mock_partition, filetype, parse_result, raises, expected_records): - stream_reader = MagicMock() - mock_open(stream_reader.open_file, read_data=bytes(str(parse_result), "utf-8")) - fake_file = MagicMock() - fake_file.uri = FILE_URI - logger = MagicMock() - mock_detect_filetype.return_value = filetype - mock_partition.return_value = parse_result - mock_optional_decode.side_effect = lambda x: x.decode("utf-8") - if raises: - with pytest.raises(RecordParseError): - list(UnstructuredParser().parse_records(MagicMock(), fake_file, stream_reader, logger, MagicMock())) - else: - assert list(UnstructuredParser().parse_records(MagicMock(), fake_file, stream_reader, logger, MagicMock())) == expected_records From 1cc2edc25735f3d41bfd28fb44ec066de9b2f9da Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 27 Oct 2023 10:28:48 +0200 Subject: [PATCH 02/10] update file cdk --- .../build_customization.py | 61 +++++++++++++++++++ .../integration_tests/spec.json | 13 ++++ .../source-azure-blob-storage/metadata.yaml | 2 +- .../source-azure-blob-storage/setup.py | 15 ++++- .../source-s3/integration_tests/spec.json | 4 +- .../connectors/source-s3/main.py | 7 +-- .../connectors/source-s3/metadata.yaml | 2 +- .../sources/azure-blob-storage.md | 14 +++++ docs/integrations/sources/s3.md | 2 +- 9 files changed, 108 insertions(+), 12 deletions(-) create mode 100644 airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py b/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py new file mode 100644 index 0000000000000..5373d64a833c1 --- /dev/null +++ b/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py @@ -0,0 +1,61 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from __future__ import annotations + +import textwrap +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from dagger import Container + + +def setup_nltk(connector_container: Container) -> Container: + """ + Seeds the connector with nltk data at build time. This is because the nltk data + is large and takes a long time to download. It runs a python script that downloads + the data following connector installation. + """ + + nltk_python_script = textwrap.dedent( + """ + import nltk + nltk.download('punkt') + nltk.download('averaged_perceptron_tagger') + """ + ) + connector_container = ( + connector_container.with_new_file("/tmp/nltk_python_script.py", nltk_python_script) + .with_exec(["python", "/tmp/nltk_python_script.py"], skip_entrypoint=True) + .with_exec(["rm", "/tmp/nltk_python_script.py"], skip_entrypoint=True) + ) + + return connector_container + + +def install_tesseract_and_poppler(connector_container: Container) -> Container: + """ + Installs Tesseract-OCR and Poppler-utils in the container. These tools are necessary for + OCR (Optical Character Recognition) processes and working with PDFs, respectively. + """ + + connector_container = connector_container.with_exec( + ["sh", "-c", "apt-get update && apt-get install -y tesseract-ocr poppler-utils"], skip_entrypoint=True + ) + + return connector_container + + +async def post_connector_install(connector_container: Container) -> Container: + """ + Handles post-installation setup for the connector by setting up nltk and + installing necessary system dependencies such as Tesseract-OCR and Poppler-utils. + """ + + # Setup nltk in the container + connector_container = setup_nltk(connector_container) + + # Install Tesseract and Poppler + connector_container = install_tesseract_and_poppler(connector_container) + + return connector_container diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json b/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json index 156729699c45a..c8cef33ff4d90 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json +++ b/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json @@ -268,6 +268,19 @@ "type": "boolean" } } + }, + { + "title": "Document File Type Format (Experimental)", + "type": "object", + "properties": { + "filetype": { + "title": "Filetype", + "default": "unstructured", + "const": "unstructured", + "type": "string" + } + }, + "description": "Extract text from document formats (.pdf, .docx, .md) and emit as one record per file." } ] }, diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml b/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml index c83e47e9405bd..a28d503c492c3 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml +++ b/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml @@ -7,7 +7,7 @@ data: connectorSubtype: file connectorType: source definitionId: fdaaba68-4875-4ed9-8fcd-4ae1e0a25093 - dockerImageTag: 0.2.1 + dockerImageTag: 0.2.2 dockerRepository: airbyte/source-azure-blob-storage documentationUrl: https://docs.airbyte.com/integrations/sources/azure-blob-storage githubIssueLabel: source-azure-blob-storage diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/setup.py b/airbyte-integrations/connectors/source-azure-blob-storage/setup.py index cfcb2ebfb9e1c..5c77e10176d11 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/setup.py +++ b/airbyte-integrations/connectors/source-azure-blob-storage/setup.py @@ -5,7 +5,20 @@ from setuptools import find_packages, setup -MAIN_REQUIREMENTS = ["airbyte-cdk>=0.51.17", "smart_open[azure]", "pytz", "fastavro==1.4.11", "pyarrow"] +MAIN_REQUIREMENTS = [ + "airbyte-cdk>=0.51.17", + "smart_open[azure]", + "pytz", + "fastavro==1.4.11", + "pyarrow", + "unstructured==0.10.19", + "pdf2image==1.16.3", + "pdfminer.six==20221105", + "unstructured[docx]==0.10.19", + "unstructured.pytesseract>=0.3.12", + "pytesseract==0.3.10", + "markdown", +] TEST_REQUIREMENTS = ["requests-mock~=1.9.3", "pytest-mock~=3.6.1", "pytest~=6.2"] diff --git a/airbyte-integrations/connectors/source-s3/integration_tests/spec.json b/airbyte-integrations/connectors/source-s3/integration_tests/spec.json index f65f9b36c156a..4558142323925 100644 --- a/airbyte-integrations/connectors/source-s3/integration_tests/spec.json +++ b/airbyte-integrations/connectors/source-s3/integration_tests/spec.json @@ -270,7 +270,7 @@ } }, { - "title": "Markdown/PDF/Docx Format (Experimental)", + "title": "Document File Type Format (Experimental)", "type": "object", "properties": { "filetype": { @@ -280,7 +280,7 @@ "type": "string" } }, - "description": "Extract text from document formats and emit as one record per file." + "description": "Extract text from document formats (.pdf, .docx, .md) and emit as one record per file." } ] }, diff --git a/airbyte-integrations/connectors/source-s3/main.py b/airbyte-integrations/connectors/source-s3/main.py index 6813aeb82437b..c3b6b0bc32ede 100644 --- a/airbyte-integrations/connectors/source-s3/main.py +++ b/airbyte-integrations/connectors/source-s3/main.py @@ -10,18 +10,13 @@ from airbyte_cdk.entrypoint import AirbyteEntrypoint, launch from airbyte_cdk.models import AirbyteErrorTraceMessage, AirbyteMessage, AirbyteTraceMessage, TraceType, Type -from airbyte_cdk.sources.file_based.file_types import default_parsers from source_s3.v4 import Config, Cursor, SourceS3, SourceS3StreamReader -from source_s3.v4.config import UnstructuredFormat -from source_s3.v4.unstructured_parser import UnstructuredParser - -parsers = {**default_parsers, UnstructuredFormat: UnstructuredParser()} def get_source(args: List[str]): catalog_path = AirbyteEntrypoint.extract_catalog(args) try: - return SourceS3(SourceS3StreamReader(), Config, catalog_path, cursor_cls=Cursor, parsers=parsers) + return SourceS3(SourceS3StreamReader(), Config, catalog_path, cursor_cls=Cursor) except Exception: print( AirbyteMessage( diff --git a/airbyte-integrations/connectors/source-s3/metadata.yaml b/airbyte-integrations/connectors/source-s3/metadata.yaml index fd74b31153a12..d00af9105a1c4 100644 --- a/airbyte-integrations/connectors/source-s3/metadata.yaml +++ b/airbyte-integrations/connectors/source-s3/metadata.yaml @@ -10,7 +10,7 @@ data: connectorSubtype: file connectorType: source definitionId: 69589781-7828-43c5-9f63-8925b1c1ccc2 - dockerImageTag: 4.1.3 + dockerImageTag: 4.1.4 dockerRepository: airbyte/source-s3 documentationUrl: https://docs.airbyte.com/integrations/sources/s3 githubIssueLabel: source-s3 diff --git a/docs/integrations/sources/azure-blob-storage.md b/docs/integrations/sources/azure-blob-storage.md index dc9cb3bcba2ee..28b6cef44ad72 100644 --- a/docs/integrations/sources/azure-blob-storage.md +++ b/docs/integrations/sources/azure-blob-storage.md @@ -174,6 +174,20 @@ The Avro parser uses the [Fastavro library](https://fastavro.readthedocs.io/en/l There are currently no options for JSONL parsing. + +### Markdown/PDF/Docx Format (Experimental) + +:::warning +The Markdown/PDF/Docx format is currently an experimental feature and not subject to SLAs. Use at your own risk. +::: + +The Markdown/PDF/Docx format is a special format that allows you to extract text from Markdown, PDF, and Word documents. If selected, the connector will extract text from the documents and output it as a single field named `content`. The `document_key` field will hold a unique identifier for the processed file which can be used as a primary key. The content of the document will contain markdown formatting converted from the original file format. Each file matching the defined glob pattern needs to either be a markdown (`md`), PDF (`pdf`) or Docx (`docx`) file. + +One record will be emitted for each document. Keep in mind that large files can emit large records that might not fit into every destination as each destination has different limitations for string fields. + +To perform the text extraction from PDF and Docx files, the connector uses the [Unstructured](https://pypi.org/project/unstructured/) Python library. + + ## Changelog | Version | Date | Pull Request | Subject | diff --git a/docs/integrations/sources/s3.md b/docs/integrations/sources/s3.md index a369cd493a1a9..d1059c9f90e0b 100644 --- a/docs/integrations/sources/s3.md +++ b/docs/integrations/sources/s3.md @@ -247,9 +247,9 @@ The Markdown/PDF/Docx format is currently an experimental feature and not subjec The Markdown/PDF/Docx format is a special format that allows you to extract text from Markdown, PDF, and Word documents. If selected, the connector will extract text from the documents and output it as a single field named `content`. The `document_key` field will hold a unique identifier for the processed file which can be used as a primary key. The content of the document will contain markdown formatting converted from the original file format. Each file matching the defined glob pattern needs to either be a markdown (`md`), PDF (`pdf`) or Docx (`docx`) file. One record will be emitted for each document. Keep in mind that large files can emit large records that might not fit into every destination as each destination has different limitations for string fields. - To perform the text extraction from PDF and Docx files, the connector uses the [Unstructured](https://pypi.org/project/unstructured/) Python library. + ## Changelog From 27fdffe5f4613939a277b62742166cd03b9d19fa Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 27 Oct 2023 10:33:04 +0200 Subject: [PATCH 03/10] prepare release --- docs/integrations/sources/azure-blob-storage.md | 1 + docs/integrations/sources/s3.md | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/integrations/sources/azure-blob-storage.md b/docs/integrations/sources/azure-blob-storage.md index 28b6cef44ad72..2c563d05cee1e 100644 --- a/docs/integrations/sources/azure-blob-storage.md +++ b/docs/integrations/sources/azure-blob-storage.md @@ -192,6 +192,7 @@ To perform the text extraction from PDF and Docx files, the connector uses the [ | Version | Date | Pull Request | Subject | |:--------|:-----------|:------------------------------------------------|:------------------------------------------------------------------------| +| 0.2.2 | 2023-10-30 | [31904](https://github.com/airbytehq/airbyte/pull/31904) | Update CDK to support document file types | | 0.2.1 | 2023-10-18 | [31543](https://github.com/airbytehq/airbyte/pull/31543) | Base image migration: remove Dockerfile and use the python-connector-base image | | 0.2.0 | 2023-10-10 | https://github.com/airbytehq/airbyte/pull/31336 | Migrate to File-based CDK. Add support of CSV, Parquet and Avro files | | 0.1.0 | 2023-02-17 | https://github.com/airbytehq/airbyte/pull/23222 | Initial release with full-refresh and incremental sync with JSONL files | \ No newline at end of file diff --git a/docs/integrations/sources/s3.md b/docs/integrations/sources/s3.md index d1059c9f90e0b..3fc955691b3c7 100644 --- a/docs/integrations/sources/s3.md +++ b/docs/integrations/sources/s3.md @@ -255,8 +255,8 @@ To perform the text extraction from PDF and Docx files, the connector uses the [ | Version | Date | Pull Request | Subject | | :------ | :--------- | :-------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------- | +| 4.1.4 | 2023-10-30 | [31904](https://github.com/airbytehq/airbyte/pull/31904) | Update CDK | | 4.1.3 | 2023-10-25 | [31654](https://github.com/airbytehq/airbyte/pull/31654) | Reduce image size | -|:--------|:-----------| :-------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------- | | 4.1.2 | 2023-10-23 | [31383](https://github.com/airbytehq/airbyte/pull/31383) | Add handling NoSuchBucket error | | 4.1.1 | 2023-10-19 | [31601](https://github.com/airbytehq/airbyte/pull/31601) | Base image migration: remove Dockerfile and use the python-connector-base image | | 4.1.0 | 2023-10-17 | [31340](https://github.com/airbytehq/airbyte/pull/31340) | Add reading files inside zip archive | From 74dd1eb1871efd07e05dd22afe3d9a2484822227 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 27 Oct 2023 11:18:23 +0200 Subject: [PATCH 04/10] review comments --- airbyte-cdk/python/README.md | 6 ++ .../python/file_based_build_customization.py | 63 +++++++++++++++++++ .../build_customization.py | 62 +----------------- .../source-azure-blob-storage/setup.py | 2 +- .../source-s3/build_customization.py | 62 +----------------- .../source-s3/integration_tests/spec.json | 2 +- .../connectors/source-s3/setup.py | 2 +- 7 files changed, 74 insertions(+), 125 deletions(-) create mode 100644 airbyte-cdk/python/file_based_build_customization.py mode change 100644 => 120000 airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py mode change 100644 => 120000 airbyte-integrations/connectors/source-s3/build_customization.py diff --git a/airbyte-cdk/python/README.md b/airbyte-cdk/python/README.md index c3ac3221b6222..983005ce31c1d 100644 --- a/airbyte-cdk/python/README.md +++ b/airbyte-cdk/python/README.md @@ -150,6 +150,12 @@ HTTP requests to `localhost:8113/data` should now return the body defined in the 1. Open a PR 2. Once it is approved and **merged**, an Airbyte member must run the `Publish CDK Manually` workflow from master using `release-type=major|manor|patch` and setting the changelog message. +#### File-based CDK + +A subset of the CDK is dedicated to sources that have the notion of files. It's located in `airbyte-cdk/sources/file_based`. When using this part of the CDK, install the CDK using the `file-based` extra: `pip install airbyte-cdk[file-based]`. + +As the `unstructured` parser of the file based CDK requires some native dependencies to be installed, link the `file_based_build_customization.py` file in the connector as `build_customization.py`. + ## Coming Soon * Full OAuth 2.0 support \(including refresh token issuing flow via UI or CLI\) diff --git a/airbyte-cdk/python/file_based_build_customization.py b/airbyte-cdk/python/file_based_build_customization.py new file mode 100644 index 0000000000000..78c5d059738ab --- /dev/null +++ b/airbyte-cdk/python/file_based_build_customization.py @@ -0,0 +1,63 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from __future__ import annotations + +import textwrap +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from dagger import Container + + +def setup_nltk(connector_container: Container) -> Container: + """ + Seeds the connector with nltk data at build time. This is because the nltk data + is large and takes a long time to download. It runs a python script that downloads + the data following connector installation. + """ + + nltk_python_script = textwrap.dedent( + """ + import nltk + nltk.download('punkt') + nltk.download('averaged_perceptron_tagger') + """ + ) + connector_container = ( + connector_container.with_new_file("/tmp/nltk_python_script.py", nltk_python_script) + .with_exec(["python", "/tmp/nltk_python_script.py"], skip_entrypoint=True) + .with_exec(["rm", "/tmp/nltk_python_script.py"], skip_entrypoint=True) + ) + + return connector_container + + +def install_tesseract_and_poppler(connector_container: Container) -> Container: + """ + Installs Tesseract-OCR and Poppler-utils in the container. These tools are necessary for + OCR (Optical Character Recognition) processes and working with PDFs, respectively. + """ + + connector_container = connector_container.with_exec( + ["sh", "-c", "apt-get update && apt-get install -y tesseract-ocr poppler-utils"], skip_entrypoint=True + ) + + return connector_container + + +async def post_connector_install(connector_container: Container) -> Container: + """ + Handles post-installation setup for the connector by setting up nltk and + installing necessary system dependencies such as Tesseract-OCR and Poppler-utils. + + These steps are necessary if the unstructured parser from the file based CDK is exposed in the connector. + """ + + # Setup nltk in the container + connector_container = setup_nltk(connector_container) + + # Install Tesseract and Poppler + connector_container = install_tesseract_and_poppler(connector_container) + + return connector_container diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py b/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py deleted file mode 100644 index 5373d64a833c1..0000000000000 --- a/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py +++ /dev/null @@ -1,61 +0,0 @@ -# -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. -# -from __future__ import annotations - -import textwrap -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from dagger import Container - - -def setup_nltk(connector_container: Container) -> Container: - """ - Seeds the connector with nltk data at build time. This is because the nltk data - is large and takes a long time to download. It runs a python script that downloads - the data following connector installation. - """ - - nltk_python_script = textwrap.dedent( - """ - import nltk - nltk.download('punkt') - nltk.download('averaged_perceptron_tagger') - """ - ) - connector_container = ( - connector_container.with_new_file("/tmp/nltk_python_script.py", nltk_python_script) - .with_exec(["python", "/tmp/nltk_python_script.py"], skip_entrypoint=True) - .with_exec(["rm", "/tmp/nltk_python_script.py"], skip_entrypoint=True) - ) - - return connector_container - - -def install_tesseract_and_poppler(connector_container: Container) -> Container: - """ - Installs Tesseract-OCR and Poppler-utils in the container. These tools are necessary for - OCR (Optical Character Recognition) processes and working with PDFs, respectively. - """ - - connector_container = connector_container.with_exec( - ["sh", "-c", "apt-get update && apt-get install -y tesseract-ocr poppler-utils"], skip_entrypoint=True - ) - - return connector_container - - -async def post_connector_install(connector_container: Container) -> Container: - """ - Handles post-installation setup for the connector by setting up nltk and - installing necessary system dependencies such as Tesseract-OCR and Poppler-utils. - """ - - # Setup nltk in the container - connector_container = setup_nltk(connector_container) - - # Install Tesseract and Poppler - connector_container = install_tesseract_and_poppler(connector_container) - - return connector_container diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py b/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py new file mode 120000 index 0000000000000..6c492f9a9347c --- /dev/null +++ b/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py @@ -0,0 +1 @@ +../../../airbyte-cdk/python/file_based_build_customization.py \ No newline at end of file diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/setup.py b/airbyte-integrations/connectors/source-azure-blob-storage/setup.py index 5c77e10176d11..644c91229c8d4 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/setup.py +++ b/airbyte-integrations/connectors/source-azure-blob-storage/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-cdk>=0.51.17", + "airbyte-cdk>=0.52.5", "smart_open[azure]", "pytz", "fastavro==1.4.11", diff --git a/airbyte-integrations/connectors/source-s3/build_customization.py b/airbyte-integrations/connectors/source-s3/build_customization.py deleted file mode 100644 index 5373d64a833c1..0000000000000 --- a/airbyte-integrations/connectors/source-s3/build_customization.py +++ /dev/null @@ -1,61 +0,0 @@ -# -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. -# -from __future__ import annotations - -import textwrap -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from dagger import Container - - -def setup_nltk(connector_container: Container) -> Container: - """ - Seeds the connector with nltk data at build time. This is because the nltk data - is large and takes a long time to download. It runs a python script that downloads - the data following connector installation. - """ - - nltk_python_script = textwrap.dedent( - """ - import nltk - nltk.download('punkt') - nltk.download('averaged_perceptron_tagger') - """ - ) - connector_container = ( - connector_container.with_new_file("/tmp/nltk_python_script.py", nltk_python_script) - .with_exec(["python", "/tmp/nltk_python_script.py"], skip_entrypoint=True) - .with_exec(["rm", "/tmp/nltk_python_script.py"], skip_entrypoint=True) - ) - - return connector_container - - -def install_tesseract_and_poppler(connector_container: Container) -> Container: - """ - Installs Tesseract-OCR and Poppler-utils in the container. These tools are necessary for - OCR (Optical Character Recognition) processes and working with PDFs, respectively. - """ - - connector_container = connector_container.with_exec( - ["sh", "-c", "apt-get update && apt-get install -y tesseract-ocr poppler-utils"], skip_entrypoint=True - ) - - return connector_container - - -async def post_connector_install(connector_container: Container) -> Container: - """ - Handles post-installation setup for the connector by setting up nltk and - installing necessary system dependencies such as Tesseract-OCR and Poppler-utils. - """ - - # Setup nltk in the container - connector_container = setup_nltk(connector_container) - - # Install Tesseract and Poppler - connector_container = install_tesseract_and_poppler(connector_container) - - return connector_container diff --git a/airbyte-integrations/connectors/source-s3/build_customization.py b/airbyte-integrations/connectors/source-s3/build_customization.py new file mode 120000 index 0000000000000..6c492f9a9347c --- /dev/null +++ b/airbyte-integrations/connectors/source-s3/build_customization.py @@ -0,0 +1 @@ +../../../airbyte-cdk/python/file_based_build_customization.py \ No newline at end of file diff --git a/airbyte-integrations/connectors/source-s3/integration_tests/spec.json b/airbyte-integrations/connectors/source-s3/integration_tests/spec.json index 4558142323925..47d59b8f26681 100644 --- a/airbyte-integrations/connectors/source-s3/integration_tests/spec.json +++ b/airbyte-integrations/connectors/source-s3/integration_tests/spec.json @@ -21,7 +21,7 @@ "order": 10, "type": "array", "items": { - "title": "S3FileBasedStreamConfig", + "title": "BasedStreamConfig", "type": "object", "properties": { "name": { diff --git a/airbyte-integrations/connectors/source-s3/setup.py b/airbyte-integrations/connectors/source-s3/setup.py index a1ca53469dcf1..8cf826ee3a32a 100644 --- a/airbyte-integrations/connectors/source-s3/setup.py +++ b/airbyte-integrations/connectors/source-s3/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-cdk>=0.52.0", + "airbyte-cdk>=0.52.5", "pyarrow==12.0.1", "smart-open[s3]==5.1.0", "wcmatch==8.4", From 5b1587f003ac2e626a5c93fc9ec34d4891db0825 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 27 Oct 2023 15:46:41 +0200 Subject: [PATCH 05/10] clean up dependencies --- airbyte-cdk/python/README.md | 2 +- .../python/file_based_build_customization.py | 27 +++++-- .../build_customization.py | 77 ++++++++++++++++++- .../source-azure-blob-storage/setup.py | 11 +-- .../source-s3/build_customization.py | 77 ++++++++++++++++++- .../source-s3/integration_tests/spec.json | 2 +- .../connectors/source-s3/setup.py | 11 +-- 7 files changed, 176 insertions(+), 31 deletions(-) mode change 120000 => 100644 airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py mode change 120000 => 100644 airbyte-integrations/connectors/source-s3/build_customization.py diff --git a/airbyte-cdk/python/README.md b/airbyte-cdk/python/README.md index 983005ce31c1d..a69f44bf9a850 100644 --- a/airbyte-cdk/python/README.md +++ b/airbyte-cdk/python/README.md @@ -154,7 +154,7 @@ HTTP requests to `localhost:8113/data` should now return the body defined in the A subset of the CDK is dedicated to sources that have the notion of files. It's located in `airbyte-cdk/sources/file_based`. When using this part of the CDK, install the CDK using the `file-based` extra: `pip install airbyte-cdk[file-based]`. -As the `unstructured` parser of the file based CDK requires some native dependencies to be installed, link the `file_based_build_customization.py` file in the connector as `build_customization.py`. +As the `unstructured` parser of the file based CDK requires some native dependencies to be installed, copy the `file_based_build_customization.py` file into the connector as `build_customization.py`. ## Coming Soon diff --git a/airbyte-cdk/python/file_based_build_customization.py b/airbyte-cdk/python/file_based_build_customization.py index 78c5d059738ab..c524f9f9bfc2c 100644 --- a/airbyte-cdk/python/file_based_build_customization.py +++ b/airbyte-cdk/python/file_based_build_customization.py @@ -13,13 +13,18 @@ def setup_nltk(connector_container: Container) -> Container: """ Seeds the connector with nltk data at build time. This is because the nltk data - is large and takes a long time to download. It runs a python script that downloads + is large and takes some time to download. It runs a python script that downloads the data following connector installation. + + The data is cached to the images /root/nltk_data directory. """ nltk_python_script = textwrap.dedent( """ import nltk + + # inline the index url to make the build reproduceable by pinning the exact version of the nltk packages that are downloaded + downloader = nltk.downloader.Downloader(server_index_url="data:text/xml;charset=utf-8,%3C%3Fxml%20version%3D%221.0%22%3F%3E%0D%0A%3C%3Fxml-stylesheet%20href%3D%22index.xsl%22%20type%3D%22text%2Fxsl%22%3F%3E%0D%0A%3Cnltk_data%3E%0D%0A%20%20%3Cpackages%3E%0D%0A%20%20%20%20%3Cpackage%20id%3D%22punkt%22%20name%3D%22Punkt%20Tokenizer%20Models%22%20author%3D%22Jan%20Strunk%22%20languages%3D%22Czech%2C%20Danish%2C%20Dutch%2C%20English%2C%20Estonian%2C%20Finnish%2C%20French%2C%20German%2C%20Greek%2C%20Italian%2C%20Malayalam%2C%20Norwegian%2C%20Polish%2C%20Portuguese%2C%20Russian%2C%20Slovene%2C%20Spanish%2C%20Swedish%2C%20Turkish%22%20unzip%3D%221%22%20unzipped_size%3D%2237245719%22%20size%3D%2213905355%22%20checksum%3D%228dd1d8760a0976f96e5c262decd75165%22%20subdir%3D%22tokenizers%22%20url%3D%22https%3A%2F%2Fgithub.com%2Fnltk%2Fnltk_data%2Fraw%2F5db857e6f7df11eabb5e5665836db9ec8df07e28%2Fpackages%2Ftokenizers%2Fpunkt.zip%22%20%2F%3E%0D%0A%20%20%20%20%3Cpackage%20id%3D%22averaged_perceptron_tagger%22%20name%3D%22Averaged%20Perceptron%20Tagger%22%20languages%3D%22English%22%20unzip%3D%221%22%20unzipped_size%3D%226138625%22%20size%3D%222526731%22%20checksum%3D%2205c91d607ee1043181233365b3f76978%22%20subdir%3D%22taggers%22%20url%3D%22https%3A%2F%2Fgithub.com%2Fnltk%2Fnltk_data%2Fraw%2F5db857e6f7df11eabb5e5665836db9ec8df07e28%2Fpackages%2Ftaggers%2Faveraged_perceptron_tagger.zip%22%20%2F%3E%0D%0A%20%20%3C%2Fpackages%3E%0D%0A%20%20%3Ccollections%3E%0D%0A%20%20%3C%2Fcollections%3E%0D%0A%3C%2Fnltk_data%3E") nltk.download('punkt') nltk.download('averaged_perceptron_tagger') """ @@ -40,16 +45,27 @@ def install_tesseract_and_poppler(connector_container: Container) -> Container: """ connector_container = connector_container.with_exec( - ["sh", "-c", "apt-get update && apt-get install -y tesseract-ocr poppler-utils"], skip_entrypoint=True + ["sh", "-c", "apt-get update && apt-get install -y tesseract-ocr=5.3.0-2 poppler-utils=22.12.0-2+b1"], skip_entrypoint=True ) return connector_container +async def pre_connector_install(connector_container: Container) -> Container: + """ + Handles pre-installation setup for the connector by installing necessary system dependencies such as Tesseract-OCR and Poppler-utils. + + These steps are necessary if the unstructured parser from the file based CDK is exposed in the connector. + """ + + # Install Tesseract and Poppler + connector_container = install_tesseract_and_poppler(connector_container) + + return connector_container + async def post_connector_install(connector_container: Container) -> Container: """ - Handles post-installation setup for the connector by setting up nltk and - installing necessary system dependencies such as Tesseract-OCR and Poppler-utils. + Handles post-installation setup for the connector by setting up nltk. These steps are necessary if the unstructured parser from the file based CDK is exposed in the connector. """ @@ -57,7 +73,4 @@ async def post_connector_install(connector_container: Container) -> Container: # Setup nltk in the container connector_container = setup_nltk(connector_container) - # Install Tesseract and Poppler - connector_container = install_tesseract_and_poppler(connector_container) - return connector_container diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py b/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py deleted file mode 120000 index 6c492f9a9347c..0000000000000 --- a/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py +++ /dev/null @@ -1 +0,0 @@ -../../../airbyte-cdk/python/file_based_build_customization.py \ No newline at end of file diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py b/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py new file mode 100644 index 0000000000000..c524f9f9bfc2c --- /dev/null +++ b/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py @@ -0,0 +1,76 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from __future__ import annotations + +import textwrap +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from dagger import Container + + +def setup_nltk(connector_container: Container) -> Container: + """ + Seeds the connector with nltk data at build time. This is because the nltk data + is large and takes some time to download. It runs a python script that downloads + the data following connector installation. + + The data is cached to the images /root/nltk_data directory. + """ + + nltk_python_script = textwrap.dedent( + """ + import nltk + + # inline the index url to make the build reproduceable by pinning the exact version of the nltk packages that are downloaded + downloader = nltk.downloader.Downloader(server_index_url="data:text/xml;charset=utf-8,%3C%3Fxml%20version%3D%221.0%22%3F%3E%0D%0A%3C%3Fxml-stylesheet%20href%3D%22index.xsl%22%20type%3D%22text%2Fxsl%22%3F%3E%0D%0A%3Cnltk_data%3E%0D%0A%20%20%3Cpackages%3E%0D%0A%20%20%20%20%3Cpackage%20id%3D%22punkt%22%20name%3D%22Punkt%20Tokenizer%20Models%22%20author%3D%22Jan%20Strunk%22%20languages%3D%22Czech%2C%20Danish%2C%20Dutch%2C%20English%2C%20Estonian%2C%20Finnish%2C%20French%2C%20German%2C%20Greek%2C%20Italian%2C%20Malayalam%2C%20Norwegian%2C%20Polish%2C%20Portuguese%2C%20Russian%2C%20Slovene%2C%20Spanish%2C%20Swedish%2C%20Turkish%22%20unzip%3D%221%22%20unzipped_size%3D%2237245719%22%20size%3D%2213905355%22%20checksum%3D%228dd1d8760a0976f96e5c262decd75165%22%20subdir%3D%22tokenizers%22%20url%3D%22https%3A%2F%2Fgithub.com%2Fnltk%2Fnltk_data%2Fraw%2F5db857e6f7df11eabb5e5665836db9ec8df07e28%2Fpackages%2Ftokenizers%2Fpunkt.zip%22%20%2F%3E%0D%0A%20%20%20%20%3Cpackage%20id%3D%22averaged_perceptron_tagger%22%20name%3D%22Averaged%20Perceptron%20Tagger%22%20languages%3D%22English%22%20unzip%3D%221%22%20unzipped_size%3D%226138625%22%20size%3D%222526731%22%20checksum%3D%2205c91d607ee1043181233365b3f76978%22%20subdir%3D%22taggers%22%20url%3D%22https%3A%2F%2Fgithub.com%2Fnltk%2Fnltk_data%2Fraw%2F5db857e6f7df11eabb5e5665836db9ec8df07e28%2Fpackages%2Ftaggers%2Faveraged_perceptron_tagger.zip%22%20%2F%3E%0D%0A%20%20%3C%2Fpackages%3E%0D%0A%20%20%3Ccollections%3E%0D%0A%20%20%3C%2Fcollections%3E%0D%0A%3C%2Fnltk_data%3E") + nltk.download('punkt') + nltk.download('averaged_perceptron_tagger') + """ + ) + connector_container = ( + connector_container.with_new_file("/tmp/nltk_python_script.py", nltk_python_script) + .with_exec(["python", "/tmp/nltk_python_script.py"], skip_entrypoint=True) + .with_exec(["rm", "/tmp/nltk_python_script.py"], skip_entrypoint=True) + ) + + return connector_container + + +def install_tesseract_and_poppler(connector_container: Container) -> Container: + """ + Installs Tesseract-OCR and Poppler-utils in the container. These tools are necessary for + OCR (Optical Character Recognition) processes and working with PDFs, respectively. + """ + + connector_container = connector_container.with_exec( + ["sh", "-c", "apt-get update && apt-get install -y tesseract-ocr=5.3.0-2 poppler-utils=22.12.0-2+b1"], skip_entrypoint=True + ) + + return connector_container + +async def pre_connector_install(connector_container: Container) -> Container: + """ + Handles pre-installation setup for the connector by installing necessary system dependencies such as Tesseract-OCR and Poppler-utils. + + These steps are necessary if the unstructured parser from the file based CDK is exposed in the connector. + """ + + # Install Tesseract and Poppler + connector_container = install_tesseract_and_poppler(connector_container) + + return connector_container + + +async def post_connector_install(connector_container: Container) -> Container: + """ + Handles post-installation setup for the connector by setting up nltk. + + These steps are necessary if the unstructured parser from the file based CDK is exposed in the connector. + """ + + # Setup nltk in the container + connector_container = setup_nltk(connector_container) + + return connector_container diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/setup.py b/airbyte-integrations/connectors/source-azure-blob-storage/setup.py index 644c91229c8d4..ed2ef81f9194c 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/setup.py +++ b/airbyte-integrations/connectors/source-azure-blob-storage/setup.py @@ -6,18 +6,9 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-cdk>=0.52.5", + "airbyte-cdk[file-based]>=0.52.5", "smart_open[azure]", "pytz", - "fastavro==1.4.11", - "pyarrow", - "unstructured==0.10.19", - "pdf2image==1.16.3", - "pdfminer.six==20221105", - "unstructured[docx]==0.10.19", - "unstructured.pytesseract>=0.3.12", - "pytesseract==0.3.10", - "markdown", ] TEST_REQUIREMENTS = ["requests-mock~=1.9.3", "pytest-mock~=3.6.1", "pytest~=6.2"] diff --git a/airbyte-integrations/connectors/source-s3/build_customization.py b/airbyte-integrations/connectors/source-s3/build_customization.py deleted file mode 120000 index 6c492f9a9347c..0000000000000 --- a/airbyte-integrations/connectors/source-s3/build_customization.py +++ /dev/null @@ -1 +0,0 @@ -../../../airbyte-cdk/python/file_based_build_customization.py \ No newline at end of file diff --git a/airbyte-integrations/connectors/source-s3/build_customization.py b/airbyte-integrations/connectors/source-s3/build_customization.py new file mode 100644 index 0000000000000..c524f9f9bfc2c --- /dev/null +++ b/airbyte-integrations/connectors/source-s3/build_customization.py @@ -0,0 +1,76 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +from __future__ import annotations + +import textwrap +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from dagger import Container + + +def setup_nltk(connector_container: Container) -> Container: + """ + Seeds the connector with nltk data at build time. This is because the nltk data + is large and takes some time to download. It runs a python script that downloads + the data following connector installation. + + The data is cached to the images /root/nltk_data directory. + """ + + nltk_python_script = textwrap.dedent( + """ + import nltk + + # inline the index url to make the build reproduceable by pinning the exact version of the nltk packages that are downloaded + downloader = nltk.downloader.Downloader(server_index_url="data:text/xml;charset=utf-8,%3C%3Fxml%20version%3D%221.0%22%3F%3E%0D%0A%3C%3Fxml-stylesheet%20href%3D%22index.xsl%22%20type%3D%22text%2Fxsl%22%3F%3E%0D%0A%3Cnltk_data%3E%0D%0A%20%20%3Cpackages%3E%0D%0A%20%20%20%20%3Cpackage%20id%3D%22punkt%22%20name%3D%22Punkt%20Tokenizer%20Models%22%20author%3D%22Jan%20Strunk%22%20languages%3D%22Czech%2C%20Danish%2C%20Dutch%2C%20English%2C%20Estonian%2C%20Finnish%2C%20French%2C%20German%2C%20Greek%2C%20Italian%2C%20Malayalam%2C%20Norwegian%2C%20Polish%2C%20Portuguese%2C%20Russian%2C%20Slovene%2C%20Spanish%2C%20Swedish%2C%20Turkish%22%20unzip%3D%221%22%20unzipped_size%3D%2237245719%22%20size%3D%2213905355%22%20checksum%3D%228dd1d8760a0976f96e5c262decd75165%22%20subdir%3D%22tokenizers%22%20url%3D%22https%3A%2F%2Fgithub.com%2Fnltk%2Fnltk_data%2Fraw%2F5db857e6f7df11eabb5e5665836db9ec8df07e28%2Fpackages%2Ftokenizers%2Fpunkt.zip%22%20%2F%3E%0D%0A%20%20%20%20%3Cpackage%20id%3D%22averaged_perceptron_tagger%22%20name%3D%22Averaged%20Perceptron%20Tagger%22%20languages%3D%22English%22%20unzip%3D%221%22%20unzipped_size%3D%226138625%22%20size%3D%222526731%22%20checksum%3D%2205c91d607ee1043181233365b3f76978%22%20subdir%3D%22taggers%22%20url%3D%22https%3A%2F%2Fgithub.com%2Fnltk%2Fnltk_data%2Fraw%2F5db857e6f7df11eabb5e5665836db9ec8df07e28%2Fpackages%2Ftaggers%2Faveraged_perceptron_tagger.zip%22%20%2F%3E%0D%0A%20%20%3C%2Fpackages%3E%0D%0A%20%20%3Ccollections%3E%0D%0A%20%20%3C%2Fcollections%3E%0D%0A%3C%2Fnltk_data%3E") + nltk.download('punkt') + nltk.download('averaged_perceptron_tagger') + """ + ) + connector_container = ( + connector_container.with_new_file("/tmp/nltk_python_script.py", nltk_python_script) + .with_exec(["python", "/tmp/nltk_python_script.py"], skip_entrypoint=True) + .with_exec(["rm", "/tmp/nltk_python_script.py"], skip_entrypoint=True) + ) + + return connector_container + + +def install_tesseract_and_poppler(connector_container: Container) -> Container: + """ + Installs Tesseract-OCR and Poppler-utils in the container. These tools are necessary for + OCR (Optical Character Recognition) processes and working with PDFs, respectively. + """ + + connector_container = connector_container.with_exec( + ["sh", "-c", "apt-get update && apt-get install -y tesseract-ocr=5.3.0-2 poppler-utils=22.12.0-2+b1"], skip_entrypoint=True + ) + + return connector_container + +async def pre_connector_install(connector_container: Container) -> Container: + """ + Handles pre-installation setup for the connector by installing necessary system dependencies such as Tesseract-OCR and Poppler-utils. + + These steps are necessary if the unstructured parser from the file based CDK is exposed in the connector. + """ + + # Install Tesseract and Poppler + connector_container = install_tesseract_and_poppler(connector_container) + + return connector_container + + +async def post_connector_install(connector_container: Container) -> Container: + """ + Handles post-installation setup for the connector by setting up nltk. + + These steps are necessary if the unstructured parser from the file based CDK is exposed in the connector. + """ + + # Setup nltk in the container + connector_container = setup_nltk(connector_container) + + return connector_container diff --git a/airbyte-integrations/connectors/source-s3/integration_tests/spec.json b/airbyte-integrations/connectors/source-s3/integration_tests/spec.json index 47d59b8f26681..0ef7e9a3f5aaa 100644 --- a/airbyte-integrations/connectors/source-s3/integration_tests/spec.json +++ b/airbyte-integrations/connectors/source-s3/integration_tests/spec.json @@ -21,7 +21,7 @@ "order": 10, "type": "array", "items": { - "title": "BasedStreamConfig", + "title": "FileBasedStreamConfig", "type": "object", "properties": { "name": { diff --git a/airbyte-integrations/connectors/source-s3/setup.py b/airbyte-integrations/connectors/source-s3/setup.py index 8cf826ee3a32a..8b1a07359a396 100644 --- a/airbyte-integrations/connectors/source-s3/setup.py +++ b/airbyte-integrations/connectors/source-s3/setup.py @@ -6,21 +6,12 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-cdk>=0.52.5", - "pyarrow==12.0.1", + "airbyte-cdk[file-based]>=0.52.5", "smart-open[s3]==5.1.0", "wcmatch==8.4", "dill==0.3.4", "pytz", - "fastavro==1.4.11", "python-snappy==0.6.1", - "unstructured==0.10.19", - "pdf2image==1.16.3", - "pdfminer.six==20221105", - "unstructured[docx]==0.10.19", - "unstructured.pytesseract>=0.3.12", - "pytesseract==0.3.10", - "markdown", ] TEST_REQUIREMENTS = [ From 287c5403882f3a8cd2811bcc1e537c03639c6f34 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 27 Oct 2023 15:56:23 +0200 Subject: [PATCH 06/10] fix tests --- airbyte-cdk/python/file_based_build_customization.py | 1 + .../connectors/source-azure-blob-storage/build_customization.py | 1 + airbyte-integrations/connectors/source-s3/build_customization.py | 1 + airbyte-integrations/connectors/source-s3/setup.py | 1 - 4 files changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte-cdk/python/file_based_build_customization.py b/airbyte-cdk/python/file_based_build_customization.py index c524f9f9bfc2c..84c0df037e455 100644 --- a/airbyte-cdk/python/file_based_build_customization.py +++ b/airbyte-cdk/python/file_based_build_customization.py @@ -50,6 +50,7 @@ def install_tesseract_and_poppler(connector_container: Container) -> Container: return connector_container + async def pre_connector_install(connector_container: Container) -> Container: """ Handles pre-installation setup for the connector by installing necessary system dependencies such as Tesseract-OCR and Poppler-utils. diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py b/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py index c524f9f9bfc2c..84c0df037e455 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py +++ b/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py @@ -50,6 +50,7 @@ def install_tesseract_and_poppler(connector_container: Container) -> Container: return connector_container + async def pre_connector_install(connector_container: Container) -> Container: """ Handles pre-installation setup for the connector by installing necessary system dependencies such as Tesseract-OCR and Poppler-utils. diff --git a/airbyte-integrations/connectors/source-s3/build_customization.py b/airbyte-integrations/connectors/source-s3/build_customization.py index c524f9f9bfc2c..84c0df037e455 100644 --- a/airbyte-integrations/connectors/source-s3/build_customization.py +++ b/airbyte-integrations/connectors/source-s3/build_customization.py @@ -50,6 +50,7 @@ def install_tesseract_and_poppler(connector_container: Container) -> Container: return connector_container + async def pre_connector_install(connector_container: Container) -> Container: """ Handles pre-installation setup for the connector by installing necessary system dependencies such as Tesseract-OCR and Poppler-utils. diff --git a/airbyte-integrations/connectors/source-s3/setup.py b/airbyte-integrations/connectors/source-s3/setup.py index 8b1a07359a396..b7afc2e99448c 100644 --- a/airbyte-integrations/connectors/source-s3/setup.py +++ b/airbyte-integrations/connectors/source-s3/setup.py @@ -23,7 +23,6 @@ "pytest-order", "netifaces~=0.11.0", "docker", - "avro==1.11.0", ] setup( From 6f8ca09bed894217a5145433c5f5047033f8bde2 Mon Sep 17 00:00:00 2001 From: alafanechere Date: Mon, 30 Oct 2023 11:55:15 +0100 Subject: [PATCH 07/10] rm build_customization.py and use python-connector-base-image:1.1.0 --- .../python/file_based_build_customization.py | 77 ------------------- .../build_customization.py | 77 ------------------- .../source-azure-blob-storage/metadata.yaml | 2 +- .../source-s3/build_customization.py | 77 ------------------- .../connectors/source-s3/metadata.yaml | 2 +- 5 files changed, 2 insertions(+), 233 deletions(-) delete mode 100644 airbyte-cdk/python/file_based_build_customization.py delete mode 100644 airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py delete mode 100644 airbyte-integrations/connectors/source-s3/build_customization.py diff --git a/airbyte-cdk/python/file_based_build_customization.py b/airbyte-cdk/python/file_based_build_customization.py deleted file mode 100644 index 84c0df037e455..0000000000000 --- a/airbyte-cdk/python/file_based_build_customization.py +++ /dev/null @@ -1,77 +0,0 @@ -# -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. -# -from __future__ import annotations - -import textwrap -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from dagger import Container - - -def setup_nltk(connector_container: Container) -> Container: - """ - Seeds the connector with nltk data at build time. This is because the nltk data - is large and takes some time to download. It runs a python script that downloads - the data following connector installation. - - The data is cached to the images /root/nltk_data directory. - """ - - nltk_python_script = textwrap.dedent( - """ - import nltk - - # inline the index url to make the build reproduceable by pinning the exact version of the nltk packages that are downloaded - downloader = nltk.downloader.Downloader(server_index_url="data:text/xml;charset=utf-8,%3C%3Fxml%20version%3D%221.0%22%3F%3E%0D%0A%3C%3Fxml-stylesheet%20href%3D%22index.xsl%22%20type%3D%22text%2Fxsl%22%3F%3E%0D%0A%3Cnltk_data%3E%0D%0A%20%20%3Cpackages%3E%0D%0A%20%20%20%20%3Cpackage%20id%3D%22punkt%22%20name%3D%22Punkt%20Tokenizer%20Models%22%20author%3D%22Jan%20Strunk%22%20languages%3D%22Czech%2C%20Danish%2C%20Dutch%2C%20English%2C%20Estonian%2C%20Finnish%2C%20French%2C%20German%2C%20Greek%2C%20Italian%2C%20Malayalam%2C%20Norwegian%2C%20Polish%2C%20Portuguese%2C%20Russian%2C%20Slovene%2C%20Spanish%2C%20Swedish%2C%20Turkish%22%20unzip%3D%221%22%20unzipped_size%3D%2237245719%22%20size%3D%2213905355%22%20checksum%3D%228dd1d8760a0976f96e5c262decd75165%22%20subdir%3D%22tokenizers%22%20url%3D%22https%3A%2F%2Fgithub.com%2Fnltk%2Fnltk_data%2Fraw%2F5db857e6f7df11eabb5e5665836db9ec8df07e28%2Fpackages%2Ftokenizers%2Fpunkt.zip%22%20%2F%3E%0D%0A%20%20%20%20%3Cpackage%20id%3D%22averaged_perceptron_tagger%22%20name%3D%22Averaged%20Perceptron%20Tagger%22%20languages%3D%22English%22%20unzip%3D%221%22%20unzipped_size%3D%226138625%22%20size%3D%222526731%22%20checksum%3D%2205c91d607ee1043181233365b3f76978%22%20subdir%3D%22taggers%22%20url%3D%22https%3A%2F%2Fgithub.com%2Fnltk%2Fnltk_data%2Fraw%2F5db857e6f7df11eabb5e5665836db9ec8df07e28%2Fpackages%2Ftaggers%2Faveraged_perceptron_tagger.zip%22%20%2F%3E%0D%0A%20%20%3C%2Fpackages%3E%0D%0A%20%20%3Ccollections%3E%0D%0A%20%20%3C%2Fcollections%3E%0D%0A%3C%2Fnltk_data%3E") - nltk.download('punkt') - nltk.download('averaged_perceptron_tagger') - """ - ) - connector_container = ( - connector_container.with_new_file("/tmp/nltk_python_script.py", nltk_python_script) - .with_exec(["python", "/tmp/nltk_python_script.py"], skip_entrypoint=True) - .with_exec(["rm", "/tmp/nltk_python_script.py"], skip_entrypoint=True) - ) - - return connector_container - - -def install_tesseract_and_poppler(connector_container: Container) -> Container: - """ - Installs Tesseract-OCR and Poppler-utils in the container. These tools are necessary for - OCR (Optical Character Recognition) processes and working with PDFs, respectively. - """ - - connector_container = connector_container.with_exec( - ["sh", "-c", "apt-get update && apt-get install -y tesseract-ocr=5.3.0-2 poppler-utils=22.12.0-2+b1"], skip_entrypoint=True - ) - - return connector_container - - -async def pre_connector_install(connector_container: Container) -> Container: - """ - Handles pre-installation setup for the connector by installing necessary system dependencies such as Tesseract-OCR and Poppler-utils. - - These steps are necessary if the unstructured parser from the file based CDK is exposed in the connector. - """ - - # Install Tesseract and Poppler - connector_container = install_tesseract_and_poppler(connector_container) - - return connector_container - - -async def post_connector_install(connector_container: Container) -> Container: - """ - Handles post-installation setup for the connector by setting up nltk. - - These steps are necessary if the unstructured parser from the file based CDK is exposed in the connector. - """ - - # Setup nltk in the container - connector_container = setup_nltk(connector_container) - - return connector_container diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py b/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py deleted file mode 100644 index 84c0df037e455..0000000000000 --- a/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py +++ /dev/null @@ -1,77 +0,0 @@ -# -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. -# -from __future__ import annotations - -import textwrap -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from dagger import Container - - -def setup_nltk(connector_container: Container) -> Container: - """ - Seeds the connector with nltk data at build time. This is because the nltk data - is large and takes some time to download. It runs a python script that downloads - the data following connector installation. - - The data is cached to the images /root/nltk_data directory. - """ - - nltk_python_script = textwrap.dedent( - """ - import nltk - - # inline the index url to make the build reproduceable by pinning the exact version of the nltk packages that are downloaded - downloader = nltk.downloader.Downloader(server_index_url="data:text/xml;charset=utf-8,%3C%3Fxml%20version%3D%221.0%22%3F%3E%0D%0A%3C%3Fxml-stylesheet%20href%3D%22index.xsl%22%20type%3D%22text%2Fxsl%22%3F%3E%0D%0A%3Cnltk_data%3E%0D%0A%20%20%3Cpackages%3E%0D%0A%20%20%20%20%3Cpackage%20id%3D%22punkt%22%20name%3D%22Punkt%20Tokenizer%20Models%22%20author%3D%22Jan%20Strunk%22%20languages%3D%22Czech%2C%20Danish%2C%20Dutch%2C%20English%2C%20Estonian%2C%20Finnish%2C%20French%2C%20German%2C%20Greek%2C%20Italian%2C%20Malayalam%2C%20Norwegian%2C%20Polish%2C%20Portuguese%2C%20Russian%2C%20Slovene%2C%20Spanish%2C%20Swedish%2C%20Turkish%22%20unzip%3D%221%22%20unzipped_size%3D%2237245719%22%20size%3D%2213905355%22%20checksum%3D%228dd1d8760a0976f96e5c262decd75165%22%20subdir%3D%22tokenizers%22%20url%3D%22https%3A%2F%2Fgithub.com%2Fnltk%2Fnltk_data%2Fraw%2F5db857e6f7df11eabb5e5665836db9ec8df07e28%2Fpackages%2Ftokenizers%2Fpunkt.zip%22%20%2F%3E%0D%0A%20%20%20%20%3Cpackage%20id%3D%22averaged_perceptron_tagger%22%20name%3D%22Averaged%20Perceptron%20Tagger%22%20languages%3D%22English%22%20unzip%3D%221%22%20unzipped_size%3D%226138625%22%20size%3D%222526731%22%20checksum%3D%2205c91d607ee1043181233365b3f76978%22%20subdir%3D%22taggers%22%20url%3D%22https%3A%2F%2Fgithub.com%2Fnltk%2Fnltk_data%2Fraw%2F5db857e6f7df11eabb5e5665836db9ec8df07e28%2Fpackages%2Ftaggers%2Faveraged_perceptron_tagger.zip%22%20%2F%3E%0D%0A%20%20%3C%2Fpackages%3E%0D%0A%20%20%3Ccollections%3E%0D%0A%20%20%3C%2Fcollections%3E%0D%0A%3C%2Fnltk_data%3E") - nltk.download('punkt') - nltk.download('averaged_perceptron_tagger') - """ - ) - connector_container = ( - connector_container.with_new_file("/tmp/nltk_python_script.py", nltk_python_script) - .with_exec(["python", "/tmp/nltk_python_script.py"], skip_entrypoint=True) - .with_exec(["rm", "/tmp/nltk_python_script.py"], skip_entrypoint=True) - ) - - return connector_container - - -def install_tesseract_and_poppler(connector_container: Container) -> Container: - """ - Installs Tesseract-OCR and Poppler-utils in the container. These tools are necessary for - OCR (Optical Character Recognition) processes and working with PDFs, respectively. - """ - - connector_container = connector_container.with_exec( - ["sh", "-c", "apt-get update && apt-get install -y tesseract-ocr=5.3.0-2 poppler-utils=22.12.0-2+b1"], skip_entrypoint=True - ) - - return connector_container - - -async def pre_connector_install(connector_container: Container) -> Container: - """ - Handles pre-installation setup for the connector by installing necessary system dependencies such as Tesseract-OCR and Poppler-utils. - - These steps are necessary if the unstructured parser from the file based CDK is exposed in the connector. - """ - - # Install Tesseract and Poppler - connector_container = install_tesseract_and_poppler(connector_container) - - return connector_container - - -async def post_connector_install(connector_container: Container) -> Container: - """ - Handles post-installation setup for the connector by setting up nltk. - - These steps are necessary if the unstructured parser from the file based CDK is exposed in the connector. - """ - - # Setup nltk in the container - connector_container = setup_nltk(connector_container) - - return connector_container diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml b/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml index a28d503c492c3..2a60ff3eb9dc5 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml +++ b/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml @@ -3,7 +3,7 @@ data: ql: 100 sl: 100 connectorBuildOptions: - baseImage: docker.io/airbyte/python-connector-base:1.1.0@sha256:bd98f6505c6764b1b5f99d3aedc23dfc9e9af631a62533f60eb32b1d3dbab20c + baseImage: docker.io/airbyte/python-connector-base:1.2.0@sha256:c22a9d97464b69d6ef01898edf3f8612dc11614f05a84984451dde195f337db9 connectorSubtype: file connectorType: source definitionId: fdaaba68-4875-4ed9-8fcd-4ae1e0a25093 diff --git a/airbyte-integrations/connectors/source-s3/build_customization.py b/airbyte-integrations/connectors/source-s3/build_customization.py deleted file mode 100644 index 84c0df037e455..0000000000000 --- a/airbyte-integrations/connectors/source-s3/build_customization.py +++ /dev/null @@ -1,77 +0,0 @@ -# -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. -# -from __future__ import annotations - -import textwrap -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from dagger import Container - - -def setup_nltk(connector_container: Container) -> Container: - """ - Seeds the connector with nltk data at build time. This is because the nltk data - is large and takes some time to download. It runs a python script that downloads - the data following connector installation. - - The data is cached to the images /root/nltk_data directory. - """ - - nltk_python_script = textwrap.dedent( - """ - import nltk - - # inline the index url to make the build reproduceable by pinning the exact version of the nltk packages that are downloaded - downloader = nltk.downloader.Downloader(server_index_url="data:text/xml;charset=utf-8,%3C%3Fxml%20version%3D%221.0%22%3F%3E%0D%0A%3C%3Fxml-stylesheet%20href%3D%22index.xsl%22%20type%3D%22text%2Fxsl%22%3F%3E%0D%0A%3Cnltk_data%3E%0D%0A%20%20%3Cpackages%3E%0D%0A%20%20%20%20%3Cpackage%20id%3D%22punkt%22%20name%3D%22Punkt%20Tokenizer%20Models%22%20author%3D%22Jan%20Strunk%22%20languages%3D%22Czech%2C%20Danish%2C%20Dutch%2C%20English%2C%20Estonian%2C%20Finnish%2C%20French%2C%20German%2C%20Greek%2C%20Italian%2C%20Malayalam%2C%20Norwegian%2C%20Polish%2C%20Portuguese%2C%20Russian%2C%20Slovene%2C%20Spanish%2C%20Swedish%2C%20Turkish%22%20unzip%3D%221%22%20unzipped_size%3D%2237245719%22%20size%3D%2213905355%22%20checksum%3D%228dd1d8760a0976f96e5c262decd75165%22%20subdir%3D%22tokenizers%22%20url%3D%22https%3A%2F%2Fgithub.com%2Fnltk%2Fnltk_data%2Fraw%2F5db857e6f7df11eabb5e5665836db9ec8df07e28%2Fpackages%2Ftokenizers%2Fpunkt.zip%22%20%2F%3E%0D%0A%20%20%20%20%3Cpackage%20id%3D%22averaged_perceptron_tagger%22%20name%3D%22Averaged%20Perceptron%20Tagger%22%20languages%3D%22English%22%20unzip%3D%221%22%20unzipped_size%3D%226138625%22%20size%3D%222526731%22%20checksum%3D%2205c91d607ee1043181233365b3f76978%22%20subdir%3D%22taggers%22%20url%3D%22https%3A%2F%2Fgithub.com%2Fnltk%2Fnltk_data%2Fraw%2F5db857e6f7df11eabb5e5665836db9ec8df07e28%2Fpackages%2Ftaggers%2Faveraged_perceptron_tagger.zip%22%20%2F%3E%0D%0A%20%20%3C%2Fpackages%3E%0D%0A%20%20%3Ccollections%3E%0D%0A%20%20%3C%2Fcollections%3E%0D%0A%3C%2Fnltk_data%3E") - nltk.download('punkt') - nltk.download('averaged_perceptron_tagger') - """ - ) - connector_container = ( - connector_container.with_new_file("/tmp/nltk_python_script.py", nltk_python_script) - .with_exec(["python", "/tmp/nltk_python_script.py"], skip_entrypoint=True) - .with_exec(["rm", "/tmp/nltk_python_script.py"], skip_entrypoint=True) - ) - - return connector_container - - -def install_tesseract_and_poppler(connector_container: Container) -> Container: - """ - Installs Tesseract-OCR and Poppler-utils in the container. These tools are necessary for - OCR (Optical Character Recognition) processes and working with PDFs, respectively. - """ - - connector_container = connector_container.with_exec( - ["sh", "-c", "apt-get update && apt-get install -y tesseract-ocr=5.3.0-2 poppler-utils=22.12.0-2+b1"], skip_entrypoint=True - ) - - return connector_container - - -async def pre_connector_install(connector_container: Container) -> Container: - """ - Handles pre-installation setup for the connector by installing necessary system dependencies such as Tesseract-OCR and Poppler-utils. - - These steps are necessary if the unstructured parser from the file based CDK is exposed in the connector. - """ - - # Install Tesseract and Poppler - connector_container = install_tesseract_and_poppler(connector_container) - - return connector_container - - -async def post_connector_install(connector_container: Container) -> Container: - """ - Handles post-installation setup for the connector by setting up nltk. - - These steps are necessary if the unstructured parser from the file based CDK is exposed in the connector. - """ - - # Setup nltk in the container - connector_container = setup_nltk(connector_container) - - return connector_container diff --git a/airbyte-integrations/connectors/source-s3/metadata.yaml b/airbyte-integrations/connectors/source-s3/metadata.yaml index d00af9105a1c4..1ce5e178792f2 100644 --- a/airbyte-integrations/connectors/source-s3/metadata.yaml +++ b/airbyte-integrations/connectors/source-s3/metadata.yaml @@ -6,7 +6,7 @@ data: hosts: - "*.s3.amazonaws.com" connectorBuildOptions: - baseImage: docker.io/airbyte/python-connector-base:1.1.0@sha256:bd98f6505c6764b1b5f99d3aedc23dfc9e9af631a62533f60eb32b1d3dbab20c + baseImage: docker.io/airbyte/python-connector-base:1.2.0@sha256:c22a9d97464b69d6ef01898edf3f8612dc11614f05a84984451dde195f337db9 connectorSubtype: file connectorType: source definitionId: 69589781-7828-43c5-9f63-8925b1c1ccc2 From 57635c995a719870c7ca970ae7e37d3d614575b8 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Mon, 30 Oct 2023 12:45:29 +0100 Subject: [PATCH 08/10] add acceptance test for unstructured in azure blob storage --- .../source-azure-blob-storage/acceptance-test-config.yml | 5 +++++ .../integration_tests/expected_records/unstructured.jsonl | 2 ++ 2 files changed, 7 insertions(+) create mode 100644 airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/expected_records/unstructured.jsonl diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/acceptance-test-config.yml b/airbyte-integrations/connectors/source-azure-blob-storage/acceptance-test-config.yml index 8a06b3818f0fa..e15f5f60be541 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/acceptance-test-config.yml +++ b/airbyte-integrations/connectors/source-azure-blob-storage/acceptance-test-config.yml @@ -53,6 +53,11 @@ acceptance_tests: expect_records: path: integration_tests/expected_records/jsonl_newlines.jsonl exact_order: true + - config_path: secrets/unstructured_config.json + expect_records: + path: integration_tests/expected_records/unstructured.jsonl + exact_order: true + timeout_seconds: 1800 connection: tests: - config_path: secrets/config.json diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/expected_records/unstructured.jsonl b/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/expected_records/unstructured.jsonl new file mode 100644 index 0000000000000..29dcaa565ccc8 --- /dev/null +++ b/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/expected_records/unstructured.jsonl @@ -0,0 +1,2 @@ +{"stream": "airbyte-source-azure-blob-storage-test", "data": {"content": "# Heading\n\nThis is the content which is not just a single word", "document_key": "Testdoc.pdf", "_ab_source_file_last_modified": "2023-10-30T11:38:48.000000Z", "_ab_source_file_url": "Testdoc.pdf"}, "emitted_at": 1698666216334} +{"stream": "airbyte-source-azure-blob-storage-test", "data": {"content": "This is a test", "document_key": "Testdoc_OCR.pdf", "_ab_source_file_last_modified": "2023-10-30T11:38:48.000000Z", "_ab_source_file_url": "Testdoc_OCR.pdf"}, "emitted_at": 1698666218048} \ No newline at end of file From be560160dc7bc9067bafc9ac26ab13b53ce58bb6 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Mon, 30 Oct 2023 12:46:32 +0100 Subject: [PATCH 09/10] revert readme change --- airbyte-cdk/python/README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/airbyte-cdk/python/README.md b/airbyte-cdk/python/README.md index a69f44bf9a850..c3ac3221b6222 100644 --- a/airbyte-cdk/python/README.md +++ b/airbyte-cdk/python/README.md @@ -150,12 +150,6 @@ HTTP requests to `localhost:8113/data` should now return the body defined in the 1. Open a PR 2. Once it is approved and **merged**, an Airbyte member must run the `Publish CDK Manually` workflow from master using `release-type=major|manor|patch` and setting the changelog message. -#### File-based CDK - -A subset of the CDK is dedicated to sources that have the notion of files. It's located in `airbyte-cdk/sources/file_based`. When using this part of the CDK, install the CDK using the `file-based` extra: `pip install airbyte-cdk[file-based]`. - -As the `unstructured` parser of the file based CDK requires some native dependencies to be installed, copy the `file_based_build_customization.py` file into the connector as `build_customization.py`. - ## Coming Soon * Full OAuth 2.0 support \(including refresh token issuing flow via UI or CLI\) From 9bc621a60b8f1d44661f8ae3c24986712285ef9d Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Mon, 30 Oct 2023 15:35:38 +0100 Subject: [PATCH 10/10] add pptx support --- .../source-azure-blob-storage/integration_tests/spec.json | 2 +- .../connectors/source-azure-blob-storage/setup.py | 2 +- .../connectors/source-s3/integration_tests/spec.json | 2 +- airbyte-integrations/connectors/source-s3/setup.py | 2 +- docs/integrations/sources/azure-blob-storage.md | 6 +++--- docs/integrations/sources/s3.md | 6 +++--- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json b/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json index c8cef33ff4d90..1ecff3aa29edb 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json +++ b/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json @@ -280,7 +280,7 @@ "type": "string" } }, - "description": "Extract text from document formats (.pdf, .docx, .md) and emit as one record per file." + "description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file." } ] }, diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/setup.py b/airbyte-integrations/connectors/source-azure-blob-storage/setup.py index ed2ef81f9194c..b2211beb81bb7 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/setup.py +++ b/airbyte-integrations/connectors/source-azure-blob-storage/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-cdk[file-based]>=0.52.5", + "airbyte-cdk[file-based]>=0.52.7", "smart_open[azure]", "pytz", ] diff --git a/airbyte-integrations/connectors/source-s3/integration_tests/spec.json b/airbyte-integrations/connectors/source-s3/integration_tests/spec.json index 0ef7e9a3f5aaa..c97aa81955f18 100644 --- a/airbyte-integrations/connectors/source-s3/integration_tests/spec.json +++ b/airbyte-integrations/connectors/source-s3/integration_tests/spec.json @@ -280,7 +280,7 @@ "type": "string" } }, - "description": "Extract text from document formats (.pdf, .docx, .md) and emit as one record per file." + "description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file." } ] }, diff --git a/airbyte-integrations/connectors/source-s3/setup.py b/airbyte-integrations/connectors/source-s3/setup.py index b7afc2e99448c..5cf8dd246356b 100644 --- a/airbyte-integrations/connectors/source-s3/setup.py +++ b/airbyte-integrations/connectors/source-s3/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-cdk[file-based]>=0.52.5", + "airbyte-cdk[file-based]>=0.52.7", "smart-open[s3]==5.1.0", "wcmatch==8.4", "dill==0.3.4", diff --git a/docs/integrations/sources/azure-blob-storage.md b/docs/integrations/sources/azure-blob-storage.md index 2c563d05cee1e..782eeef1e5625 100644 --- a/docs/integrations/sources/azure-blob-storage.md +++ b/docs/integrations/sources/azure-blob-storage.md @@ -175,13 +175,13 @@ The Avro parser uses the [Fastavro library](https://fastavro.readthedocs.io/en/l There are currently no options for JSONL parsing. -### Markdown/PDF/Docx Format (Experimental) +### Document File Type Format (Experimental) :::warning -The Markdown/PDF/Docx format is currently an experimental feature and not subject to SLAs. Use at your own risk. +The Document File Type Format is currently an experimental feature and not subject to SLAs. Use at your own risk. ::: -The Markdown/PDF/Docx format is a special format that allows you to extract text from Markdown, PDF, and Word documents. If selected, the connector will extract text from the documents and output it as a single field named `content`. The `document_key` field will hold a unique identifier for the processed file which can be used as a primary key. The content of the document will contain markdown formatting converted from the original file format. Each file matching the defined glob pattern needs to either be a markdown (`md`), PDF (`pdf`) or Docx (`docx`) file. +The Document File Type Format is a special format that allows you to extract text from Markdown, PDF, Word and Powerpoint documents. If selected, the connector will extract text from the documents and output it as a single field named `content`. The `document_key` field will hold a unique identifier for the processed file which can be used as a primary key. The content of the document will contain markdown formatting converted from the original file format. Each file matching the defined glob pattern needs to either be a markdown (`md`), PDF (`pdf`), Word (`docx`) or Powerpoint (`.pptx`) file. One record will be emitted for each document. Keep in mind that large files can emit large records that might not fit into every destination as each destination has different limitations for string fields. diff --git a/docs/integrations/sources/s3.md b/docs/integrations/sources/s3.md index 3fc955691b3c7..f48e22cc557dc 100644 --- a/docs/integrations/sources/s3.md +++ b/docs/integrations/sources/s3.md @@ -238,13 +238,13 @@ The Avro parser uses the [Fastavro library](https://fastavro.readthedocs.io/en/l There are currently no options for JSONL parsing. -### Markdown/PDF/Docx Format (Experimental) +### Document File Type Format (Experimental) :::warning -The Markdown/PDF/Docx format is currently an experimental feature and not subject to SLAs. Use at your own risk. +The Document File Type Format is currently an experimental feature and not subject to SLAs. Use at your own risk. ::: -The Markdown/PDF/Docx format is a special format that allows you to extract text from Markdown, PDF, and Word documents. If selected, the connector will extract text from the documents and output it as a single field named `content`. The `document_key` field will hold a unique identifier for the processed file which can be used as a primary key. The content of the document will contain markdown formatting converted from the original file format. Each file matching the defined glob pattern needs to either be a markdown (`md`), PDF (`pdf`) or Docx (`docx`) file. +The Document File Type Format is a special format that allows you to extract text from Markdown, PDF, Word and Powerpoint documents. If selected, the connector will extract text from the documents and output it as a single field named `content`. The `document_key` field will hold a unique identifier for the processed file which can be used as a primary key. The content of the document will contain markdown formatting converted from the original file format. Each file matching the defined glob pattern needs to either be a markdown (`md`), PDF (`pdf`), Word (`docx`) or Powerpoint (`.pptx`) file. One record will be emitted for each document. Keep in mind that large files can emit large records that might not fit into every destination as each destination has different limitations for string fields.