airbytehq · flash1293 · Oct 31, 2023 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
diff --git a/airbyte-cdk/python/README.md b/airbyte-cdk/python/README.md
@@ -150,6 +150,12 @@ HTTP requests to `localhost:8113/data` should now return the body defined in the
 1. Open a PR
 2. Once it is approved and **merged**, an Airbyte member must run the `Publish CDK Manually` workflow from master using `release-type=major|manor|patch` and setting the changelog message.
 
+#### File-based CDK
+
+A subset of the CDK is dedicated to sources that have the notion of files. It's located in `airbyte-cdk/sources/file_based`. When using this part of the CDK, install the CDK using the `file-based` extra: `pip install airbyte-cdk[file-based]`.
+
+As the `unstructured` parser of the file based CDK requires some native dependencies to be installed, link the `file_based_build_customization.py` file in the connector as `build_customization.py`.
+
 ## Coming Soon
 
 * Full OAuth 2.0 support \(including refresh token issuing flow via UI or CLI\)

diff --git a/airbyte-cdk/python/file_based_build_customization.py b/airbyte-cdk/python/file_based_build_customization.py
@@ -0,0 +1,63 @@
+#
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+#
+from __future__ import annotations
+
+import textwrap
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from dagger import Container
+
+
+def setup_nltk(connector_container: Container) -> Container:
+    """
+    Seeds the connector with nltk data at build time. This is because the nltk data
+    is large and takes a long time to download. It runs a python script that downloads
+    the data following connector installation.
+    """
+
+    nltk_python_script = textwrap.dedent(
+        """
+        import nltk
+        nltk.download('punkt')
+        nltk.download('averaged_perceptron_tagger')
+        """
+    )
+    connector_container = (
+        connector_container.with_new_file("/tmp/nltk_python_script.py", nltk_python_script)
+        .with_exec(["python", "/tmp/nltk_python_script.py"], skip_entrypoint=True)
+        .with_exec(["rm", "/tmp/nltk_python_script.py"], skip_entrypoint=True)
+    )
+
+    return connector_container
+
+
+def install_tesseract_and_poppler(connector_container: Container) -> Container:
+    """
+    Installs Tesseract-OCR and Poppler-utils in the container. These tools are necessary for
+    OCR (Optical Character Recognition) processes and working with PDFs, respectively.
+    """
+
+    connector_container = connector_container.with_exec(
+        ["sh", "-c", "apt-get update && apt-get install -y tesseract-ocr poppler-utils"], skip_entrypoint=True
+    )
+
+    return connector_container
+
+
+async def post_connector_install(connector_container: Container) -> Container:
+    """
+    Handles post-installation setup for the connector by setting up nltk and
+    installing necessary system dependencies such as Tesseract-OCR and Poppler-utils.
+
+    These steps are necessary if the unstructured parser from the file based CDK is exposed in the connector.
+    """
+
+    # Setup nltk in the container
+    connector_container = setup_nltk(connector_container)
+
+    # Install Tesseract and Poppler
+    connector_container = install_tesseract_and_poppler(connector_container)
+
+    return connector_container
diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py b/airbyte-integrations/connectors/source-azure-blob-storage/build_customization.py
@@ -0,0 +1 @@
+../../../airbyte-cdk/python/file_based_build_customization.py
diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json b/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json
@@ -268,6 +268,19 @@
                       "type": "boolean"
                     }
                   }
+                },
+                {
+                  "title": "Document File Type Format (Experimental)",
+                  "type": "object",
+                  "properties": {
+                    "filetype": {
+                      "title": "Filetype",
+                      "default": "unstructured",
+                      "const": "unstructured",
+                      "type": "string"
+                    }
+                  },
+                  "description": "Extract text from document formats (.pdf, .docx, .md) and emit as one record per file."
                 }
               ]
             },

diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml b/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml
@@ -7,7 +7,7 @@ data:
   connectorSubtype: file
   connectorType: source
   definitionId: fdaaba68-4875-4ed9-8fcd-4ae1e0a25093
-  dockerImageTag: 0.2.1
+  dockerImageTag: 0.2.2
   dockerRepository: airbyte/source-azure-blob-storage
   documentationUrl: https://docs.airbyte.com/integrations/sources/azure-blob-storage
   githubIssueLabel: source-azure-blob-storage

diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/setup.py b/airbyte-integrations/connectors/source-azure-blob-storage/setup.py
@@ -5,7 +5,20 @@
 
 from setuptools import find_packages, setup
 
-MAIN_REQUIREMENTS = ["airbyte-cdk>=0.51.17", "smart_open[azure]", "pytz", "fastavro==1.4.11", "pyarrow"]
+MAIN_REQUIREMENTS = [
+    "airbyte-cdk>=0.52.5",
+    "smart_open[azure]",
+    "pytz",
+    "fastavro==1.4.11",
+    "pyarrow",
+    "unstructured==0.10.19",
+    "pdf2image==1.16.3",
+    "pdfminer.six==20221105",
+    "unstructured[docx]==0.10.19",
+    "unstructured.pytesseract>=0.3.12",
+    "pytesseract==0.3.10",
+    "markdown",
+]
 
 TEST_REQUIREMENTS = ["requests-mock~=1.9.3", "pytest-mock~=3.6.1", "pytest~=6.2"]
 

@@ -0,0 +1 @@
+../../../airbyte-cdk/python/file_based_build_customization.py
diff --git a/airbyte-integrations/connectors/source-s3/integration_tests/spec.json b/airbyte-integrations/connectors/source-s3/integration_tests/spec.json
@@ -21,7 +21,7 @@
         "order": 10,
         "type": "array",
         "items": {
-          "title": "S3FileBasedStreamConfig",
+          "title": "BasedStreamConfig",
-          "title": "BasedStreamConfig",
+          "title": "FileBasedStreamConfig",
-          "title": "BasedStreamConfig",
+          "title": "FileBasedStreamConfig",
           "type": "object",
           "properties": {
             "name": {
@@ -270,7 +270,7 @@
                   }
                 },
                 {
-                  "title": "Markdown/PDF/Docx Format (Experimental)",
+                  "title": "Document File Type Format (Experimental)",
                   "type": "object",
                   "properties": {
                     "filetype": {
@@ -280,7 +280,7 @@
                       "type": "string"
                     }
                   },
-                  "description": "Extract text from document formats and emit as one record per file."
+                  "description": "Extract text from document formats (.pdf, .docx, .md) and emit as one record per file."
                 }
               ]
             },

diff --git a/airbyte-integrations/connectors/source-s3/main.py b/airbyte-integrations/connectors/source-s3/main.py
@@ -10,18 +10,13 @@
 
 from airbyte_cdk.entrypoint import AirbyteEntrypoint, launch
 from airbyte_cdk.models import AirbyteErrorTraceMessage, AirbyteMessage, AirbyteTraceMessage, TraceType, Type
-from airbyte_cdk.sources.file_based.file_types import default_parsers
 from source_s3.v4 import Config, Cursor, SourceS3, SourceS3StreamReader
-from source_s3.v4.config import UnstructuredFormat
-from source_s3.v4.unstructured_parser import UnstructuredParser
-
-parsers = {**default_parsers, UnstructuredFormat: UnstructuredParser()}
 
 
 def get_source(args: List[str]):
     catalog_path = AirbyteEntrypoint.extract_catalog(args)
     try:
-        return SourceS3(SourceS3StreamReader(), Config, catalog_path, cursor_cls=Cursor, parsers=parsers)
+        return SourceS3(SourceS3StreamReader(), Config, catalog_path, cursor_cls=Cursor)
     except Exception:
         print(
             AirbyteMessage(

diff --git a/airbyte-integrations/connectors/source-s3/metadata.yaml b/airbyte-integrations/connectors/source-s3/metadata.yaml
@@ -10,7 +10,7 @@ data:
   connectorSubtype: file
   connectorType: source
   definitionId: 69589781-7828-43c5-9f63-8925b1c1ccc2
-  dockerImageTag: 4.1.3
+  dockerImageTag: 4.1.4
   dockerRepository: airbyte/source-s3
   documentationUrl: https://docs.airbyte.com/integrations/sources/s3
   githubIssueLabel: source-s3

diff --git a/airbyte-integrations/connectors/source-s3/setup.py b/airbyte-integrations/connectors/source-s3/setup.py
@@ -6,7 +6,7 @@
 from setuptools import find_packages, setup
 
 MAIN_REQUIREMENTS = [
-    "airbyte-cdk>=0.52.0",
+    "airbyte-cdk>=0.52.5",
     "pyarrow==12.0.1",
     "smart-open[s3]==5.1.0",
     "wcmatch==8.4",

diff --git a/airbyte-integrations/connectors/source-s3/source_s3/v4/config.py b/airbyte-integrations/connectors/source-s3/source_s3/v4/config.py
@@ -2,33 +2,10 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
 
-from typing import List, Optional, Union
+from typing import Optional
 
 from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
-from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
-from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
-from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
-from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
-from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat
-from pydantic import AnyUrl, BaseModel, Field, ValidationError, root_validator
-
-
-class UnstructuredFormat(BaseModel):
-    class Config:
-        title = "Markdown/PDF/Docx Format (Experimental)"
-        schema_extra = {"description": "Extract text from document formats and emit as one record per file."}
-
-    filetype: str = Field(
-        "unstructured",
-        const=True,
-    )
-
-
-class S3FileBasedStreamConfig(FileBasedStreamConfig):
-    format: Union[AvroFormat, CsvFormat, JsonlFormat, ParquetFormat, UnstructuredFormat] = Field(
-        title="Format",
-        description="The configuration options that are used to alter how to read incoming files that deviate from the standard formatting.",
-    )
+from pydantic import AnyUrl, Field, ValidationError, root_validator
 
 
 class Config(AbstractFileBasedSpec):
@@ -65,12 +42,6 @@ def documentation_url(cls) -> AnyUrl:
         "", title="Endpoint", description="Endpoint to an S3 compatible service. Leave empty to use AWS.", order=4
     )
 
-    streams: List[S3FileBasedStreamConfig] = Field(
-        title="The list of streams to sync",
-        description='Each instance of this configuration defines a <a href="https://docs.airbyte.com/cloud/core-concepts#stream">stream</a>. Use this to define which files belong in the stream, their format, and how they should be parsed and validated. When sending data to warehouse destination such as Snowflake or BigQuery, each stream is a separate table.',
-        order=10,
-    )
-
     @root_validator
     def validate_optional_args(cls, values):
         aws_access_key_id = values.get("aws_access_key_id")