AirbyteLib: Fix column count mismatch bug (#34783)

airbytehq · Feb 13, 2024 · b6804df · b6804df
1 parent 430258e
commit b6804df
Show file tree

Hide file tree

Showing 11 changed files with 229 additions and 89 deletions.
diff --git a/airbyte-lib/airbyte_lib/_file_writers/base.py b/airbyte-lib/airbyte_lib/_file_writers/base.py
@@ -51,7 +51,7 @@ def _write_batch(
         self,
         stream_name: str,
         batch_id: str,
-        record_batch: pa.Table | pa.RecordBatch,
+        record_batch: pa.Table,
     ) -> FileWriterBatchHandle:
         """Process a record batch.
 
@@ -64,7 +64,7 @@ def write_batch(
         self,
         stream_name: str,
         batch_id: str,
-        record_batch: pa.Table | pa.RecordBatch,
+        record_batch: pa.Table,
     ) -> FileWriterBatchHandle:
         """Write a batch of records to the cache.
 

diff --git a/airbyte-lib/airbyte_lib/_file_writers/parquet.py b/airbyte-lib/airbyte_lib/_file_writers/parquet.py
@@ -11,7 +11,12 @@
 from overrides import overrides
 from pyarrow import parquet
 
-from .base import FileWriterBase, FileWriterBatchHandle, FileWriterConfigBase
+from airbyte_lib import exceptions as exc
+from airbyte_lib._file_writers.base import (
+    FileWriterBase,
+    FileWriterBatchHandle,
+    FileWriterConfigBase,
+)
 
 
 class ParquetWriterConfig(FileWriterConfigBase):
@@ -37,12 +42,24 @@ def get_new_cache_file_path(
         target_dir.mkdir(parents=True, exist_ok=True)
         return target_dir / f"{stream_name}_{batch_id}.parquet"
 
+    def _get_missing_columns(
+        self,
+        stream_name: str,
+        record_batch: pa.Table,
+    ) -> list[str]:
+        """Return a list of columns that are missing in the batch."""
+        if not self._catalog_manager:
+            raise exc.AirbyteLibInternalError(message="Catalog manager should exist but does not.")
+        stream = self._catalog_manager.get_stream_config(stream_name)
+        stream_property_names = stream.stream.json_schema["properties"].keys()
+        return [col for col in stream_property_names if col not in record_batch.schema.names]
+
     @overrides
     def _write_batch(
         self,
         stream_name: str,
         batch_id: str,
-        record_batch: pa.Table | pa.RecordBatch,
+        record_batch: pa.Table,
     ) -> FileWriterBatchHandle:
         """Process a record batch.
 
@@ -51,8 +68,15 @@ def _write_batch(
         _ = batch_id  # unused
         output_file_path = self.get_new_cache_file_path(stream_name)
 
-        with parquet.ParquetWriter(output_file_path, record_batch.schema) as writer:
-            writer.write_table(cast(pa.Table, record_batch))
+        missing_columns = self._get_missing_columns(stream_name, record_batch)
+        if missing_columns:
+            # We need to append columns with the missing column name(s) and a null type
+            null_array = cast(pa.Array, pa.array([None] * len(record_batch), type=pa.null()))
+            for col in missing_columns:
+                record_batch = record_batch.append_column(col, null_array)
+
+        with parquet.ParquetWriter(output_file_path, schema=record_batch.schema) as writer:
+            writer.write_table(record_batch)
 
         batch_handle = FileWriterBatchHandle()
         batch_handle.files.append(output_file_path)

diff --git a/airbyte-lib/airbyte_lib/_processors.py b/airbyte-lib/airbyte_lib/_processors.py
@@ -27,16 +27,19 @@
     AirbyteStateType,
     AirbyteStreamState,
     ConfiguredAirbyteCatalog,
+    ConfiguredAirbyteStream,
     Type,
 )
 
+from airbyte_lib import exceptions as exc
 from airbyte_lib._util import protocol_util  # Internal utility functions
 from airbyte_lib.progress import progress
 
 
 if TYPE_CHECKING:
     from collections.abc import Generator, Iterable, Iterator
 
+    from airbyte_lib.caches._catalog_manager import CatalogManager
     from airbyte_lib.config import CacheConfigBase
 
 
@@ -60,6 +63,8 @@ class RecordProcessor(abc.ABC):
     def __init__(
         self,
         config: CacheConfigBase | dict | None,
+        *,
+        catalog_manager: CatalogManager | None = None,
     ) -> None:
         if isinstance(config, dict):
             config = self.config_class(**config)
@@ -72,8 +77,6 @@ def __init__(
             )
             raise TypeError(err_msg)
 
-        self.source_catalog: ConfiguredAirbyteCatalog | None = None
-
         self._pending_batches: dict[str, dict[str, Any]] = defaultdict(lambda: {}, {})
         self._finalized_batches: dict[str, dict[str, Any]] = defaultdict(lambda: {}, {})
 
@@ -83,22 +86,25 @@ def __init__(
             list[AirbyteStateMessage],
         ] = defaultdict(list, {})
 
+        self._catalog_manager: CatalogManager | None = catalog_manager
         self._setup()
 
     def register_source(
         self,
         source_name: str,
         incoming_source_catalog: ConfiguredAirbyteCatalog,
+        stream_names: set[str],
     ) -> None:
-        """Register the source name and catalog.
-
-        For now, only one source at a time is supported.
-        If this method is called multiple times, the last call will overwrite the previous one.
-
-        TODO: Expand this to handle multiple sources.
-        """
-        _ = source_name
-        self.source_catalog = incoming_source_catalog
+        """Register the source name and catalog."""
+        if not self._catalog_manager:
+            raise exc.AirbyteLibInternalError(
+                message="Catalog manager should exist but does not.",
+            )
+        self._catalog_manager.register_source(
+            source_name,
+            incoming_source_catalog=incoming_source_catalog,
+            incoming_stream_names=stream_names,
+        )
 
     @property
     def _streams_with_data(self) -> set[str]:
@@ -215,7 +221,7 @@ def _write_batch(
         self,
         stream_name: str,
         batch_id: str,
-        record_batch: pa.Table | pa.RecordBatch,
+        record_batch: pa.Table,
     ) -> BatchHandle:
         """Process a single batch.
 
@@ -319,3 +325,24 @@ def _teardown(self) -> None:
     def __del__(self) -> None:
         """Teardown temporary resources when instance is unloaded from memory."""
         self._teardown()
+
+    @final
+    def _get_stream_config(
+        self,
+        stream_name: str,
+    ) -> ConfiguredAirbyteStream:
+        """Return the column definitions for the given stream."""
+        if not self._catalog_manager:
+            raise exc.AirbyteLibInternalError(
+                message="Catalog manager should exist but does not.",
+            )
+
+        return self._catalog_manager.get_stream_config(stream_name)
+
+    @final
+    def _get_stream_json_schema(
+        self,
+        stream_name: str,
+    ) -> dict[str, Any]:
+        """Return the column definitions for the given stream."""
+        return self._get_stream_config(stream_name).stream.json_schema
diff --git a/airbyte-lib/airbyte_lib/caches/_catalog_manager.py b/airbyte-lib/airbyte_lib/caches/_catalog_manager.py
@@ -46,8 +46,23 @@ def __init__(
     ) -> None:
         self._engine: Engine = engine
         self._table_name_resolver = table_name_resolver
-        self.source_catalog: ConfiguredAirbyteCatalog | None = None
+        self._source_catalog: ConfiguredAirbyteCatalog | None = None
         self._load_catalog_from_internal_table()
+        assert self._source_catalog is not None
+
+    @property
+    def source_catalog(self) -> ConfiguredAirbyteCatalog:
+        """Return the source catalog.
+
+        Raises:
+            AirbyteLibInternalError: If the source catalog is not set.
+        """
+        if not self._source_catalog:
+            raise exc.AirbyteLibInternalError(
+                message="Source catalog should be initialized but is not.",
+            )
+
+        return self._source_catalog
 
     def _ensure_internal_tables(self) -> None:
         engine = self._engine
@@ -57,34 +72,70 @@ def register_source(
         self,
         source_name: str,
         incoming_source_catalog: ConfiguredAirbyteCatalog,
+        incoming_stream_names: set[str],
     ) -> None:
-        if not self.source_catalog:
-            self.source_catalog = incoming_source_catalog
-        else:
-            # merge in the new streams, keyed by name
-            new_streams = {stream.stream.name: stream for stream in incoming_source_catalog.streams}
-            for stream in self.source_catalog.streams:
-                if stream.stream.name not in new_streams:
-                    new_streams[stream.stream.name] = stream
-            self.source_catalog = ConfiguredAirbyteCatalog(
-                streams=list(new_streams.values()),
+        """Register a source and its streams in the cache."""
+        self._update_catalog(
+            incoming_source_catalog=incoming_source_catalog,
+            incoming_stream_names=incoming_stream_names,
+        )
+        self._save_catalog_to_internal_table(
+            source_name=source_name,
+            incoming_source_catalog=incoming_source_catalog,
+            incoming_stream_names=incoming_stream_names,
+        )
+
+    def _update_catalog(
+        self,
+        incoming_source_catalog: ConfiguredAirbyteCatalog,
+        incoming_stream_names: set[str],
+    ) -> None:
+        if not self._source_catalog:
+            self._source_catalog = ConfiguredAirbyteCatalog(
+                streams=[
+                    stream
+                    for stream in incoming_source_catalog.streams
+                    if stream.stream.name in incoming_stream_names
+                ],
             )
+            assert len(self._source_catalog.streams) == len(incoming_stream_names)
+            return
+
+        # Keep existing streams untouched if not incoming
+        unchanged_streams: list[ConfiguredAirbyteStream] = [
+            stream
+            for stream in self._source_catalog.streams
+            if stream.stream.name not in incoming_stream_names
+        ]
+        new_streams: list[ConfiguredAirbyteStream] = [
+            stream
+            for stream in incoming_source_catalog.streams
+            if stream.stream.name in incoming_stream_names
+        ]
+        self._source_catalog = ConfiguredAirbyteCatalog(streams=unchanged_streams + new_streams)
 
+    def _save_catalog_to_internal_table(
+        self,
+        source_name: str,
+        incoming_source_catalog: ConfiguredAirbyteCatalog,
+        incoming_stream_names: set[str],
+    ) -> None:
         self._ensure_internal_tables()
         engine = self._engine
         with Session(engine) as session:
-            # delete all existing streams from the db
-            session.query(CachedStream).filter(
-                CachedStream.table_name.in_(
-                    [
-                        self._table_name_resolver(stream.stream.name)
-                        for stream in self.source_catalog.streams
-                    ]
-                )
-            ).delete()
+            # Delete and replace existing stream entries from the catalog cache
+            table_name_entries_to_delete = [
+                self._table_name_resolver(incoming_stream_name)
+                for incoming_stream_name in incoming_stream_names
+            ]
+            result = (
+                session.query(CachedStream)
+                .filter(CachedStream.table_name.in_(table_name_entries_to_delete))
+                .delete()
+            )
+            _ = result
             session.commit()
-            # add the new ones
-            streams = [
+            insert_streams = [
                 CachedStream(
                     source_name=source_name,
                     stream_name=stream.stream.name,
@@ -93,8 +144,7 @@ def register_source(
                 )
                 for stream in incoming_source_catalog.streams
             ]
-            session.add_all(streams)
-
+            session.add_all(insert_streams)
             session.commit()
 
     def get_stream_config(
@@ -113,6 +163,11 @@ def get_stream_config(
         if not matching_streams:
             raise exc.AirbyteStreamNotFoundError(
                 stream_name=stream_name,
+                context={
+                    "available_streams": [
+                        stream.stream.name for stream in self.source_catalog.streams
+                    ],
+                },
             )
 
         if len(matching_streams) > 1:
@@ -133,10 +188,13 @@ def _load_catalog_from_internal_table(self) -> None:
             streams: list[CachedStream] = session.query(CachedStream).all()
             if not streams:
                 # no streams means the cache is pristine
+                if not self._source_catalog:
+                    self._source_catalog = ConfiguredAirbyteCatalog(streams=[])
+
                 return
 
             # load the catalog
-            self.source_catalog = ConfiguredAirbyteCatalog(
+            self._source_catalog = ConfiguredAirbyteCatalog(
                 streams=[
                     ConfiguredAirbyteStream(
                         stream=AirbyteStream(