airbyte.datasets
1from __future__ import annotations 2 3from airbyte.datasets._base import DatasetBase 4from airbyte.datasets._lazy import LazyDataset 5from airbyte.datasets._map import DatasetMap 6from airbyte.datasets._sql import CachedDataset, SQLDataset 7 8 9__all__ = [ 10 "CachedDataset", 11 "DatasetBase", 12 "DatasetMap", 13 "LazyDataset", 14 "SQLDataset", 15]
127class CachedDataset(SQLDataset): 128 """A dataset backed by a SQL table cache. 129 130 Because this dataset includes all records from the underlying table, we also expose the 131 underlying table as a SQLAlchemy Table object. 132 """ 133 134 def __init__( 135 self, 136 cache: CacheBase, 137 stream_name: str, 138 ) -> None: 139 """We construct the query statement by selecting all columns from the table. 140 141 This prevents the need to scan the table schema to construct the query statement. 142 """ 143 table_name = cache.processor.get_sql_table_name(stream_name) 144 schema_name = cache.schema_name 145 query = select("*").select_from(text(f"{schema_name}.{table_name}")) 146 super().__init__( 147 cache=cache, 148 stream_name=stream_name, 149 query_statement=query, 150 ) 151 152 @overrides 153 def to_pandas(self) -> DataFrame: 154 """Return the underlying dataset data as a pandas DataFrame.""" 155 return self._cache.processor.get_pandas_dataframe(self._stream_name) 156 157 def to_sql_table(self) -> Table: 158 """Return the underlying SQL table as a SQLAlchemy Table object.""" 159 return self._cache.processor.get_sql_table(self.stream_name) 160 161 def __eq__(self, value: object) -> bool: 162 """Return True if the value is a CachedDataset with the same cache and stream name. 163 164 In the case of CachedDataset objects, we can simply compare the cache and stream name. 165 166 Note that this equality check is only supported on CachedDataset objects and not for 167 the base SQLDataset implementation. This is because of the complexity and computational 168 cost of comparing two arbitrary SQL queries that could be bound to different variables, 169 as well as the chance that two queries can be syntactically equivalent without being 170 text-wise equivalent. 171 """ 172 if not isinstance(value, SQLDataset): 173 return False 174 175 if self._cache is not value._cache: 176 return False 177 178 if self._stream_name != value._stream_name: 179 return False 180 181 return True
A dataset backed by a SQL table cache.
Because this dataset includes all records from the underlying table, we also expose the underlying table as a SQLAlchemy Table object.
134 def __init__( 135 self, 136 cache: CacheBase, 137 stream_name: str, 138 ) -> None: 139 """We construct the query statement by selecting all columns from the table. 140 141 This prevents the need to scan the table schema to construct the query statement. 142 """ 143 table_name = cache.processor.get_sql_table_name(stream_name) 144 schema_name = cache.schema_name 145 query = select("*").select_from(text(f"{schema_name}.{table_name}")) 146 super().__init__( 147 cache=cache, 148 stream_name=stream_name, 149 query_statement=query, 150 )
We construct the query statement by selecting all columns from the table.
This prevents the need to scan the table schema to construct the query statement.
152 @overrides 153 def to_pandas(self) -> DataFrame: 154 """Return the underlying dataset data as a pandas DataFrame.""" 155 return self._cache.processor.get_pandas_dataframe(self._stream_name)
Return the underlying dataset data as a pandas DataFrame.
157 def to_sql_table(self) -> Table: 158 """Return the underlying SQL table as a SQLAlchemy Table object.""" 159 return self._cache.processor.get_sql_table(self.stream_name)
Return the underlying SQL table as a SQLAlchemy Table object.
Inherited Members
20class DatasetBase(ABC): 21 """Base implementation for all datasets.""" 22 23 def __init__(self, stream_metadata: ConfiguredAirbyteStream) -> None: 24 self._stream_metadata = stream_metadata 25 26 @abstractmethod 27 def __iter__(self) -> Iterator[dict[str, Any]]: 28 """Return the iterator of records.""" 29 raise NotImplementedError 30 31 def to_pandas(self) -> DataFrame: 32 """Return a pandas DataFrame representation of the dataset. 33 34 The base implementation simply passes the record iterator to Panda's DataFrame constructor. 35 """ 36 # Technically, we return an iterator of Mapping objects. However, pandas 37 # expects an iterator of dict objects. This cast is safe because we know 38 # duck typing is correct for this use case. 39 return DataFrame(cast(Iterator[dict[str, Any]], self)) 40 41 def to_documents( 42 self, 43 title_property: str | None = None, 44 content_properties: list[str] | None = None, 45 metadata_properties: list[str] | None = None, 46 *, 47 render_metadata: bool = False, 48 ) -> Iterable[Document]: 49 """Return the iterator of documents. 50 51 If metadata_properties is not set, all properties that are not content will be added to 52 the metadata. 53 54 If render_metadata is True, metadata will be rendered in the document, as well as the 55 the main content. Otherwise, metadata will be attached to the document but not rendered. 56 """ 57 renderer = DocumentRenderer( 58 title_property=title_property, 59 content_properties=content_properties, 60 metadata_properties=metadata_properties, 61 render_metadata=render_metadata, 62 ) 63 yield from renderer.render_documents(self)
Base implementation for all datasets.
31 def to_pandas(self) -> DataFrame: 32 """Return a pandas DataFrame representation of the dataset. 33 34 The base implementation simply passes the record iterator to Panda's DataFrame constructor. 35 """ 36 # Technically, we return an iterator of Mapping objects. However, pandas 37 # expects an iterator of dict objects. This cast is safe because we know 38 # duck typing is correct for this use case. 39 return DataFrame(cast(Iterator[dict[str, Any]], self))
Return a pandas DataFrame representation of the dataset.
The base implementation simply passes the record iterator to Panda's DataFrame constructor.
41 def to_documents( 42 self, 43 title_property: str | None = None, 44 content_properties: list[str] | None = None, 45 metadata_properties: list[str] | None = None, 46 *, 47 render_metadata: bool = False, 48 ) -> Iterable[Document]: 49 """Return the iterator of documents. 50 51 If metadata_properties is not set, all properties that are not content will be added to 52 the metadata. 53 54 If render_metadata is True, metadata will be rendered in the document, as well as the 55 the main content. Otherwise, metadata will be attached to the document but not rendered. 56 """ 57 renderer = DocumentRenderer( 58 title_property=title_property, 59 content_properties=content_properties, 60 metadata_properties=metadata_properties, 61 render_metadata=render_metadata, 62 ) 63 yield from renderer.render_documents(self)
Return the iterator of documents.
If metadata_properties is not set, all properties that are not content will be added to the metadata.
If render_metadata is True, metadata will be rendered in the document, as well as the the main content. Otherwise, metadata will be attached to the document but not rendered.
19class DatasetMap(Mapping): 20 """A generic interface for a set of streams or datasets.""" 21 22 def __init__(self) -> None: 23 self._datasets: dict[str, DatasetBase] = {} 24 25 def __getitem__(self, key: str) -> DatasetBase: 26 return self._datasets[key] 27 28 def __iter__(self) -> Iterator[str]: 29 return iter(self._datasets) 30 31 def __len__(self) -> int: 32 return len(self._datasets)
A generic interface for a set of streams or datasets.
Inherited Members
- collections.abc.Mapping
- get
- keys
- items
- values
18class LazyDataset(DatasetBase): 19 """A dataset that is loaded incrementally from a source or a SQL query.""" 20 21 def __init__( 22 self, 23 iterator: Iterator[dict[str, Any]], 24 stream_metadata: ConfiguredAirbyteStream, 25 ) -> None: 26 self._iterator: Iterator[dict[str, Any]] = iterator 27 super().__init__( 28 stream_metadata=stream_metadata, 29 ) 30 31 @overrides 32 def __iter__(self) -> Iterator[dict[str, Any]]: 33 return self._iterator 34 35 def __next__(self) -> Mapping[str, Any]: 36 return next(self._iterator)
A dataset that is loaded incrementally from a source or a SQL query.
Inherited Members
30class SQLDataset(DatasetBase): 31 """A dataset that is loaded incrementally from a SQL query. 32 33 The CachedDataset class is a subclass of this class, which simply passes a SELECT over the full 34 table as the query statement. 35 """ 36 37 def __init__( 38 self, 39 cache: CacheBase, 40 stream_name: str, 41 query_statement: Selectable, 42 stream_configuration: ConfiguredAirbyteStream | None | Literal[False] = None, 43 ) -> None: 44 """Initialize the dataset with a cache, stream name, and query statement. 45 46 This class is not intended to be created directly. Instead, you can retrieve 47 datasets from caches or Cloud connection objects, etc. 48 49 The query statement should be a SQLAlchemy Selectable object that can be executed to 50 retrieve records from the dataset. 51 52 If stream_configuration is not provided, we attempt to retrieve the stream configuration 53 from the cache processor. This is useful when constructing a dataset from a CachedDataset 54 object, which already has the stream configuration. 55 56 If stream_configuration is set to False, we skip the stream configuration retrieval. 57 """ 58 self._length: int | None = None 59 self._cache: CacheBase = cache 60 self._stream_name: str = stream_name 61 self._query_statement: Selectable = query_statement 62 if stream_configuration is None: 63 try: 64 stream_configuration = cache.processor._get_stream_config( # noqa: SLF001 # Member is private until we have a public API for it. 65 stream_name=stream_name 66 ) 67 except Exception as ex: 68 Warning(f"Failed to get stream configuration for {stream_name}: {ex}") 69 70 # Coalesce False to None 71 stream_configuration = stream_configuration or None 72 73 super().__init__(stream_metadata=stream_configuration) 74 75 @property 76 def stream_name(self) -> str: 77 return self._stream_name 78 79 def __iter__(self) -> Iterator[dict[str, Any]]: 80 with self._cache.processor.get_sql_connection() as conn: 81 for row in conn.execute(self._query_statement): 82 # Access to private member required because SQLAlchemy doesn't expose a public API. 83 # https://pydoc.dev/sqlalchemy/latest/sqlalchemy.engine.row.RowMapping.html 84 yield cast(dict[str, Any], row._mapping) # noqa: SLF001 85 86 def __len__(self) -> int: 87 """Return the number of records in the dataset. 88 89 This method caches the length of the dataset after the first call. 90 """ 91 if self._length is None: 92 count_query = select([func.count()]).select_from(self._query_statement.alias()) 93 with self._cache.processor.get_sql_connection() as conn: 94 self._length = conn.execute(count_query).scalar() 95 96 return self._length 97 98 def to_pandas(self) -> DataFrame: 99 return self._cache.processor.get_pandas_dataframe(self._stream_name) 100 101 def with_filter(self, *filter_expressions: ClauseElement | str) -> SQLDataset: 102 """Filter the dataset by a set of column values. 103 104 Filters can be specified as either a string or a SQLAlchemy expression. 105 106 Filters are lazily applied to the dataset, so they can be chained together. For example: 107 108 dataset.with_filter("id > 5").with_filter("id < 10") 109 110 is equivalent to: 111 112 dataset.with_filter("id > 5", "id < 10") 113 """ 114 # Convert all strings to TextClause objects. 115 filters: list[ClauseElement] = [ 116 text(expression) if isinstance(expression, str) else expression 117 for expression in filter_expressions 118 ] 119 filtered_select = self._query_statement.where(and_(*filters)) 120 return SQLDataset( 121 cache=self._cache, 122 stream_name=self._stream_name, 123 query_statement=filtered_select, 124 )
A dataset that is loaded incrementally from a SQL query.
The CachedDataset class is a subclass of this class, which simply passes a SELECT over the full table as the query statement.
37 def __init__( 38 self, 39 cache: CacheBase, 40 stream_name: str, 41 query_statement: Selectable, 42 stream_configuration: ConfiguredAirbyteStream | None | Literal[False] = None, 43 ) -> None: 44 """Initialize the dataset with a cache, stream name, and query statement. 45 46 This class is not intended to be created directly. Instead, you can retrieve 47 datasets from caches or Cloud connection objects, etc. 48 49 The query statement should be a SQLAlchemy Selectable object that can be executed to 50 retrieve records from the dataset. 51 52 If stream_configuration is not provided, we attempt to retrieve the stream configuration 53 from the cache processor. This is useful when constructing a dataset from a CachedDataset 54 object, which already has the stream configuration. 55 56 If stream_configuration is set to False, we skip the stream configuration retrieval. 57 """ 58 self._length: int | None = None 59 self._cache: CacheBase = cache 60 self._stream_name: str = stream_name 61 self._query_statement: Selectable = query_statement 62 if stream_configuration is None: 63 try: 64 stream_configuration = cache.processor._get_stream_config( # noqa: SLF001 # Member is private until we have a public API for it. 65 stream_name=stream_name 66 ) 67 except Exception as ex: 68 Warning(f"Failed to get stream configuration for {stream_name}: {ex}") 69 70 # Coalesce False to None 71 stream_configuration = stream_configuration or None 72 73 super().__init__(stream_metadata=stream_configuration)
Initialize the dataset with a cache, stream name, and query statement.
This class is not intended to be created directly. Instead, you can retrieve datasets from caches or Cloud connection objects, etc.
The query statement should be a SQLAlchemy Selectable object that can be executed to retrieve records from the dataset.
If stream_configuration is not provided, we attempt to retrieve the stream configuration from the cache processor. This is useful when constructing a dataset from a CachedDataset object, which already has the stream configuration.
If stream_configuration is set to False, we skip the stream configuration retrieval.
98 def to_pandas(self) -> DataFrame: 99 return self._cache.processor.get_pandas_dataframe(self._stream_name)
Return a pandas DataFrame representation of the dataset.
The base implementation simply passes the record iterator to Panda's DataFrame constructor.
101 def with_filter(self, *filter_expressions: ClauseElement | str) -> SQLDataset: 102 """Filter the dataset by a set of column values. 103 104 Filters can be specified as either a string or a SQLAlchemy expression. 105 106 Filters are lazily applied to the dataset, so they can be chained together. For example: 107 108 dataset.with_filter("id > 5").with_filter("id < 10") 109 110 is equivalent to: 111 112 dataset.with_filter("id > 5", "id < 10") 113 """ 114 # Convert all strings to TextClause objects. 115 filters: list[ClauseElement] = [ 116 text(expression) if isinstance(expression, str) else expression 117 for expression in filter_expressions 118 ] 119 filtered_select = self._query_statement.where(and_(*filters)) 120 return SQLDataset( 121 cache=self._cache, 122 stream_name=self._stream_name, 123 query_statement=filtered_select, 124 )
Filter the dataset by a set of column values.
Filters can be specified as either a string or a SQLAlchemy expression.
Filters are lazily applied to the dataset, so they can be chained together. For example:
dataset.with_filter("id > 5").with_filter("id < 10")
is equivalent to:
dataset.with_filter("id > 5", "id < 10")