airbyte.caches

Base module for all caches.

 1# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 2"""Base module for all caches."""
 3from __future__ import annotations
 4
 5from airbyte.caches import bigquery, duckdb, motherduck, postgres, snowflake, util
 6from airbyte.caches.base import CacheBase
 7from airbyte.caches.bigquery import BigQueryCache
 8from airbyte.caches.duckdb import DuckDBCache
 9from airbyte.caches.motherduck import MotherDuckCache
10from airbyte.caches.postgres import PostgresCache
11from airbyte.caches.snowflake import SnowflakeCache
12from airbyte.caches.util import get_default_cache, new_local_cache
13
14
15# We export these classes for easy access: `airbyte.caches...`
16__all__ = [
17    # Factories
18    "get_default_cache",
19    "new_local_cache",
20    # Classes
21    "BigQueryCache",
22    "CacheBase",
23    "DuckDBCache",
24    "MotherDuckCache",
25    "PostgresCache",
26    "SnowflakeCache",
27    # Submodules,
28    "util",
29    "base",
30    "bigquery",
31    "duckdb",
32    "motherduck",
33    "postgres",
34    "snowflake",
35]
def get_default_cache() -> DuckDBCache:
15def get_default_cache() -> DuckDBCache:
16    """Get a local cache for storing data, using the default database path.
17
18    Cache files are stored in the `.cache` directory, relative to the current
19    working directory.
20    """
21    cache_dir = Path("./.cache/default_cache")
22    return DuckDBCache(
23        db_path=cache_dir / "default_cache.duckdb",
24        cache_dir=cache_dir,
25    )

Get a local cache for storing data, using the default database path.

Cache files are stored in the .cache directory, relative to the current working directory.

def new_local_cache( cache_name: str | None = None, cache_dir: str | pathlib.Path | None = None, *, cleanup: bool = True) -> DuckDBCache:
28def new_local_cache(
29    cache_name: str | None = None,
30    cache_dir: str | Path | None = None,
31    *,
32    cleanup: bool = True,
33) -> DuckDBCache:
34    """Get a local cache for storing data, using a name string to seed the path.
35
36    Args:
37        cache_name: Name to use for the cache. Defaults to None.
38        cache_dir: Root directory to store the cache in. Defaults to None.
39        cleanup: Whether to clean up temporary files. Defaults to True.
40
41    Cache files are stored in the `.cache` directory, relative to the current
42    working directory.
43    """
44    if cache_name:
45        if " " in cache_name:
46            raise exc.PyAirbyteInputError(
47                message="Cache name cannot contain spaces.",
48                input_value=cache_name,
49            )
50
51        if not cache_name.replace("_", "").isalnum():
52            raise exc.PyAirbyteInputError(
53                message="Cache name can only contain alphanumeric characters and underscores.",
54                input_value=cache_name,
55            )
56
57    cache_name = cache_name or str(ulid.ULID())
58    cache_dir = cache_dir or Path(f"./.cache/{cache_name}")
59    if not isinstance(cache_dir, Path):
60        cache_dir = Path(cache_dir)
61
62    return DuckDBCache(
63        db_path=cache_dir / f"db_{cache_name}.duckdb",
64        cache_dir=cache_dir,
65        cleanup=cleanup,
66    )

Get a local cache for storing data, using a name string to seed the path.

Arguments:
  • cache_name: Name to use for the cache. Defaults to None.
  • cache_dir: Root directory to store the cache in. Defaults to None.
  • cleanup: Whether to clean up temporary files. Defaults to True.

Cache files are stored in the .cache directory, relative to the current working directory.

class BigQueryCache(airbyte.caches.CacheBase):
37class BigQueryCache(CacheBase):
38    """The BigQuery cache implementation."""
39
40    project_name: str
41    """The name of the project to use. In BigQuery, this is equivalent to the database name."""
42
43    dataset_name: str = "airbyte_raw"
44    """The name of the dataset to use. In BigQuery, this is equivalent to the schema name."""
45
46    credentials_path: Optional[str] = None
47    """The path to the credentials file to use.
48    If not passed, falls back to the default inferred from the environment."""
49
50    _sql_processor_class: type[BigQuerySqlProcessor] = BigQuerySqlProcessor
51
52    @root_validator(pre=True)
53    @classmethod
54    def set_schema_name(cls, values: dict[str, Any]) -> dict[str, Any]:
55        dataset_name = values.get("dataset_name")
56        if dataset_name is None:
57            raise ValueError("dataset_name must be defined")  # noqa: TRY003
58        values["schema_name"] = dataset_name
59        return values
60
61    @overrides
62    def get_database_name(self) -> str:
63        """Return the name of the database. For BigQuery, this is the project name."""
64        return self.project_name
65
66    @overrides
67    def get_sql_alchemy_url(self) -> str:
68        """Return the SQLAlchemy URL to use."""
69        url: URL = make_url(f"bigquery://{self.project_name!s}")
70        if self.credentials_path:
71            url = url.update_query_dict({"credentials_path": self.credentials_path})
72
73        return str(url)

The BigQuery cache implementation.

project_name: str

The name of the project to use. In BigQuery, this is equivalent to the database name.

dataset_name: str

The name of the dataset to use. In BigQuery, this is equivalent to the schema name.

credentials_path: Optional[str]

The path to the credentials file to use. If not passed, falls back to the default inferred from the environment.

@root_validator(pre=True)
@classmethod
def set_schema_name(cls, values: dict[str, typing.Any]) -> dict[str, typing.Any]:
52    @root_validator(pre=True)
53    @classmethod
54    def set_schema_name(cls, values: dict[str, Any]) -> dict[str, Any]:
55        dataset_name = values.get("dataset_name")
56        if dataset_name is None:
57            raise ValueError("dataset_name must be defined")  # noqa: TRY003
58        values["schema_name"] = dataset_name
59        return values
@overrides
def get_database_name(self) -> str:
61    @overrides
62    def get_database_name(self) -> str:
63        """Return the name of the database. For BigQuery, this is the project name."""
64        return self.project_name

Return the name of the database. For BigQuery, this is the project name.

@overrides
def get_sql_alchemy_url(self) -> str:
66    @overrides
67    def get_sql_alchemy_url(self) -> str:
68        """Return the SQLAlchemy URL to use."""
69        url: URL = make_url(f"bigquery://{self.project_name!s}")
70        if self.credentials_path:
71            url = url.update_query_dict({"credentials_path": self.credentials_path})
72
73        return str(url)

Return the SQLAlchemy URL to use.

Inherited Members
pydantic.main.BaseModel
BaseModel
Config
dict
json
parse_obj
parse_raw
parse_file
from_orm
construct
copy
schema
schema_json
validate
update_forward_refs
CacheBase
cache_dir
cleanup
schema_name
table_prefix
table_suffix
processor
get_sql_engine
streams
class CacheBase(pydantic.main.BaseModel):
 28class CacheBase(BaseModel):
 29    """Base configuration for a cache."""
 30
 31    cache_dir: Path = Path(".cache")
 32    """The directory to store the cache in."""
 33
 34    cleanup: bool = True
 35    """Whether to clean up the cache after use."""
 36
 37    schema_name: str = "airbyte_raw"
 38    """The name of the schema to write to."""
 39
 40    table_prefix: Optional[str] = None
 41    """ A prefix to add to all table names.
 42    If 'None', a prefix will be created based on the source name.
 43    """
 44
 45    table_suffix: str = ""
 46    """A suffix to add to all table names."""
 47
 48    _deployed_api_root: Optional[str] = PrivateAttr(default=None)
 49    _deployed_workspace_id: Optional[str] = PrivateAttr(default=None)
 50    _deployed_destination_id: Optional[str] = PrivateAttr(default=None)
 51    _deployed_connection_id: Optional[str] = PrivateAttr(default=None)
 52
 53    _sql_processor_class: type[SqlProcessorBase] = PrivateAttr()
 54    _sql_processor: Optional[SqlProcessorBase] = PrivateAttr(default=None)
 55
 56    @final
 57    @property
 58    def processor(self) -> SqlProcessorBase:
 59        """Return the SQL processor instance."""
 60        if self._sql_processor is None:
 61            self._sql_processor = self._sql_processor_class(cache=self)
 62        return self._sql_processor
 63
 64    @final
 65    def get_sql_engine(self) -> Engine:
 66        """Return a new SQL engine to use."""
 67        return self.processor.get_sql_engine()
 68
 69    @abc.abstractmethod
 70    def get_sql_alchemy_url(self) -> str:
 71        """Returns a SQL Alchemy URL."""
 72        ...
 73
 74    @abc.abstractmethod
 75    def get_database_name(self) -> str:
 76        """Return the name of the database."""
 77        ...
 78
 79    @final
 80    @property
 81    def streams(
 82        self,
 83    ) -> dict[str, CachedDataset]:
 84        """Return a temporary table name."""
 85        result = {}
 86        stream_names = self.processor.expected_streams
 87        if self._has_catalog_manager:
 88            stream_names |= set(self._catalog_manager.stream_names)
 89        for stream_name in stream_names:
 90            result[stream_name] = CachedDataset(self, stream_name)
 91
 92        return result
 93
 94    def _get_state(
 95        self,
 96        source_name: str,
 97        streams: list[str] | None,
 98    ) -> list[dict[str, Any]] | None:
 99        return self._catalog_manager.get_state(
100            source_name=source_name,
101            streams=streams,
102        )
103
104    @property
105    def _has_catalog_manager(
106        self,
107    ) -> bool:
108        """Return whether the cache has a catalog manager."""
109        # Member is private until we have a public API for it.
110        return self.processor._catalog_manager is not None  # noqa: SLF001
111
112    @property
113    def _catalog_manager(
114        self,
115    ) -> CatalogManager:
116        if not self._has_catalog_manager:
117            raise exc.PyAirbyteInternalError(
118                message="Catalog manager should exist but does not.",
119            )
120
121        # Member is private until we have a public API for it.
122        return cast(CatalogManager, self.processor._catalog_manager)  # noqa: SLF001
123
124    def __getitem__(self, stream: str) -> DatasetBase:
125        return self.streams[stream]
126
127    def __contains__(self, stream: str) -> bool:
128        return stream in (self.processor.expected_streams)
129
130    def __iter__(self) -> Generator[tuple[str, Any], None, None]:
131        return ((name, dataset) for name, dataset in self.streams.items())

Base configuration for a cache.

cache_dir: pathlib.Path

The directory to store the cache in.

cleanup: bool

Whether to clean up the cache after use.

schema_name: str

The name of the schema to write to.

table_prefix: Optional[str]

A prefix to add to all table names. If 'None', a prefix will be created based on the source name.

table_suffix: str

A suffix to add to all table names.

processor: airbyte._processors.sql.base.SqlProcessorBase
56    @final
57    @property
58    def processor(self) -> SqlProcessorBase:
59        """Return the SQL processor instance."""
60        if self._sql_processor is None:
61            self._sql_processor = self._sql_processor_class(cache=self)
62        return self._sql_processor

Return the SQL processor instance.

@final
def get_sql_engine(self) -> sqlalchemy.engine.base.Engine:
64    @final
65    def get_sql_engine(self) -> Engine:
66        """Return a new SQL engine to use."""
67        return self.processor.get_sql_engine()

Return a new SQL engine to use.

@abc.abstractmethod
def get_sql_alchemy_url(self) -> str:
69    @abc.abstractmethod
70    def get_sql_alchemy_url(self) -> str:
71        """Returns a SQL Alchemy URL."""
72        ...

Returns a SQL Alchemy URL.

@abc.abstractmethod
def get_database_name(self) -> str:
74    @abc.abstractmethod
75    def get_database_name(self) -> str:
76        """Return the name of the database."""
77        ...

Return the name of the database.

streams: dict[str, airbyte.datasets._sql.CachedDataset]
79    @final
80    @property
81    def streams(
82        self,
83    ) -> dict[str, CachedDataset]:
84        """Return a temporary table name."""
85        result = {}
86        stream_names = self.processor.expected_streams
87        if self._has_catalog_manager:
88            stream_names |= set(self._catalog_manager.stream_names)
89        for stream_name in stream_names:
90            result[stream_name] = CachedDataset(self, stream_name)
91
92        return result

Return a temporary table name.

Inherited Members
pydantic.main.BaseModel
BaseModel
Config
dict
json
parse_obj
parse_raw
parse_file
from_orm
construct
copy
schema
schema_json
validate
update_forward_refs
class DuckDBCache(airbyte.caches.CacheBase):
38class DuckDBCache(CacheBase):
39    """A DuckDB cache."""
40
41    db_path: Union[Path, str]
42    """Normally db_path is a Path object.
43
44    The database name will be inferred from the file name. For example, given a `db_path` of
45    `/path/to/my/my_db.duckdb`, the database name is `my_db`.
46    """
47
48    schema_name: str = "main"
49    """The name of the schema to write to. Defaults to "main"."""
50
51    _sql_processor_class = DuckDBSqlProcessor
52
53    @overrides
54    def get_sql_alchemy_url(self) -> str:
55        """Return the SQLAlchemy URL to use."""
56        # return f"duckdb:///{self.db_path}?schema={self.schema_name}"
57        return f"duckdb:///{self.db_path!s}"
58
59    @overrides
60    def get_database_name(self) -> str:
61        """Return the name of the database."""
62        if self.db_path == ":memory:":
63            return "memory"
64
65        # Split the path on the appropriate separator ("/" or "\")
66        split_on: Literal["/", "\\"] = "\\" if "\\" in str(self.db_path) else "/"
67
68        # Return the file name without the extension
69        return str(self.db_path).split(sep=split_on)[-1].split(".")[0]

A DuckDB cache.

db_path: Union[pathlib.Path, str]

Normally db_path is a Path object.

The database name will be inferred from the file name. For example, given a db_path of /path/to/my/my_dbairbyte.caches.duckdb, the database name is my_db.

schema_name: str

The name of the schema to write to. Defaults to "main".

@overrides
def get_sql_alchemy_url(self) -> str:
53    @overrides
54    def get_sql_alchemy_url(self) -> str:
55        """Return the SQLAlchemy URL to use."""
56        # return f"duckdb:///{self.db_path}?schema={self.schema_name}"
57        return f"duckdb:///{self.db_path!s}"

Return the SQLAlchemy URL to use.

@overrides
def get_database_name(self) -> str:
59    @overrides
60    def get_database_name(self) -> str:
61        """Return the name of the database."""
62        if self.db_path == ":memory:":
63            return "memory"
64
65        # Split the path on the appropriate separator ("/" or "\")
66        split_on: Literal["/", "\\"] = "\\" if "\\" in str(self.db_path) else "/"
67
68        # Return the file name without the extension
69        return str(self.db_path).split(sep=split_on)[-1].split(".")[0]

Return the name of the database.

Inherited Members
pydantic.main.BaseModel
BaseModel
Config
dict
json
parse_obj
parse_raw
parse_file
from_orm
construct
copy
schema
schema_json
validate
update_forward_refs
CacheBase
cache_dir
cleanup
table_prefix
table_suffix
processor
get_sql_engine
streams
class MotherDuckCache(airbyte.caches.DuckDBCache):
27class MotherDuckCache(DuckDBCache):
28    """Cache that uses MotherDuck for external persistent storage."""
29
30    db_path: str = Field(default="md:")
31    database: str
32    api_key: SecretString
33
34    _sql_processor_class = MotherDuckSqlProcessor
35
36    @overrides
37    def get_sql_alchemy_url(self) -> SecretString:
38        """Return the SQLAlchemy URL to use."""
39        return SecretString(
40            f"duckdb:///md:{self.database}?motherduck_token={self.api_key}"
41            # f"&schema={self.schema_name}"  # TODO: Debug why this doesn't work
42        )
43
44    @overrides
45    def get_database_name(self) -> str:
46        """Return the name of the database."""
47        return self.database

Cache that uses MotherDuck for external persistent storage.

db_path: str

Normally db_path is a Path object.

The database name will be inferred from the file name. For example, given a db_path of /path/to/my/my_dbairbyte.caches.duckdb, the database name is my_db.

database: str
@overrides
def get_sql_alchemy_url(self) -> airbyte.secrets.base.SecretString:
36    @overrides
37    def get_sql_alchemy_url(self) -> SecretString:
38        """Return the SQLAlchemy URL to use."""
39        return SecretString(
40            f"duckdb:///md:{self.database}?motherduck_token={self.api_key}"
41            # f"&schema={self.schema_name}"  # TODO: Debug why this doesn't work
42        )

Return the SQLAlchemy URL to use.

@overrides
def get_database_name(self) -> str:
44    @overrides
45    def get_database_name(self) -> str:
46        """Return the name of the database."""
47        return self.database

Return the name of the database.

Inherited Members
pydantic.main.BaseModel
BaseModel
Config
dict
json
parse_obj
parse_raw
parse_file
from_orm
construct
copy
schema
schema_json
validate
update_forward_refs
DuckDBCache
schema_name
CacheBase
cache_dir
cleanup
table_prefix
table_suffix
processor
get_sql_engine
streams
class PostgresCache(airbyte.caches.CacheBase):
30class PostgresCache(CacheBase):
31    """Configuration for the Postgres cache.
32
33    Also inherits config from the JsonlWriter, which is responsible for writing files to disk.
34    """
35
36    host: str
37    port: int
38    username: str
39    password: SecretString
40    database: str
41
42    _sql_processor_class = PostgresSqlProcessor
43
44    @overrides
45    def get_sql_alchemy_url(self) -> SecretString:
46        """Return the SQLAlchemy URL to use."""
47        return SecretString(
48            f"postgresql+psycopg2://{self.username}:{self.password}@{self.host}:{self.port}/{self.database}"
49        )
50
51    @overrides
52    def get_database_name(self) -> str:
53        """Return the name of the database."""
54        return self.database

Configuration for the Postgres cache.

Also inherits config from the JsonlWriter, which is responsible for writing files to disk.

host: str
port: int
username: str
database: str
@overrides
def get_sql_alchemy_url(self) -> airbyte.secrets.base.SecretString:
44    @overrides
45    def get_sql_alchemy_url(self) -> SecretString:
46        """Return the SQLAlchemy URL to use."""
47        return SecretString(
48            f"postgresql+psycopg2://{self.username}:{self.password}@{self.host}:{self.port}/{self.database}"
49        )

Return the SQLAlchemy URL to use.

@overrides
def get_database_name(self) -> str:
51    @overrides
52    def get_database_name(self) -> str:
53        """Return the name of the database."""
54        return self.database

Return the name of the database.

Inherited Members
pydantic.main.BaseModel
BaseModel
Config
dict
json
parse_obj
parse_raw
parse_file
from_orm
construct
copy
schema
schema_json
validate
update_forward_refs
CacheBase
cache_dir
cleanup
schema_name
table_prefix
table_suffix
processor
get_sql_engine
streams
class SnowflakeCache(airbyte.caches.CacheBase):
34class SnowflakeCache(CacheBase):
35    """Configuration for the Snowflake cache."""
36
37    account: str
38    username: str
39    password: SecretString
40    warehouse: str
41    database: str
42    role: str
43
44    dedupe_mode = RecordDedupeMode.APPEND
45
46    _sql_processor_class = SnowflakeSqlProcessor
47
48    # Already defined in base class:
49    # schema_name: str
50
51    @overrides
52    def get_sql_alchemy_url(self) -> SecretString:
53        """Return the SQLAlchemy URL to use."""
54        return SecretString(
55            URL(
56                account=self.account,
57                user=self.username,
58                password=self.password,
59                database=self.database,
60                warehouse=self.warehouse,
61                schema=self.schema_name,
62                role=self.role,
63            )
64        )
65
66    @overrides
67    def get_database_name(self) -> str:
68        """Return the name of the database."""
69        return self.database

Configuration for the Snowflake cache.

account: str
username: str
warehouse: str
database: str
role: str
dedupe_mode
@overrides
def get_sql_alchemy_url(self) -> airbyte.secrets.base.SecretString:
51    @overrides
52    def get_sql_alchemy_url(self) -> SecretString:
53        """Return the SQLAlchemy URL to use."""
54        return SecretString(
55            URL(
56                account=self.account,
57                user=self.username,
58                password=self.password,
59                database=self.database,
60                warehouse=self.warehouse,
61                schema=self.schema_name,
62                role=self.role,
63            )
64        )

Return the SQLAlchemy URL to use.

@overrides
def get_database_name(self) -> str:
66    @overrides
67    def get_database_name(self) -> str:
68        """Return the name of the database."""
69        return self.database

Return the name of the database.

Inherited Members
pydantic.main.BaseModel
BaseModel
Config
dict
json
parse_obj
parse_raw
parse_file
from_orm
construct
copy
schema
schema_json
validate
update_forward_refs
CacheBase
cache_dir
cleanup
schema_name
table_prefix
table_suffix
processor
get_sql_engine
streams