airbyte.caches
Base module for all caches.
1# Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2"""Base module for all caches.""" 3from __future__ import annotations 4 5from airbyte.caches import bigquery, duckdb, motherduck, postgres, snowflake, util 6from airbyte.caches.base import CacheBase 7from airbyte.caches.bigquery import BigQueryCache 8from airbyte.caches.duckdb import DuckDBCache 9from airbyte.caches.motherduck import MotherDuckCache 10from airbyte.caches.postgres import PostgresCache 11from airbyte.caches.snowflake import SnowflakeCache 12from airbyte.caches.util import get_default_cache, new_local_cache 13 14 15# We export these classes for easy access: `airbyte.caches...` 16__all__ = [ 17 # Factories 18 "get_default_cache", 19 "new_local_cache", 20 # Classes 21 "BigQueryCache", 22 "CacheBase", 23 "DuckDBCache", 24 "MotherDuckCache", 25 "PostgresCache", 26 "SnowflakeCache", 27 # Submodules, 28 "util", 29 "base", 30 "bigquery", 31 "duckdb", 32 "motherduck", 33 "postgres", 34 "snowflake", 35]
15def get_default_cache() -> DuckDBCache: 16 """Get a local cache for storing data, using the default database path. 17 18 Cache files are stored in the `.cache` directory, relative to the current 19 working directory. 20 """ 21 cache_dir = Path("./.cache/default_cache") 22 return DuckDBCache( 23 db_path=cache_dir / "default_cache.duckdb", 24 cache_dir=cache_dir, 25 )
Get a local cache for storing data, using the default database path.
Cache files are stored in the .cache
directory, relative to the current
working directory.
28def new_local_cache( 29 cache_name: str | None = None, 30 cache_dir: str | Path | None = None, 31 *, 32 cleanup: bool = True, 33) -> DuckDBCache: 34 """Get a local cache for storing data, using a name string to seed the path. 35 36 Args: 37 cache_name: Name to use for the cache. Defaults to None. 38 cache_dir: Root directory to store the cache in. Defaults to None. 39 cleanup: Whether to clean up temporary files. Defaults to True. 40 41 Cache files are stored in the `.cache` directory, relative to the current 42 working directory. 43 """ 44 if cache_name: 45 if " " in cache_name: 46 raise exc.PyAirbyteInputError( 47 message="Cache name cannot contain spaces.", 48 input_value=cache_name, 49 ) 50 51 if not cache_name.replace("_", "").isalnum(): 52 raise exc.PyAirbyteInputError( 53 message="Cache name can only contain alphanumeric characters and underscores.", 54 input_value=cache_name, 55 ) 56 57 cache_name = cache_name or str(ulid.ULID()) 58 cache_dir = cache_dir or Path(f"./.cache/{cache_name}") 59 if not isinstance(cache_dir, Path): 60 cache_dir = Path(cache_dir) 61 62 return DuckDBCache( 63 db_path=cache_dir / f"db_{cache_name}.duckdb", 64 cache_dir=cache_dir, 65 cleanup=cleanup, 66 )
Get a local cache for storing data, using a name string to seed the path.
Arguments:
- cache_name: Name to use for the cache. Defaults to None.
- cache_dir: Root directory to store the cache in. Defaults to None.
- cleanup: Whether to clean up temporary files. Defaults to True.
Cache files are stored in the .cache
directory, relative to the current
working directory.
37class BigQueryCache(CacheBase): 38 """The BigQuery cache implementation.""" 39 40 project_name: str 41 """The name of the project to use. In BigQuery, this is equivalent to the database name.""" 42 43 dataset_name: str = "airbyte_raw" 44 """The name of the dataset to use. In BigQuery, this is equivalent to the schema name.""" 45 46 credentials_path: Optional[str] = None 47 """The path to the credentials file to use. 48 If not passed, falls back to the default inferred from the environment.""" 49 50 _sql_processor_class: type[BigQuerySqlProcessor] = BigQuerySqlProcessor 51 52 @root_validator(pre=True) 53 @classmethod 54 def set_schema_name(cls, values: dict[str, Any]) -> dict[str, Any]: 55 dataset_name = values.get("dataset_name") 56 if dataset_name is None: 57 raise ValueError("dataset_name must be defined") # noqa: TRY003 58 values["schema_name"] = dataset_name 59 return values 60 61 @overrides 62 def get_database_name(self) -> str: 63 """Return the name of the database. For BigQuery, this is the project name.""" 64 return self.project_name 65 66 @overrides 67 def get_sql_alchemy_url(self) -> str: 68 """Return the SQLAlchemy URL to use.""" 69 url: URL = make_url(f"bigquery://{self.project_name!s}") 70 if self.credentials_path: 71 url = url.update_query_dict({"credentials_path": self.credentials_path}) 72 73 return str(url)
The BigQuery cache implementation.
The name of the project to use. In BigQuery, this is equivalent to the database name.
The name of the dataset to use. In BigQuery, this is equivalent to the schema name.
The path to the credentials file to use. If not passed, falls back to the default inferred from the environment.
52 @root_validator(pre=True) 53 @classmethod 54 def set_schema_name(cls, values: dict[str, Any]) -> dict[str, Any]: 55 dataset_name = values.get("dataset_name") 56 if dataset_name is None: 57 raise ValueError("dataset_name must be defined") # noqa: TRY003 58 values["schema_name"] = dataset_name 59 return values
61 @overrides 62 def get_database_name(self) -> str: 63 """Return the name of the database. For BigQuery, this is the project name.""" 64 return self.project_name
Return the name of the database. For BigQuery, this is the project name.
66 @overrides 67 def get_sql_alchemy_url(self) -> str: 68 """Return the SQLAlchemy URL to use.""" 69 url: URL = make_url(f"bigquery://{self.project_name!s}") 70 if self.credentials_path: 71 url = url.update_query_dict({"credentials_path": self.credentials_path}) 72 73 return str(url)
Return the SQLAlchemy URL to use.
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- Config
- dict
- json
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
28class CacheBase(BaseModel): 29 """Base configuration for a cache.""" 30 31 cache_dir: Path = Path(".cache") 32 """The directory to store the cache in.""" 33 34 cleanup: bool = True 35 """Whether to clean up the cache after use.""" 36 37 schema_name: str = "airbyte_raw" 38 """The name of the schema to write to.""" 39 40 table_prefix: Optional[str] = None 41 """ A prefix to add to all table names. 42 If 'None', a prefix will be created based on the source name. 43 """ 44 45 table_suffix: str = "" 46 """A suffix to add to all table names.""" 47 48 _deployed_api_root: Optional[str] = PrivateAttr(default=None) 49 _deployed_workspace_id: Optional[str] = PrivateAttr(default=None) 50 _deployed_destination_id: Optional[str] = PrivateAttr(default=None) 51 _deployed_connection_id: Optional[str] = PrivateAttr(default=None) 52 53 _sql_processor_class: type[SqlProcessorBase] = PrivateAttr() 54 _sql_processor: Optional[SqlProcessorBase] = PrivateAttr(default=None) 55 56 @final 57 @property 58 def processor(self) -> SqlProcessorBase: 59 """Return the SQL processor instance.""" 60 if self._sql_processor is None: 61 self._sql_processor = self._sql_processor_class(cache=self) 62 return self._sql_processor 63 64 @final 65 def get_sql_engine(self) -> Engine: 66 """Return a new SQL engine to use.""" 67 return self.processor.get_sql_engine() 68 69 @abc.abstractmethod 70 def get_sql_alchemy_url(self) -> str: 71 """Returns a SQL Alchemy URL.""" 72 ... 73 74 @abc.abstractmethod 75 def get_database_name(self) -> str: 76 """Return the name of the database.""" 77 ... 78 79 @final 80 @property 81 def streams( 82 self, 83 ) -> dict[str, CachedDataset]: 84 """Return a temporary table name.""" 85 result = {} 86 stream_names = self.processor.expected_streams 87 if self._has_catalog_manager: 88 stream_names |= set(self._catalog_manager.stream_names) 89 for stream_name in stream_names: 90 result[stream_name] = CachedDataset(self, stream_name) 91 92 return result 93 94 def _get_state( 95 self, 96 source_name: str, 97 streams: list[str] | None, 98 ) -> list[dict[str, Any]] | None: 99 return self._catalog_manager.get_state( 100 source_name=source_name, 101 streams=streams, 102 ) 103 104 @property 105 def _has_catalog_manager( 106 self, 107 ) -> bool: 108 """Return whether the cache has a catalog manager.""" 109 # Member is private until we have a public API for it. 110 return self.processor._catalog_manager is not None # noqa: SLF001 111 112 @property 113 def _catalog_manager( 114 self, 115 ) -> CatalogManager: 116 if not self._has_catalog_manager: 117 raise exc.PyAirbyteInternalError( 118 message="Catalog manager should exist but does not.", 119 ) 120 121 # Member is private until we have a public API for it. 122 return cast(CatalogManager, self.processor._catalog_manager) # noqa: SLF001 123 124 def __getitem__(self, stream: str) -> DatasetBase: 125 return self.streams[stream] 126 127 def __contains__(self, stream: str) -> bool: 128 return stream in (self.processor.expected_streams) 129 130 def __iter__(self) -> Generator[tuple[str, Any], None, None]: 131 return ((name, dataset) for name, dataset in self.streams.items())
Base configuration for a cache.
A prefix to add to all table names. If 'None', a prefix will be created based on the source name.
56 @final 57 @property 58 def processor(self) -> SqlProcessorBase: 59 """Return the SQL processor instance.""" 60 if self._sql_processor is None: 61 self._sql_processor = self._sql_processor_class(cache=self) 62 return self._sql_processor
Return the SQL processor instance.
64 @final 65 def get_sql_engine(self) -> Engine: 66 """Return a new SQL engine to use.""" 67 return self.processor.get_sql_engine()
Return a new SQL engine to use.
69 @abc.abstractmethod 70 def get_sql_alchemy_url(self) -> str: 71 """Returns a SQL Alchemy URL.""" 72 ...
Returns a SQL Alchemy URL.
74 @abc.abstractmethod 75 def get_database_name(self) -> str: 76 """Return the name of the database.""" 77 ...
Return the name of the database.
79 @final 80 @property 81 def streams( 82 self, 83 ) -> dict[str, CachedDataset]: 84 """Return a temporary table name.""" 85 result = {} 86 stream_names = self.processor.expected_streams 87 if self._has_catalog_manager: 88 stream_names |= set(self._catalog_manager.stream_names) 89 for stream_name in stream_names: 90 result[stream_name] = CachedDataset(self, stream_name) 91 92 return result
Return a temporary table name.
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- Config
- dict
- json
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
38class DuckDBCache(CacheBase): 39 """A DuckDB cache.""" 40 41 db_path: Union[Path, str] 42 """Normally db_path is a Path object. 43 44 The database name will be inferred from the file name. For example, given a `db_path` of 45 `/path/to/my/my_db.duckdb`, the database name is `my_db`. 46 """ 47 48 schema_name: str = "main" 49 """The name of the schema to write to. Defaults to "main".""" 50 51 _sql_processor_class = DuckDBSqlProcessor 52 53 @overrides 54 def get_sql_alchemy_url(self) -> str: 55 """Return the SQLAlchemy URL to use.""" 56 # return f"duckdb:///{self.db_path}?schema={self.schema_name}" 57 return f"duckdb:///{self.db_path!s}" 58 59 @overrides 60 def get_database_name(self) -> str: 61 """Return the name of the database.""" 62 if self.db_path == ":memory:": 63 return "memory" 64 65 # Split the path on the appropriate separator ("/" or "\") 66 split_on: Literal["/", "\\"] = "\\" if "\\" in str(self.db_path) else "/" 67 68 # Return the file name without the extension 69 return str(self.db_path).split(sep=split_on)[-1].split(".")[0]
A DuckDB cache.
Normally db_path is a Path object.
The database name will be inferred from the file name. For example, given a db_path
of
/path/to/my/my_dbairbyte.caches.duckdb
, the database name is my_db
.
53 @overrides 54 def get_sql_alchemy_url(self) -> str: 55 """Return the SQLAlchemy URL to use.""" 56 # return f"duckdb:///{self.db_path}?schema={self.schema_name}" 57 return f"duckdb:///{self.db_path!s}"
Return the SQLAlchemy URL to use.
59 @overrides 60 def get_database_name(self) -> str: 61 """Return the name of the database.""" 62 if self.db_path == ":memory:": 63 return "memory" 64 65 # Split the path on the appropriate separator ("/" or "\") 66 split_on: Literal["/", "\\"] = "\\" if "\\" in str(self.db_path) else "/" 67 68 # Return the file name without the extension 69 return str(self.db_path).split(sep=split_on)[-1].split(".")[0]
Return the name of the database.
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- Config
- dict
- json
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
27class MotherDuckCache(DuckDBCache): 28 """Cache that uses MotherDuck for external persistent storage.""" 29 30 db_path: str = Field(default="md:") 31 database: str 32 api_key: SecretString 33 34 _sql_processor_class = MotherDuckSqlProcessor 35 36 @overrides 37 def get_sql_alchemy_url(self) -> SecretString: 38 """Return the SQLAlchemy URL to use.""" 39 return SecretString( 40 f"duckdb:///md:{self.database}?motherduck_token={self.api_key}" 41 # f"&schema={self.schema_name}" # TODO: Debug why this doesn't work 42 ) 43 44 @overrides 45 def get_database_name(self) -> str: 46 """Return the name of the database.""" 47 return self.database
Cache that uses MotherDuck for external persistent storage.
Normally db_path is a Path object.
The database name will be inferred from the file name. For example, given a db_path
of
/path/to/my/my_dbairbyte.caches.duckdb
, the database name is my_db
.
36 @overrides 37 def get_sql_alchemy_url(self) -> SecretString: 38 """Return the SQLAlchemy URL to use.""" 39 return SecretString( 40 f"duckdb:///md:{self.database}?motherduck_token={self.api_key}" 41 # f"&schema={self.schema_name}" # TODO: Debug why this doesn't work 42 )
Return the SQLAlchemy URL to use.
44 @overrides 45 def get_database_name(self) -> str: 46 """Return the name of the database.""" 47 return self.database
Return the name of the database.
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- Config
- dict
- json
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
30class PostgresCache(CacheBase): 31 """Configuration for the Postgres cache. 32 33 Also inherits config from the JsonlWriter, which is responsible for writing files to disk. 34 """ 35 36 host: str 37 port: int 38 username: str 39 password: SecretString 40 database: str 41 42 _sql_processor_class = PostgresSqlProcessor 43 44 @overrides 45 def get_sql_alchemy_url(self) -> SecretString: 46 """Return the SQLAlchemy URL to use.""" 47 return SecretString( 48 f"postgresql+psycopg2://{self.username}:{self.password}@{self.host}:{self.port}/{self.database}" 49 ) 50 51 @overrides 52 def get_database_name(self) -> str: 53 """Return the name of the database.""" 54 return self.database
Configuration for the Postgres cache.
Also inherits config from the JsonlWriter, which is responsible for writing files to disk.
44 @overrides 45 def get_sql_alchemy_url(self) -> SecretString: 46 """Return the SQLAlchemy URL to use.""" 47 return SecretString( 48 f"postgresql+psycopg2://{self.username}:{self.password}@{self.host}:{self.port}/{self.database}" 49 )
Return the SQLAlchemy URL to use.
51 @overrides 52 def get_database_name(self) -> str: 53 """Return the name of the database.""" 54 return self.database
Return the name of the database.
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- Config
- dict
- json
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
34class SnowflakeCache(CacheBase): 35 """Configuration for the Snowflake cache.""" 36 37 account: str 38 username: str 39 password: SecretString 40 warehouse: str 41 database: str 42 role: str 43 44 dedupe_mode = RecordDedupeMode.APPEND 45 46 _sql_processor_class = SnowflakeSqlProcessor 47 48 # Already defined in base class: 49 # schema_name: str 50 51 @overrides 52 def get_sql_alchemy_url(self) -> SecretString: 53 """Return the SQLAlchemy URL to use.""" 54 return SecretString( 55 URL( 56 account=self.account, 57 user=self.username, 58 password=self.password, 59 database=self.database, 60 warehouse=self.warehouse, 61 schema=self.schema_name, 62 role=self.role, 63 ) 64 ) 65 66 @overrides 67 def get_database_name(self) -> str: 68 """Return the name of the database.""" 69 return self.database
Configuration for the Snowflake cache.
51 @overrides 52 def get_sql_alchemy_url(self) -> SecretString: 53 """Return the SQLAlchemy URL to use.""" 54 return SecretString( 55 URL( 56 account=self.account, 57 user=self.username, 58 password=self.password, 59 database=self.database, 60 warehouse=self.warehouse, 61 schema=self.schema_name, 62 role=self.role, 63 ) 64 )
Return the SQLAlchemy URL to use.
66 @overrides 67 def get_database_name(self) -> str: 68 """Return the name of the database.""" 69 return self.database
Return the name of the database.
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- Config
- dict
- json
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs