airbyte.cloud.sync_results
Sync results for Airbyte Cloud workspaces.
1# Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2"""Sync results for Airbyte Cloud workspaces.""" 3 4from __future__ import annotations 5 6import time 7from collections.abc import Iterator, Mapping 8from dataclasses import dataclass 9from typing import TYPE_CHECKING, Any, final 10 11from airbyte_api.models.shared import ConnectionResponse, JobStatusEnum 12 13from airbyte._util import api_util 14from airbyte.cloud._destination_util import create_cache_from_destination_config 15from airbyte.datasets import CachedDataset 16from airbyte.exceptions import AirbyteConnectionSyncError, AirbyteConnectionSyncTimeoutError 17 18 19DEFAULT_SYNC_TIMEOUT_SECONDS = 30 * 60 # 30 minutes 20 21 22if TYPE_CHECKING: 23 import sqlalchemy 24 25 from airbyte.caches.base import CacheBase 26 from airbyte.cloud.connections import CloudConnection 27 from airbyte.cloud.workspaces import CloudWorkspace 28 29 30FINAL_STATUSES = { 31 JobStatusEnum.SUCCEEDED, 32 JobStatusEnum.FAILED, 33 JobStatusEnum.CANCELLED, 34} 35FAILED_STATUSES = { 36 JobStatusEnum.FAILED, 37 JobStatusEnum.CANCELLED, 38} 39 40 41@dataclass 42class SyncResult: 43 """The result of a sync operation.""" 44 45 workspace: CloudWorkspace 46 connection: CloudConnection 47 job_id: str 48 table_name_prefix: str = "" 49 table_name_suffix: str = "" 50 _latest_status: JobStatusEnum | None = None 51 _connection_response: ConnectionResponse | None = None 52 _cache: CacheBase | None = None 53 54 @property 55 def job_url(self) -> str: 56 """Return the URL of the sync job.""" 57 return f"{self.connection.job_history_url}/{self.job_id}" 58 59 def _get_connection_info(self, *, force_refresh: bool = False) -> ConnectionResponse: 60 """Return connection info for the sync job.""" 61 if self._connection_response and not force_refresh: 62 return self._connection_response 63 64 self._connection_response = api_util.get_connection( 65 workspace_id=self.workspace.workspace_id, 66 api_root=self.workspace.api_root, 67 api_key=self.workspace.api_key, 68 connection_id=self.connection.connection_id, 69 ) 70 return self._connection_response 71 72 def _get_destination_configuration(self, *, force_refresh: bool = False) -> dict[str, Any]: 73 """Return the destination configuration for the sync job.""" 74 connection_info: ConnectionResponse = self._get_connection_info(force_refresh=force_refresh) 75 destination_response = api_util.get_destination( 76 destination_id=connection_info.destination_id, 77 api_root=self.workspace.api_root, 78 api_key=self.workspace.api_key, 79 ) 80 return destination_response.configuration 81 82 def is_job_complete(self) -> bool: 83 """Check if the sync job is complete.""" 84 return self.get_job_status() in FINAL_STATUSES 85 86 def get_job_status(self) -> JobStatusEnum: 87 """Check if the sync job is still running.""" 88 if self._latest_status and self._latest_status in FINAL_STATUSES: 89 return self._latest_status 90 91 job_info = api_util.get_job_info( 92 job_id=self.job_id, 93 api_root=self.workspace.api_root, 94 api_key=self.workspace.api_key, 95 ) 96 self._latest_status = job_info.status 97 98 return job_info.status 99 100 def raise_failure_status( 101 self, 102 *, 103 refresh_status: bool = False, 104 ) -> None: 105 """Raise an exception if the sync job failed. 106 107 By default, this method will use the latest status available. If you want to refresh the 108 status before checking for failure, set `refresh_status=True`. If the job has failed, this 109 method will raise a `AirbyteConnectionSyncError`. 110 111 Otherwise, do nothing. 112 """ 113 latest_status = self._latest_status 114 if refresh_status: 115 latest_status = self.get_job_status() 116 117 if latest_status in FAILED_STATUSES: 118 raise AirbyteConnectionSyncError( 119 workspace=self.workspace, 120 connection_id=self.connection.connection_id, 121 job_id=self.job_id, 122 job_status=self._latest_status, 123 ) 124 125 def wait_for_completion( 126 self, 127 *, 128 wait_timeout: int = DEFAULT_SYNC_TIMEOUT_SECONDS, 129 raise_timeout: bool = True, 130 raise_failure: bool = False, 131 ) -> JobStatusEnum: 132 """Wait for a job to finish running.""" 133 start_time = time.time() 134 while True: 135 latest_status = self.get_job_status() 136 if latest_status in FINAL_STATUSES: 137 if raise_failure: 138 # No-op if the job succeeded or is still running: 139 self.raise_failure_status() 140 141 return latest_status 142 143 if time.time() - start_time > wait_timeout: 144 if raise_timeout: 145 raise AirbyteConnectionSyncTimeoutError( 146 workspace=self.workspace, 147 connection_id=self.connection.connection_id, 148 job_id=self.job_id, 149 job_status=latest_status, 150 timeout=wait_timeout, 151 ) 152 153 return latest_status # This will be a non-final status 154 155 time.sleep(api_util.JOB_WAIT_INTERVAL_SECS) 156 157 def get_sql_cache(self) -> CacheBase: 158 """Return a SQL Cache object for working with the data in a SQL-based destination's.""" 159 if self._cache: 160 return self._cache 161 162 destination_configuration: dict[str, Any] = self._get_destination_configuration() 163 self._cache = create_cache_from_destination_config( 164 destination_configuration=destination_configuration 165 ) 166 return self._cache 167 168 def get_sql_engine(self) -> sqlalchemy.engine.Engine: 169 """Return a SQL Engine for querying a SQL-based destination.""" 170 self.get_sql_cache().get_sql_engine() 171 172 def get_sql_table_name(self, stream_name: str) -> str: 173 """Return the SQL table name of the named stream.""" 174 return self.get_sql_cache().processor.get_sql_table_name(stream_name=stream_name) 175 176 def get_sql_table( 177 self, 178 stream_name: str, 179 ) -> sqlalchemy.Table: 180 """Return a SQLAlchemy table object for the named stream.""" 181 self.get_sql_cache().processor.get_sql_table(stream_name) 182 183 def get_dataset(self, stream_name: str) -> CachedDataset: 184 """Return cached dataset.""" 185 return CachedDataset(self.get_sql_cache(), stream_name=stream_name) 186 187 def get_sql_database_name(self) -> str: 188 """Return the SQL database name.""" 189 cache = self.get_sql_cache() 190 return cache.get_database_name() 191 192 def get_sql_schema_name(self) -> str: 193 """Return the SQL schema name.""" 194 cache = self.get_sql_cache() 195 return cache.schema_name 196 197 @property 198 def stream_names(self) -> set[str]: 199 """Return the set of stream names.""" 200 return self.get_sql_cache().processor.expected_streams 201 202 @final 203 @property 204 def streams( 205 self, 206 ) -> SyncResultStreams: 207 """Return a temporary table name.""" 208 return self.SyncResultStreams(self) 209 210 class SyncResultStreams(Mapping[str, CachedDataset]): 211 """A mapping of stream names to cached datasets.""" 212 213 def __init__( 214 self, 215 parent: SyncResult, 216 /, 217 ) -> None: 218 self.parent: SyncResult = parent 219 220 def __getitem__(self, key: str) -> CachedDataset: 221 return self.parent.get_dataset(stream_name=key) 222 223 def __iter__(self) -> Iterator[str]: 224 """TODO""" 225 return iter(self.parent.stream_names) 226 227 def __len__(self) -> int: 228 return len(self.parent.stream_names)
42@dataclass 43class SyncResult: 44 """The result of a sync operation.""" 45 46 workspace: CloudWorkspace 47 connection: CloudConnection 48 job_id: str 49 table_name_prefix: str = "" 50 table_name_suffix: str = "" 51 _latest_status: JobStatusEnum | None = None 52 _connection_response: ConnectionResponse | None = None 53 _cache: CacheBase | None = None 54 55 @property 56 def job_url(self) -> str: 57 """Return the URL of the sync job.""" 58 return f"{self.connection.job_history_url}/{self.job_id}" 59 60 def _get_connection_info(self, *, force_refresh: bool = False) -> ConnectionResponse: 61 """Return connection info for the sync job.""" 62 if self._connection_response and not force_refresh: 63 return self._connection_response 64 65 self._connection_response = api_util.get_connection( 66 workspace_id=self.workspace.workspace_id, 67 api_root=self.workspace.api_root, 68 api_key=self.workspace.api_key, 69 connection_id=self.connection.connection_id, 70 ) 71 return self._connection_response 72 73 def _get_destination_configuration(self, *, force_refresh: bool = False) -> dict[str, Any]: 74 """Return the destination configuration for the sync job.""" 75 connection_info: ConnectionResponse = self._get_connection_info(force_refresh=force_refresh) 76 destination_response = api_util.get_destination( 77 destination_id=connection_info.destination_id, 78 api_root=self.workspace.api_root, 79 api_key=self.workspace.api_key, 80 ) 81 return destination_response.configuration 82 83 def is_job_complete(self) -> bool: 84 """Check if the sync job is complete.""" 85 return self.get_job_status() in FINAL_STATUSES 86 87 def get_job_status(self) -> JobStatusEnum: 88 """Check if the sync job is still running.""" 89 if self._latest_status and self._latest_status in FINAL_STATUSES: 90 return self._latest_status 91 92 job_info = api_util.get_job_info( 93 job_id=self.job_id, 94 api_root=self.workspace.api_root, 95 api_key=self.workspace.api_key, 96 ) 97 self._latest_status = job_info.status 98 99 return job_info.status 100 101 def raise_failure_status( 102 self, 103 *, 104 refresh_status: bool = False, 105 ) -> None: 106 """Raise an exception if the sync job failed. 107 108 By default, this method will use the latest status available. If you want to refresh the 109 status before checking for failure, set `refresh_status=True`. If the job has failed, this 110 method will raise a `AirbyteConnectionSyncError`. 111 112 Otherwise, do nothing. 113 """ 114 latest_status = self._latest_status 115 if refresh_status: 116 latest_status = self.get_job_status() 117 118 if latest_status in FAILED_STATUSES: 119 raise AirbyteConnectionSyncError( 120 workspace=self.workspace, 121 connection_id=self.connection.connection_id, 122 job_id=self.job_id, 123 job_status=self._latest_status, 124 ) 125 126 def wait_for_completion( 127 self, 128 *, 129 wait_timeout: int = DEFAULT_SYNC_TIMEOUT_SECONDS, 130 raise_timeout: bool = True, 131 raise_failure: bool = False, 132 ) -> JobStatusEnum: 133 """Wait for a job to finish running.""" 134 start_time = time.time() 135 while True: 136 latest_status = self.get_job_status() 137 if latest_status in FINAL_STATUSES: 138 if raise_failure: 139 # No-op if the job succeeded or is still running: 140 self.raise_failure_status() 141 142 return latest_status 143 144 if time.time() - start_time > wait_timeout: 145 if raise_timeout: 146 raise AirbyteConnectionSyncTimeoutError( 147 workspace=self.workspace, 148 connection_id=self.connection.connection_id, 149 job_id=self.job_id, 150 job_status=latest_status, 151 timeout=wait_timeout, 152 ) 153 154 return latest_status # This will be a non-final status 155 156 time.sleep(api_util.JOB_WAIT_INTERVAL_SECS) 157 158 def get_sql_cache(self) -> CacheBase: 159 """Return a SQL Cache object for working with the data in a SQL-based destination's.""" 160 if self._cache: 161 return self._cache 162 163 destination_configuration: dict[str, Any] = self._get_destination_configuration() 164 self._cache = create_cache_from_destination_config( 165 destination_configuration=destination_configuration 166 ) 167 return self._cache 168 169 def get_sql_engine(self) -> sqlalchemy.engine.Engine: 170 """Return a SQL Engine for querying a SQL-based destination.""" 171 self.get_sql_cache().get_sql_engine() 172 173 def get_sql_table_name(self, stream_name: str) -> str: 174 """Return the SQL table name of the named stream.""" 175 return self.get_sql_cache().processor.get_sql_table_name(stream_name=stream_name) 176 177 def get_sql_table( 178 self, 179 stream_name: str, 180 ) -> sqlalchemy.Table: 181 """Return a SQLAlchemy table object for the named stream.""" 182 self.get_sql_cache().processor.get_sql_table(stream_name) 183 184 def get_dataset(self, stream_name: str) -> CachedDataset: 185 """Return cached dataset.""" 186 return CachedDataset(self.get_sql_cache(), stream_name=stream_name) 187 188 def get_sql_database_name(self) -> str: 189 """Return the SQL database name.""" 190 cache = self.get_sql_cache() 191 return cache.get_database_name() 192 193 def get_sql_schema_name(self) -> str: 194 """Return the SQL schema name.""" 195 cache = self.get_sql_cache() 196 return cache.schema_name 197 198 @property 199 def stream_names(self) -> set[str]: 200 """Return the set of stream names.""" 201 return self.get_sql_cache().processor.expected_streams 202 203 @final 204 @property 205 def streams( 206 self, 207 ) -> SyncResultStreams: 208 """Return a temporary table name.""" 209 return self.SyncResultStreams(self) 210 211 class SyncResultStreams(Mapping[str, CachedDataset]): 212 """A mapping of stream names to cached datasets.""" 213 214 def __init__( 215 self, 216 parent: SyncResult, 217 /, 218 ) -> None: 219 self.parent: SyncResult = parent 220 221 def __getitem__(self, key: str) -> CachedDataset: 222 return self.parent.get_dataset(stream_name=key) 223 224 def __iter__(self) -> Iterator[str]: 225 """TODO""" 226 return iter(self.parent.stream_names) 227 228 def __len__(self) -> int: 229 return len(self.parent.stream_names)
The result of a sync operation.
55 @property 56 def job_url(self) -> str: 57 """Return the URL of the sync job.""" 58 return f"{self.connection.job_history_url}/{self.job_id}"
Return the URL of the sync job.
83 def is_job_complete(self) -> bool: 84 """Check if the sync job is complete.""" 85 return self.get_job_status() in FINAL_STATUSES
Check if the sync job is complete.
87 def get_job_status(self) -> JobStatusEnum: 88 """Check if the sync job is still running.""" 89 if self._latest_status and self._latest_status in FINAL_STATUSES: 90 return self._latest_status 91 92 job_info = api_util.get_job_info( 93 job_id=self.job_id, 94 api_root=self.workspace.api_root, 95 api_key=self.workspace.api_key, 96 ) 97 self._latest_status = job_info.status 98 99 return job_info.status
Check if the sync job is still running.
101 def raise_failure_status( 102 self, 103 *, 104 refresh_status: bool = False, 105 ) -> None: 106 """Raise an exception if the sync job failed. 107 108 By default, this method will use the latest status available. If you want to refresh the 109 status before checking for failure, set `refresh_status=True`. If the job has failed, this 110 method will raise a `AirbyteConnectionSyncError`. 111 112 Otherwise, do nothing. 113 """ 114 latest_status = self._latest_status 115 if refresh_status: 116 latest_status = self.get_job_status() 117 118 if latest_status in FAILED_STATUSES: 119 raise AirbyteConnectionSyncError( 120 workspace=self.workspace, 121 connection_id=self.connection.connection_id, 122 job_id=self.job_id, 123 job_status=self._latest_status, 124 )
Raise an exception if the sync job failed.
By default, this method will use the latest status available. If you want to refresh the
status before checking for failure, set refresh_status=True
. If the job has failed, this
method will raise a AirbyteConnectionSyncError
.
Otherwise, do nothing.
126 def wait_for_completion( 127 self, 128 *, 129 wait_timeout: int = DEFAULT_SYNC_TIMEOUT_SECONDS, 130 raise_timeout: bool = True, 131 raise_failure: bool = False, 132 ) -> JobStatusEnum: 133 """Wait for a job to finish running.""" 134 start_time = time.time() 135 while True: 136 latest_status = self.get_job_status() 137 if latest_status in FINAL_STATUSES: 138 if raise_failure: 139 # No-op if the job succeeded or is still running: 140 self.raise_failure_status() 141 142 return latest_status 143 144 if time.time() - start_time > wait_timeout: 145 if raise_timeout: 146 raise AirbyteConnectionSyncTimeoutError( 147 workspace=self.workspace, 148 connection_id=self.connection.connection_id, 149 job_id=self.job_id, 150 job_status=latest_status, 151 timeout=wait_timeout, 152 ) 153 154 return latest_status # This will be a non-final status 155 156 time.sleep(api_util.JOB_WAIT_INTERVAL_SECS)
Wait for a job to finish running.
158 def get_sql_cache(self) -> CacheBase: 159 """Return a SQL Cache object for working with the data in a SQL-based destination's.""" 160 if self._cache: 161 return self._cache 162 163 destination_configuration: dict[str, Any] = self._get_destination_configuration() 164 self._cache = create_cache_from_destination_config( 165 destination_configuration=destination_configuration 166 ) 167 return self._cache
Return a SQL Cache object for working with the data in a SQL-based destination's.
169 def get_sql_engine(self) -> sqlalchemy.engine.Engine: 170 """Return a SQL Engine for querying a SQL-based destination.""" 171 self.get_sql_cache().get_sql_engine()
Return a SQL Engine for querying a SQL-based destination.
173 def get_sql_table_name(self, stream_name: str) -> str: 174 """Return the SQL table name of the named stream.""" 175 return self.get_sql_cache().processor.get_sql_table_name(stream_name=stream_name)
Return the SQL table name of the named stream.
177 def get_sql_table( 178 self, 179 stream_name: str, 180 ) -> sqlalchemy.Table: 181 """Return a SQLAlchemy table object for the named stream.""" 182 self.get_sql_cache().processor.get_sql_table(stream_name)
Return a SQLAlchemy table object for the named stream.
184 def get_dataset(self, stream_name: str) -> CachedDataset: 185 """Return cached dataset.""" 186 return CachedDataset(self.get_sql_cache(), stream_name=stream_name)
Return cached dataset.
188 def get_sql_database_name(self) -> str: 189 """Return the SQL database name.""" 190 cache = self.get_sql_cache() 191 return cache.get_database_name()
Return the SQL database name.
193 def get_sql_schema_name(self) -> str: 194 """Return the SQL schema name.""" 195 cache = self.get_sql_cache() 196 return cache.schema_name
Return the SQL schema name.
211 class SyncResultStreams(Mapping[str, CachedDataset]): 212 """A mapping of stream names to cached datasets.""" 213 214 def __init__( 215 self, 216 parent: SyncResult, 217 /, 218 ) -> None: 219 self.parent: SyncResult = parent 220 221 def __getitem__(self, key: str) -> CachedDataset: 222 return self.parent.get_dataset(stream_name=key) 223 224 def __iter__(self) -> Iterator[str]: 225 """TODO""" 226 return iter(self.parent.stream_names) 227 228 def __len__(self) -> int: 229 return len(self.parent.stream_names)
A mapping of stream names to cached datasets.
Inherited Members
- collections.abc.Mapping
- get
- keys
- items
- values