airbyte.cloud.sync_results

Sync results for Airbyte Cloud workspaces.

  1# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
  2"""Sync results for Airbyte Cloud workspaces."""
  3
  4from __future__ import annotations
  5
  6import time
  7from collections.abc import Iterator, Mapping
  8from dataclasses import dataclass
  9from typing import TYPE_CHECKING, Any, final
 10
 11from airbyte_api.models.shared import ConnectionResponse, JobStatusEnum
 12
 13from airbyte._util import api_util
 14from airbyte.cloud._destination_util import create_cache_from_destination_config
 15from airbyte.datasets import CachedDataset
 16from airbyte.exceptions import AirbyteConnectionSyncError, AirbyteConnectionSyncTimeoutError
 17
 18
 19DEFAULT_SYNC_TIMEOUT_SECONDS = 30 * 60  # 30 minutes
 20
 21
 22if TYPE_CHECKING:
 23    import sqlalchemy
 24
 25    from airbyte.caches.base import CacheBase
 26    from airbyte.cloud.connections import CloudConnection
 27    from airbyte.cloud.workspaces import CloudWorkspace
 28
 29
 30FINAL_STATUSES = {
 31    JobStatusEnum.SUCCEEDED,
 32    JobStatusEnum.FAILED,
 33    JobStatusEnum.CANCELLED,
 34}
 35FAILED_STATUSES = {
 36    JobStatusEnum.FAILED,
 37    JobStatusEnum.CANCELLED,
 38}
 39
 40
 41@dataclass
 42class SyncResult:
 43    """The result of a sync operation."""
 44
 45    workspace: CloudWorkspace
 46    connection: CloudConnection
 47    job_id: str
 48    table_name_prefix: str = ""
 49    table_name_suffix: str = ""
 50    _latest_status: JobStatusEnum | None = None
 51    _connection_response: ConnectionResponse | None = None
 52    _cache: CacheBase | None = None
 53
 54    @property
 55    def job_url(self) -> str:
 56        """Return the URL of the sync job."""
 57        return f"{self.connection.job_history_url}/{self.job_id}"
 58
 59    def _get_connection_info(self, *, force_refresh: bool = False) -> ConnectionResponse:
 60        """Return connection info for the sync job."""
 61        if self._connection_response and not force_refresh:
 62            return self._connection_response
 63
 64        self._connection_response = api_util.get_connection(
 65            workspace_id=self.workspace.workspace_id,
 66            api_root=self.workspace.api_root,
 67            api_key=self.workspace.api_key,
 68            connection_id=self.connection.connection_id,
 69        )
 70        return self._connection_response
 71
 72    def _get_destination_configuration(self, *, force_refresh: bool = False) -> dict[str, Any]:
 73        """Return the destination configuration for the sync job."""
 74        connection_info: ConnectionResponse = self._get_connection_info(force_refresh=force_refresh)
 75        destination_response = api_util.get_destination(
 76            destination_id=connection_info.destination_id,
 77            api_root=self.workspace.api_root,
 78            api_key=self.workspace.api_key,
 79        )
 80        return destination_response.configuration
 81
 82    def is_job_complete(self) -> bool:
 83        """Check if the sync job is complete."""
 84        return self.get_job_status() in FINAL_STATUSES
 85
 86    def get_job_status(self) -> JobStatusEnum:
 87        """Check if the sync job is still running."""
 88        if self._latest_status and self._latest_status in FINAL_STATUSES:
 89            return self._latest_status
 90
 91        job_info = api_util.get_job_info(
 92            job_id=self.job_id,
 93            api_root=self.workspace.api_root,
 94            api_key=self.workspace.api_key,
 95        )
 96        self._latest_status = job_info.status
 97
 98        return job_info.status
 99
100    def raise_failure_status(
101        self,
102        *,
103        refresh_status: bool = False,
104    ) -> None:
105        """Raise an exception if the sync job failed.
106
107        By default, this method will use the latest status available. If you want to refresh the
108        status before checking for failure, set `refresh_status=True`. If the job has failed, this
109        method will raise a `AirbyteConnectionSyncError`.
110
111        Otherwise, do nothing.
112        """
113        latest_status = self._latest_status
114        if refresh_status:
115            latest_status = self.get_job_status()
116
117        if latest_status in FAILED_STATUSES:
118            raise AirbyteConnectionSyncError(
119                workspace=self.workspace,
120                connection_id=self.connection.connection_id,
121                job_id=self.job_id,
122                job_status=self._latest_status,
123            )
124
125    def wait_for_completion(
126        self,
127        *,
128        wait_timeout: int = DEFAULT_SYNC_TIMEOUT_SECONDS,
129        raise_timeout: bool = True,
130        raise_failure: bool = False,
131    ) -> JobStatusEnum:
132        """Wait for a job to finish running."""
133        start_time = time.time()
134        while True:
135            latest_status = self.get_job_status()
136            if latest_status in FINAL_STATUSES:
137                if raise_failure:
138                    # No-op if the job succeeded or is still running:
139                    self.raise_failure_status()
140
141                return latest_status
142
143            if time.time() - start_time > wait_timeout:
144                if raise_timeout:
145                    raise AirbyteConnectionSyncTimeoutError(
146                        workspace=self.workspace,
147                        connection_id=self.connection.connection_id,
148                        job_id=self.job_id,
149                        job_status=latest_status,
150                        timeout=wait_timeout,
151                    )
152
153                return latest_status  # This will be a non-final status
154
155            time.sleep(api_util.JOB_WAIT_INTERVAL_SECS)
156
157    def get_sql_cache(self) -> CacheBase:
158        """Return a SQL Cache object for working with the data in a SQL-based destination's."""
159        if self._cache:
160            return self._cache
161
162        destination_configuration: dict[str, Any] = self._get_destination_configuration()
163        self._cache = create_cache_from_destination_config(
164            destination_configuration=destination_configuration
165        )
166        return self._cache
167
168    def get_sql_engine(self) -> sqlalchemy.engine.Engine:
169        """Return a SQL Engine for querying a SQL-based destination."""
170        self.get_sql_cache().get_sql_engine()
171
172    def get_sql_table_name(self, stream_name: str) -> str:
173        """Return the SQL table name of the named stream."""
174        return self.get_sql_cache().processor.get_sql_table_name(stream_name=stream_name)
175
176    def get_sql_table(
177        self,
178        stream_name: str,
179    ) -> sqlalchemy.Table:
180        """Return a SQLAlchemy table object for the named stream."""
181        self.get_sql_cache().processor.get_sql_table(stream_name)
182
183    def get_dataset(self, stream_name: str) -> CachedDataset:
184        """Return cached dataset."""
185        return CachedDataset(self.get_sql_cache(), stream_name=stream_name)
186
187    def get_sql_database_name(self) -> str:
188        """Return the SQL database name."""
189        cache = self.get_sql_cache()
190        return cache.get_database_name()
191
192    def get_sql_schema_name(self) -> str:
193        """Return the SQL schema name."""
194        cache = self.get_sql_cache()
195        return cache.schema_name
196
197    @property
198    def stream_names(self) -> set[str]:
199        """Return the set of stream names."""
200        return self.get_sql_cache().processor.expected_streams
201
202    @final
203    @property
204    def streams(
205        self,
206    ) -> SyncResultStreams:
207        """Return a temporary table name."""
208        return self.SyncResultStreams(self)
209
210    class SyncResultStreams(Mapping[str, CachedDataset]):
211        """A mapping of stream names to cached datasets."""
212
213        def __init__(
214            self,
215            parent: SyncResult,
216            /,
217        ) -> None:
218            self.parent: SyncResult = parent
219
220        def __getitem__(self, key: str) -> CachedDataset:
221            return self.parent.get_dataset(stream_name=key)
222
223        def __iter__(self) -> Iterator[str]:
224            """TODO"""
225            return iter(self.parent.stream_names)
226
227        def __len__(self) -> int:
228            return len(self.parent.stream_names)
DEFAULT_SYNC_TIMEOUT_SECONDS = 1800
FINAL_STATUSES = {<JobStatusEnum.CANCELLED: 'cancelled'>, <JobStatusEnum.SUCCEEDED: 'succeeded'>, <JobStatusEnum.FAILED: 'failed'>}
FAILED_STATUSES = {<JobStatusEnum.CANCELLED: 'cancelled'>, <JobStatusEnum.FAILED: 'failed'>}
@dataclass
class SyncResult:
 42@dataclass
 43class SyncResult:
 44    """The result of a sync operation."""
 45
 46    workspace: CloudWorkspace
 47    connection: CloudConnection
 48    job_id: str
 49    table_name_prefix: str = ""
 50    table_name_suffix: str = ""
 51    _latest_status: JobStatusEnum | None = None
 52    _connection_response: ConnectionResponse | None = None
 53    _cache: CacheBase | None = None
 54
 55    @property
 56    def job_url(self) -> str:
 57        """Return the URL of the sync job."""
 58        return f"{self.connection.job_history_url}/{self.job_id}"
 59
 60    def _get_connection_info(self, *, force_refresh: bool = False) -> ConnectionResponse:
 61        """Return connection info for the sync job."""
 62        if self._connection_response and not force_refresh:
 63            return self._connection_response
 64
 65        self._connection_response = api_util.get_connection(
 66            workspace_id=self.workspace.workspace_id,
 67            api_root=self.workspace.api_root,
 68            api_key=self.workspace.api_key,
 69            connection_id=self.connection.connection_id,
 70        )
 71        return self._connection_response
 72
 73    def _get_destination_configuration(self, *, force_refresh: bool = False) -> dict[str, Any]:
 74        """Return the destination configuration for the sync job."""
 75        connection_info: ConnectionResponse = self._get_connection_info(force_refresh=force_refresh)
 76        destination_response = api_util.get_destination(
 77            destination_id=connection_info.destination_id,
 78            api_root=self.workspace.api_root,
 79            api_key=self.workspace.api_key,
 80        )
 81        return destination_response.configuration
 82
 83    def is_job_complete(self) -> bool:
 84        """Check if the sync job is complete."""
 85        return self.get_job_status() in FINAL_STATUSES
 86
 87    def get_job_status(self) -> JobStatusEnum:
 88        """Check if the sync job is still running."""
 89        if self._latest_status and self._latest_status in FINAL_STATUSES:
 90            return self._latest_status
 91
 92        job_info = api_util.get_job_info(
 93            job_id=self.job_id,
 94            api_root=self.workspace.api_root,
 95            api_key=self.workspace.api_key,
 96        )
 97        self._latest_status = job_info.status
 98
 99        return job_info.status
100
101    def raise_failure_status(
102        self,
103        *,
104        refresh_status: bool = False,
105    ) -> None:
106        """Raise an exception if the sync job failed.
107
108        By default, this method will use the latest status available. If you want to refresh the
109        status before checking for failure, set `refresh_status=True`. If the job has failed, this
110        method will raise a `AirbyteConnectionSyncError`.
111
112        Otherwise, do nothing.
113        """
114        latest_status = self._latest_status
115        if refresh_status:
116            latest_status = self.get_job_status()
117
118        if latest_status in FAILED_STATUSES:
119            raise AirbyteConnectionSyncError(
120                workspace=self.workspace,
121                connection_id=self.connection.connection_id,
122                job_id=self.job_id,
123                job_status=self._latest_status,
124            )
125
126    def wait_for_completion(
127        self,
128        *,
129        wait_timeout: int = DEFAULT_SYNC_TIMEOUT_SECONDS,
130        raise_timeout: bool = True,
131        raise_failure: bool = False,
132    ) -> JobStatusEnum:
133        """Wait for a job to finish running."""
134        start_time = time.time()
135        while True:
136            latest_status = self.get_job_status()
137            if latest_status in FINAL_STATUSES:
138                if raise_failure:
139                    # No-op if the job succeeded or is still running:
140                    self.raise_failure_status()
141
142                return latest_status
143
144            if time.time() - start_time > wait_timeout:
145                if raise_timeout:
146                    raise AirbyteConnectionSyncTimeoutError(
147                        workspace=self.workspace,
148                        connection_id=self.connection.connection_id,
149                        job_id=self.job_id,
150                        job_status=latest_status,
151                        timeout=wait_timeout,
152                    )
153
154                return latest_status  # This will be a non-final status
155
156            time.sleep(api_util.JOB_WAIT_INTERVAL_SECS)
157
158    def get_sql_cache(self) -> CacheBase:
159        """Return a SQL Cache object for working with the data in a SQL-based destination's."""
160        if self._cache:
161            return self._cache
162
163        destination_configuration: dict[str, Any] = self._get_destination_configuration()
164        self._cache = create_cache_from_destination_config(
165            destination_configuration=destination_configuration
166        )
167        return self._cache
168
169    def get_sql_engine(self) -> sqlalchemy.engine.Engine:
170        """Return a SQL Engine for querying a SQL-based destination."""
171        self.get_sql_cache().get_sql_engine()
172
173    def get_sql_table_name(self, stream_name: str) -> str:
174        """Return the SQL table name of the named stream."""
175        return self.get_sql_cache().processor.get_sql_table_name(stream_name=stream_name)
176
177    def get_sql_table(
178        self,
179        stream_name: str,
180    ) -> sqlalchemy.Table:
181        """Return a SQLAlchemy table object for the named stream."""
182        self.get_sql_cache().processor.get_sql_table(stream_name)
183
184    def get_dataset(self, stream_name: str) -> CachedDataset:
185        """Return cached dataset."""
186        return CachedDataset(self.get_sql_cache(), stream_name=stream_name)
187
188    def get_sql_database_name(self) -> str:
189        """Return the SQL database name."""
190        cache = self.get_sql_cache()
191        return cache.get_database_name()
192
193    def get_sql_schema_name(self) -> str:
194        """Return the SQL schema name."""
195        cache = self.get_sql_cache()
196        return cache.schema_name
197
198    @property
199    def stream_names(self) -> set[str]:
200        """Return the set of stream names."""
201        return self.get_sql_cache().processor.expected_streams
202
203    @final
204    @property
205    def streams(
206        self,
207    ) -> SyncResultStreams:
208        """Return a temporary table name."""
209        return self.SyncResultStreams(self)
210
211    class SyncResultStreams(Mapping[str, CachedDataset]):
212        """A mapping of stream names to cached datasets."""
213
214        def __init__(
215            self,
216            parent: SyncResult,
217            /,
218        ) -> None:
219            self.parent: SyncResult = parent
220
221        def __getitem__(self, key: str) -> CachedDataset:
222            return self.parent.get_dataset(stream_name=key)
223
224        def __iter__(self) -> Iterator[str]:
225            """TODO"""
226            return iter(self.parent.stream_names)
227
228        def __len__(self) -> int:
229            return len(self.parent.stream_names)

The result of a sync operation.

SyncResult( workspace: airbyte.cloud.workspaces.CloudWorkspace, connection: airbyte.cloud.connections.CloudConnection, job_id: str, table_name_prefix: str = '', table_name_suffix: str = '', _latest_status: airbyte_api.models.shared.jobstatusenum.JobStatusEnum | None = None, _connection_response: airbyte_api.models.shared.connectionresponse.ConnectionResponse | None = None, _cache: airbyte.caches.base.CacheBase | None = None)
job_id: str
table_name_prefix: str = ''
table_name_suffix: str = ''
job_url: str
55    @property
56    def job_url(self) -> str:
57        """Return the URL of the sync job."""
58        return f"{self.connection.job_history_url}/{self.job_id}"

Return the URL of the sync job.

def is_job_complete(self) -> bool:
83    def is_job_complete(self) -> bool:
84        """Check if the sync job is complete."""
85        return self.get_job_status() in FINAL_STATUSES

Check if the sync job is complete.

def get_job_status(self) -> airbyte_api.models.shared.jobstatusenum.JobStatusEnum:
87    def get_job_status(self) -> JobStatusEnum:
88        """Check if the sync job is still running."""
89        if self._latest_status and self._latest_status in FINAL_STATUSES:
90            return self._latest_status
91
92        job_info = api_util.get_job_info(
93            job_id=self.job_id,
94            api_root=self.workspace.api_root,
95            api_key=self.workspace.api_key,
96        )
97        self._latest_status = job_info.status
98
99        return job_info.status

Check if the sync job is still running.

def raise_failure_status(self, *, refresh_status: bool = False) -> None:
101    def raise_failure_status(
102        self,
103        *,
104        refresh_status: bool = False,
105    ) -> None:
106        """Raise an exception if the sync job failed.
107
108        By default, this method will use the latest status available. If you want to refresh the
109        status before checking for failure, set `refresh_status=True`. If the job has failed, this
110        method will raise a `AirbyteConnectionSyncError`.
111
112        Otherwise, do nothing.
113        """
114        latest_status = self._latest_status
115        if refresh_status:
116            latest_status = self.get_job_status()
117
118        if latest_status in FAILED_STATUSES:
119            raise AirbyteConnectionSyncError(
120                workspace=self.workspace,
121                connection_id=self.connection.connection_id,
122                job_id=self.job_id,
123                job_status=self._latest_status,
124            )

Raise an exception if the sync job failed.

By default, this method will use the latest status available. If you want to refresh the status before checking for failure, set refresh_status=True. If the job has failed, this method will raise a AirbyteConnectionSyncError.

Otherwise, do nothing.

def wait_for_completion( self, *, wait_timeout: int = 1800, raise_timeout: bool = True, raise_failure: bool = False) -> airbyte_api.models.shared.jobstatusenum.JobStatusEnum:
126    def wait_for_completion(
127        self,
128        *,
129        wait_timeout: int = DEFAULT_SYNC_TIMEOUT_SECONDS,
130        raise_timeout: bool = True,
131        raise_failure: bool = False,
132    ) -> JobStatusEnum:
133        """Wait for a job to finish running."""
134        start_time = time.time()
135        while True:
136            latest_status = self.get_job_status()
137            if latest_status in FINAL_STATUSES:
138                if raise_failure:
139                    # No-op if the job succeeded or is still running:
140                    self.raise_failure_status()
141
142                return latest_status
143
144            if time.time() - start_time > wait_timeout:
145                if raise_timeout:
146                    raise AirbyteConnectionSyncTimeoutError(
147                        workspace=self.workspace,
148                        connection_id=self.connection.connection_id,
149                        job_id=self.job_id,
150                        job_status=latest_status,
151                        timeout=wait_timeout,
152                    )
153
154                return latest_status  # This will be a non-final status
155
156            time.sleep(api_util.JOB_WAIT_INTERVAL_SECS)

Wait for a job to finish running.

def get_sql_cache(self) -> airbyte.caches.base.CacheBase:
158    def get_sql_cache(self) -> CacheBase:
159        """Return a SQL Cache object for working with the data in a SQL-based destination's."""
160        if self._cache:
161            return self._cache
162
163        destination_configuration: dict[str, Any] = self._get_destination_configuration()
164        self._cache = create_cache_from_destination_config(
165            destination_configuration=destination_configuration
166        )
167        return self._cache

Return a SQL Cache object for working with the data in a SQL-based destination's.

def get_sql_engine(self) -> sqlalchemy.engine.base.Engine:
169    def get_sql_engine(self) -> sqlalchemy.engine.Engine:
170        """Return a SQL Engine for querying a SQL-based destination."""
171        self.get_sql_cache().get_sql_engine()

Return a SQL Engine for querying a SQL-based destination.

def get_sql_table_name(self, stream_name: str) -> str:
173    def get_sql_table_name(self, stream_name: str) -> str:
174        """Return the SQL table name of the named stream."""
175        return self.get_sql_cache().processor.get_sql_table_name(stream_name=stream_name)

Return the SQL table name of the named stream.

def get_sql_table(self, stream_name: str) -> sqlalchemy.sql.schema.Table:
177    def get_sql_table(
178        self,
179        stream_name: str,
180    ) -> sqlalchemy.Table:
181        """Return a SQLAlchemy table object for the named stream."""
182        self.get_sql_cache().processor.get_sql_table(stream_name)

Return a SQLAlchemy table object for the named stream.

def get_dataset(self, stream_name: str) -> airbyte.datasets._sql.CachedDataset:
184    def get_dataset(self, stream_name: str) -> CachedDataset:
185        """Return cached dataset."""
186        return CachedDataset(self.get_sql_cache(), stream_name=stream_name)

Return cached dataset.

def get_sql_database_name(self) -> str:
188    def get_sql_database_name(self) -> str:
189        """Return the SQL database name."""
190        cache = self.get_sql_cache()
191        return cache.get_database_name()

Return the SQL database name.

def get_sql_schema_name(self) -> str:
193    def get_sql_schema_name(self) -> str:
194        """Return the SQL schema name."""
195        cache = self.get_sql_cache()
196        return cache.schema_name

Return the SQL schema name.

stream_names: set[str]
198    @property
199    def stream_names(self) -> set[str]:
200        """Return the set of stream names."""
201        return self.get_sql_cache().processor.expected_streams

Return the set of stream names.

streams: SyncResult.SyncResultStreams
203    @final
204    @property
205    def streams(
206        self,
207    ) -> SyncResultStreams:
208        """Return a temporary table name."""
209        return self.SyncResultStreams(self)

Return a temporary table name.

class SyncResult.SyncResultStreams(collections.abc.Mapping[str, airbyte.datasets._sql.CachedDataset]):
211    class SyncResultStreams(Mapping[str, CachedDataset]):
212        """A mapping of stream names to cached datasets."""
213
214        def __init__(
215            self,
216            parent: SyncResult,
217            /,
218        ) -> None:
219            self.parent: SyncResult = parent
220
221        def __getitem__(self, key: str) -> CachedDataset:
222            return self.parent.get_dataset(stream_name=key)
223
224        def __iter__(self) -> Iterator[str]:
225            """TODO"""
226            return iter(self.parent.stream_names)
227
228        def __len__(self) -> int:
229            return len(self.parent.stream_names)

A mapping of stream names to cached datasets.

SyncResult.SyncResultStreams(parent: SyncResult, /)
214        def __init__(
215            self,
216            parent: SyncResult,
217            /,
218        ) -> None:
219            self.parent: SyncResult = parent
parent: SyncResult
Inherited Members
collections.abc.Mapping
get
keys
items
values