airbyte.sources.base
1# Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2from __future__ import annotations 3 4import json 5import warnings 6from pathlib import Path 7from typing import TYPE_CHECKING, Any, cast 8 9import jsonschema 10import pendulum 11import yaml 12from rich import print 13from rich.syntax import Syntax 14from typing_extensions import Literal 15 16from airbyte_protocol.models import ( 17 AirbyteCatalog, 18 AirbyteMessage, 19 AirbyteStateMessage, 20 ConfiguredAirbyteCatalog, 21 ConfiguredAirbyteStream, 22 ConnectorSpecification, 23 DestinationSyncMode, 24 Status, 25 SyncMode, 26 TraceType, 27 Type, 28) 29 30from airbyte import exceptions as exc 31from airbyte._util.telemetry import ( 32 EventState, 33 EventType, 34 log_config_validation_result, 35 log_source_check_result, 36 send_telemetry, 37) 38from airbyte._util.temp_files import as_temp_files 39from airbyte.caches.util import get_default_cache 40from airbyte.datasets._lazy import LazyDataset 41from airbyte.progress import progress 42from airbyte.records import StreamRecord 43from airbyte.results import ReadResult 44from airbyte.strategies import WriteStrategy 45from airbyte.warnings import PyAirbyteDataLossWarning 46 47 48if TYPE_CHECKING: 49 from collections.abc import Generator, Iterable, Iterator 50 51 from airbyte_protocol.models.airbyte_protocol import AirbyteStream 52 53 from airbyte._executor import Executor 54 from airbyte.caches import CacheBase 55 from airbyte.documents import Document 56 57 58class Source: 59 """A class representing a source that can be called.""" 60 61 def __init__( 62 self, 63 executor: Executor, 64 name: str, 65 config: dict[str, Any] | None = None, 66 streams: str | list[str] | None = None, 67 *, 68 validate: bool = False, 69 ) -> None: 70 """Initialize the source. 71 72 If config is provided, it will be validated against the spec if validate is True. 73 """ 74 self.executor = executor 75 self.name = name 76 self._processed_records = 0 77 self._config_dict: dict[str, Any] | None = None 78 self._last_log_messages: list[str] = [] 79 self._discovered_catalog: AirbyteCatalog | None = None 80 self._spec: ConnectorSpecification | None = None 81 self._selected_stream_names: list[str] = [] 82 if config is not None: 83 self.set_config(config, validate=validate) 84 if streams is not None: 85 self.select_streams(streams) 86 87 self._deployed_api_root: str | None = None 88 self._deployed_workspace_id: str | None = None 89 self._deployed_source_id: str | None = None 90 self._deployed_connection_id: str | None = None 91 92 def set_streams(self, streams: list[str]) -> None: 93 """Deprecated. See select_streams().""" 94 warnings.warn( 95 "The 'set_streams' method is deprecated and will be removed in a future version. " 96 "Please use the 'select_streams' method instead.", 97 DeprecationWarning, 98 stacklevel=2, 99 ) 100 self.select_streams(streams) 101 102 def select_all_streams(self) -> None: 103 """Select all streams. 104 105 This is a more streamlined equivalent to: 106 > source.select_streams(source.get_available_streams()). 107 """ 108 self._selected_stream_names = self.get_available_streams() 109 110 def select_streams(self, streams: str | list[str]) -> None: 111 """Select the stream names that should be read from the connector. 112 113 Args: 114 - streams: A list of stream names to select. If set to "*", all streams will be selected. 115 116 Currently, if this is not set, all streams will be read. 117 """ 118 if streams == "*": 119 self.select_all_streams() 120 return 121 122 if isinstance(streams, str): 123 # If a single stream is provided, convert it to a one-item list 124 streams = [streams] 125 126 available_streams = self.get_available_streams() 127 for stream in streams: 128 if stream not in available_streams: 129 raise exc.AirbyteStreamNotFoundError( 130 stream_name=stream, 131 connector_name=self.name, 132 available_streams=available_streams, 133 ) 134 self._selected_stream_names = streams 135 136 def get_selected_streams(self) -> list[str]: 137 """Get the selected streams. 138 139 If no streams are selected, return an empty list. 140 """ 141 return self._selected_stream_names 142 143 def set_config( 144 self, 145 config: dict[str, Any], 146 *, 147 validate: bool = True, 148 ) -> None: 149 """Set the config for the connector. 150 151 If validate is True, raise an exception if the config fails validation. 152 153 If validate is False, validation will be deferred until check() or validate_config() 154 is called. 155 """ 156 if validate: 157 self.validate_config(config) 158 159 self._config_dict = config 160 161 def get_config(self) -> dict[str, Any]: 162 """Get the config for the connector.""" 163 return self._config 164 165 @property 166 def _config(self) -> dict[str, Any]: 167 if self._config_dict is None: 168 raise exc.AirbyteConnectorConfigurationMissingError( 169 guidance="Provide via get_source() or set_config()" 170 ) 171 return self._config_dict 172 173 def _discover(self) -> AirbyteCatalog: 174 """Call discover on the connector. 175 176 This involves the following steps: 177 * Write the config to a temporary file 178 * execute the connector with discover --config <config_file> 179 * Listen to the messages and return the first AirbyteCatalog that comes along. 180 * Make sure the subprocess is killed when the function returns. 181 """ 182 with as_temp_files([self._config]) as [config_file]: 183 for msg in self._execute(["discover", "--config", config_file]): 184 if msg.type == Type.CATALOG and msg.catalog: 185 return msg.catalog 186 raise exc.AirbyteConnectorMissingCatalogError( 187 log_text=self._last_log_messages, 188 ) 189 190 def validate_config(self, config: dict[str, Any] | None = None) -> None: 191 """Validate the config against the spec. 192 193 If config is not provided, the already-set config will be validated. 194 """ 195 spec = self._get_spec(force_refresh=False) 196 config = self._config if config is None else config 197 try: 198 jsonschema.validate(config, spec.connectionSpecification) 199 log_config_validation_result( 200 name=self.name, 201 state=EventState.SUCCEEDED, 202 ) 203 except jsonschema.ValidationError as ex: 204 validation_ex = exc.AirbyteConnectorValidationFailedError( 205 message="The provided config is not valid.", 206 context={ 207 "error_message": ex.message, 208 "error_path": ex.path, 209 "error_instance": ex.instance, 210 "error_schema": ex.schema, 211 }, 212 ) 213 log_config_validation_result( 214 name=self.name, 215 state=EventState.FAILED, 216 exception=validation_ex, 217 ) 218 raise validation_ex from ex 219 220 def get_available_streams(self) -> list[str]: 221 """Get the available streams from the spec.""" 222 return [s.name for s in self.discovered_catalog.streams] 223 224 def _get_spec(self, *, force_refresh: bool = False) -> ConnectorSpecification: 225 """Call spec on the connector. 226 227 This involves the following steps: 228 * execute the connector with spec 229 * Listen to the messages and return the first AirbyteCatalog that comes along. 230 * Make sure the subprocess is killed when the function returns. 231 """ 232 if force_refresh or self._spec is None: 233 for msg in self._execute(["spec"]): 234 if msg.type == Type.SPEC and msg.spec: 235 self._spec = msg.spec 236 break 237 238 if self._spec: 239 return self._spec 240 241 raise exc.AirbyteConnectorMissingSpecError( 242 log_text=self._last_log_messages, 243 ) 244 245 @property 246 def config_spec(self) -> dict[str, Any]: 247 """Generate a configuration spec for this connector, as a JSON Schema definition. 248 249 This function generates a JSON Schema dictionary with configuration specs for the 250 current connector, as a dictionary. 251 252 Returns: 253 dict: The JSON Schema configuration spec as a dictionary. 254 """ 255 return self._get_spec(force_refresh=True).connectionSpecification 256 257 def print_config_spec( 258 self, 259 format: Literal["yaml", "json"] = "yaml", # noqa: A002 260 *, 261 output_file: Path | str | None = None, 262 ) -> None: 263 """Print the configuration spec for this connector. 264 265 Args: 266 - format: The format to print the spec in. Must be "yaml" or "json". 267 - output_file: Optional. If set, the spec will be written to the given file path. Otherwise, 268 it will be printed to the console. 269 """ 270 if format not in ["yaml", "json"]: 271 raise exc.PyAirbyteInputError( 272 message="Invalid format. Expected 'yaml' or 'json'", 273 input_value=format, 274 ) 275 if isinstance(output_file, str): 276 output_file = Path(output_file) 277 278 if format == "yaml": 279 content = yaml.dump(self.config_spec, indent=2) 280 elif format == "json": 281 content = json.dumps(self.config_spec, indent=2) 282 283 if output_file: 284 output_file.write_text(content) 285 return 286 287 syntax_highlighted = Syntax(content, format) 288 print(syntax_highlighted) 289 290 @property 291 def _yaml_spec(self) -> str: 292 """Get the spec as a yaml string. 293 294 For now, the primary use case is for writing and debugging a valid config for a source. 295 296 This is private for now because we probably want better polish before exposing this 297 as a stable interface. This will also get easier when we have docs links with this info 298 for each connector. 299 """ 300 spec_obj: ConnectorSpecification = self._get_spec() 301 spec_dict = spec_obj.dict(exclude_unset=True) 302 # convert to a yaml string 303 return yaml.dump(spec_dict) 304 305 @property 306 def docs_url(self) -> str: 307 """Get the URL to the connector's documentation.""" 308 # TODO: Replace with docs URL from metadata when available 309 return "https://docs.airbyte.com/integrations/sources/" + self.name.lower().replace( 310 "source-", "" 311 ) 312 313 @property 314 def discovered_catalog(self) -> AirbyteCatalog: 315 """Get the raw catalog for the given streams. 316 317 If the catalog is not yet known, we call discover to get it. 318 """ 319 if self._discovered_catalog is None: 320 self._discovered_catalog = self._discover() 321 322 return self._discovered_catalog 323 324 @property 325 def configured_catalog(self) -> ConfiguredAirbyteCatalog: 326 """Get the configured catalog for the given streams. 327 328 If the raw catalog is not yet known, we call discover to get it. 329 330 If no specific streams are selected, we return a catalog that syncs all available streams. 331 332 TODO: We should consider disabling by default the streams that the connector would 333 disable by default. (For instance, streams that require a premium license are sometimes 334 disabled by default within the connector.) 335 """ 336 # Ensure discovered catalog is cached before we start 337 _ = self.discovered_catalog 338 339 # Filter for selected streams if set, otherwise use all available streams: 340 streams_filter: list[str] = self._selected_stream_names or self.get_available_streams() 341 342 return ConfiguredAirbyteCatalog( 343 streams=[ 344 ConfiguredAirbyteStream( 345 stream=stream, 346 destination_sync_mode=DestinationSyncMode.overwrite, 347 primary_key=stream.source_defined_primary_key, 348 # TODO: The below assumes all sources can coalesce from incremental sync to 349 # full_table as needed. CDK supports this, so it might be safe: 350 sync_mode=SyncMode.incremental, 351 ) 352 for stream in self.discovered_catalog.streams 353 if stream.name in streams_filter 354 ], 355 ) 356 357 def get_stream_json_schema(self, stream_name: str) -> dict[str, Any]: 358 """Return the JSON Schema spec for the specified stream name.""" 359 catalog: AirbyteCatalog = self.discovered_catalog 360 found: list[AirbyteStream] = [ 361 stream for stream in catalog.streams if stream.name == stream_name 362 ] 363 364 if len(found) == 0: 365 raise exc.PyAirbyteInputError( 366 message="Stream name does not exist in catalog.", 367 input_value=stream_name, 368 ) 369 370 if len(found) > 1: 371 raise exc.PyAirbyteInternalError( 372 message="Duplicate streams found with the same name.", 373 context={ 374 "found_streams": found, 375 }, 376 ) 377 378 return found[0].json_schema 379 380 def get_records(self, stream: str) -> LazyDataset: 381 """Read a stream from the connector. 382 383 This involves the following steps: 384 * Call discover to get the catalog 385 * Generate a configured catalog that syncs the given stream in full_refresh mode 386 * Write the configured catalog and the config to a temporary file 387 * execute the connector with read --config <config_file> --catalog <catalog_file> 388 * Listen to the messages and return the first AirbyteRecordMessages that come along. 389 * Make sure the subprocess is killed when the function returns. 390 """ 391 discovered_catalog: AirbyteCatalog = self.discovered_catalog 392 configured_catalog = ConfiguredAirbyteCatalog( 393 streams=[ 394 ConfiguredAirbyteStream( 395 stream=s, 396 sync_mode=SyncMode.full_refresh, 397 destination_sync_mode=DestinationSyncMode.overwrite, 398 ) 399 for s in discovered_catalog.streams 400 if s.name == stream 401 ], 402 ) 403 if len(configured_catalog.streams) == 0: 404 raise exc.PyAirbyteInputError( 405 message="Requested stream does not exist.", 406 context={ 407 "stream": stream, 408 "available_streams": self.get_available_streams(), 409 "connector_name": self.name, 410 }, 411 ) from KeyError(stream) 412 413 configured_stream = configured_catalog.streams[0] 414 all_properties = cast( 415 list[str], list(configured_stream.stream.json_schema["properties"].keys()) 416 ) 417 418 def _with_logging(records: Iterable[dict[str, Any]]) -> Iterator[dict[str, Any]]: 419 self._log_sync_start(cache=None) 420 yield from records 421 self._log_sync_success(cache=None) 422 423 iterator: Iterator[dict[str, Any]] = _with_logging( 424 records=( # Generator comprehension yields StreamRecord objects for each record 425 StreamRecord.from_record_message( 426 record_message=record.record, 427 expected_keys=all_properties, 428 prune_extra_fields=True, 429 ) 430 for record in self._read_with_catalog(configured_catalog) 431 if record.record 432 ) 433 ) 434 return LazyDataset( 435 iterator, 436 stream_metadata=configured_stream, 437 ) 438 439 def get_documents( 440 self, 441 stream: str, 442 title_property: str | None = None, 443 content_properties: list[str] | None = None, 444 metadata_properties: list[str] | None = None, 445 *, 446 render_metadata: bool = False, 447 ) -> Iterable[Document]: 448 """Read a stream from the connector and return the records as documents. 449 450 If metadata_properties is not set, all properties that are not content will be added to 451 the metadata. 452 453 If render_metadata is True, metadata will be rendered in the document, as well as the 454 the main content. 455 """ 456 return self.get_records(stream).to_documents( 457 title_property=title_property, 458 content_properties=content_properties, 459 metadata_properties=metadata_properties, 460 render_metadata=render_metadata, 461 ) 462 463 def check(self) -> None: 464 """Call check on the connector. 465 466 This involves the following steps: 467 * Write the config to a temporary file 468 * execute the connector with check --config <config_file> 469 * Listen to the messages and return the first AirbyteCatalog that comes along. 470 * Make sure the subprocess is killed when the function returns. 471 """ 472 with as_temp_files([self._config]) as [config_file]: 473 try: 474 for msg in self._execute(["check", "--config", config_file]): 475 if msg.type == Type.CONNECTION_STATUS and msg.connectionStatus: 476 if msg.connectionStatus.status != Status.FAILED: 477 print(f"Connection check succeeded for `{self.name}`.") 478 log_source_check_result( 479 name=self.name, 480 state=EventState.SUCCEEDED, 481 ) 482 return 483 484 log_source_check_result( 485 name=self.name, 486 state=EventState.FAILED, 487 ) 488 raise exc.AirbyteConnectorCheckFailedError( 489 help_url=self.docs_url, 490 context={ 491 "failure_reason": msg.connectionStatus.message, 492 }, 493 ) 494 raise exc.AirbyteConnectorCheckFailedError(log_text=self._last_log_messages) 495 except exc.AirbyteConnectorReadError as ex: 496 raise exc.AirbyteConnectorCheckFailedError( 497 message="The connector failed to check the connection.", 498 log_text=ex.log_text, 499 ) from ex 500 501 def install(self) -> None: 502 """Install the connector if it is not yet installed.""" 503 self.executor.install() 504 print("For configuration instructions, see: \n" f"{self.docs_url}#reference\n") 505 506 def uninstall(self) -> None: 507 """Uninstall the connector if it is installed. 508 509 This only works if the use_local_install flag wasn't used and installation is managed by 510 PyAirbyte. 511 """ 512 self.executor.uninstall() 513 514 def _read_with_catalog( 515 self, 516 catalog: ConfiguredAirbyteCatalog, 517 state: list[AirbyteStateMessage] | None = None, 518 ) -> Iterator[AirbyteMessage]: 519 """Call read on the connector. 520 521 This involves the following steps: 522 * Write the config to a temporary file 523 * execute the connector with read --config <config_file> --catalog <catalog_file> 524 * Listen to the messages and return the AirbyteRecordMessages that come along. 525 * Send out telemetry on the performed sync (with information about which source was used and 526 the type of the cache) 527 """ 528 self._processed_records = 0 # Reset the counter before we start 529 with as_temp_files( 530 [ 531 self._config, 532 catalog.json(), 533 json.dumps(state) if state else "[]", 534 ] 535 ) as [ 536 config_file, 537 catalog_file, 538 state_file, 539 ]: 540 yield from self._tally_records( 541 self._execute( 542 [ 543 "read", 544 "--config", 545 config_file, 546 "--catalog", 547 catalog_file, 548 "--state", 549 state_file, 550 ], 551 ) 552 ) 553 554 def _add_to_logs(self, message: str) -> None: 555 self._last_log_messages.append(message) 556 self._last_log_messages = self._last_log_messages[-10:] 557 558 def _execute(self, args: list[str]) -> Iterator[AirbyteMessage]: 559 """Execute the connector with the given arguments. 560 561 This involves the following steps: 562 * Locate the right venv. It is called ".venv-<connector_name>" 563 * Spawn a subprocess with .venv-<connector_name>/bin/<connector-name> <args> 564 * Read the output line by line of the subprocess and serialize them AirbyteMessage objects. 565 Drop if not valid. 566 """ 567 # Fail early if the connector is not installed. 568 self.executor.ensure_installation(auto_fix=False) 569 570 try: 571 self._last_log_messages = [] 572 for line in self.executor.execute(args): 573 try: 574 message = AirbyteMessage.parse_raw(line) 575 if message.type is Type.RECORD: 576 self._processed_records += 1 577 if message.type == Type.LOG: 578 self._add_to_logs(message.log.message) 579 if message.type == Type.TRACE and message.trace.type == TraceType.ERROR: 580 self._add_to_logs(message.trace.error.message) 581 yield message 582 except Exception: 583 self._add_to_logs(line) 584 except Exception as e: 585 raise exc.AirbyteConnectorReadError( 586 log_text=self._last_log_messages, 587 ) from e 588 589 def _tally_records( 590 self, 591 messages: Iterable[AirbyteMessage], 592 ) -> Generator[AirbyteMessage, Any, None]: 593 """This method simply tallies the number of records processed and yields the messages.""" 594 self._processed_records = 0 # Reset the counter before we start 595 progress.reset(len(self._selected_stream_names or [])) 596 597 for message in messages: 598 yield message 599 progress.log_records_read(new_total_count=self._processed_records) 600 601 def _log_sync_start( 602 self, 603 *, 604 cache: CacheBase | None, 605 ) -> None: 606 """Log the start of a sync operation.""" 607 print(f"Started `{self.name}` read operation at {pendulum.now().format('HH:mm:ss')}...") 608 send_telemetry( 609 source=self, 610 cache=cache, 611 state=EventState.STARTED, 612 event_type=EventType.SYNC, 613 ) 614 615 def _log_sync_success( 616 self, 617 *, 618 cache: CacheBase | None, 619 ) -> None: 620 """Log the success of a sync operation.""" 621 print(f"Completed `{self.name}` read operation at {pendulum.now().format('HH:mm:ss')}.") 622 send_telemetry( 623 source=self, 624 cache=cache, 625 state=EventState.SUCCEEDED, 626 number_of_records=self._processed_records, 627 event_type=EventType.SYNC, 628 ) 629 630 def _log_sync_failure( 631 self, 632 *, 633 cache: CacheBase | None, 634 exception: Exception, 635 ) -> None: 636 """Log the failure of a sync operation.""" 637 print(f"Failed `{self.name}` read operation at {pendulum.now().format('HH:mm:ss')}.") 638 send_telemetry( 639 state=EventState.FAILED, 640 source=self, 641 cache=cache, 642 number_of_records=self._processed_records, 643 exception=exception, 644 event_type=EventType.SYNC, 645 ) 646 647 def read( 648 self, 649 cache: CacheBase | None = None, 650 *, 651 streams: str | list[str] | None = None, 652 write_strategy: str | WriteStrategy = WriteStrategy.AUTO, 653 force_full_refresh: bool = False, 654 skip_validation: bool = False, 655 ) -> ReadResult: 656 """Read from the connector and write to the cache. 657 658 Args: 659 cache: The cache to write to. If None, a default cache will be used. 660 write_strategy: The strategy to use when writing to the cache. If a string, it must be 661 one of "append", "upsert", "replace", or "auto". If a WriteStrategy, it must be one 662 of WriteStrategy.APPEND, WriteStrategy.UPSERT, WriteStrategy.REPLACE, or 663 WriteStrategy.AUTO. 664 streams: Optional if already set. A list of stream names to select for reading. If set 665 to "*", all streams will be selected. 666 force_full_refresh: If True, the source will operate in full refresh mode. Otherwise, 667 streams will be read in incremental mode if supported by the connector. This option 668 must be True when using the "replace" strategy. 669 """ 670 if write_strategy == WriteStrategy.REPLACE and not force_full_refresh: 671 warnings.warn( 672 message=( 673 "Using `REPLACE` strategy without also setting `full_refresh_mode=True` " 674 "could result in data loss. " 675 "To silence this warning, use the following: " 676 'warnings.filterwarnings("ignore", ' 677 'category="airbyte.warnings.PyAirbyteDataLossWarning")`' 678 ), 679 category=PyAirbyteDataLossWarning, 680 stacklevel=1, 681 ) 682 if cache is None: 683 cache = get_default_cache() 684 685 if isinstance(write_strategy, str): 686 try: 687 write_strategy = WriteStrategy(write_strategy) 688 except ValueError: 689 raise exc.PyAirbyteInputError( 690 message="Invalid strategy", 691 context={ 692 "write_strategy": write_strategy, 693 "available_strategies": [s.value for s in WriteStrategy], 694 }, 695 ) from None 696 697 if streams: 698 self.select_streams(streams) 699 700 if not self._selected_stream_names: 701 raise exc.PyAirbyteNoStreamsSelectedError( 702 connector_name=self.name, 703 available_streams=self.get_available_streams(), 704 ) 705 706 cache.processor.register_source( 707 source_name=self.name, 708 incoming_source_catalog=self.configured_catalog, 709 stream_names=set(self._selected_stream_names), 710 ) 711 712 state = ( 713 cache._get_state( # noqa: SLF001 # Private method until we have a public API for it. 714 source_name=self.name, 715 streams=self._selected_stream_names, 716 ) 717 if not force_full_refresh 718 else None 719 ) 720 if not skip_validation: 721 self.validate_config() 722 723 self._log_sync_start(cache=cache) 724 try: 725 cache.processor.process_airbyte_messages( 726 self._read_with_catalog( 727 catalog=self.configured_catalog, 728 state=state, 729 ), 730 write_strategy=write_strategy, 731 ) 732 except Exception as ex: 733 self._log_sync_failure(cache=cache, exception=ex) 734 raise exc.AirbyteConnectorFailedError( 735 log_text=self._last_log_messages, 736 ) from ex 737 738 self._log_sync_success(cache=cache) 739 return ReadResult( 740 processed_records=self._processed_records, 741 cache=cache, 742 processed_streams=[stream.stream.name for stream in self.configured_catalog.streams], 743 ) 744 745 746__all__ = [ 747 "Source", 748]
59class Source: 60 """A class representing a source that can be called.""" 61 62 def __init__( 63 self, 64 executor: Executor, 65 name: str, 66 config: dict[str, Any] | None = None, 67 streams: str | list[str] | None = None, 68 *, 69 validate: bool = False, 70 ) -> None: 71 """Initialize the source. 72 73 If config is provided, it will be validated against the spec if validate is True. 74 """ 75 self.executor = executor 76 self.name = name 77 self._processed_records = 0 78 self._config_dict: dict[str, Any] | None = None 79 self._last_log_messages: list[str] = [] 80 self._discovered_catalog: AirbyteCatalog | None = None 81 self._spec: ConnectorSpecification | None = None 82 self._selected_stream_names: list[str] = [] 83 if config is not None: 84 self.set_config(config, validate=validate) 85 if streams is not None: 86 self.select_streams(streams) 87 88 self._deployed_api_root: str | None = None 89 self._deployed_workspace_id: str | None = None 90 self._deployed_source_id: str | None = None 91 self._deployed_connection_id: str | None = None 92 93 def set_streams(self, streams: list[str]) -> None: 94 """Deprecated. See select_streams().""" 95 warnings.warn( 96 "The 'set_streams' method is deprecated and will be removed in a future version. " 97 "Please use the 'select_streams' method instead.", 98 DeprecationWarning, 99 stacklevel=2, 100 ) 101 self.select_streams(streams) 102 103 def select_all_streams(self) -> None: 104 """Select all streams. 105 106 This is a more streamlined equivalent to: 107 > source.select_streams(source.get_available_streams()). 108 """ 109 self._selected_stream_names = self.get_available_streams() 110 111 def select_streams(self, streams: str | list[str]) -> None: 112 """Select the stream names that should be read from the connector. 113 114 Args: 115 - streams: A list of stream names to select. If set to "*", all streams will be selected. 116 117 Currently, if this is not set, all streams will be read. 118 """ 119 if streams == "*": 120 self.select_all_streams() 121 return 122 123 if isinstance(streams, str): 124 # If a single stream is provided, convert it to a one-item list 125 streams = [streams] 126 127 available_streams = self.get_available_streams() 128 for stream in streams: 129 if stream not in available_streams: 130 raise exc.AirbyteStreamNotFoundError( 131 stream_name=stream, 132 connector_name=self.name, 133 available_streams=available_streams, 134 ) 135 self._selected_stream_names = streams 136 137 def get_selected_streams(self) -> list[str]: 138 """Get the selected streams. 139 140 If no streams are selected, return an empty list. 141 """ 142 return self._selected_stream_names 143 144 def set_config( 145 self, 146 config: dict[str, Any], 147 *, 148 validate: bool = True, 149 ) -> None: 150 """Set the config for the connector. 151 152 If validate is True, raise an exception if the config fails validation. 153 154 If validate is False, validation will be deferred until check() or validate_config() 155 is called. 156 """ 157 if validate: 158 self.validate_config(config) 159 160 self._config_dict = config 161 162 def get_config(self) -> dict[str, Any]: 163 """Get the config for the connector.""" 164 return self._config 165 166 @property 167 def _config(self) -> dict[str, Any]: 168 if self._config_dict is None: 169 raise exc.AirbyteConnectorConfigurationMissingError( 170 guidance="Provide via get_source() or set_config()" 171 ) 172 return self._config_dict 173 174 def _discover(self) -> AirbyteCatalog: 175 """Call discover on the connector. 176 177 This involves the following steps: 178 * Write the config to a temporary file 179 * execute the connector with discover --config <config_file> 180 * Listen to the messages and return the first AirbyteCatalog that comes along. 181 * Make sure the subprocess is killed when the function returns. 182 """ 183 with as_temp_files([self._config]) as [config_file]: 184 for msg in self._execute(["discover", "--config", config_file]): 185 if msg.type == Type.CATALOG and msg.catalog: 186 return msg.catalog 187 raise exc.AirbyteConnectorMissingCatalogError( 188 log_text=self._last_log_messages, 189 ) 190 191 def validate_config(self, config: dict[str, Any] | None = None) -> None: 192 """Validate the config against the spec. 193 194 If config is not provided, the already-set config will be validated. 195 """ 196 spec = self._get_spec(force_refresh=False) 197 config = self._config if config is None else config 198 try: 199 jsonschema.validate(config, spec.connectionSpecification) 200 log_config_validation_result( 201 name=self.name, 202 state=EventState.SUCCEEDED, 203 ) 204 except jsonschema.ValidationError as ex: 205 validation_ex = exc.AirbyteConnectorValidationFailedError( 206 message="The provided config is not valid.", 207 context={ 208 "error_message": ex.message, 209 "error_path": ex.path, 210 "error_instance": ex.instance, 211 "error_schema": ex.schema, 212 }, 213 ) 214 log_config_validation_result( 215 name=self.name, 216 state=EventState.FAILED, 217 exception=validation_ex, 218 ) 219 raise validation_ex from ex 220 221 def get_available_streams(self) -> list[str]: 222 """Get the available streams from the spec.""" 223 return [s.name for s in self.discovered_catalog.streams] 224 225 def _get_spec(self, *, force_refresh: bool = False) -> ConnectorSpecification: 226 """Call spec on the connector. 227 228 This involves the following steps: 229 * execute the connector with spec 230 * Listen to the messages and return the first AirbyteCatalog that comes along. 231 * Make sure the subprocess is killed when the function returns. 232 """ 233 if force_refresh or self._spec is None: 234 for msg in self._execute(["spec"]): 235 if msg.type == Type.SPEC and msg.spec: 236 self._spec = msg.spec 237 break 238 239 if self._spec: 240 return self._spec 241 242 raise exc.AirbyteConnectorMissingSpecError( 243 log_text=self._last_log_messages, 244 ) 245 246 @property 247 def config_spec(self) -> dict[str, Any]: 248 """Generate a configuration spec for this connector, as a JSON Schema definition. 249 250 This function generates a JSON Schema dictionary with configuration specs for the 251 current connector, as a dictionary. 252 253 Returns: 254 dict: The JSON Schema configuration spec as a dictionary. 255 """ 256 return self._get_spec(force_refresh=True).connectionSpecification 257 258 def print_config_spec( 259 self, 260 format: Literal["yaml", "json"] = "yaml", # noqa: A002 261 *, 262 output_file: Path | str | None = None, 263 ) -> None: 264 """Print the configuration spec for this connector. 265 266 Args: 267 - format: The format to print the spec in. Must be "yaml" or "json". 268 - output_file: Optional. If set, the spec will be written to the given file path. Otherwise, 269 it will be printed to the console. 270 """ 271 if format not in ["yaml", "json"]: 272 raise exc.PyAirbyteInputError( 273 message="Invalid format. Expected 'yaml' or 'json'", 274 input_value=format, 275 ) 276 if isinstance(output_file, str): 277 output_file = Path(output_file) 278 279 if format == "yaml": 280 content = yaml.dump(self.config_spec, indent=2) 281 elif format == "json": 282 content = json.dumps(self.config_spec, indent=2) 283 284 if output_file: 285 output_file.write_text(content) 286 return 287 288 syntax_highlighted = Syntax(content, format) 289 print(syntax_highlighted) 290 291 @property 292 def _yaml_spec(self) -> str: 293 """Get the spec as a yaml string. 294 295 For now, the primary use case is for writing and debugging a valid config for a source. 296 297 This is private for now because we probably want better polish before exposing this 298 as a stable interface. This will also get easier when we have docs links with this info 299 for each connector. 300 """ 301 spec_obj: ConnectorSpecification = self._get_spec() 302 spec_dict = spec_obj.dict(exclude_unset=True) 303 # convert to a yaml string 304 return yaml.dump(spec_dict) 305 306 @property 307 def docs_url(self) -> str: 308 """Get the URL to the connector's documentation.""" 309 # TODO: Replace with docs URL from metadata when available 310 return "https://docs.airbyte.com/integrations/sources/" + self.name.lower().replace( 311 "source-", "" 312 ) 313 314 @property 315 def discovered_catalog(self) -> AirbyteCatalog: 316 """Get the raw catalog for the given streams. 317 318 If the catalog is not yet known, we call discover to get it. 319 """ 320 if self._discovered_catalog is None: 321 self._discovered_catalog = self._discover() 322 323 return self._discovered_catalog 324 325 @property 326 def configured_catalog(self) -> ConfiguredAirbyteCatalog: 327 """Get the configured catalog for the given streams. 328 329 If the raw catalog is not yet known, we call discover to get it. 330 331 If no specific streams are selected, we return a catalog that syncs all available streams. 332 333 TODO: We should consider disabling by default the streams that the connector would 334 disable by default. (For instance, streams that require a premium license are sometimes 335 disabled by default within the connector.) 336 """ 337 # Ensure discovered catalog is cached before we start 338 _ = self.discovered_catalog 339 340 # Filter for selected streams if set, otherwise use all available streams: 341 streams_filter: list[str] = self._selected_stream_names or self.get_available_streams() 342 343 return ConfiguredAirbyteCatalog( 344 streams=[ 345 ConfiguredAirbyteStream( 346 stream=stream, 347 destination_sync_mode=DestinationSyncMode.overwrite, 348 primary_key=stream.source_defined_primary_key, 349 # TODO: The below assumes all sources can coalesce from incremental sync to 350 # full_table as needed. CDK supports this, so it might be safe: 351 sync_mode=SyncMode.incremental, 352 ) 353 for stream in self.discovered_catalog.streams 354 if stream.name in streams_filter 355 ], 356 ) 357 358 def get_stream_json_schema(self, stream_name: str) -> dict[str, Any]: 359 """Return the JSON Schema spec for the specified stream name.""" 360 catalog: AirbyteCatalog = self.discovered_catalog 361 found: list[AirbyteStream] = [ 362 stream for stream in catalog.streams if stream.name == stream_name 363 ] 364 365 if len(found) == 0: 366 raise exc.PyAirbyteInputError( 367 message="Stream name does not exist in catalog.", 368 input_value=stream_name, 369 ) 370 371 if len(found) > 1: 372 raise exc.PyAirbyteInternalError( 373 message="Duplicate streams found with the same name.", 374 context={ 375 "found_streams": found, 376 }, 377 ) 378 379 return found[0].json_schema 380 381 def get_records(self, stream: str) -> LazyDataset: 382 """Read a stream from the connector. 383 384 This involves the following steps: 385 * Call discover to get the catalog 386 * Generate a configured catalog that syncs the given stream in full_refresh mode 387 * Write the configured catalog and the config to a temporary file 388 * execute the connector with read --config <config_file> --catalog <catalog_file> 389 * Listen to the messages and return the first AirbyteRecordMessages that come along. 390 * Make sure the subprocess is killed when the function returns. 391 """ 392 discovered_catalog: AirbyteCatalog = self.discovered_catalog 393 configured_catalog = ConfiguredAirbyteCatalog( 394 streams=[ 395 ConfiguredAirbyteStream( 396 stream=s, 397 sync_mode=SyncMode.full_refresh, 398 destination_sync_mode=DestinationSyncMode.overwrite, 399 ) 400 for s in discovered_catalog.streams 401 if s.name == stream 402 ], 403 ) 404 if len(configured_catalog.streams) == 0: 405 raise exc.PyAirbyteInputError( 406 message="Requested stream does not exist.", 407 context={ 408 "stream": stream, 409 "available_streams": self.get_available_streams(), 410 "connector_name": self.name, 411 }, 412 ) from KeyError(stream) 413 414 configured_stream = configured_catalog.streams[0] 415 all_properties = cast( 416 list[str], list(configured_stream.stream.json_schema["properties"].keys()) 417 ) 418 419 def _with_logging(records: Iterable[dict[str, Any]]) -> Iterator[dict[str, Any]]: 420 self._log_sync_start(cache=None) 421 yield from records 422 self._log_sync_success(cache=None) 423 424 iterator: Iterator[dict[str, Any]] = _with_logging( 425 records=( # Generator comprehension yields StreamRecord objects for each record 426 StreamRecord.from_record_message( 427 record_message=record.record, 428 expected_keys=all_properties, 429 prune_extra_fields=True, 430 ) 431 for record in self._read_with_catalog(configured_catalog) 432 if record.record 433 ) 434 ) 435 return LazyDataset( 436 iterator, 437 stream_metadata=configured_stream, 438 ) 439 440 def get_documents( 441 self, 442 stream: str, 443 title_property: str | None = None, 444 content_properties: list[str] | None = None, 445 metadata_properties: list[str] | None = None, 446 *, 447 render_metadata: bool = False, 448 ) -> Iterable[Document]: 449 """Read a stream from the connector and return the records as documents. 450 451 If metadata_properties is not set, all properties that are not content will be added to 452 the metadata. 453 454 If render_metadata is True, metadata will be rendered in the document, as well as the 455 the main content. 456 """ 457 return self.get_records(stream).to_documents( 458 title_property=title_property, 459 content_properties=content_properties, 460 metadata_properties=metadata_properties, 461 render_metadata=render_metadata, 462 ) 463 464 def check(self) -> None: 465 """Call check on the connector. 466 467 This involves the following steps: 468 * Write the config to a temporary file 469 * execute the connector with check --config <config_file> 470 * Listen to the messages and return the first AirbyteCatalog that comes along. 471 * Make sure the subprocess is killed when the function returns. 472 """ 473 with as_temp_files([self._config]) as [config_file]: 474 try: 475 for msg in self._execute(["check", "--config", config_file]): 476 if msg.type == Type.CONNECTION_STATUS and msg.connectionStatus: 477 if msg.connectionStatus.status != Status.FAILED: 478 print(f"Connection check succeeded for `{self.name}`.") 479 log_source_check_result( 480 name=self.name, 481 state=EventState.SUCCEEDED, 482 ) 483 return 484 485 log_source_check_result( 486 name=self.name, 487 state=EventState.FAILED, 488 ) 489 raise exc.AirbyteConnectorCheckFailedError( 490 help_url=self.docs_url, 491 context={ 492 "failure_reason": msg.connectionStatus.message, 493 }, 494 ) 495 raise exc.AirbyteConnectorCheckFailedError(log_text=self._last_log_messages) 496 except exc.AirbyteConnectorReadError as ex: 497 raise exc.AirbyteConnectorCheckFailedError( 498 message="The connector failed to check the connection.", 499 log_text=ex.log_text, 500 ) from ex 501 502 def install(self) -> None: 503 """Install the connector if it is not yet installed.""" 504 self.executor.install() 505 print("For configuration instructions, see: \n" f"{self.docs_url}#reference\n") 506 507 def uninstall(self) -> None: 508 """Uninstall the connector if it is installed. 509 510 This only works if the use_local_install flag wasn't used and installation is managed by 511 PyAirbyte. 512 """ 513 self.executor.uninstall() 514 515 def _read_with_catalog( 516 self, 517 catalog: ConfiguredAirbyteCatalog, 518 state: list[AirbyteStateMessage] | None = None, 519 ) -> Iterator[AirbyteMessage]: 520 """Call read on the connector. 521 522 This involves the following steps: 523 * Write the config to a temporary file 524 * execute the connector with read --config <config_file> --catalog <catalog_file> 525 * Listen to the messages and return the AirbyteRecordMessages that come along. 526 * Send out telemetry on the performed sync (with information about which source was used and 527 the type of the cache) 528 """ 529 self._processed_records = 0 # Reset the counter before we start 530 with as_temp_files( 531 [ 532 self._config, 533 catalog.json(), 534 json.dumps(state) if state else "[]", 535 ] 536 ) as [ 537 config_file, 538 catalog_file, 539 state_file, 540 ]: 541 yield from self._tally_records( 542 self._execute( 543 [ 544 "read", 545 "--config", 546 config_file, 547 "--catalog", 548 catalog_file, 549 "--state", 550 state_file, 551 ], 552 ) 553 ) 554 555 def _add_to_logs(self, message: str) -> None: 556 self._last_log_messages.append(message) 557 self._last_log_messages = self._last_log_messages[-10:] 558 559 def _execute(self, args: list[str]) -> Iterator[AirbyteMessage]: 560 """Execute the connector with the given arguments. 561 562 This involves the following steps: 563 * Locate the right venv. It is called ".venv-<connector_name>" 564 * Spawn a subprocess with .venv-<connector_name>/bin/<connector-name> <args> 565 * Read the output line by line of the subprocess and serialize them AirbyteMessage objects. 566 Drop if not valid. 567 """ 568 # Fail early if the connector is not installed. 569 self.executor.ensure_installation(auto_fix=False) 570 571 try: 572 self._last_log_messages = [] 573 for line in self.executor.execute(args): 574 try: 575 message = AirbyteMessage.parse_raw(line) 576 if message.type is Type.RECORD: 577 self._processed_records += 1 578 if message.type == Type.LOG: 579 self._add_to_logs(message.log.message) 580 if message.type == Type.TRACE and message.trace.type == TraceType.ERROR: 581 self._add_to_logs(message.trace.error.message) 582 yield message 583 except Exception: 584 self._add_to_logs(line) 585 except Exception as e: 586 raise exc.AirbyteConnectorReadError( 587 log_text=self._last_log_messages, 588 ) from e 589 590 def _tally_records( 591 self, 592 messages: Iterable[AirbyteMessage], 593 ) -> Generator[AirbyteMessage, Any, None]: 594 """This method simply tallies the number of records processed and yields the messages.""" 595 self._processed_records = 0 # Reset the counter before we start 596 progress.reset(len(self._selected_stream_names or [])) 597 598 for message in messages: 599 yield message 600 progress.log_records_read(new_total_count=self._processed_records) 601 602 def _log_sync_start( 603 self, 604 *, 605 cache: CacheBase | None, 606 ) -> None: 607 """Log the start of a sync operation.""" 608 print(f"Started `{self.name}` read operation at {pendulum.now().format('HH:mm:ss')}...") 609 send_telemetry( 610 source=self, 611 cache=cache, 612 state=EventState.STARTED, 613 event_type=EventType.SYNC, 614 ) 615 616 def _log_sync_success( 617 self, 618 *, 619 cache: CacheBase | None, 620 ) -> None: 621 """Log the success of a sync operation.""" 622 print(f"Completed `{self.name}` read operation at {pendulum.now().format('HH:mm:ss')}.") 623 send_telemetry( 624 source=self, 625 cache=cache, 626 state=EventState.SUCCEEDED, 627 number_of_records=self._processed_records, 628 event_type=EventType.SYNC, 629 ) 630 631 def _log_sync_failure( 632 self, 633 *, 634 cache: CacheBase | None, 635 exception: Exception, 636 ) -> None: 637 """Log the failure of a sync operation.""" 638 print(f"Failed `{self.name}` read operation at {pendulum.now().format('HH:mm:ss')}.") 639 send_telemetry( 640 state=EventState.FAILED, 641 source=self, 642 cache=cache, 643 number_of_records=self._processed_records, 644 exception=exception, 645 event_type=EventType.SYNC, 646 ) 647 648 def read( 649 self, 650 cache: CacheBase | None = None, 651 *, 652 streams: str | list[str] | None = None, 653 write_strategy: str | WriteStrategy = WriteStrategy.AUTO, 654 force_full_refresh: bool = False, 655 skip_validation: bool = False, 656 ) -> ReadResult: 657 """Read from the connector and write to the cache. 658 659 Args: 660 cache: The cache to write to. If None, a default cache will be used. 661 write_strategy: The strategy to use when writing to the cache. If a string, it must be 662 one of "append", "upsert", "replace", or "auto". If a WriteStrategy, it must be one 663 of WriteStrategy.APPEND, WriteStrategy.UPSERT, WriteStrategy.REPLACE, or 664 WriteStrategy.AUTO. 665 streams: Optional if already set. A list of stream names to select for reading. If set 666 to "*", all streams will be selected. 667 force_full_refresh: If True, the source will operate in full refresh mode. Otherwise, 668 streams will be read in incremental mode if supported by the connector. This option 669 must be True when using the "replace" strategy. 670 """ 671 if write_strategy == WriteStrategy.REPLACE and not force_full_refresh: 672 warnings.warn( 673 message=( 674 "Using `REPLACE` strategy without also setting `full_refresh_mode=True` " 675 "could result in data loss. " 676 "To silence this warning, use the following: " 677 'warnings.filterwarnings("ignore", ' 678 'category="airbyte.warnings.PyAirbyteDataLossWarning")`' 679 ), 680 category=PyAirbyteDataLossWarning, 681 stacklevel=1, 682 ) 683 if cache is None: 684 cache = get_default_cache() 685 686 if isinstance(write_strategy, str): 687 try: 688 write_strategy = WriteStrategy(write_strategy) 689 except ValueError: 690 raise exc.PyAirbyteInputError( 691 message="Invalid strategy", 692 context={ 693 "write_strategy": write_strategy, 694 "available_strategies": [s.value for s in WriteStrategy], 695 }, 696 ) from None 697 698 if streams: 699 self.select_streams(streams) 700 701 if not self._selected_stream_names: 702 raise exc.PyAirbyteNoStreamsSelectedError( 703 connector_name=self.name, 704 available_streams=self.get_available_streams(), 705 ) 706 707 cache.processor.register_source( 708 source_name=self.name, 709 incoming_source_catalog=self.configured_catalog, 710 stream_names=set(self._selected_stream_names), 711 ) 712 713 state = ( 714 cache._get_state( # noqa: SLF001 # Private method until we have a public API for it. 715 source_name=self.name, 716 streams=self._selected_stream_names, 717 ) 718 if not force_full_refresh 719 else None 720 ) 721 if not skip_validation: 722 self.validate_config() 723 724 self._log_sync_start(cache=cache) 725 try: 726 cache.processor.process_airbyte_messages( 727 self._read_with_catalog( 728 catalog=self.configured_catalog, 729 state=state, 730 ), 731 write_strategy=write_strategy, 732 ) 733 except Exception as ex: 734 self._log_sync_failure(cache=cache, exception=ex) 735 raise exc.AirbyteConnectorFailedError( 736 log_text=self._last_log_messages, 737 ) from ex 738 739 self._log_sync_success(cache=cache) 740 return ReadResult( 741 processed_records=self._processed_records, 742 cache=cache, 743 processed_streams=[stream.stream.name for stream in self.configured_catalog.streams], 744 )
A class representing a source that can be called.
62 def __init__( 63 self, 64 executor: Executor, 65 name: str, 66 config: dict[str, Any] | None = None, 67 streams: str | list[str] | None = None, 68 *, 69 validate: bool = False, 70 ) -> None: 71 """Initialize the source. 72 73 If config is provided, it will be validated against the spec if validate is True. 74 """ 75 self.executor = executor 76 self.name = name 77 self._processed_records = 0 78 self._config_dict: dict[str, Any] | None = None 79 self._last_log_messages: list[str] = [] 80 self._discovered_catalog: AirbyteCatalog | None = None 81 self._spec: ConnectorSpecification | None = None 82 self._selected_stream_names: list[str] = [] 83 if config is not None: 84 self.set_config(config, validate=validate) 85 if streams is not None: 86 self.select_streams(streams) 87 88 self._deployed_api_root: str | None = None 89 self._deployed_workspace_id: str | None = None 90 self._deployed_source_id: str | None = None 91 self._deployed_connection_id: str | None = None
Initialize the source.
If config is provided, it will be validated against the spec if validate is True.
93 def set_streams(self, streams: list[str]) -> None: 94 """Deprecated. See select_streams().""" 95 warnings.warn( 96 "The 'set_streams' method is deprecated and will be removed in a future version. " 97 "Please use the 'select_streams' method instead.", 98 DeprecationWarning, 99 stacklevel=2, 100 ) 101 self.select_streams(streams)
Deprecated. See select_streams().
103 def select_all_streams(self) -> None: 104 """Select all streams. 105 106 This is a more streamlined equivalent to: 107 > source.select_streams(source.get_available_streams()). 108 """ 109 self._selected_stream_names = self.get_available_streams()
Select all streams.
This is a more streamlined equivalent to:
source.select_streams(source.get_available_streams()).
111 def select_streams(self, streams: str | list[str]) -> None: 112 """Select the stream names that should be read from the connector. 113 114 Args: 115 - streams: A list of stream names to select. If set to "*", all streams will be selected. 116 117 Currently, if this is not set, all streams will be read. 118 """ 119 if streams == "*": 120 self.select_all_streams() 121 return 122 123 if isinstance(streams, str): 124 # If a single stream is provided, convert it to a one-item list 125 streams = [streams] 126 127 available_streams = self.get_available_streams() 128 for stream in streams: 129 if stream not in available_streams: 130 raise exc.AirbyteStreamNotFoundError( 131 stream_name=stream, 132 connector_name=self.name, 133 available_streams=available_streams, 134 ) 135 self._selected_stream_names = streams
Select the stream names that should be read from the connector.
Args:
- streams: A list of stream names to select. If set to "*", all streams will be selected.
Currently, if this is not set, all streams will be read.
137 def get_selected_streams(self) -> list[str]: 138 """Get the selected streams. 139 140 If no streams are selected, return an empty list. 141 """ 142 return self._selected_stream_names
Get the selected streams.
If no streams are selected, return an empty list.
144 def set_config( 145 self, 146 config: dict[str, Any], 147 *, 148 validate: bool = True, 149 ) -> None: 150 """Set the config for the connector. 151 152 If validate is True, raise an exception if the config fails validation. 153 154 If validate is False, validation will be deferred until check() or validate_config() 155 is called. 156 """ 157 if validate: 158 self.validate_config(config) 159 160 self._config_dict = config
Set the config for the connector.
If validate is True, raise an exception if the config fails validation.
If validate is False, validation will be deferred until check() or validate_config() is called.
162 def get_config(self) -> dict[str, Any]: 163 """Get the config for the connector.""" 164 return self._config
Get the config for the connector.
191 def validate_config(self, config: dict[str, Any] | None = None) -> None: 192 """Validate the config against the spec. 193 194 If config is not provided, the already-set config will be validated. 195 """ 196 spec = self._get_spec(force_refresh=False) 197 config = self._config if config is None else config 198 try: 199 jsonschema.validate(config, spec.connectionSpecification) 200 log_config_validation_result( 201 name=self.name, 202 state=EventState.SUCCEEDED, 203 ) 204 except jsonschema.ValidationError as ex: 205 validation_ex = exc.AirbyteConnectorValidationFailedError( 206 message="The provided config is not valid.", 207 context={ 208 "error_message": ex.message, 209 "error_path": ex.path, 210 "error_instance": ex.instance, 211 "error_schema": ex.schema, 212 }, 213 ) 214 log_config_validation_result( 215 name=self.name, 216 state=EventState.FAILED, 217 exception=validation_ex, 218 ) 219 raise validation_ex from ex
Validate the config against the spec.
If config is not provided, the already-set config will be validated.
221 def get_available_streams(self) -> list[str]: 222 """Get the available streams from the spec.""" 223 return [s.name for s in self.discovered_catalog.streams]
Get the available streams from the spec.
246 @property 247 def config_spec(self) -> dict[str, Any]: 248 """Generate a configuration spec for this connector, as a JSON Schema definition. 249 250 This function generates a JSON Schema dictionary with configuration specs for the 251 current connector, as a dictionary. 252 253 Returns: 254 dict: The JSON Schema configuration spec as a dictionary. 255 """ 256 return self._get_spec(force_refresh=True).connectionSpecification
Generate a configuration spec for this connector, as a JSON Schema definition.
This function generates a JSON Schema dictionary with configuration specs for the current connector, as a dictionary.
Returns:
dict: The JSON Schema configuration spec as a dictionary.
258 def print_config_spec( 259 self, 260 format: Literal["yaml", "json"] = "yaml", # noqa: A002 261 *, 262 output_file: Path | str | None = None, 263 ) -> None: 264 """Print the configuration spec for this connector. 265 266 Args: 267 - format: The format to print the spec in. Must be "yaml" or "json". 268 - output_file: Optional. If set, the spec will be written to the given file path. Otherwise, 269 it will be printed to the console. 270 """ 271 if format not in ["yaml", "json"]: 272 raise exc.PyAirbyteInputError( 273 message="Invalid format. Expected 'yaml' or 'json'", 274 input_value=format, 275 ) 276 if isinstance(output_file, str): 277 output_file = Path(output_file) 278 279 if format == "yaml": 280 content = yaml.dump(self.config_spec, indent=2) 281 elif format == "json": 282 content = json.dumps(self.config_spec, indent=2) 283 284 if output_file: 285 output_file.write_text(content) 286 return 287 288 syntax_highlighted = Syntax(content, format) 289 print(syntax_highlighted)
Print the configuration spec for this connector.
Args:
- format: The format to print the spec in. Must be "yaml" or "json".
- output_file: Optional. If set, the spec will be written to the given file path. Otherwise, it will be printed to the console.
306 @property 307 def docs_url(self) -> str: 308 """Get the URL to the connector's documentation.""" 309 # TODO: Replace with docs URL from metadata when available 310 return "https://docs.airbyte.com/integrations/sources/" + self.name.lower().replace( 311 "source-", "" 312 )
Get the URL to the connector's documentation.
314 @property 315 def discovered_catalog(self) -> AirbyteCatalog: 316 """Get the raw catalog for the given streams. 317 318 If the catalog is not yet known, we call discover to get it. 319 """ 320 if self._discovered_catalog is None: 321 self._discovered_catalog = self._discover() 322 323 return self._discovered_catalog
Get the raw catalog for the given streams.
If the catalog is not yet known, we call discover to get it.
325 @property 326 def configured_catalog(self) -> ConfiguredAirbyteCatalog: 327 """Get the configured catalog for the given streams. 328 329 If the raw catalog is not yet known, we call discover to get it. 330 331 If no specific streams are selected, we return a catalog that syncs all available streams. 332 333 TODO: We should consider disabling by default the streams that the connector would 334 disable by default. (For instance, streams that require a premium license are sometimes 335 disabled by default within the connector.) 336 """ 337 # Ensure discovered catalog is cached before we start 338 _ = self.discovered_catalog 339 340 # Filter for selected streams if set, otherwise use all available streams: 341 streams_filter: list[str] = self._selected_stream_names or self.get_available_streams() 342 343 return ConfiguredAirbyteCatalog( 344 streams=[ 345 ConfiguredAirbyteStream( 346 stream=stream, 347 destination_sync_mode=DestinationSyncMode.overwrite, 348 primary_key=stream.source_defined_primary_key, 349 # TODO: The below assumes all sources can coalesce from incremental sync to 350 # full_table as needed. CDK supports this, so it might be safe: 351 sync_mode=SyncMode.incremental, 352 ) 353 for stream in self.discovered_catalog.streams 354 if stream.name in streams_filter 355 ], 356 )
Get the configured catalog for the given streams.
If the raw catalog is not yet known, we call discover to get it.
If no specific streams are selected, we return a catalog that syncs all available streams.
TODO: We should consider disabling by default the streams that the connector would disable by default. (For instance, streams that require a premium license are sometimes disabled by default within the connector.)
358 def get_stream_json_schema(self, stream_name: str) -> dict[str, Any]: 359 """Return the JSON Schema spec for the specified stream name.""" 360 catalog: AirbyteCatalog = self.discovered_catalog 361 found: list[AirbyteStream] = [ 362 stream for stream in catalog.streams if stream.name == stream_name 363 ] 364 365 if len(found) == 0: 366 raise exc.PyAirbyteInputError( 367 message="Stream name does not exist in catalog.", 368 input_value=stream_name, 369 ) 370 371 if len(found) > 1: 372 raise exc.PyAirbyteInternalError( 373 message="Duplicate streams found with the same name.", 374 context={ 375 "found_streams": found, 376 }, 377 ) 378 379 return found[0].json_schema
Return the JSON Schema spec for the specified stream name.
381 def get_records(self, stream: str) -> LazyDataset: 382 """Read a stream from the connector. 383 384 This involves the following steps: 385 * Call discover to get the catalog 386 * Generate a configured catalog that syncs the given stream in full_refresh mode 387 * Write the configured catalog and the config to a temporary file 388 * execute the connector with read --config <config_file> --catalog <catalog_file> 389 * Listen to the messages and return the first AirbyteRecordMessages that come along. 390 * Make sure the subprocess is killed when the function returns. 391 """ 392 discovered_catalog: AirbyteCatalog = self.discovered_catalog 393 configured_catalog = ConfiguredAirbyteCatalog( 394 streams=[ 395 ConfiguredAirbyteStream( 396 stream=s, 397 sync_mode=SyncMode.full_refresh, 398 destination_sync_mode=DestinationSyncMode.overwrite, 399 ) 400 for s in discovered_catalog.streams 401 if s.name == stream 402 ], 403 ) 404 if len(configured_catalog.streams) == 0: 405 raise exc.PyAirbyteInputError( 406 message="Requested stream does not exist.", 407 context={ 408 "stream": stream, 409 "available_streams": self.get_available_streams(), 410 "connector_name": self.name, 411 }, 412 ) from KeyError(stream) 413 414 configured_stream = configured_catalog.streams[0] 415 all_properties = cast( 416 list[str], list(configured_stream.stream.json_schema["properties"].keys()) 417 ) 418 419 def _with_logging(records: Iterable[dict[str, Any]]) -> Iterator[dict[str, Any]]: 420 self._log_sync_start(cache=None) 421 yield from records 422 self._log_sync_success(cache=None) 423 424 iterator: Iterator[dict[str, Any]] = _with_logging( 425 records=( # Generator comprehension yields StreamRecord objects for each record 426 StreamRecord.from_record_message( 427 record_message=record.record, 428 expected_keys=all_properties, 429 prune_extra_fields=True, 430 ) 431 for record in self._read_with_catalog(configured_catalog) 432 if record.record 433 ) 434 ) 435 return LazyDataset( 436 iterator, 437 stream_metadata=configured_stream, 438 )
Read a stream from the connector.
This involves the following steps:
- Call discover to get the catalog
- Generate a configured catalog that syncs the given stream in full_refresh mode
- Write the configured catalog and the config to a temporary file
- execute the connector with read --config
--catalog - Listen to the messages and return the first AirbyteRecordMessages that come along.
- Make sure the subprocess is killed when the function returns.
440 def get_documents( 441 self, 442 stream: str, 443 title_property: str | None = None, 444 content_properties: list[str] | None = None, 445 metadata_properties: list[str] | None = None, 446 *, 447 render_metadata: bool = False, 448 ) -> Iterable[Document]: 449 """Read a stream from the connector and return the records as documents. 450 451 If metadata_properties is not set, all properties that are not content will be added to 452 the metadata. 453 454 If render_metadata is True, metadata will be rendered in the document, as well as the 455 the main content. 456 """ 457 return self.get_records(stream).to_documents( 458 title_property=title_property, 459 content_properties=content_properties, 460 metadata_properties=metadata_properties, 461 render_metadata=render_metadata, 462 )
Read a stream from the connector and return the records as documents.
If metadata_properties is not set, all properties that are not content will be added to the metadata.
If render_metadata is True, metadata will be rendered in the document, as well as the the main content.
464 def check(self) -> None: 465 """Call check on the connector. 466 467 This involves the following steps: 468 * Write the config to a temporary file 469 * execute the connector with check --config <config_file> 470 * Listen to the messages and return the first AirbyteCatalog that comes along. 471 * Make sure the subprocess is killed when the function returns. 472 """ 473 with as_temp_files([self._config]) as [config_file]: 474 try: 475 for msg in self._execute(["check", "--config", config_file]): 476 if msg.type == Type.CONNECTION_STATUS and msg.connectionStatus: 477 if msg.connectionStatus.status != Status.FAILED: 478 print(f"Connection check succeeded for `{self.name}`.") 479 log_source_check_result( 480 name=self.name, 481 state=EventState.SUCCEEDED, 482 ) 483 return 484 485 log_source_check_result( 486 name=self.name, 487 state=EventState.FAILED, 488 ) 489 raise exc.AirbyteConnectorCheckFailedError( 490 help_url=self.docs_url, 491 context={ 492 "failure_reason": msg.connectionStatus.message, 493 }, 494 ) 495 raise exc.AirbyteConnectorCheckFailedError(log_text=self._last_log_messages) 496 except exc.AirbyteConnectorReadError as ex: 497 raise exc.AirbyteConnectorCheckFailedError( 498 message="The connector failed to check the connection.", 499 log_text=ex.log_text, 500 ) from ex
Call check on the connector.
This involves the following steps:
- Write the config to a temporary file
- execute the connector with check --config
- Listen to the messages and return the first AirbyteCatalog that comes along.
- Make sure the subprocess is killed when the function returns.
502 def install(self) -> None: 503 """Install the connector if it is not yet installed.""" 504 self.executor.install() 505 print("For configuration instructions, see: \n" f"{self.docs_url}#reference\n")
Install the connector if it is not yet installed.
507 def uninstall(self) -> None: 508 """Uninstall the connector if it is installed. 509 510 This only works if the use_local_install flag wasn't used and installation is managed by 511 PyAirbyte. 512 """ 513 self.executor.uninstall()
Uninstall the connector if it is installed.
This only works if the use_local_install flag wasn't used and installation is managed by PyAirbyte.
648 def read( 649 self, 650 cache: CacheBase | None = None, 651 *, 652 streams: str | list[str] | None = None, 653 write_strategy: str | WriteStrategy = WriteStrategy.AUTO, 654 force_full_refresh: bool = False, 655 skip_validation: bool = False, 656 ) -> ReadResult: 657 """Read from the connector and write to the cache. 658 659 Args: 660 cache: The cache to write to. If None, a default cache will be used. 661 write_strategy: The strategy to use when writing to the cache. If a string, it must be 662 one of "append", "upsert", "replace", or "auto". If a WriteStrategy, it must be one 663 of WriteStrategy.APPEND, WriteStrategy.UPSERT, WriteStrategy.REPLACE, or 664 WriteStrategy.AUTO. 665 streams: Optional if already set. A list of stream names to select for reading. If set 666 to "*", all streams will be selected. 667 force_full_refresh: If True, the source will operate in full refresh mode. Otherwise, 668 streams will be read in incremental mode if supported by the connector. This option 669 must be True when using the "replace" strategy. 670 """ 671 if write_strategy == WriteStrategy.REPLACE and not force_full_refresh: 672 warnings.warn( 673 message=( 674 "Using `REPLACE` strategy without also setting `full_refresh_mode=True` " 675 "could result in data loss. " 676 "To silence this warning, use the following: " 677 'warnings.filterwarnings("ignore", ' 678 'category="airbyte.warnings.PyAirbyteDataLossWarning")`' 679 ), 680 category=PyAirbyteDataLossWarning, 681 stacklevel=1, 682 ) 683 if cache is None: 684 cache = get_default_cache() 685 686 if isinstance(write_strategy, str): 687 try: 688 write_strategy = WriteStrategy(write_strategy) 689 except ValueError: 690 raise exc.PyAirbyteInputError( 691 message="Invalid strategy", 692 context={ 693 "write_strategy": write_strategy, 694 "available_strategies": [s.value for s in WriteStrategy], 695 }, 696 ) from None 697 698 if streams: 699 self.select_streams(streams) 700 701 if not self._selected_stream_names: 702 raise exc.PyAirbyteNoStreamsSelectedError( 703 connector_name=self.name, 704 available_streams=self.get_available_streams(), 705 ) 706 707 cache.processor.register_source( 708 source_name=self.name, 709 incoming_source_catalog=self.configured_catalog, 710 stream_names=set(self._selected_stream_names), 711 ) 712 713 state = ( 714 cache._get_state( # noqa: SLF001 # Private method until we have a public API for it. 715 source_name=self.name, 716 streams=self._selected_stream_names, 717 ) 718 if not force_full_refresh 719 else None 720 ) 721 if not skip_validation: 722 self.validate_config() 723 724 self._log_sync_start(cache=cache) 725 try: 726 cache.processor.process_airbyte_messages( 727 self._read_with_catalog( 728 catalog=self.configured_catalog, 729 state=state, 730 ), 731 write_strategy=write_strategy, 732 ) 733 except Exception as ex: 734 self._log_sync_failure(cache=cache, exception=ex) 735 raise exc.AirbyteConnectorFailedError( 736 log_text=self._last_log_messages, 737 ) from ex 738 739 self._log_sync_success(cache=cache) 740 return ReadResult( 741 processed_records=self._processed_records, 742 cache=cache, 743 processed_streams=[stream.stream.name for stream in self.configured_catalog.streams], 744 )
Read from the connector and write to the cache.
Arguments:
- cache: The cache to write to. If None, a default cache will be used.
- write_strategy: The strategy to use when writing to the cache. If a string, it must be one of "append", "upsert", "replace", or "auto". If a WriteStrategy, it must be one of WriteStrategy.APPEND, WriteStrategy.UPSERT, WriteStrategy.REPLACE, or WriteStrategy.AUTO.
- streams: Optional if already set. A list of stream names to select for reading. If set to "*", all streams will be selected.
- force_full_refresh: If True, the source will operate in full refresh mode. Otherwise, streams will be read in incremental mode if supported by the connector. This option must be True when using the "replace" strategy.