airbyte.sources.base

  1# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
  2from __future__ import annotations
  3
  4import json
  5import warnings
  6from pathlib import Path
  7from typing import TYPE_CHECKING, Any, cast
  8
  9import jsonschema
 10import pendulum
 11import yaml
 12from rich import print
 13from rich.syntax import Syntax
 14from typing_extensions import Literal
 15
 16from airbyte_protocol.models import (
 17    AirbyteCatalog,
 18    AirbyteMessage,
 19    AirbyteStateMessage,
 20    ConfiguredAirbyteCatalog,
 21    ConfiguredAirbyteStream,
 22    ConnectorSpecification,
 23    DestinationSyncMode,
 24    Status,
 25    SyncMode,
 26    TraceType,
 27    Type,
 28)
 29
 30from airbyte import exceptions as exc
 31from airbyte._util.telemetry import (
 32    EventState,
 33    EventType,
 34    log_config_validation_result,
 35    log_source_check_result,
 36    send_telemetry,
 37)
 38from airbyte._util.temp_files import as_temp_files
 39from airbyte.caches.util import get_default_cache
 40from airbyte.datasets._lazy import LazyDataset
 41from airbyte.progress import progress
 42from airbyte.records import StreamRecord
 43from airbyte.results import ReadResult
 44from airbyte.strategies import WriteStrategy
 45from airbyte.warnings import PyAirbyteDataLossWarning
 46
 47
 48if TYPE_CHECKING:
 49    from collections.abc import Generator, Iterable, Iterator
 50
 51    from airbyte_protocol.models.airbyte_protocol import AirbyteStream
 52
 53    from airbyte._executor import Executor
 54    from airbyte.caches import CacheBase
 55    from airbyte.documents import Document
 56
 57
 58class Source:
 59    """A class representing a source that can be called."""
 60
 61    def __init__(
 62        self,
 63        executor: Executor,
 64        name: str,
 65        config: dict[str, Any] | None = None,
 66        streams: str | list[str] | None = None,
 67        *,
 68        validate: bool = False,
 69    ) -> None:
 70        """Initialize the source.
 71
 72        If config is provided, it will be validated against the spec if validate is True.
 73        """
 74        self.executor = executor
 75        self.name = name
 76        self._processed_records = 0
 77        self._config_dict: dict[str, Any] | None = None
 78        self._last_log_messages: list[str] = []
 79        self._discovered_catalog: AirbyteCatalog | None = None
 80        self._spec: ConnectorSpecification | None = None
 81        self._selected_stream_names: list[str] = []
 82        if config is not None:
 83            self.set_config(config, validate=validate)
 84        if streams is not None:
 85            self.select_streams(streams)
 86
 87        self._deployed_api_root: str | None = None
 88        self._deployed_workspace_id: str | None = None
 89        self._deployed_source_id: str | None = None
 90        self._deployed_connection_id: str | None = None
 91
 92    def set_streams(self, streams: list[str]) -> None:
 93        """Deprecated. See select_streams()."""
 94        warnings.warn(
 95            "The 'set_streams' method is deprecated and will be removed in a future version. "
 96            "Please use the 'select_streams' method instead.",
 97            DeprecationWarning,
 98            stacklevel=2,
 99        )
100        self.select_streams(streams)
101
102    def select_all_streams(self) -> None:
103        """Select all streams.
104
105        This is a more streamlined equivalent to:
106        > source.select_streams(source.get_available_streams()).
107        """
108        self._selected_stream_names = self.get_available_streams()
109
110    def select_streams(self, streams: str | list[str]) -> None:
111        """Select the stream names that should be read from the connector.
112
113        Args:
114        - streams: A list of stream names to select. If set to "*", all streams will be selected.
115
116        Currently, if this is not set, all streams will be read.
117        """
118        if streams == "*":
119            self.select_all_streams()
120            return
121
122        if isinstance(streams, str):
123            # If a single stream is provided, convert it to a one-item list
124            streams = [streams]
125
126        available_streams = self.get_available_streams()
127        for stream in streams:
128            if stream not in available_streams:
129                raise exc.AirbyteStreamNotFoundError(
130                    stream_name=stream,
131                    connector_name=self.name,
132                    available_streams=available_streams,
133                )
134        self._selected_stream_names = streams
135
136    def get_selected_streams(self) -> list[str]:
137        """Get the selected streams.
138
139        If no streams are selected, return an empty list.
140        """
141        return self._selected_stream_names
142
143    def set_config(
144        self,
145        config: dict[str, Any],
146        *,
147        validate: bool = True,
148    ) -> None:
149        """Set the config for the connector.
150
151        If validate is True, raise an exception if the config fails validation.
152
153        If validate is False, validation will be deferred until check() or validate_config()
154        is called.
155        """
156        if validate:
157            self.validate_config(config)
158
159        self._config_dict = config
160
161    def get_config(self) -> dict[str, Any]:
162        """Get the config for the connector."""
163        return self._config
164
165    @property
166    def _config(self) -> dict[str, Any]:
167        if self._config_dict is None:
168            raise exc.AirbyteConnectorConfigurationMissingError(
169                guidance="Provide via get_source() or set_config()"
170            )
171        return self._config_dict
172
173    def _discover(self) -> AirbyteCatalog:
174        """Call discover on the connector.
175
176        This involves the following steps:
177        * Write the config to a temporary file
178        * execute the connector with discover --config <config_file>
179        * Listen to the messages and return the first AirbyteCatalog that comes along.
180        * Make sure the subprocess is killed when the function returns.
181        """
182        with as_temp_files([self._config]) as [config_file]:
183            for msg in self._execute(["discover", "--config", config_file]):
184                if msg.type == Type.CATALOG and msg.catalog:
185                    return msg.catalog
186            raise exc.AirbyteConnectorMissingCatalogError(
187                log_text=self._last_log_messages,
188            )
189
190    def validate_config(self, config: dict[str, Any] | None = None) -> None:
191        """Validate the config against the spec.
192
193        If config is not provided, the already-set config will be validated.
194        """
195        spec = self._get_spec(force_refresh=False)
196        config = self._config if config is None else config
197        try:
198            jsonschema.validate(config, spec.connectionSpecification)
199            log_config_validation_result(
200                name=self.name,
201                state=EventState.SUCCEEDED,
202            )
203        except jsonschema.ValidationError as ex:
204            validation_ex = exc.AirbyteConnectorValidationFailedError(
205                message="The provided config is not valid.",
206                context={
207                    "error_message": ex.message,
208                    "error_path": ex.path,
209                    "error_instance": ex.instance,
210                    "error_schema": ex.schema,
211                },
212            )
213            log_config_validation_result(
214                name=self.name,
215                state=EventState.FAILED,
216                exception=validation_ex,
217            )
218            raise validation_ex from ex
219
220    def get_available_streams(self) -> list[str]:
221        """Get the available streams from the spec."""
222        return [s.name for s in self.discovered_catalog.streams]
223
224    def _get_spec(self, *, force_refresh: bool = False) -> ConnectorSpecification:
225        """Call spec on the connector.
226
227        This involves the following steps:
228        * execute the connector with spec
229        * Listen to the messages and return the first AirbyteCatalog that comes along.
230        * Make sure the subprocess is killed when the function returns.
231        """
232        if force_refresh or self._spec is None:
233            for msg in self._execute(["spec"]):
234                if msg.type == Type.SPEC and msg.spec:
235                    self._spec = msg.spec
236                    break
237
238        if self._spec:
239            return self._spec
240
241        raise exc.AirbyteConnectorMissingSpecError(
242            log_text=self._last_log_messages,
243        )
244
245    @property
246    def config_spec(self) -> dict[str, Any]:
247        """Generate a configuration spec for this connector, as a JSON Schema definition.
248
249        This function generates a JSON Schema dictionary with configuration specs for the
250        current connector, as a dictionary.
251
252        Returns:
253            dict: The JSON Schema configuration spec as a dictionary.
254        """
255        return self._get_spec(force_refresh=True).connectionSpecification
256
257    def print_config_spec(
258        self,
259        format: Literal["yaml", "json"] = "yaml",  # noqa: A002
260        *,
261        output_file: Path | str | None = None,
262    ) -> None:
263        """Print the configuration spec for this connector.
264
265        Args:
266        - format: The format to print the spec in. Must be "yaml" or "json".
267        - output_file: Optional. If set, the spec will be written to the given file path. Otherwise,
268          it will be printed to the console.
269        """
270        if format not in ["yaml", "json"]:
271            raise exc.PyAirbyteInputError(
272                message="Invalid format. Expected 'yaml' or 'json'",
273                input_value=format,
274            )
275        if isinstance(output_file, str):
276            output_file = Path(output_file)
277
278        if format == "yaml":
279            content = yaml.dump(self.config_spec, indent=2)
280        elif format == "json":
281            content = json.dumps(self.config_spec, indent=2)
282
283        if output_file:
284            output_file.write_text(content)
285            return
286
287        syntax_highlighted = Syntax(content, format)
288        print(syntax_highlighted)
289
290    @property
291    def _yaml_spec(self) -> str:
292        """Get the spec as a yaml string.
293
294        For now, the primary use case is for writing and debugging a valid config for a source.
295
296        This is private for now because we probably want better polish before exposing this
297        as a stable interface. This will also get easier when we have docs links with this info
298        for each connector.
299        """
300        spec_obj: ConnectorSpecification = self._get_spec()
301        spec_dict = spec_obj.dict(exclude_unset=True)
302        # convert to a yaml string
303        return yaml.dump(spec_dict)
304
305    @property
306    def docs_url(self) -> str:
307        """Get the URL to the connector's documentation."""
308        # TODO: Replace with docs URL from metadata when available
309        return "https://docs.airbyte.com/integrations/sources/" + self.name.lower().replace(
310            "source-", ""
311        )
312
313    @property
314    def discovered_catalog(self) -> AirbyteCatalog:
315        """Get the raw catalog for the given streams.
316
317        If the catalog is not yet known, we call discover to get it.
318        """
319        if self._discovered_catalog is None:
320            self._discovered_catalog = self._discover()
321
322        return self._discovered_catalog
323
324    @property
325    def configured_catalog(self) -> ConfiguredAirbyteCatalog:
326        """Get the configured catalog for the given streams.
327
328        If the raw catalog is not yet known, we call discover to get it.
329
330        If no specific streams are selected, we return a catalog that syncs all available streams.
331
332        TODO: We should consider disabling by default the streams that the connector would
333        disable by default. (For instance, streams that require a premium license are sometimes
334        disabled by default within the connector.)
335        """
336        # Ensure discovered catalog is cached before we start
337        _ = self.discovered_catalog
338
339        # Filter for selected streams if set, otherwise use all available streams:
340        streams_filter: list[str] = self._selected_stream_names or self.get_available_streams()
341
342        return ConfiguredAirbyteCatalog(
343            streams=[
344                ConfiguredAirbyteStream(
345                    stream=stream,
346                    destination_sync_mode=DestinationSyncMode.overwrite,
347                    primary_key=stream.source_defined_primary_key,
348                    # TODO: The below assumes all sources can coalesce from incremental sync to
349                    # full_table as needed. CDK supports this, so it might be safe:
350                    sync_mode=SyncMode.incremental,
351                )
352                for stream in self.discovered_catalog.streams
353                if stream.name in streams_filter
354            ],
355        )
356
357    def get_stream_json_schema(self, stream_name: str) -> dict[str, Any]:
358        """Return the JSON Schema spec for the specified stream name."""
359        catalog: AirbyteCatalog = self.discovered_catalog
360        found: list[AirbyteStream] = [
361            stream for stream in catalog.streams if stream.name == stream_name
362        ]
363
364        if len(found) == 0:
365            raise exc.PyAirbyteInputError(
366                message="Stream name does not exist in catalog.",
367                input_value=stream_name,
368            )
369
370        if len(found) > 1:
371            raise exc.PyAirbyteInternalError(
372                message="Duplicate streams found with the same name.",
373                context={
374                    "found_streams": found,
375                },
376            )
377
378        return found[0].json_schema
379
380    def get_records(self, stream: str) -> LazyDataset:
381        """Read a stream from the connector.
382
383        This involves the following steps:
384        * Call discover to get the catalog
385        * Generate a configured catalog that syncs the given stream in full_refresh mode
386        * Write the configured catalog and the config to a temporary file
387        * execute the connector with read --config <config_file> --catalog <catalog_file>
388        * Listen to the messages and return the first AirbyteRecordMessages that come along.
389        * Make sure the subprocess is killed when the function returns.
390        """
391        discovered_catalog: AirbyteCatalog = self.discovered_catalog
392        configured_catalog = ConfiguredAirbyteCatalog(
393            streams=[
394                ConfiguredAirbyteStream(
395                    stream=s,
396                    sync_mode=SyncMode.full_refresh,
397                    destination_sync_mode=DestinationSyncMode.overwrite,
398                )
399                for s in discovered_catalog.streams
400                if s.name == stream
401            ],
402        )
403        if len(configured_catalog.streams) == 0:
404            raise exc.PyAirbyteInputError(
405                message="Requested stream does not exist.",
406                context={
407                    "stream": stream,
408                    "available_streams": self.get_available_streams(),
409                    "connector_name": self.name,
410                },
411            ) from KeyError(stream)
412
413        configured_stream = configured_catalog.streams[0]
414        all_properties = cast(
415            list[str], list(configured_stream.stream.json_schema["properties"].keys())
416        )
417
418        def _with_logging(records: Iterable[dict[str, Any]]) -> Iterator[dict[str, Any]]:
419            self._log_sync_start(cache=None)
420            yield from records
421            self._log_sync_success(cache=None)
422
423        iterator: Iterator[dict[str, Any]] = _with_logging(
424            records=(  # Generator comprehension yields StreamRecord objects for each record
425                StreamRecord.from_record_message(
426                    record_message=record.record,
427                    expected_keys=all_properties,
428                    prune_extra_fields=True,
429                )
430                for record in self._read_with_catalog(configured_catalog)
431                if record.record
432            )
433        )
434        return LazyDataset(
435            iterator,
436            stream_metadata=configured_stream,
437        )
438
439    def get_documents(
440        self,
441        stream: str,
442        title_property: str | None = None,
443        content_properties: list[str] | None = None,
444        metadata_properties: list[str] | None = None,
445        *,
446        render_metadata: bool = False,
447    ) -> Iterable[Document]:
448        """Read a stream from the connector and return the records as documents.
449
450        If metadata_properties is not set, all properties that are not content will be added to
451        the metadata.
452
453        If render_metadata is True, metadata will be rendered in the document, as well as the
454        the main content.
455        """
456        return self.get_records(stream).to_documents(
457            title_property=title_property,
458            content_properties=content_properties,
459            metadata_properties=metadata_properties,
460            render_metadata=render_metadata,
461        )
462
463    def check(self) -> None:
464        """Call check on the connector.
465
466        This involves the following steps:
467        * Write the config to a temporary file
468        * execute the connector with check --config <config_file>
469        * Listen to the messages and return the first AirbyteCatalog that comes along.
470        * Make sure the subprocess is killed when the function returns.
471        """
472        with as_temp_files([self._config]) as [config_file]:
473            try:
474                for msg in self._execute(["check", "--config", config_file]):
475                    if msg.type == Type.CONNECTION_STATUS and msg.connectionStatus:
476                        if msg.connectionStatus.status != Status.FAILED:
477                            print(f"Connection check succeeded for `{self.name}`.")
478                            log_source_check_result(
479                                name=self.name,
480                                state=EventState.SUCCEEDED,
481                            )
482                            return
483
484                        log_source_check_result(
485                            name=self.name,
486                            state=EventState.FAILED,
487                        )
488                        raise exc.AirbyteConnectorCheckFailedError(
489                            help_url=self.docs_url,
490                            context={
491                                "failure_reason": msg.connectionStatus.message,
492                            },
493                        )
494                raise exc.AirbyteConnectorCheckFailedError(log_text=self._last_log_messages)
495            except exc.AirbyteConnectorReadError as ex:
496                raise exc.AirbyteConnectorCheckFailedError(
497                    message="The connector failed to check the connection.",
498                    log_text=ex.log_text,
499                ) from ex
500
501    def install(self) -> None:
502        """Install the connector if it is not yet installed."""
503        self.executor.install()
504        print("For configuration instructions, see: \n" f"{self.docs_url}#reference\n")
505
506    def uninstall(self) -> None:
507        """Uninstall the connector if it is installed.
508
509        This only works if the use_local_install flag wasn't used and installation is managed by
510        PyAirbyte.
511        """
512        self.executor.uninstall()
513
514    def _read_with_catalog(
515        self,
516        catalog: ConfiguredAirbyteCatalog,
517        state: list[AirbyteStateMessage] | None = None,
518    ) -> Iterator[AirbyteMessage]:
519        """Call read on the connector.
520
521        This involves the following steps:
522        * Write the config to a temporary file
523        * execute the connector with read --config <config_file> --catalog <catalog_file>
524        * Listen to the messages and return the AirbyteRecordMessages that come along.
525        * Send out telemetry on the performed sync (with information about which source was used and
526          the type of the cache)
527        """
528        self._processed_records = 0  # Reset the counter before we start
529        with as_temp_files(
530            [
531                self._config,
532                catalog.json(),
533                json.dumps(state) if state else "[]",
534            ]
535        ) as [
536            config_file,
537            catalog_file,
538            state_file,
539        ]:
540            yield from self._tally_records(
541                self._execute(
542                    [
543                        "read",
544                        "--config",
545                        config_file,
546                        "--catalog",
547                        catalog_file,
548                        "--state",
549                        state_file,
550                    ],
551                )
552            )
553
554    def _add_to_logs(self, message: str) -> None:
555        self._last_log_messages.append(message)
556        self._last_log_messages = self._last_log_messages[-10:]
557
558    def _execute(self, args: list[str]) -> Iterator[AirbyteMessage]:
559        """Execute the connector with the given arguments.
560
561        This involves the following steps:
562        * Locate the right venv. It is called ".venv-<connector_name>"
563        * Spawn a subprocess with .venv-<connector_name>/bin/<connector-name> <args>
564        * Read the output line by line of the subprocess and serialize them AirbyteMessage objects.
565          Drop if not valid.
566        """
567        # Fail early if the connector is not installed.
568        self.executor.ensure_installation(auto_fix=False)
569
570        try:
571            self._last_log_messages = []
572            for line in self.executor.execute(args):
573                try:
574                    message = AirbyteMessage.parse_raw(line)
575                    if message.type is Type.RECORD:
576                        self._processed_records += 1
577                    if message.type == Type.LOG:
578                        self._add_to_logs(message.log.message)
579                    if message.type == Type.TRACE and message.trace.type == TraceType.ERROR:
580                        self._add_to_logs(message.trace.error.message)
581                    yield message
582                except Exception:
583                    self._add_to_logs(line)
584        except Exception as e:
585            raise exc.AirbyteConnectorReadError(
586                log_text=self._last_log_messages,
587            ) from e
588
589    def _tally_records(
590        self,
591        messages: Iterable[AirbyteMessage],
592    ) -> Generator[AirbyteMessage, Any, None]:
593        """This method simply tallies the number of records processed and yields the messages."""
594        self._processed_records = 0  # Reset the counter before we start
595        progress.reset(len(self._selected_stream_names or []))
596
597        for message in messages:
598            yield message
599            progress.log_records_read(new_total_count=self._processed_records)
600
601    def _log_sync_start(
602        self,
603        *,
604        cache: CacheBase | None,
605    ) -> None:
606        """Log the start of a sync operation."""
607        print(f"Started `{self.name}` read operation at {pendulum.now().format('HH:mm:ss')}...")
608        send_telemetry(
609            source=self,
610            cache=cache,
611            state=EventState.STARTED,
612            event_type=EventType.SYNC,
613        )
614
615    def _log_sync_success(
616        self,
617        *,
618        cache: CacheBase | None,
619    ) -> None:
620        """Log the success of a sync operation."""
621        print(f"Completed `{self.name}` read operation at {pendulum.now().format('HH:mm:ss')}.")
622        send_telemetry(
623            source=self,
624            cache=cache,
625            state=EventState.SUCCEEDED,
626            number_of_records=self._processed_records,
627            event_type=EventType.SYNC,
628        )
629
630    def _log_sync_failure(
631        self,
632        *,
633        cache: CacheBase | None,
634        exception: Exception,
635    ) -> None:
636        """Log the failure of a sync operation."""
637        print(f"Failed `{self.name}` read operation at {pendulum.now().format('HH:mm:ss')}.")
638        send_telemetry(
639            state=EventState.FAILED,
640            source=self,
641            cache=cache,
642            number_of_records=self._processed_records,
643            exception=exception,
644            event_type=EventType.SYNC,
645        )
646
647    def read(
648        self,
649        cache: CacheBase | None = None,
650        *,
651        streams: str | list[str] | None = None,
652        write_strategy: str | WriteStrategy = WriteStrategy.AUTO,
653        force_full_refresh: bool = False,
654        skip_validation: bool = False,
655    ) -> ReadResult:
656        """Read from the connector and write to the cache.
657
658        Args:
659            cache: The cache to write to. If None, a default cache will be used.
660            write_strategy: The strategy to use when writing to the cache. If a string, it must be
661                one of "append", "upsert", "replace", or "auto". If a WriteStrategy, it must be one
662                of WriteStrategy.APPEND, WriteStrategy.UPSERT, WriteStrategy.REPLACE, or
663                WriteStrategy.AUTO.
664            streams: Optional if already set. A list of stream names to select for reading. If set
665                to "*", all streams will be selected.
666            force_full_refresh: If True, the source will operate in full refresh mode. Otherwise,
667                streams will be read in incremental mode if supported by the connector. This option
668                must be True when using the "replace" strategy.
669        """
670        if write_strategy == WriteStrategy.REPLACE and not force_full_refresh:
671            warnings.warn(
672                message=(
673                    "Using `REPLACE` strategy without also setting `full_refresh_mode=True` "
674                    "could result in data loss. "
675                    "To silence this warning, use the following: "
676                    'warnings.filterwarnings("ignore", '
677                    'category="airbyte.warnings.PyAirbyteDataLossWarning")`'
678                ),
679                category=PyAirbyteDataLossWarning,
680                stacklevel=1,
681            )
682        if cache is None:
683            cache = get_default_cache()
684
685        if isinstance(write_strategy, str):
686            try:
687                write_strategy = WriteStrategy(write_strategy)
688            except ValueError:
689                raise exc.PyAirbyteInputError(
690                    message="Invalid strategy",
691                    context={
692                        "write_strategy": write_strategy,
693                        "available_strategies": [s.value for s in WriteStrategy],
694                    },
695                ) from None
696
697        if streams:
698            self.select_streams(streams)
699
700        if not self._selected_stream_names:
701            raise exc.PyAirbyteNoStreamsSelectedError(
702                connector_name=self.name,
703                available_streams=self.get_available_streams(),
704            )
705
706        cache.processor.register_source(
707            source_name=self.name,
708            incoming_source_catalog=self.configured_catalog,
709            stream_names=set(self._selected_stream_names),
710        )
711
712        state = (
713            cache._get_state(  # noqa: SLF001  # Private method until we have a public API for it.
714                source_name=self.name,
715                streams=self._selected_stream_names,
716            )
717            if not force_full_refresh
718            else None
719        )
720        if not skip_validation:
721            self.validate_config()
722
723        self._log_sync_start(cache=cache)
724        try:
725            cache.processor.process_airbyte_messages(
726                self._read_with_catalog(
727                    catalog=self.configured_catalog,
728                    state=state,
729                ),
730                write_strategy=write_strategy,
731            )
732        except Exception as ex:
733            self._log_sync_failure(cache=cache, exception=ex)
734            raise exc.AirbyteConnectorFailedError(
735                log_text=self._last_log_messages,
736            ) from ex
737
738        self._log_sync_success(cache=cache)
739        return ReadResult(
740            processed_records=self._processed_records,
741            cache=cache,
742            processed_streams=[stream.stream.name for stream in self.configured_catalog.streams],
743        )
744
745
746__all__ = [
747    "Source",
748]
class Source:
 59class Source:
 60    """A class representing a source that can be called."""
 61
 62    def __init__(
 63        self,
 64        executor: Executor,
 65        name: str,
 66        config: dict[str, Any] | None = None,
 67        streams: str | list[str] | None = None,
 68        *,
 69        validate: bool = False,
 70    ) -> None:
 71        """Initialize the source.
 72
 73        If config is provided, it will be validated against the spec if validate is True.
 74        """
 75        self.executor = executor
 76        self.name = name
 77        self._processed_records = 0
 78        self._config_dict: dict[str, Any] | None = None
 79        self._last_log_messages: list[str] = []
 80        self._discovered_catalog: AirbyteCatalog | None = None
 81        self._spec: ConnectorSpecification | None = None
 82        self._selected_stream_names: list[str] = []
 83        if config is not None:
 84            self.set_config(config, validate=validate)
 85        if streams is not None:
 86            self.select_streams(streams)
 87
 88        self._deployed_api_root: str | None = None
 89        self._deployed_workspace_id: str | None = None
 90        self._deployed_source_id: str | None = None
 91        self._deployed_connection_id: str | None = None
 92
 93    def set_streams(self, streams: list[str]) -> None:
 94        """Deprecated. See select_streams()."""
 95        warnings.warn(
 96            "The 'set_streams' method is deprecated and will be removed in a future version. "
 97            "Please use the 'select_streams' method instead.",
 98            DeprecationWarning,
 99            stacklevel=2,
100        )
101        self.select_streams(streams)
102
103    def select_all_streams(self) -> None:
104        """Select all streams.
105
106        This is a more streamlined equivalent to:
107        > source.select_streams(source.get_available_streams()).
108        """
109        self._selected_stream_names = self.get_available_streams()
110
111    def select_streams(self, streams: str | list[str]) -> None:
112        """Select the stream names that should be read from the connector.
113
114        Args:
115        - streams: A list of stream names to select. If set to "*", all streams will be selected.
116
117        Currently, if this is not set, all streams will be read.
118        """
119        if streams == "*":
120            self.select_all_streams()
121            return
122
123        if isinstance(streams, str):
124            # If a single stream is provided, convert it to a one-item list
125            streams = [streams]
126
127        available_streams = self.get_available_streams()
128        for stream in streams:
129            if stream not in available_streams:
130                raise exc.AirbyteStreamNotFoundError(
131                    stream_name=stream,
132                    connector_name=self.name,
133                    available_streams=available_streams,
134                )
135        self._selected_stream_names = streams
136
137    def get_selected_streams(self) -> list[str]:
138        """Get the selected streams.
139
140        If no streams are selected, return an empty list.
141        """
142        return self._selected_stream_names
143
144    def set_config(
145        self,
146        config: dict[str, Any],
147        *,
148        validate: bool = True,
149    ) -> None:
150        """Set the config for the connector.
151
152        If validate is True, raise an exception if the config fails validation.
153
154        If validate is False, validation will be deferred until check() or validate_config()
155        is called.
156        """
157        if validate:
158            self.validate_config(config)
159
160        self._config_dict = config
161
162    def get_config(self) -> dict[str, Any]:
163        """Get the config for the connector."""
164        return self._config
165
166    @property
167    def _config(self) -> dict[str, Any]:
168        if self._config_dict is None:
169            raise exc.AirbyteConnectorConfigurationMissingError(
170                guidance="Provide via get_source() or set_config()"
171            )
172        return self._config_dict
173
174    def _discover(self) -> AirbyteCatalog:
175        """Call discover on the connector.
176
177        This involves the following steps:
178        * Write the config to a temporary file
179        * execute the connector with discover --config <config_file>
180        * Listen to the messages and return the first AirbyteCatalog that comes along.
181        * Make sure the subprocess is killed when the function returns.
182        """
183        with as_temp_files([self._config]) as [config_file]:
184            for msg in self._execute(["discover", "--config", config_file]):
185                if msg.type == Type.CATALOG and msg.catalog:
186                    return msg.catalog
187            raise exc.AirbyteConnectorMissingCatalogError(
188                log_text=self._last_log_messages,
189            )
190
191    def validate_config(self, config: dict[str, Any] | None = None) -> None:
192        """Validate the config against the spec.
193
194        If config is not provided, the already-set config will be validated.
195        """
196        spec = self._get_spec(force_refresh=False)
197        config = self._config if config is None else config
198        try:
199            jsonschema.validate(config, spec.connectionSpecification)
200            log_config_validation_result(
201                name=self.name,
202                state=EventState.SUCCEEDED,
203            )
204        except jsonschema.ValidationError as ex:
205            validation_ex = exc.AirbyteConnectorValidationFailedError(
206                message="The provided config is not valid.",
207                context={
208                    "error_message": ex.message,
209                    "error_path": ex.path,
210                    "error_instance": ex.instance,
211                    "error_schema": ex.schema,
212                },
213            )
214            log_config_validation_result(
215                name=self.name,
216                state=EventState.FAILED,
217                exception=validation_ex,
218            )
219            raise validation_ex from ex
220
221    def get_available_streams(self) -> list[str]:
222        """Get the available streams from the spec."""
223        return [s.name for s in self.discovered_catalog.streams]
224
225    def _get_spec(self, *, force_refresh: bool = False) -> ConnectorSpecification:
226        """Call spec on the connector.
227
228        This involves the following steps:
229        * execute the connector with spec
230        * Listen to the messages and return the first AirbyteCatalog that comes along.
231        * Make sure the subprocess is killed when the function returns.
232        """
233        if force_refresh or self._spec is None:
234            for msg in self._execute(["spec"]):
235                if msg.type == Type.SPEC and msg.spec:
236                    self._spec = msg.spec
237                    break
238
239        if self._spec:
240            return self._spec
241
242        raise exc.AirbyteConnectorMissingSpecError(
243            log_text=self._last_log_messages,
244        )
245
246    @property
247    def config_spec(self) -> dict[str, Any]:
248        """Generate a configuration spec for this connector, as a JSON Schema definition.
249
250        This function generates a JSON Schema dictionary with configuration specs for the
251        current connector, as a dictionary.
252
253        Returns:
254            dict: The JSON Schema configuration spec as a dictionary.
255        """
256        return self._get_spec(force_refresh=True).connectionSpecification
257
258    def print_config_spec(
259        self,
260        format: Literal["yaml", "json"] = "yaml",  # noqa: A002
261        *,
262        output_file: Path | str | None = None,
263    ) -> None:
264        """Print the configuration spec for this connector.
265
266        Args:
267        - format: The format to print the spec in. Must be "yaml" or "json".
268        - output_file: Optional. If set, the spec will be written to the given file path. Otherwise,
269          it will be printed to the console.
270        """
271        if format not in ["yaml", "json"]:
272            raise exc.PyAirbyteInputError(
273                message="Invalid format. Expected 'yaml' or 'json'",
274                input_value=format,
275            )
276        if isinstance(output_file, str):
277            output_file = Path(output_file)
278
279        if format == "yaml":
280            content = yaml.dump(self.config_spec, indent=2)
281        elif format == "json":
282            content = json.dumps(self.config_spec, indent=2)
283
284        if output_file:
285            output_file.write_text(content)
286            return
287
288        syntax_highlighted = Syntax(content, format)
289        print(syntax_highlighted)
290
291    @property
292    def _yaml_spec(self) -> str:
293        """Get the spec as a yaml string.
294
295        For now, the primary use case is for writing and debugging a valid config for a source.
296
297        This is private for now because we probably want better polish before exposing this
298        as a stable interface. This will also get easier when we have docs links with this info
299        for each connector.
300        """
301        spec_obj: ConnectorSpecification = self._get_spec()
302        spec_dict = spec_obj.dict(exclude_unset=True)
303        # convert to a yaml string
304        return yaml.dump(spec_dict)
305
306    @property
307    def docs_url(self) -> str:
308        """Get the URL to the connector's documentation."""
309        # TODO: Replace with docs URL from metadata when available
310        return "https://docs.airbyte.com/integrations/sources/" + self.name.lower().replace(
311            "source-", ""
312        )
313
314    @property
315    def discovered_catalog(self) -> AirbyteCatalog:
316        """Get the raw catalog for the given streams.
317
318        If the catalog is not yet known, we call discover to get it.
319        """
320        if self._discovered_catalog is None:
321            self._discovered_catalog = self._discover()
322
323        return self._discovered_catalog
324
325    @property
326    def configured_catalog(self) -> ConfiguredAirbyteCatalog:
327        """Get the configured catalog for the given streams.
328
329        If the raw catalog is not yet known, we call discover to get it.
330
331        If no specific streams are selected, we return a catalog that syncs all available streams.
332
333        TODO: We should consider disabling by default the streams that the connector would
334        disable by default. (For instance, streams that require a premium license are sometimes
335        disabled by default within the connector.)
336        """
337        # Ensure discovered catalog is cached before we start
338        _ = self.discovered_catalog
339
340        # Filter for selected streams if set, otherwise use all available streams:
341        streams_filter: list[str] = self._selected_stream_names or self.get_available_streams()
342
343        return ConfiguredAirbyteCatalog(
344            streams=[
345                ConfiguredAirbyteStream(
346                    stream=stream,
347                    destination_sync_mode=DestinationSyncMode.overwrite,
348                    primary_key=stream.source_defined_primary_key,
349                    # TODO: The below assumes all sources can coalesce from incremental sync to
350                    # full_table as needed. CDK supports this, so it might be safe:
351                    sync_mode=SyncMode.incremental,
352                )
353                for stream in self.discovered_catalog.streams
354                if stream.name in streams_filter
355            ],
356        )
357
358    def get_stream_json_schema(self, stream_name: str) -> dict[str, Any]:
359        """Return the JSON Schema spec for the specified stream name."""
360        catalog: AirbyteCatalog = self.discovered_catalog
361        found: list[AirbyteStream] = [
362            stream for stream in catalog.streams if stream.name == stream_name
363        ]
364
365        if len(found) == 0:
366            raise exc.PyAirbyteInputError(
367                message="Stream name does not exist in catalog.",
368                input_value=stream_name,
369            )
370
371        if len(found) > 1:
372            raise exc.PyAirbyteInternalError(
373                message="Duplicate streams found with the same name.",
374                context={
375                    "found_streams": found,
376                },
377            )
378
379        return found[0].json_schema
380
381    def get_records(self, stream: str) -> LazyDataset:
382        """Read a stream from the connector.
383
384        This involves the following steps:
385        * Call discover to get the catalog
386        * Generate a configured catalog that syncs the given stream in full_refresh mode
387        * Write the configured catalog and the config to a temporary file
388        * execute the connector with read --config <config_file> --catalog <catalog_file>
389        * Listen to the messages and return the first AirbyteRecordMessages that come along.
390        * Make sure the subprocess is killed when the function returns.
391        """
392        discovered_catalog: AirbyteCatalog = self.discovered_catalog
393        configured_catalog = ConfiguredAirbyteCatalog(
394            streams=[
395                ConfiguredAirbyteStream(
396                    stream=s,
397                    sync_mode=SyncMode.full_refresh,
398                    destination_sync_mode=DestinationSyncMode.overwrite,
399                )
400                for s in discovered_catalog.streams
401                if s.name == stream
402            ],
403        )
404        if len(configured_catalog.streams) == 0:
405            raise exc.PyAirbyteInputError(
406                message="Requested stream does not exist.",
407                context={
408                    "stream": stream,
409                    "available_streams": self.get_available_streams(),
410                    "connector_name": self.name,
411                },
412            ) from KeyError(stream)
413
414        configured_stream = configured_catalog.streams[0]
415        all_properties = cast(
416            list[str], list(configured_stream.stream.json_schema["properties"].keys())
417        )
418
419        def _with_logging(records: Iterable[dict[str, Any]]) -> Iterator[dict[str, Any]]:
420            self._log_sync_start(cache=None)
421            yield from records
422            self._log_sync_success(cache=None)
423
424        iterator: Iterator[dict[str, Any]] = _with_logging(
425            records=(  # Generator comprehension yields StreamRecord objects for each record
426                StreamRecord.from_record_message(
427                    record_message=record.record,
428                    expected_keys=all_properties,
429                    prune_extra_fields=True,
430                )
431                for record in self._read_with_catalog(configured_catalog)
432                if record.record
433            )
434        )
435        return LazyDataset(
436            iterator,
437            stream_metadata=configured_stream,
438        )
439
440    def get_documents(
441        self,
442        stream: str,
443        title_property: str | None = None,
444        content_properties: list[str] | None = None,
445        metadata_properties: list[str] | None = None,
446        *,
447        render_metadata: bool = False,
448    ) -> Iterable[Document]:
449        """Read a stream from the connector and return the records as documents.
450
451        If metadata_properties is not set, all properties that are not content will be added to
452        the metadata.
453
454        If render_metadata is True, metadata will be rendered in the document, as well as the
455        the main content.
456        """
457        return self.get_records(stream).to_documents(
458            title_property=title_property,
459            content_properties=content_properties,
460            metadata_properties=metadata_properties,
461            render_metadata=render_metadata,
462        )
463
464    def check(self) -> None:
465        """Call check on the connector.
466
467        This involves the following steps:
468        * Write the config to a temporary file
469        * execute the connector with check --config <config_file>
470        * Listen to the messages and return the first AirbyteCatalog that comes along.
471        * Make sure the subprocess is killed when the function returns.
472        """
473        with as_temp_files([self._config]) as [config_file]:
474            try:
475                for msg in self._execute(["check", "--config", config_file]):
476                    if msg.type == Type.CONNECTION_STATUS and msg.connectionStatus:
477                        if msg.connectionStatus.status != Status.FAILED:
478                            print(f"Connection check succeeded for `{self.name}`.")
479                            log_source_check_result(
480                                name=self.name,
481                                state=EventState.SUCCEEDED,
482                            )
483                            return
484
485                        log_source_check_result(
486                            name=self.name,
487                            state=EventState.FAILED,
488                        )
489                        raise exc.AirbyteConnectorCheckFailedError(
490                            help_url=self.docs_url,
491                            context={
492                                "failure_reason": msg.connectionStatus.message,
493                            },
494                        )
495                raise exc.AirbyteConnectorCheckFailedError(log_text=self._last_log_messages)
496            except exc.AirbyteConnectorReadError as ex:
497                raise exc.AirbyteConnectorCheckFailedError(
498                    message="The connector failed to check the connection.",
499                    log_text=ex.log_text,
500                ) from ex
501
502    def install(self) -> None:
503        """Install the connector if it is not yet installed."""
504        self.executor.install()
505        print("For configuration instructions, see: \n" f"{self.docs_url}#reference\n")
506
507    def uninstall(self) -> None:
508        """Uninstall the connector if it is installed.
509
510        This only works if the use_local_install flag wasn't used and installation is managed by
511        PyAirbyte.
512        """
513        self.executor.uninstall()
514
515    def _read_with_catalog(
516        self,
517        catalog: ConfiguredAirbyteCatalog,
518        state: list[AirbyteStateMessage] | None = None,
519    ) -> Iterator[AirbyteMessage]:
520        """Call read on the connector.
521
522        This involves the following steps:
523        * Write the config to a temporary file
524        * execute the connector with read --config <config_file> --catalog <catalog_file>
525        * Listen to the messages and return the AirbyteRecordMessages that come along.
526        * Send out telemetry on the performed sync (with information about which source was used and
527          the type of the cache)
528        """
529        self._processed_records = 0  # Reset the counter before we start
530        with as_temp_files(
531            [
532                self._config,
533                catalog.json(),
534                json.dumps(state) if state else "[]",
535            ]
536        ) as [
537            config_file,
538            catalog_file,
539            state_file,
540        ]:
541            yield from self._tally_records(
542                self._execute(
543                    [
544                        "read",
545                        "--config",
546                        config_file,
547                        "--catalog",
548                        catalog_file,
549                        "--state",
550                        state_file,
551                    ],
552                )
553            )
554
555    def _add_to_logs(self, message: str) -> None:
556        self._last_log_messages.append(message)
557        self._last_log_messages = self._last_log_messages[-10:]
558
559    def _execute(self, args: list[str]) -> Iterator[AirbyteMessage]:
560        """Execute the connector with the given arguments.
561
562        This involves the following steps:
563        * Locate the right venv. It is called ".venv-<connector_name>"
564        * Spawn a subprocess with .venv-<connector_name>/bin/<connector-name> <args>
565        * Read the output line by line of the subprocess and serialize them AirbyteMessage objects.
566          Drop if not valid.
567        """
568        # Fail early if the connector is not installed.
569        self.executor.ensure_installation(auto_fix=False)
570
571        try:
572            self._last_log_messages = []
573            for line in self.executor.execute(args):
574                try:
575                    message = AirbyteMessage.parse_raw(line)
576                    if message.type is Type.RECORD:
577                        self._processed_records += 1
578                    if message.type == Type.LOG:
579                        self._add_to_logs(message.log.message)
580                    if message.type == Type.TRACE and message.trace.type == TraceType.ERROR:
581                        self._add_to_logs(message.trace.error.message)
582                    yield message
583                except Exception:
584                    self._add_to_logs(line)
585        except Exception as e:
586            raise exc.AirbyteConnectorReadError(
587                log_text=self._last_log_messages,
588            ) from e
589
590    def _tally_records(
591        self,
592        messages: Iterable[AirbyteMessage],
593    ) -> Generator[AirbyteMessage, Any, None]:
594        """This method simply tallies the number of records processed and yields the messages."""
595        self._processed_records = 0  # Reset the counter before we start
596        progress.reset(len(self._selected_stream_names or []))
597
598        for message in messages:
599            yield message
600            progress.log_records_read(new_total_count=self._processed_records)
601
602    def _log_sync_start(
603        self,
604        *,
605        cache: CacheBase | None,
606    ) -> None:
607        """Log the start of a sync operation."""
608        print(f"Started `{self.name}` read operation at {pendulum.now().format('HH:mm:ss')}...")
609        send_telemetry(
610            source=self,
611            cache=cache,
612            state=EventState.STARTED,
613            event_type=EventType.SYNC,
614        )
615
616    def _log_sync_success(
617        self,
618        *,
619        cache: CacheBase | None,
620    ) -> None:
621        """Log the success of a sync operation."""
622        print(f"Completed `{self.name}` read operation at {pendulum.now().format('HH:mm:ss')}.")
623        send_telemetry(
624            source=self,
625            cache=cache,
626            state=EventState.SUCCEEDED,
627            number_of_records=self._processed_records,
628            event_type=EventType.SYNC,
629        )
630
631    def _log_sync_failure(
632        self,
633        *,
634        cache: CacheBase | None,
635        exception: Exception,
636    ) -> None:
637        """Log the failure of a sync operation."""
638        print(f"Failed `{self.name}` read operation at {pendulum.now().format('HH:mm:ss')}.")
639        send_telemetry(
640            state=EventState.FAILED,
641            source=self,
642            cache=cache,
643            number_of_records=self._processed_records,
644            exception=exception,
645            event_type=EventType.SYNC,
646        )
647
648    def read(
649        self,
650        cache: CacheBase | None = None,
651        *,
652        streams: str | list[str] | None = None,
653        write_strategy: str | WriteStrategy = WriteStrategy.AUTO,
654        force_full_refresh: bool = False,
655        skip_validation: bool = False,
656    ) -> ReadResult:
657        """Read from the connector and write to the cache.
658
659        Args:
660            cache: The cache to write to. If None, a default cache will be used.
661            write_strategy: The strategy to use when writing to the cache. If a string, it must be
662                one of "append", "upsert", "replace", or "auto". If a WriteStrategy, it must be one
663                of WriteStrategy.APPEND, WriteStrategy.UPSERT, WriteStrategy.REPLACE, or
664                WriteStrategy.AUTO.
665            streams: Optional if already set. A list of stream names to select for reading. If set
666                to "*", all streams will be selected.
667            force_full_refresh: If True, the source will operate in full refresh mode. Otherwise,
668                streams will be read in incremental mode if supported by the connector. This option
669                must be True when using the "replace" strategy.
670        """
671        if write_strategy == WriteStrategy.REPLACE and not force_full_refresh:
672            warnings.warn(
673                message=(
674                    "Using `REPLACE` strategy without also setting `full_refresh_mode=True` "
675                    "could result in data loss. "
676                    "To silence this warning, use the following: "
677                    'warnings.filterwarnings("ignore", '
678                    'category="airbyte.warnings.PyAirbyteDataLossWarning")`'
679                ),
680                category=PyAirbyteDataLossWarning,
681                stacklevel=1,
682            )
683        if cache is None:
684            cache = get_default_cache()
685
686        if isinstance(write_strategy, str):
687            try:
688                write_strategy = WriteStrategy(write_strategy)
689            except ValueError:
690                raise exc.PyAirbyteInputError(
691                    message="Invalid strategy",
692                    context={
693                        "write_strategy": write_strategy,
694                        "available_strategies": [s.value for s in WriteStrategy],
695                    },
696                ) from None
697
698        if streams:
699            self.select_streams(streams)
700
701        if not self._selected_stream_names:
702            raise exc.PyAirbyteNoStreamsSelectedError(
703                connector_name=self.name,
704                available_streams=self.get_available_streams(),
705            )
706
707        cache.processor.register_source(
708            source_name=self.name,
709            incoming_source_catalog=self.configured_catalog,
710            stream_names=set(self._selected_stream_names),
711        )
712
713        state = (
714            cache._get_state(  # noqa: SLF001  # Private method until we have a public API for it.
715                source_name=self.name,
716                streams=self._selected_stream_names,
717            )
718            if not force_full_refresh
719            else None
720        )
721        if not skip_validation:
722            self.validate_config()
723
724        self._log_sync_start(cache=cache)
725        try:
726            cache.processor.process_airbyte_messages(
727                self._read_with_catalog(
728                    catalog=self.configured_catalog,
729                    state=state,
730                ),
731                write_strategy=write_strategy,
732            )
733        except Exception as ex:
734            self._log_sync_failure(cache=cache, exception=ex)
735            raise exc.AirbyteConnectorFailedError(
736                log_text=self._last_log_messages,
737            ) from ex
738
739        self._log_sync_success(cache=cache)
740        return ReadResult(
741            processed_records=self._processed_records,
742            cache=cache,
743            processed_streams=[stream.stream.name for stream in self.configured_catalog.streams],
744        )

A class representing a source that can be called.

Source( executor: airbyte._executor.Executor, name: str, config: dict[str, typing.Any] | None = None, streams: str | list[str] | None = None, *, validate: bool = False)
62    def __init__(
63        self,
64        executor: Executor,
65        name: str,
66        config: dict[str, Any] | None = None,
67        streams: str | list[str] | None = None,
68        *,
69        validate: bool = False,
70    ) -> None:
71        """Initialize the source.
72
73        If config is provided, it will be validated against the spec if validate is True.
74        """
75        self.executor = executor
76        self.name = name
77        self._processed_records = 0
78        self._config_dict: dict[str, Any] | None = None
79        self._last_log_messages: list[str] = []
80        self._discovered_catalog: AirbyteCatalog | None = None
81        self._spec: ConnectorSpecification | None = None
82        self._selected_stream_names: list[str] = []
83        if config is not None:
84            self.set_config(config, validate=validate)
85        if streams is not None:
86            self.select_streams(streams)
87
88        self._deployed_api_root: str | None = None
89        self._deployed_workspace_id: str | None = None
90        self._deployed_source_id: str | None = None
91        self._deployed_connection_id: str | None = None

Initialize the source.

If config is provided, it will be validated against the spec if validate is True.

executor
name
def set_streams(self, streams: list[str]) -> None:
 93    def set_streams(self, streams: list[str]) -> None:
 94        """Deprecated. See select_streams()."""
 95        warnings.warn(
 96            "The 'set_streams' method is deprecated and will be removed in a future version. "
 97            "Please use the 'select_streams' method instead.",
 98            DeprecationWarning,
 99            stacklevel=2,
100        )
101        self.select_streams(streams)

Deprecated. See select_streams().

def select_all_streams(self) -> None:
103    def select_all_streams(self) -> None:
104        """Select all streams.
105
106        This is a more streamlined equivalent to:
107        > source.select_streams(source.get_available_streams()).
108        """
109        self._selected_stream_names = self.get_available_streams()

Select all streams.

This is a more streamlined equivalent to:

source.select_streams(source.get_available_streams()).

def select_streams(self, streams: str | list[str]) -> None:
111    def select_streams(self, streams: str | list[str]) -> None:
112        """Select the stream names that should be read from the connector.
113
114        Args:
115        - streams: A list of stream names to select. If set to "*", all streams will be selected.
116
117        Currently, if this is not set, all streams will be read.
118        """
119        if streams == "*":
120            self.select_all_streams()
121            return
122
123        if isinstance(streams, str):
124            # If a single stream is provided, convert it to a one-item list
125            streams = [streams]
126
127        available_streams = self.get_available_streams()
128        for stream in streams:
129            if stream not in available_streams:
130                raise exc.AirbyteStreamNotFoundError(
131                    stream_name=stream,
132                    connector_name=self.name,
133                    available_streams=available_streams,
134                )
135        self._selected_stream_names = streams

Select the stream names that should be read from the connector.

Args:

  • streams: A list of stream names to select. If set to "*", all streams will be selected.

Currently, if this is not set, all streams will be read.

def get_selected_streams(self) -> list[str]:
137    def get_selected_streams(self) -> list[str]:
138        """Get the selected streams.
139
140        If no streams are selected, return an empty list.
141        """
142        return self._selected_stream_names

Get the selected streams.

If no streams are selected, return an empty list.

def set_config(self, config: dict[str, typing.Any], *, validate: bool = True) -> None:
144    def set_config(
145        self,
146        config: dict[str, Any],
147        *,
148        validate: bool = True,
149    ) -> None:
150        """Set the config for the connector.
151
152        If validate is True, raise an exception if the config fails validation.
153
154        If validate is False, validation will be deferred until check() or validate_config()
155        is called.
156        """
157        if validate:
158            self.validate_config(config)
159
160        self._config_dict = config

Set the config for the connector.

If validate is True, raise an exception if the config fails validation.

If validate is False, validation will be deferred until check() or validate_config() is called.

def get_config(self) -> dict[str, typing.Any]:
162    def get_config(self) -> dict[str, Any]:
163        """Get the config for the connector."""
164        return self._config

Get the config for the connector.

def validate_config(self, config: dict[str, typing.Any] | None = None) -> None:
191    def validate_config(self, config: dict[str, Any] | None = None) -> None:
192        """Validate the config against the spec.
193
194        If config is not provided, the already-set config will be validated.
195        """
196        spec = self._get_spec(force_refresh=False)
197        config = self._config if config is None else config
198        try:
199            jsonschema.validate(config, spec.connectionSpecification)
200            log_config_validation_result(
201                name=self.name,
202                state=EventState.SUCCEEDED,
203            )
204        except jsonschema.ValidationError as ex:
205            validation_ex = exc.AirbyteConnectorValidationFailedError(
206                message="The provided config is not valid.",
207                context={
208                    "error_message": ex.message,
209                    "error_path": ex.path,
210                    "error_instance": ex.instance,
211                    "error_schema": ex.schema,
212                },
213            )
214            log_config_validation_result(
215                name=self.name,
216                state=EventState.FAILED,
217                exception=validation_ex,
218            )
219            raise validation_ex from ex

Validate the config against the spec.

If config is not provided, the already-set config will be validated.

def get_available_streams(self) -> list[str]:
221    def get_available_streams(self) -> list[str]:
222        """Get the available streams from the spec."""
223        return [s.name for s in self.discovered_catalog.streams]

Get the available streams from the spec.

config_spec: dict[str, typing.Any]
246    @property
247    def config_spec(self) -> dict[str, Any]:
248        """Generate a configuration spec for this connector, as a JSON Schema definition.
249
250        This function generates a JSON Schema dictionary with configuration specs for the
251        current connector, as a dictionary.
252
253        Returns:
254            dict: The JSON Schema configuration spec as a dictionary.
255        """
256        return self._get_spec(force_refresh=True).connectionSpecification

Generate a configuration spec for this connector, as a JSON Schema definition.

This function generates a JSON Schema dictionary with configuration specs for the current connector, as a dictionary.

Returns:

dict: The JSON Schema configuration spec as a dictionary.

def print_config_spec( self, format: Literal['yaml', 'json'] = 'yaml', *, output_file: pathlib.Path | str | None = None) -> None:
258    def print_config_spec(
259        self,
260        format: Literal["yaml", "json"] = "yaml",  # noqa: A002
261        *,
262        output_file: Path | str | None = None,
263    ) -> None:
264        """Print the configuration spec for this connector.
265
266        Args:
267        - format: The format to print the spec in. Must be "yaml" or "json".
268        - output_file: Optional. If set, the spec will be written to the given file path. Otherwise,
269          it will be printed to the console.
270        """
271        if format not in ["yaml", "json"]:
272            raise exc.PyAirbyteInputError(
273                message="Invalid format. Expected 'yaml' or 'json'",
274                input_value=format,
275            )
276        if isinstance(output_file, str):
277            output_file = Path(output_file)
278
279        if format == "yaml":
280            content = yaml.dump(self.config_spec, indent=2)
281        elif format == "json":
282            content = json.dumps(self.config_spec, indent=2)
283
284        if output_file:
285            output_file.write_text(content)
286            return
287
288        syntax_highlighted = Syntax(content, format)
289        print(syntax_highlighted)

Print the configuration spec for this connector.

Args:

  • format: The format to print the spec in. Must be "yaml" or "json".
  • output_file: Optional. If set, the spec will be written to the given file path. Otherwise, it will be printed to the console.
docs_url: str
306    @property
307    def docs_url(self) -> str:
308        """Get the URL to the connector's documentation."""
309        # TODO: Replace with docs URL from metadata when available
310        return "https://docs.airbyte.com/integrations/sources/" + self.name.lower().replace(
311            "source-", ""
312        )

Get the URL to the connector's documentation.

discovered_catalog: airbyte_protocol.models.airbyte_protocol.AirbyteCatalog
314    @property
315    def discovered_catalog(self) -> AirbyteCatalog:
316        """Get the raw catalog for the given streams.
317
318        If the catalog is not yet known, we call discover to get it.
319        """
320        if self._discovered_catalog is None:
321            self._discovered_catalog = self._discover()
322
323        return self._discovered_catalog

Get the raw catalog for the given streams.

If the catalog is not yet known, we call discover to get it.

configured_catalog: airbyte_protocol.models.airbyte_protocol.ConfiguredAirbyteCatalog
325    @property
326    def configured_catalog(self) -> ConfiguredAirbyteCatalog:
327        """Get the configured catalog for the given streams.
328
329        If the raw catalog is not yet known, we call discover to get it.
330
331        If no specific streams are selected, we return a catalog that syncs all available streams.
332
333        TODO: We should consider disabling by default the streams that the connector would
334        disable by default. (For instance, streams that require a premium license are sometimes
335        disabled by default within the connector.)
336        """
337        # Ensure discovered catalog is cached before we start
338        _ = self.discovered_catalog
339
340        # Filter for selected streams if set, otherwise use all available streams:
341        streams_filter: list[str] = self._selected_stream_names or self.get_available_streams()
342
343        return ConfiguredAirbyteCatalog(
344            streams=[
345                ConfiguredAirbyteStream(
346                    stream=stream,
347                    destination_sync_mode=DestinationSyncMode.overwrite,
348                    primary_key=stream.source_defined_primary_key,
349                    # TODO: The below assumes all sources can coalesce from incremental sync to
350                    # full_table as needed. CDK supports this, so it might be safe:
351                    sync_mode=SyncMode.incremental,
352                )
353                for stream in self.discovered_catalog.streams
354                if stream.name in streams_filter
355            ],
356        )

Get the configured catalog for the given streams.

If the raw catalog is not yet known, we call discover to get it.

If no specific streams are selected, we return a catalog that syncs all available streams.

TODO: We should consider disabling by default the streams that the connector would disable by default. (For instance, streams that require a premium license are sometimes disabled by default within the connector.)

def get_stream_json_schema(self, stream_name: str) -> dict[str, typing.Any]:
358    def get_stream_json_schema(self, stream_name: str) -> dict[str, Any]:
359        """Return the JSON Schema spec for the specified stream name."""
360        catalog: AirbyteCatalog = self.discovered_catalog
361        found: list[AirbyteStream] = [
362            stream for stream in catalog.streams if stream.name == stream_name
363        ]
364
365        if len(found) == 0:
366            raise exc.PyAirbyteInputError(
367                message="Stream name does not exist in catalog.",
368                input_value=stream_name,
369            )
370
371        if len(found) > 1:
372            raise exc.PyAirbyteInternalError(
373                message="Duplicate streams found with the same name.",
374                context={
375                    "found_streams": found,
376                },
377            )
378
379        return found[0].json_schema

Return the JSON Schema spec for the specified stream name.

def get_records(self, stream: str) -> airbyte.datasets._lazy.LazyDataset:
381    def get_records(self, stream: str) -> LazyDataset:
382        """Read a stream from the connector.
383
384        This involves the following steps:
385        * Call discover to get the catalog
386        * Generate a configured catalog that syncs the given stream in full_refresh mode
387        * Write the configured catalog and the config to a temporary file
388        * execute the connector with read --config <config_file> --catalog <catalog_file>
389        * Listen to the messages and return the first AirbyteRecordMessages that come along.
390        * Make sure the subprocess is killed when the function returns.
391        """
392        discovered_catalog: AirbyteCatalog = self.discovered_catalog
393        configured_catalog = ConfiguredAirbyteCatalog(
394            streams=[
395                ConfiguredAirbyteStream(
396                    stream=s,
397                    sync_mode=SyncMode.full_refresh,
398                    destination_sync_mode=DestinationSyncMode.overwrite,
399                )
400                for s in discovered_catalog.streams
401                if s.name == stream
402            ],
403        )
404        if len(configured_catalog.streams) == 0:
405            raise exc.PyAirbyteInputError(
406                message="Requested stream does not exist.",
407                context={
408                    "stream": stream,
409                    "available_streams": self.get_available_streams(),
410                    "connector_name": self.name,
411                },
412            ) from KeyError(stream)
413
414        configured_stream = configured_catalog.streams[0]
415        all_properties = cast(
416            list[str], list(configured_stream.stream.json_schema["properties"].keys())
417        )
418
419        def _with_logging(records: Iterable[dict[str, Any]]) -> Iterator[dict[str, Any]]:
420            self._log_sync_start(cache=None)
421            yield from records
422            self._log_sync_success(cache=None)
423
424        iterator: Iterator[dict[str, Any]] = _with_logging(
425            records=(  # Generator comprehension yields StreamRecord objects for each record
426                StreamRecord.from_record_message(
427                    record_message=record.record,
428                    expected_keys=all_properties,
429                    prune_extra_fields=True,
430                )
431                for record in self._read_with_catalog(configured_catalog)
432                if record.record
433            )
434        )
435        return LazyDataset(
436            iterator,
437            stream_metadata=configured_stream,
438        )

Read a stream from the connector.

This involves the following steps:

  • Call discover to get the catalog
  • Generate a configured catalog that syncs the given stream in full_refresh mode
  • Write the configured catalog and the config to a temporary file
  • execute the connector with read --config --catalog
  • Listen to the messages and return the first AirbyteRecordMessages that come along.
  • Make sure the subprocess is killed when the function returns.
def get_documents( self, stream: str, title_property: str | None = None, content_properties: list[str] | None = None, metadata_properties: list[str] | None = None, *, render_metadata: bool = False) -> collections.abc.Iterable[airbyte.documents.Document]:
440    def get_documents(
441        self,
442        stream: str,
443        title_property: str | None = None,
444        content_properties: list[str] | None = None,
445        metadata_properties: list[str] | None = None,
446        *,
447        render_metadata: bool = False,
448    ) -> Iterable[Document]:
449        """Read a stream from the connector and return the records as documents.
450
451        If metadata_properties is not set, all properties that are not content will be added to
452        the metadata.
453
454        If render_metadata is True, metadata will be rendered in the document, as well as the
455        the main content.
456        """
457        return self.get_records(stream).to_documents(
458            title_property=title_property,
459            content_properties=content_properties,
460            metadata_properties=metadata_properties,
461            render_metadata=render_metadata,
462        )

Read a stream from the connector and return the records as documents.

If metadata_properties is not set, all properties that are not content will be added to the metadata.

If render_metadata is True, metadata will be rendered in the document, as well as the the main content.

def check(self) -> None:
464    def check(self) -> None:
465        """Call check on the connector.
466
467        This involves the following steps:
468        * Write the config to a temporary file
469        * execute the connector with check --config <config_file>
470        * Listen to the messages and return the first AirbyteCatalog that comes along.
471        * Make sure the subprocess is killed when the function returns.
472        """
473        with as_temp_files([self._config]) as [config_file]:
474            try:
475                for msg in self._execute(["check", "--config", config_file]):
476                    if msg.type == Type.CONNECTION_STATUS and msg.connectionStatus:
477                        if msg.connectionStatus.status != Status.FAILED:
478                            print(f"Connection check succeeded for `{self.name}`.")
479                            log_source_check_result(
480                                name=self.name,
481                                state=EventState.SUCCEEDED,
482                            )
483                            return
484
485                        log_source_check_result(
486                            name=self.name,
487                            state=EventState.FAILED,
488                        )
489                        raise exc.AirbyteConnectorCheckFailedError(
490                            help_url=self.docs_url,
491                            context={
492                                "failure_reason": msg.connectionStatus.message,
493                            },
494                        )
495                raise exc.AirbyteConnectorCheckFailedError(log_text=self._last_log_messages)
496            except exc.AirbyteConnectorReadError as ex:
497                raise exc.AirbyteConnectorCheckFailedError(
498                    message="The connector failed to check the connection.",
499                    log_text=ex.log_text,
500                ) from ex

Call check on the connector.

This involves the following steps:

  • Write the config to a temporary file
  • execute the connector with check --config
  • Listen to the messages and return the first AirbyteCatalog that comes along.
  • Make sure the subprocess is killed when the function returns.
def install(self) -> None:
502    def install(self) -> None:
503        """Install the connector if it is not yet installed."""
504        self.executor.install()
505        print("For configuration instructions, see: \n" f"{self.docs_url}#reference\n")

Install the connector if it is not yet installed.

def uninstall(self) -> None:
507    def uninstall(self) -> None:
508        """Uninstall the connector if it is installed.
509
510        This only works if the use_local_install flag wasn't used and installation is managed by
511        PyAirbyte.
512        """
513        self.executor.uninstall()

Uninstall the connector if it is installed.

This only works if the use_local_install flag wasn't used and installation is managed by PyAirbyte.

def read( self, cache: airbyte.caches.base.CacheBase | None = None, *, streams: str | list[str] | None = None, write_strategy: str | airbyte.strategies.WriteStrategy = <WriteStrategy.AUTO: 'auto'>, force_full_refresh: bool = False, skip_validation: bool = False) -> airbyte.results.ReadResult:
648    def read(
649        self,
650        cache: CacheBase | None = None,
651        *,
652        streams: str | list[str] | None = None,
653        write_strategy: str | WriteStrategy = WriteStrategy.AUTO,
654        force_full_refresh: bool = False,
655        skip_validation: bool = False,
656    ) -> ReadResult:
657        """Read from the connector and write to the cache.
658
659        Args:
660            cache: The cache to write to. If None, a default cache will be used.
661            write_strategy: The strategy to use when writing to the cache. If a string, it must be
662                one of "append", "upsert", "replace", or "auto". If a WriteStrategy, it must be one
663                of WriteStrategy.APPEND, WriteStrategy.UPSERT, WriteStrategy.REPLACE, or
664                WriteStrategy.AUTO.
665            streams: Optional if already set. A list of stream names to select for reading. If set
666                to "*", all streams will be selected.
667            force_full_refresh: If True, the source will operate in full refresh mode. Otherwise,
668                streams will be read in incremental mode if supported by the connector. This option
669                must be True when using the "replace" strategy.
670        """
671        if write_strategy == WriteStrategy.REPLACE and not force_full_refresh:
672            warnings.warn(
673                message=(
674                    "Using `REPLACE` strategy without also setting `full_refresh_mode=True` "
675                    "could result in data loss. "
676                    "To silence this warning, use the following: "
677                    'warnings.filterwarnings("ignore", '
678                    'category="airbyte.warnings.PyAirbyteDataLossWarning")`'
679                ),
680                category=PyAirbyteDataLossWarning,
681                stacklevel=1,
682            )
683        if cache is None:
684            cache = get_default_cache()
685
686        if isinstance(write_strategy, str):
687            try:
688                write_strategy = WriteStrategy(write_strategy)
689            except ValueError:
690                raise exc.PyAirbyteInputError(
691                    message="Invalid strategy",
692                    context={
693                        "write_strategy": write_strategy,
694                        "available_strategies": [s.value for s in WriteStrategy],
695                    },
696                ) from None
697
698        if streams:
699            self.select_streams(streams)
700
701        if not self._selected_stream_names:
702            raise exc.PyAirbyteNoStreamsSelectedError(
703                connector_name=self.name,
704                available_streams=self.get_available_streams(),
705            )
706
707        cache.processor.register_source(
708            source_name=self.name,
709            incoming_source_catalog=self.configured_catalog,
710            stream_names=set(self._selected_stream_names),
711        )
712
713        state = (
714            cache._get_state(  # noqa: SLF001  # Private method until we have a public API for it.
715                source_name=self.name,
716                streams=self._selected_stream_names,
717            )
718            if not force_full_refresh
719            else None
720        )
721        if not skip_validation:
722            self.validate_config()
723
724        self._log_sync_start(cache=cache)
725        try:
726            cache.processor.process_airbyte_messages(
727                self._read_with_catalog(
728                    catalog=self.configured_catalog,
729                    state=state,
730                ),
731                write_strategy=write_strategy,
732            )
733        except Exception as ex:
734            self._log_sync_failure(cache=cache, exception=ex)
735            raise exc.AirbyteConnectorFailedError(
736                log_text=self._last_log_messages,
737            ) from ex
738
739        self._log_sync_success(cache=cache)
740        return ReadResult(
741            processed_records=self._processed_records,
742            cache=cache,
743            processed_streams=[stream.stream.name for stream in self.configured_catalog.streams],
744        )

Read from the connector and write to the cache.

Arguments:
  • cache: The cache to write to. If None, a default cache will be used.
  • write_strategy: The strategy to use when writing to the cache. If a string, it must be one of "append", "upsert", "replace", or "auto". If a WriteStrategy, it must be one of WriteStrategy.APPEND, WriteStrategy.UPSERT, WriteStrategy.REPLACE, or WriteStrategy.AUTO.
  • streams: Optional if already set. A list of stream names to select for reading. If set to "*", all streams will be selected.
  • force_full_refresh: If True, the source will operate in full refresh mode. Otherwise, streams will be read in incremental mode if supported by the connector. This option must be True when using the "replace" strategy.