Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AirbyteLib: More robust error handling, installation improvements #34572

Merged
merged 30 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
abbb256
new exception type: AirbyteConnectorNotRegisteredError
aaronsteers Jan 26, 2024
3845f5c
make constructors more resilient
aaronsteers Jan 26, 2024
9fccace
print stderr in exception text, cleanup failed install, remove editab…
aaronsteers Jan 26, 2024
a217a6e
move auto-install out of venv constructor, for easier debugging
aaronsteers Jan 26, 2024
6aa85d6
add test to assert that install failure includes pip log text
aaronsteers Jan 26, 2024
dddbc78
update docs
aaronsteers Jan 26, 2024
b1d966b
auto-format
aaronsteers Jan 26, 2024
f61152a
update docs
aaronsteers Jan 26, 2024
d665088
refactor version handling, control for side effects
aaronsteers Jan 26, 2024
809918b
fix exception handling in _get_installed_version()
aaronsteers Jan 26, 2024
4a41ffb
fix tests
aaronsteers Jan 26, 2024
bab5e06
improve thread safety
aaronsteers Jan 27, 2024
10ce077
handle quoted spaces in pip_url
aaronsteers Jan 27, 2024
063bba3
fix import sorts
aaronsteers Jan 27, 2024
ab75be4
standalone validate_config() method
aaronsteers Jan 27, 2024
8880b0b
add Source.yaml_spec property
aaronsteers Jan 27, 2024
3773149
make _yaml_spec a protected member
aaronsteers Jan 27, 2024
90918c8
fix too-limited json package_data glob
aaronsteers Jan 27, 2024
9197728
fix missing copyright str
aaronsteers Jan 28, 2024
f73f288
docstring
aaronsteers Jan 28, 2024
dd9ac99
update docs
aaronsteers Jan 28, 2024
a2bed01
revert source-github change
aaronsteers Jan 28, 2024
2e49154
updated comment
aaronsteers Jan 28, 2024
f975282
remove redundant strings
aaronsteers Jan 28, 2024
ace7208
Merge remote-tracking branch 'origin/master' into aj/airbyte-lib/inst…
aaronsteers Jan 28, 2024
8775c1b
update docs (removes empty cloud page)
aaronsteers Jan 28, 2024
f24226d
remove unused lock
aaronsteers Jan 30, 2024
7370d83
rename AirbyteConnectoNotFoundError to AirbyteConnectorExecutableNotF…
aaronsteers Jan 30, 2024
cfafccc
Merge branch 'master' into aj/airbyte-lib/install-failure-handling
aaronsteers Jan 30, 2024
8446838
allow prereleases in version check
aaronsteers Jan 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
308 changes: 234 additions & 74 deletions airbyte-lib/airbyte_lib/_executor.py

Large diffs are not rendered by default.

76 changes: 51 additions & 25 deletions airbyte-lib/airbyte_lib/_factories/connector_factories.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
from __future__ import annotations

import shutil
from pathlib import Path
from typing import Any

from airbyte_lib._executor import Executor, PathExecutor, VenvExecutor
from airbyte_lib.exceptions import AirbyteLibInputError
from airbyte_lib.registry import get_connector_metadata
from airbyte_lib import exceptions as exc
from airbyte_lib._executor import PathExecutor, VenvExecutor
from airbyte_lib.registry import ConnectorMetadata, get_connector_metadata
from airbyte_lib.source import Source


Expand All @@ -15,7 +17,7 @@ def get_connector(
pip_url: str | None = None,
config: dict[str, Any] | None = None,
*,
use_local_install: bool = False,
local_executable: Path | str | None = None,
install_if_missing: bool = True,
) -> Source:
"""Get a connector by name and version.
Expand All @@ -29,34 +31,58 @@ def get_connector(
connector name.
config: connector config - if not provided, you need to set it later via the set_config
method.
use_local_install: whether to use a virtual environment to run the connector. If True, the
connector is expected to be available on the path (e.g. installed via pip). If False,
the connector will be installed automatically in a virtual environment.
install_if_missing: whether to install the connector if it is not available locally. This
parameter is ignored if use_local_install is True.
local_executable: If set, the connector will be assumed to already be installed and will be
executed using this path or executable name. Otherwise, the connector will be installed
automatically in a virtual environment.
install_if_missing: Whether to install the connector if it is not available locally. This
parameter is ignored when local_executable is set.
"""
metadata = get_connector_metadata(name)
if use_local_install:
if local_executable:
if pip_url:
raise AirbyteLibInputError(
message="Param 'pip_url' is not supported when 'use_local_install' is True."
raise exc.AirbyteLibInputError(
message="Param 'pip_url' is not supported when 'local_executable' is set."
)
if version:
raise AirbyteLibInputError(
message="Param 'version' is not supported when 'use_local_install' is True."
raise exc.AirbyteLibInputError(
message="Param 'version' is not supported when 'local_executable' is set."
)
executor: Executor = PathExecutor(
metadata=metadata,
target_version=version,
)

else:
executor = VenvExecutor(
metadata=metadata,
target_version=version,
install_if_missing=install_if_missing,
pip_url=pip_url,
if isinstance(local_executable, str):
if "/" in local_executable or "\\" in local_executable:
# Assume this is a path
local_executable = Path(local_executable).absolute()
else:
which_executable = shutil.which(local_executable)
if which_executable is None:
raise FileNotFoundError(local_executable)
local_executable = Path(which_executable).absolute()

return Source(
name=name,
config=config,
executor=PathExecutor(
name=name,
path=local_executable,
),
)

metadata: ConnectorMetadata | None = None
try:
metadata = get_connector_metadata(name)
except exc.AirbyteConnectorNotRegisteredError:
if not pip_url:
# We don't have a pip url or registry entry, so we can't install the connector
raise

executor = VenvExecutor(
name=name,
metadata=metadata,
target_version=version,
pip_url=pip_url,
)
if install_if_missing:
executor.ensure_installation()

return Source(
executor=executor,
name=name,
Expand Down
10 changes: 9 additions & 1 deletion airbyte-lib/airbyte_lib/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,14 @@ class AirbyteConnectorRegistryError(AirbyteError):
"""Error when accessing the connector registry."""


@dataclass
class AirbyteConnectorNotRegisteredError(AirbyteConnectorRegistryError):
"""Connector not found in registry."""

connector_name: str | None = None
guidance = "Please double check the connector name."


# Connector Errors


Expand All @@ -185,7 +193,7 @@ class AirbyteConnectorError(AirbyteError):


class AirbyteConnectorNotFoundError(AirbyteConnectorError):
"""Connector not found."""
"""Connector name not found in registry."""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the code, by my understanding AirbyteConnectorNotFoundError means the connector is not found locally, and AirbyteConnectorNotRegisteredError means the connector is not found in the registry, but this docstring indicates otherwise.

Could you clarify?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! This was a mistake. I've renamed the error to AirbyteConnectorExecutableNotFoundError so it is more explicit.



class AirbyteConnectorInstallationError(AirbyteConnectorError):
Expand Down
74 changes: 55 additions & 19 deletions airbyte-lib/airbyte_lib/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import json
import os
import threading
from copy import copy
from dataclasses import dataclass
from pathlib import Path

Expand All @@ -12,47 +14,81 @@
from airbyte_lib.version import get_version


__cache: dict[str, ConnectorMetadata] | None = None
_cache_lock = threading.Lock()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like this lock is never actually aquired? Am I missing something?

Copy link
Collaborator Author

@aaronsteers aaronsteers Jan 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. I had it in an earlier version when debugging a race condition, then refactored so it wasn't needed.

(Now removed.)


REGISTRY_ENV_VAR = "AIRBYTE_LOCAL_REGISTRY"
REGISTRY_URL = "https://connectors.airbyte.com/files/registries/v0/oss_registry.json"


@dataclass
class ConnectorMetadata:
name: str
latest_available_version: str


_cache: dict[str, ConnectorMetadata] | None = None
def _get_registry_url() -> str:
if REGISTRY_ENV_VAR in os.environ:
return str(os.environ.get(REGISTRY_ENV_VAR))

REGISTRY_URL = "https://connectors.airbyte.com/files/registries/v0/oss_registry.json"
return REGISTRY_URL


def _update_cache() -> None:
global _cache
if os.environ.get("AIRBYTE_LOCAL_REGISTRY"):
with Path(str(os.environ.get("AIRBYTE_LOCAL_REGISTRY"))).open() as f:
data = json.load(f)
else:
def _get_registry_cache(*, force_refresh: bool = False) -> dict[str, ConnectorMetadata]:
"""Return the registry cache."""
global __cache
if __cache and not force_refresh:
return __cache

registry_url = _get_registry_url()
if registry_url.startswith("http"):
response = requests.get(
REGISTRY_URL, headers={"User-Agent": f"airbyte-lib-{get_version()}"}
registry_url, headers={"User-Agent": f"airbyte-lib-{get_version()}"}
)
response.raise_for_status()
data = response.json()
_cache = {}
else:
# Assume local file
with Path(registry_url).open() as f:
data = json.load(f)

new_cache: dict[str, ConnectorMetadata] = {}

for connector in data["sources"]:
name = connector["dockerRepository"].replace("airbyte/", "")
_cache[name] = ConnectorMetadata(name, connector["dockerImageTag"])
new_cache[name] = ConnectorMetadata(name, connector["dockerImageTag"])

if len(new_cache) == 0:
raise exc.AirbyteLibInternalError(
message="Connector registry is empty.",
context={
"registry_url": _get_registry_url(),
},
)

__cache = new_cache
return __cache


def get_connector_metadata(name: str) -> ConnectorMetadata:
"""Check the cache for the connector.

If the cache is empty, populate by calling update_cache.
"""
if not _cache:
_update_cache()
if not _cache or name not in _cache:
raise exc.AirbyteLibInputError(
message="Connector name not found in registry.",
guidance="Please double check the connector name.",
cache = copy(_get_registry_cache())
if not cache:
raise exc.AirbyteLibInternalError(
message="Connector registry could not be loaded.",
context={
"registry_url": _get_registry_url(),
},
)
if name not in cache:
raise exc.AirbyteConnectorNotRegisteredError(
connector_name=name,
context={
"connector_name": name,
"registry_url": _get_registry_url(),
"available_connectors": sorted(cache.keys()),
},
)
return _cache[name]
return cache[name]
81 changes: 64 additions & 17 deletions airbyte-lib/airbyte_lib/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import TYPE_CHECKING, Any

import jsonschema
import yaml

from airbyte_protocol.models import (
AirbyteCatalog,
Expand Down Expand Up @@ -68,7 +69,13 @@ def __init__(
name: str,
config: dict[str, Any] | None = None,
streams: list[str] | None = None,
*,
validate: bool = False,
) -> None:
"""Initialize the source.

If config is provided, it will be validated against the spec if validate is True.
"""
self._processed_records = 0
self.executor = executor
self.name = name
Expand All @@ -79,7 +86,7 @@ def __init__(
self._spec: ConnectorSpecification | None = None
self._selected_stream_names: list[str] | None = None
if config is not None:
self.set_config(config)
self.set_config(config, validate=validate)
if streams is not None:
self.set_streams(streams)

Expand All @@ -102,8 +109,22 @@ def set_streams(self, streams: list[str]) -> None:
)
self._selected_stream_names = streams

def set_config(self, config: dict[str, Any]) -> None:
self._validate_config(config)
def set_config(
self,
config: dict[str, Any],
*,
validate: bool = False,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the advantage of deferring the check? I thought of it being quite nice as it will tell you as early as possible if your config won't work, instead of waiting for actually invoking.

What's the workflow you had in mind here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed in DM.

) -> None:
"""Set the config for the connector.

If validate is True, raise an exception if the config fails validation.

If validate is False, validation will be deferred until check() or validate_config()
is called.
"""
if validate:
self.validate_config(config)

self._config_dict = config

@property
Expand Down Expand Up @@ -131,9 +152,13 @@ def _discover(self) -> AirbyteCatalog:
log_text=self._last_log_messages,
)

def _validate_config(self, config: dict[str, Any]) -> None:
"""Validate the config against the spec."""
def validate_config(self, config: dict[str, Any] | None = None) -> None:
"""Validate the config against the spec.

If config is not provided, the already-set config will be validated.
"""
spec = self._get_spec(force_refresh=False)
config = self._config if config is None else config
jsonschema.validate(config, spec.connectionSpecification)

def get_available_streams(self) -> list[str]:
Expand Down Expand Up @@ -161,6 +186,21 @@ def _get_spec(self, *, force_refresh: bool = False) -> ConnectorSpecification:
log_text=self._last_log_messages,
)

@property
def _yaml_spec(self) -> str:
"""Get the spec as a yaml string.

For now, the primary use case is for writing and debugging a valid config for a source.

This is private for now because we probably want better polish before exposing this
as a stable interface. This will also get easier when we have docs links with this info
for each connector.
"""
spec_obj: ConnectorSpecification = self._get_spec()
spec_dict = spec_obj.dict(exclude_unset=True)
# convert to a yaml string
return yaml.dump(spec_dict)

@property
def discovered_catalog(self) -> AirbyteCatalog:
"""Get the raw catalog for the given streams.
Expand Down Expand Up @@ -248,17 +288,23 @@ def check(self) -> None:
* Make sure the subprocess is killed when the function returns.
"""
with as_temp_files([self._config]) as [config_file]:
for msg in self._execute(["check", "--config", config_file]):
if msg.type == Type.CONNECTION_STATUS and msg.connectionStatus:
if msg.connectionStatus.status != Status.FAILED:
return # Success!

raise exc.AirbyteConnectorCheckFailedError(
context={
"message": msg.connectionStatus.message,
}
)
raise exc.AirbyteConnectorCheckFailedError(log_text=self._last_log_messages)
try:
for msg in self._execute(["check", "--config", config_file]):
if msg.type == Type.CONNECTION_STATUS and msg.connectionStatus:
if msg.connectionStatus.status != Status.FAILED:
return # Success!

raise exc.AirbyteConnectorCheckFailedError(
context={
"message": msg.connectionStatus.message,
}
)
raise exc.AirbyteConnectorCheckFailedError(log_text=self._last_log_messages)
except exc.AirbyteConnectorReadError as ex:
raise exc.AirbyteConnectorCheckFailedError(
message="The connector failed to check the connection.",
log_text=ex.log_text,
) from ex

def install(self) -> None:
"""Install the connector if it is not yet installed."""
Expand Down Expand Up @@ -338,7 +384,8 @@ def _execute(self, args: list[str]) -> Iterator[AirbyteMessage]:
* Read the output line by line of the subprocess and serialize them AirbyteMessage objects.
Drop if not valid.
"""
self.executor.ensure_installation()
# Fail early if the connector is not installed.
self.executor.ensure_installation(auto_fix=False)

try:
self._last_log_messages = []
Expand Down
Loading
Loading