Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Source S3: Add HTTPS validation for S3 endpoint #32109

Merged
merged 12 commits into from Nov 10, 2023
6 changes: 6 additions & 0 deletions airbyte-integrations/connectors/source-s3/.coveragerc
@@ -0,0 +1,6 @@
[run]
omit =
source_s3/exceptions.py
source_s3/stream.py
source_s3/utils.py
source_s3/source_files_abstract/source.py
Expand Up @@ -286,5 +286,7 @@ acceptance_tests:
- spec_path: integration_tests/spec.json
backward_compatibility_tests_config:
disable_for_version: "4.0.3" # removing the `streams.*.file_type` field which was redundant with `streams.*.format`
- spec_path: integration_tests/cloud_spec.json
deployment_mode: "cloud"
connector_image: airbyte/source-s3:dev
test_strictness_level: high

Large diffs are not rendered by default.

Expand Up @@ -318,6 +318,7 @@
"title": "Endpoint",
"description": "Endpoint to an S3 compatible service. Leave empty to use AWS.",
"default": "",
"examples": ["my-s3-endpoint.com", "https://my-s3-endpoint.com"],
"order": 4,
"type": "string"
},
Expand Down
2 changes: 1 addition & 1 deletion airbyte-integrations/connectors/source-s3/metadata.yaml
Expand Up @@ -10,7 +10,7 @@ data:
connectorSubtype: file
connectorType: source
definitionId: 69589781-7828-43c5-9f63-8925b1c1ccc2
dockerImageTag: 4.1.4
dockerImageTag: 4.2.0
dockerRepository: airbyte/source-s3
documentationUrl: https://docs.airbyte.com/integrations/sources/s3
githubIssueLabel: source-s3
Expand Down
14 changes: 13 additions & 1 deletion airbyte-integrations/connectors/source-s3/source_s3/v4/config.py
Expand Up @@ -5,6 +5,7 @@
from typing import Optional

from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
from airbyte_cdk.utils import is_cloud_environment
from pydantic import AnyUrl, Field, ValidationError, root_validator


Expand Down Expand Up @@ -39,7 +40,11 @@ def documentation_url(cls) -> AnyUrl:
)

endpoint: Optional[str] = Field(
"", title="Endpoint", description="Endpoint to an S3 compatible service. Leave empty to use AWS.", order=4
default="",
title="Endpoint",
description="Endpoint to an S3 compatible service. Leave empty to use AWS.",
examples=["my-s3-endpoint.com", "https://my-s3-endpoint.com"],
order=4,
)

@root_validator
Expand All @@ -50,4 +55,11 @@ def validate_optional_args(cls, values):
raise ValidationError(
"`aws_access_key_id` and `aws_secret_access_key` are both required to authenticate with AWS.", model=Config
)

if is_cloud_environment():
endpoint = values.get("endpoint")
if endpoint:
if endpoint.startswith("http://"): # ignore-https-check
raise ValidationError("The endpoint must be a secure HTTPS endpoint.", model=Config)

return values
10 changes: 10 additions & 0 deletions airbyte-integrations/connectors/source-s3/source_s3/v4/source.py
Expand Up @@ -7,6 +7,7 @@
from airbyte_cdk.config_observation import emit_configuration_as_airbyte_control_message
from airbyte_cdk.models import ConnectorSpecification
from airbyte_cdk.sources.file_based.file_based_source import FileBasedSource
from airbyte_cdk.utils import is_cloud_environment
from source_s3.source import SourceS3Spec
from source_s3.v4.legacy_config_transformer import LegacyConfigTransformer

Expand Down Expand Up @@ -51,6 +52,15 @@ def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification:
)
self._clean_required_fields(s4_spec["properties"][v3_property_key])

if is_cloud_environment():
s4_spec["properties"]["endpoint"].update(
{
"description": "Endpoint to an S3 compatible service. Leave empty to use AWS. "
"The custom endpoint must be secure, but the 'https' prefix is not required.",
"pattern": "^(?!http://).*$", # ignore-https-check
}
)

return ConnectorSpecification(
documentationUrl=self.spec_class.documentation_url(),
connectionSpecification=s4_spec,
Expand Down
Expand Up @@ -13,30 +13,36 @@


@pytest.mark.parametrize(
"kwargs,expected_error",
"kwargs, is_cloud, expected_error",
[
pytest.param({"bucket": "test", "streams": []}, None, id="required-fields"),
pytest.param({"bucket": "test", "streams": []}, False, None, id="required-fields"),
pytest.param(
{"bucket": "test", "streams": [], "aws_access_key_id": "access_key", "aws_secret_access_key": "secret_access_key"},
True,
None,
id="config-created-with-aws-info",
),
pytest.param({"bucket": "test", "streams": [], "endpoint": "http://test.com"}, None, id="config-created-with-endpoint"),
pytest.param({"bucket": "test", "streams": [], "endpoint": "https://test.com"}, False, None, id="config-created-with-endpoint"),
pytest.param({"bucket": "test", "streams": [], "endpoint": "http://test.com"}, True, ValidationError, id="http-endpoint-error"),
pytest.param({"bucket": "test", "streams": [], "endpoint": "http://test.com"}, False, None, id="http-endpoint-error"),
pytest.param(
{
"bucket": "test",
"streams": [],
"aws_access_key_id": "access_key",
"aws_secret_access_key": "secret_access_key",
"endpoint": "http://test.com",
"endpoint": "https://test.com",
},
True,
None,
id="config-created-with-endpoint-and-aws-info",
),
pytest.param({"streams": []}, ValidationError, id="missing-bucket"),
pytest.param({"streams": []}, False, ValidationError, id="missing-bucket"),
],
)
def test_config(kwargs, expected_error):
def test_config(mocker, kwargs, is_cloud, expected_error):
mocker.patch("source_s3.v4.config.is_cloud_environment", lambda: is_cloud)

if expected_error:
with pytest.raises(expected_error):
Config(**kwargs)
Expand Down
Expand Up @@ -22,7 +22,7 @@

logger = logging.Logger("")

endpoint_values = ["http://fake.com", None]
endpoint_values = ["https://fake.com", None]
_get_matching_files_cases = [
pytest.param([], [], False, set(), id="no-files-match-if-no-globs"),
pytest.param(
Expand Down
8 changes: 5 additions & 3 deletions docs/integrations/sources/s3.md
Expand Up @@ -238,6 +238,7 @@ The Avro parser uses the [Fastavro library](https://fastavro.readthedocs.io/en/l
There are currently no options for JSONL parsing.

<FieldAnchor field="streams.0.format[unstructured],streams.1.format[unstructured],streams.2.format[unstructured]">

### Document File Type Format (Experimental)

:::warning
Expand All @@ -254,9 +255,10 @@ To perform the text extraction from PDF and Docx files, the connector uses the [
## Changelog

| Version | Date | Pull Request | Subject |
| :------ | :--------- | :-------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------- |
| 4.1.4 | 2023-10-30 | [31904](https://github.com/airbytehq/airbyte/pull/31904) | Update CDK |
| 4.1.3 | 2023-10-25 | [31654](https://github.com/airbytehq/airbyte/pull/31654) | Reduce image size |
|:--------|:-----------|:----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------|
| 4.2.0 | 2023-11-02 | [32109](https://github.com/airbytehq/airbyte/pull/32109) | Fix docs; add HTTPS validation for S3 endpoint; fix coverage |
| 4.1.4 | 2023-10-30 | [31904](https://github.com/airbytehq/airbyte/pull/31904) | Update CDK |
| 4.1.3 | 2023-10-25 | [31654](https://github.com/airbytehq/airbyte/pull/31654) | Reduce image size |
| 4.1.2 | 2023-10-23 | [31383](https://github.com/airbytehq/airbyte/pull/31383) | Add handling NoSuchBucket error |
| 4.1.1 | 2023-10-19 | [31601](https://github.com/airbytehq/airbyte/pull/31601) | Base image migration: remove Dockerfile and use the python-connector-base image |
| 4.1.0 | 2023-10-17 | [31340](https://github.com/airbytehq/airbyte/pull/31340) | Add reading files inside zip archive |
Expand Down