-
Notifications
You must be signed in to change notification settings - Fork 3.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.
Already on GitHub? Sign in to your account
CAT: add validation for stream statuses #34675
Changes from 13 commits
aaa6e58
6bf54af
2b49825
fa03b42
e1d73f7
46f30c0
3cb3ade
f7b7bd0
229236d
208e6bd
6c9d88d
7605a2c
b7208b7
80304f3
0236b60
9153f5a
085ab2b
266b309
b155b51
afbb79e
13331f9
8b55494
fe49f10
9fa583c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,8 @@ | |
from airbyte_protocol.models import ( | ||
AirbyteRecordMessage, | ||
AirbyteStream, | ||
AirbyteStreamStatus, | ||
AirbyteStreamStatusTraceMessage, | ||
AirbyteTraceMessage, | ||
ConfiguredAirbyteCatalog, | ||
ConfiguredAirbyteStream, | ||
|
@@ -975,6 +977,13 @@ def should_validate_schema_fixture(self, inputs: BasicReadTestConfig, test_stric | |
else: | ||
return inputs.validate_schema | ||
|
||
@pytest.fixture(name="should_validate_stream_statuses") | ||
def should_validate_stream_statuses_fixture(self, inputs: BasicReadTestConfig, test_strictness_level: Config.TestStrictnessLevel): | ||
if not inputs.validate_stream_statuses and test_strictness_level is Config.TestStrictnessLevel.high: | ||
pytest.fail("High strictness level error: validate_stream_statuses must be set to true in the basic read test configuration.") | ||
else: | ||
return inputs.validate_stream_statuses | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a grooming note that mentions that we should not apply this by default for community connectors. Right now, the default value for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed, this test will not run if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With the new change, does that mean that if we want to enable this for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in this case, do we need to set it to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we set it to "None" so that we have three states:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point! |
||
|
||
@pytest.fixture(name="should_fail_on_extra_columns") | ||
def should_fail_on_extra_columns_fixture(self, inputs: BasicReadTestConfig): | ||
# TODO (Ella): enforce this param once all connectors are passing | ||
|
@@ -1026,6 +1035,7 @@ async def test_read( | |
expect_records_config: ExpectedRecordsConfig, | ||
should_validate_schema: Boolean, | ||
should_validate_data_points: Boolean, | ||
should_validate_stream_statuses: Boolean, | ||
should_fail_on_extra_columns: Boolean, | ||
empty_streams: Set[EmptyStreamConfiguration], | ||
ignored_fields: Optional[Mapping[str, List[IgnoredFieldsConfiguration]]], | ||
|
@@ -1035,6 +1045,7 @@ async def test_read( | |
certified_file_based_connector: bool, | ||
): | ||
output = await docker_runner.call_read(connector_config, configured_catalog) | ||
|
||
records = [message.record for message in filter_output(output, Type.RECORD)] | ||
|
||
if certified_file_based_connector: | ||
|
@@ -1067,6 +1078,14 @@ async def test_read( | |
detailed_logger=detailed_logger, | ||
) | ||
|
||
if should_validate_stream_statuses: | ||
all_statuses = [ | ||
message.trace.stream_status | ||
for message in filter_output(output, Type.TRACE) | ||
if message.trace.type == TraceType.STREAM_STATUS | ||
] | ||
self._validate_stream_statuses(configured_catalog=configured_catalog, statuses=all_statuses) | ||
|
||
async def test_airbyte_trace_message_on_failure(self, connector_config, inputs: BasicReadTestConfig, docker_runner: ConnectorRunner): | ||
if not inputs.expect_trace_message_on_failure: | ||
pytest.skip("Skipping `test_airbyte_trace_message_on_failure` because `inputs.expect_trace_message_on_failure=False`") | ||
|
@@ -1233,6 +1252,22 @@ async def test_all_supported_file_types_present(self, certified_file_based_conne | |
"or add them to the `file_types -> unsupported_types` list in config." | ||
) | ||
|
||
@staticmethod | ||
def _validate_stream_statuses(configured_catalog: ConfiguredAirbyteCatalog, statuses: List[AirbyteStreamStatusTraceMessage]): | ||
"""Validate all statuses for all streams in the catalogs were emitted in correct order""" | ||
stream_statuses = defaultdict(list) | ||
for status in statuses: | ||
stream_statuses[status.stream_descriptor.name].append(status.status) | ||
maxi297 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
assert set(x.stream.name for x in configured_catalog.streams) == set(stream_statuses), "All stream must emit status" | ||
|
||
for stream_name, status_list in stream_statuses.items(): | ||
assert status_list == [ | ||
AirbyteStreamStatus.STARTED, | ||
AirbyteStreamStatus.RUNNING, | ||
maxi297 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
AirbyteStreamStatus.COMPLETE, | ||
], f"Stream `{stream_name}` statuses should be emitted in the next order: `STARTED`, `RUNNING`, `COMPLETE`" | ||
|
||
|
||
@pytest.mark.default_timeout(TEN_MINUTES) | ||
class TestConnectorAttributes(BaseTest): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,40 +90,35 @@ def test_verify_records_schema(configured_catalog: ConfiguredAirbyteCatalog): | |
@pytest.mark.parametrize( | ||
"json_schema, record, should_fail", | ||
[ | ||
( | ||
{"type": "object", "properties": {"a": {"type": "string"}}}, | ||
{"a": "str", "b": "extra_string"}, | ||
True | ||
), | ||
({"type": "object", "properties": {"a": {"type": "string"}}}, {"a": "str", "b": "extra_string"}, True), | ||
( | ||
{"type": "object", "properties": {"a": {"type": "string"}, "some_obj": {"type": ["null", "object"]}}}, | ||
{"a": "str", "some_obj": {"b": "extra_string"}}, | ||
False | ||
False, | ||
), | ||
( | ||
{ | ||
"type": "object", | ||
"properties": {"a": {"type": "string"}, "some_obj": {"type": ["null", "object"], "properties": {"a": {"type": "string"}}}}, | ||
}, | ||
{"a": "str", "some_obj": {"a": "str", "b": "extra_string"}}, | ||
True | ||
True, | ||
), | ||
|
||
( | ||
{"type": "object", "properties": {"a": {"type": "string"}, "b": {"type": "array", "items": {"type": "object"}}}}, | ||
{"a": "str", "b": [{"a": "extra_string"}]}, | ||
False | ||
False, | ||
), | ||
( | ||
{ | ||
"type": "object", | ||
"properties": { | ||
"a": {"type": "string"}, | ||
"b": {"type": "array", "items": {"type": "object", "properties": {"a": {"type": "string"}}}}, | ||
} | ||
}, | ||
}, | ||
{"a": "str", "b": [{"a": "string", "b": "extra_string"}]}, | ||
True | ||
True, | ||
), | ||
], | ||
ids=[ | ||
|
@@ -136,7 +131,7 @@ def test_verify_records_schema(configured_catalog: ConfiguredAirbyteCatalog): | |
) | ||
def test_verify_records_schema_with_fail_on_extra_columns(configured_catalog: ConfiguredAirbyteCatalog, json_schema, record, should_fail): | ||
"""Test that fail_on_extra_columns works correctly with nested objects, array of objects""" | ||
configured_catalog.streams[0].stream.json_schema =json_schema | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Out of curiosity, what formatter do you personally use, and what formatter does most of the team have? @alafanechere works on setting up requirements on new PRs including type checks with mypy, and I'm considering a formatter as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Mainly There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @natikgadzhi we already have a repo wide formatter ( |
||
configured_catalog.streams[0].stream.json_schema = json_schema | ||
records = [AirbyteRecordMessage(stream="my_stream", data=record, emitted_at=0)] | ||
streams_with_errors = verify_records_schema(records, configured_catalog, fail_on_extra_columns=True) | ||
errors = [error.message for error in streams_with_errors["my_stream"].values()] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This means that it is a breaking change. What is the plan for updating the sources so that CATs pass after this is merged?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We expect that all certified connectors will pass this test.
Our connector-health engineer will take care of it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you be more explicit about this? I want to understand is CATs will be red for a while. We have a goal for Q1 which is to have CATs be green for certified connectors. This includes having a process which ensure that connectors are passing CATs