From b512fa462817b7ffaac9361a81039ec1b374f981 Mon Sep 17 00:00:00 2001 From: Alexandre Girard Date: Mon, 14 Aug 2023 12:51:27 -0700 Subject: [PATCH] file-based CDK: Configurable strings_can_be_null (#29298) * [ISSUE #28893] infer csv schema * [ISSUE #28893] align with pyarrow * Automated Commit - Formatting Changes * [ISSUE #28893] legacy inference and infer only when needed * [ISSUE #28893] fix scenario tests * [ISSUE #28893] using discovered schema as part of read * [ISSUE #28893] self-review + cleanup * [ISSUE #28893] fix test * [ISSUE #28893] code review part #1 * [ISSUE #28893] code review part #2 * Fix test * formatcdk * first pass * [ISSUE #28893] code review * fix mypy issues * comment * rename for clarity * Add a scenario test case * this isn't optional anymore * FIX test log level * Re-adding failing tests * [ISSUE #28893] improve inferrence to consider multiple types per value * Automated Commit - Formatting Changes * [ISSUE #28893] remove InferenceType.PRIMITIVE_AND_COMPLEX_TYPES * Code review * Automated Commit - Formatting Changes * fix unit tests --------- Co-authored-by: maxi297 Co-authored-by: maxi297 --- .../sources/file_based/config/csv_format.py | 5 + .../file_based/file_types/csv_parser.py | 72 +- .../file_based/file_types/test_csv_parser.py | 278 ++++-- .../file_based/scenarios/csv_scenarios.py | 808 +++++++++--------- .../sources/file_based/test_scenarios.py | 2 + 5 files changed, 652 insertions(+), 513 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py index 28d03cee8bf61..c88923134f8d7 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py @@ -63,6 +63,11 @@ class Config: default=[], description="A set of case-sensitive strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field.", ) + strings_can_be_null: bool = Field( + title="Strings Can Be Null", + default=True, + description="Whether strings can be interpreted as null values. If true, strings that match the null_values set will be interpreted as null. If false, strings that match the null_values set will be interpreted as the string itself.", + ) skip_rows_before_header: int = Field( title="Skip Rows Before Header", default=0, diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py index b6240d079d254..68f097a9057d8 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py @@ -160,10 +160,15 @@ def parse_records( discovered_schema: Optional[Mapping[str, SchemaType]], ) -> Iterable[Dict[str, Any]]: config_format = _extract_format(config) - cast_fn = CsvParser._get_cast_function(discovered_schema, config_format, logger) + if discovered_schema: + property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()} # type: ignore # discovered_schema["properties"] is known to be a mapping + deduped_property_types = CsvParser._pre_propcess_property_types(property_types) + else: + deduped_property_types = {} + cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger) data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode) for row in data_generator: - yield CsvParser._to_nullable(cast_fn(row), config_format.null_values) + yield CsvParser._to_nullable(cast_fn(row), deduped_property_types, config_format.null_values, config_format.strings_can_be_null) data_generator.close() @property @@ -172,24 +177,62 @@ def file_read_mode(self) -> FileReadMode: @staticmethod def _get_cast_function( - schema: Optional[Mapping[str, SchemaType]], config_format: CsvFormat, logger: logging.Logger + deduped_property_types: Mapping[str, str], config_format: CsvFormat, logger: logging.Logger ) -> Callable[[Mapping[str, str]], Mapping[str, str]]: # Only cast values if the schema is provided - if schema: - property_types = {col: prop["type"] for col, prop in schema["properties"].items()} - return partial(CsvParser._cast_types, property_types=property_types, config_format=config_format, logger=logger) + if deduped_property_types: + return partial(CsvParser._cast_types, deduped_property_types=deduped_property_types, config_format=config_format, logger=logger) else: # If no schema is provided, yield the rows as they are return _no_cast @staticmethod - def _to_nullable(row: Mapping[str, str], null_values: Set[str]) -> Dict[str, Optional[str]]: - nullable = row | {k: None if v in null_values else v for k, v in row.items()} + def _to_nullable( + row: Mapping[str, str], deduped_property_types: Mapping[str, str], null_values: Set[str], strings_can_be_null: bool + ) -> Dict[str, Optional[str]]: + nullable = row | { + k: None if CsvParser._value_is_none(v, deduped_property_types.get(k), null_values, strings_can_be_null) else v + for k, v in row.items() + } return nullable + @staticmethod + def _value_is_none(value: Any, deduped_property_type: Optional[str], null_values: Set[str], strings_can_be_null: bool) -> bool: + return value in null_values and (strings_can_be_null or deduped_property_type != "string") + + @staticmethod + def _pre_propcess_property_types(property_types: Dict[str, Any]) -> Mapping[str, str]: + """ + Transform the property types to be non-nullable and remove duplicate types if any. + Sample input: + { + "col1": ["string", "null"], + "col2": ["string", "string", "null"], + "col3": "integer" + } + + Sample output: + { + "col1": "string", + "col2": "string", + "col3": "integer", + } + """ + output = {} + for prop, prop_type in property_types.items(): + if isinstance(prop_type, list): + prop_type_distinct = set(prop_type) + prop_type_distinct.remove("null") + if len(prop_type_distinct) != 1: + raise ValueError(f"Could not get non nullable type from {prop_type}") + output[prop] = next(iter(prop_type_distinct)) + else: + output[prop] = prop_type + return output + @staticmethod def _cast_types( - row: Dict[str, str], property_types: Dict[str, Any], config_format: CsvFormat, logger: logging.Logger + row: Dict[str, str], deduped_property_types: Dict[str, str], config_format: CsvFormat, logger: logging.Logger ) -> Dict[str, Any]: """ Casts the values in the input 'row' dictionary according to the types defined in the JSON schema. @@ -202,17 +245,10 @@ def _cast_types( result = {} for key, value in row.items(): - prop_type = property_types.get(key) + prop_type = deduped_property_types.get(key) cast_value: Any = value - if isinstance(prop_type, list): - prop_type_distinct = set(prop_type) - prop_type_distinct.remove("null") - if len(prop_type_distinct) != 1: - raise ValueError(f"Could not get non nullable type from {prop_type}") - (prop_type,) = prop_type_distinct - - if prop_type in TYPE_PYTHON_MAPPING: + if prop_type in TYPE_PYTHON_MAPPING and prop_type is not None: _, python_type = TYPE_PYTHON_MAPPING[prop_type] if python_type is None: diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py index 64832bc8d39ba..7690fe3f3b1ef 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py @@ -30,7 +30,7 @@ "col7": "array", "col8": "array", "col9": "array", - "col10": ["null", "string"], + "col10": "string", } logger = logging.getLogger() @@ -47,10 +47,10 @@ "col4": "1.1", "col5": "asdf", "col6": '{"a": "b"}', - "col7": '[1, 2]', + "col7": "[1, 2]", "col8": '["1", "2"]', "col9": '[{"a": "b"}, {"a": "c"}]', - "col10": 'asdf', + "col10": "asdf", }, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, @@ -64,30 +64,86 @@ "col7": [1, 2], "col8": ["1", "2"], "col9": [{"a": "b"}, {"a": "c"}], - "col10": 'asdf', - }, id="cast-all-cols"), + "col10": "asdf", + }, + id="cast-all-cols", + ), pytest.param({"col1": "1"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col1": "1"}, id="cannot-cast-to-null"), pytest.param({"col2": "1"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col2": True}, id="cast-1-to-bool"), pytest.param({"col2": "0"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col2": False}, id="cast-0-to-bool"), pytest.param({"col2": "yes"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col2": True}, id="cast-yes-to-bool"), - pytest.param({"col2": "this_is_a_true_value"}, ["this_is_a_true_value"], DEFAULT_FALSE_VALUES, {"col2": True}, id="cast-custom-true-value-to-bool"), - pytest.param({"col2": "this_is_a_false_value"}, DEFAULT_TRUE_VALUES, ["this_is_a_false_value"], {"col2": False}, id="cast-custom-false-value-to-bool"), + pytest.param( + {"col2": "this_is_a_true_value"}, + ["this_is_a_true_value"], + DEFAULT_FALSE_VALUES, + {"col2": True}, + id="cast-custom-true-value-to-bool", + ), + pytest.param( + {"col2": "this_is_a_false_value"}, + DEFAULT_TRUE_VALUES, + ["this_is_a_false_value"], + {"col2": False}, + id="cast-custom-false-value-to-bool", + ), pytest.param({"col2": "no"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col2": False}, id="cast-no-to-bool"), pytest.param({"col2": "10"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col2": "10"}, id="cannot-cast-to-bool"), pytest.param({"col3": "1.1"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col3": "1.1"}, id="cannot-cast-to-int"), pytest.param({"col4": "asdf"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col4": "asdf"}, id="cannot-cast-to-float"), pytest.param({"col6": "{'a': 'b'}"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col6": "{'a': 'b'}"}, id="cannot-cast-to-dict"), - pytest.param({"col7": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col7": "['a', 'b']"}, id="cannot-cast-to-list-of-ints"), - pytest.param({"col8": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col8": "['a', 'b']"}, id="cannot-cast-to-list-of-strings"), - pytest.param({"col9": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col9": "['a', 'b']"}, id="cannot-cast-to-list-of-objects"), + pytest.param( + {"col7": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col7": "['a', 'b']"}, id="cannot-cast-to-list-of-ints" + ), + pytest.param( + {"col8": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col8": "['a', 'b']"}, id="cannot-cast-to-list-of-strings" + ), + pytest.param( + {"col9": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col9": "['a', 'b']"}, id="cannot-cast-to-list-of-objects" + ), pytest.param({"col11": "x"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col11": "x"}, id="item-not-in-props-doesn't-error"), - ] + ], ) def test_cast_to_python_type(row: Dict[str, str], true_values: Set[str], false_values: Set[str], expected_output: Dict[str, Any]) -> None: csv_format = CsvFormat(true_values=true_values, false_values=false_values) assert CsvParser._cast_types(row, PROPERTY_TYPES, csv_format, logger) == expected_output +@pytest.mark.parametrize( + "row, strings_can_be_null, expected_output", + [ + pytest.param( + {"id": "1", "name": "bob", "age": 10, "is_cool": False}, + False, + {"id": "1", "name": "bob", "age": 10, "is_cool": False}, + id="test-no-values-are-null", + ), + pytest.param( + {"id": "1", "name": "bob", "age": "null", "is_cool": "null"}, + False, + {"id": "1", "name": "bob", "age": None, "is_cool": None}, + id="test-non-string-values-are-none-if-in-null-values", + ), + pytest.param( + {"id": "1", "name": "null", "age": 10, "is_cool": False}, + False, + {"id": "1", "name": "null", "age": 10, "is_cool": False}, + id="test-string-values-are-not-none-if-strings-cannot-be-null", + ), + pytest.param( + {"id": "1", "name": "null", "age": 10, "is_cool": False}, + True, + {"id": "1", "name": None, "age": 10, "is_cool": False}, + id="test-string-values-none-if-strings-can-be-null", + ), + ], +) +def test_to_nullable(row, strings_can_be_null, expected_output): + property_types = {"id": "string", "name": "string", "age": "integer", "is_cool": "boolean"} + null_values = {"null"} + nulled_row = CsvParser._to_nullable(row, property_types, null_values, strings_can_be_null) + assert nulled_row == expected_output + + _DEFAULT_TRUE_VALUES = {"1", "yes", "yeah", "right"} _DEFAULT_FALSE_VALUES = {"0", "no", "nop", "wrong"} @@ -176,11 +232,11 @@ def __init__(self) -> None: self._prefixed_rows: List[str] = [] self._data: List[str] = [] - def with_prefixed_rows(self, rows: List[str]) -> 'CsvFileBuilder': + def with_prefixed_rows(self, rows: List[str]) -> "CsvFileBuilder": self._prefixed_rows = rows return self - def with_data(self, data: List[str]) -> 'CsvFileBuilder': + def with_data(self, data: List[str]) -> "CsvFileBuilder": self._data = data return self @@ -204,11 +260,18 @@ def setUp(self) -> None: def test_given_skip_rows_when_read_data_then_do_not_considered_prefixed_rows(self) -> None: self._config_format.skip_rows_before_header = 2 - self._stream_reader.open_file.return_value = CsvFileBuilder().with_prefixed_rows(["first line", "second line"]).with_data([ - "header", - "a value", - "another value", - ]).build() + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_prefixed_rows(["first line", "second line"]) + .with_data( + [ + "header", + "a value", + "another value", + ] + ) + .build() + ) data_generator = self._read_data() @@ -216,9 +279,7 @@ def test_given_skip_rows_when_read_data_then_do_not_considered_prefixed_rows(sel def test_given_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None: self._config_format.autogenerate_column_names = True - self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ - '0,1,2,3,4,5,6' - ]).build() + self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3,4,5,6"]).build() data_generator = self._read_data() @@ -226,26 +287,38 @@ def test_given_autogenerated_headers_when_read_data_then_generate_headers_with_f def test_given_skip_rows_after_header_when_read_data_then_do_not_parse_skipped_rows(self) -> None: self._config_format.skip_rows_after_header = 1 - self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ - "header1,header2", - "skipped row: important that the is no comma in this string to test if columns do not match in skipped rows", - "a value 1,a value 2", - "another value 1,another value 2" - ]).build() + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1,header2", + "skipped row: important that the is no comma in this string to test if columns do not match in skipped rows", + "a value 1,a value 2", + "another value 1,another value 2", + ] + ) + .build() + ) data_generator = self._read_data() assert list(data_generator) == [ {"header1": "a value 1", "header2": "a value 2"}, - {"header1": "another value 1", "header2": "another value 2"} + {"header1": "another value 1", "header2": "another value 2"}, ] def test_given_quote_delimiter_when_read_data_then_parse_properly(self) -> None: self._config_format.delimiter = "|" - self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ - "header1|header2", - "a value 1|a value 2", - ]).build() + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1|header2", + "a value 1|a value 2", + ] + ) + .build() + ) data_generator = self._read_data() @@ -253,10 +326,16 @@ def test_given_quote_delimiter_when_read_data_then_parse_properly(self) -> None: def test_given_quote_char_when_read_data_then_parse_properly(self) -> None: self._config_format.quote_char = "|" - self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ - "header1,header2", - "|a,value,1|,|a,value,2|", - ]).build() + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1,header2", + "|a,value,1|,|a,value,2|", + ] + ) + .build() + ) data_generator = self._read_data() @@ -264,10 +343,16 @@ def test_given_quote_char_when_read_data_then_parse_properly(self) -> None: def test_given_escape_char_when_read_data_then_parse_properly(self) -> None: self._config_format.escape_char = "|" - self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ - "header1,header2", - '"a |"value|", 1",a value 2', - ]).build() + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1,header2", + '"a |"value|", 1",a value 2', + ] + ) + .build() + ) data_generator = self._read_data() @@ -275,10 +360,16 @@ def test_given_escape_char_when_read_data_then_parse_properly(self) -> None: def test_given_double_quote_on_when_read_data_then_parse_properly(self) -> None: self._config_format.double_quote = True - self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ - "header1,header2", - '1,"Text with doublequote: ""This is a text."""', - ]).build() + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1,header2", + '1,"Text with doublequote: ""This is a text."""', + ] + ) + .build() + ) data_generator = self._read_data() @@ -286,21 +377,33 @@ def test_given_double_quote_on_when_read_data_then_parse_properly(self) -> None: def test_given_double_quote_off_when_read_data_then_parse_properly(self) -> None: self._config_format.double_quote = False - self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ - "header1,header2", - '1,"Text with doublequote: ""This is a text."""', - ]).build() + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1,header2", + '1,"Text with doublequote: ""This is a text."""', + ] + ) + .build() + ) data_generator = self._read_data() assert list(data_generator) == [{"header1": "1", "header2": 'Text with doublequote: "This is a text."""'}] def test_given_generator_closed_when_read_data_then_unregister_dialect(self) -> None: - self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ - "header", - "a value", - "another value", - ]).build() + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header", + "a value", + "another value", + ] + ) + .build() + ) data_generator = self._read_data() next(data_generator) @@ -309,11 +412,17 @@ def test_given_generator_closed_when_read_data_then_unregister_dialect(self) -> assert f"{self._CONFIG_NAME}_config_dialect" not in csv.list_dialects() def test_given_too_many_values_for_columns_when_read_data_then_raise_exception_and_unregister_dialect(self) -> None: - self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ - "header", - "a value", - "too many values,value,value,value", - ]).build() + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header", + "a value", + "too many values,value,value,value", + ] + ) + .build() + ) data_generator = self._read_data() next(data_generator) @@ -324,11 +433,17 @@ def test_given_too_many_values_for_columns_when_read_data_then_raise_exception_a assert f"{self._CONFIG_NAME}_config_dialect" not in csv.list_dialects() def test_given_too_few_values_for_columns_when_read_data_then_raise_exception_and_unregister_dialect(self) -> None: - self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([ - "header1,header2,header3", - "value1,value2,value3", - "a value", - ]).build() + self._stream_reader.open_file.return_value = ( + CsvFileBuilder() + .with_data( + [ + "header1,header2,header3", + "value1,value2,value3", + "a value", + ] + ) + .build() + ) data_generator = self._read_data() next(data_generator) @@ -357,28 +472,27 @@ def test_encoding_is_passed_to_stream_reader() -> None: mock_obj.__enter__ = Mock(return_value=io.StringIO("c1,c2\nv1,v2")) mock_obj.__exit__ = Mock(return_value=None) file = RemoteFile(uri="s3://bucket/key.csv", last_modified=datetime.now()) - config = FileBasedStreamConfig( - name="test", - validation_policy="Emit Record", - file_type="csv", - format=CsvFormat(encoding=encoding) - ) + config = FileBasedStreamConfig(name="test", validation_policy="Emit Record", file_type="csv", format=CsvFormat(encoding=encoding)) list(parser.parse_records(config, file, stream_reader, logger, {"properties": {"c1": {"type": "string"}, "c2": {"type": "string"}}})) - stream_reader.open_file.assert_has_calls([ - mock.call(file, FileReadMode.READ, encoding, logger), - mock.call().__enter__(), - mock.call().__exit__(None, None, None), - ]) + stream_reader.open_file.assert_has_calls( + [ + mock.call(file, FileReadMode.READ, encoding, logger), + mock.call().__enter__(), + mock.call().__exit__(None, None, None), + ] + ) mock_obj.__enter__ = Mock(return_value=io.StringIO("c1,c2\nv1,v2")) loop = asyncio.get_event_loop() loop.run_until_complete(parser.infer_schema(config, file, stream_reader, logger)) stream_reader.open_file.assert_called_with(file, FileReadMode.READ, encoding, logger) - stream_reader.open_file.assert_has_calls([ - mock.call(file, FileReadMode.READ, encoding, logger), - mock.call().__enter__(), - mock.call().__exit__(None, None, None), - mock.call(file, FileReadMode.READ, encoding, logger), - mock.call().__enter__(), - mock.call().__exit__(None, None, None), - ]) + stream_reader.open_file.assert_has_calls( + [ + mock.call(file, FileReadMode.READ, encoding, logger), + mock.call().__enter__(), + mock.call().__exit__(None, None, None), + mock.call(file, FileReadMode.READ, encoding, logger), + mock.call().__enter__(), + mock.call().__exit__(None, None, None), + ] + ) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index 4548b6ca5f04e..947c79c6da508 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -43,58 +43,58 @@ "description": "Used during spec; allows the developer to configure the cloud provider specific options\nthat are needed when users configure a file-based source.", "type": "object", "properties": { + "start_date": { + "title": "Start Date", + "description": "UTC date and time in the format 2017-01-25T00:00:00.000000Z. Any file modified before this date will not be replicated.", + "examples": ["2021-01-01T00:00:00.000000Z"], + "format": "date-time", + "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{6}Z$", + "pattern_descriptor": "YYYY-MM-DDTHH:mm:ss.SSSSSSZ", + "order": 1, + "type": "string", + }, "streams": { "title": "The list of streams to sync", - "description": "Each instance of this configuration defines a stream. Use this to define which files belong in the stream, their format, and how they should be parsed and validated. When sending data to warehouse destination such as Snowflake or BigQuery, each stream is a separate table.", + "description": 'Each instance of this configuration defines a stream. Use this to define which files belong in the stream, their format, and how they should be parsed and validated. When sending data to warehouse destination such as Snowflake or BigQuery, each stream is a separate table.', "order": 10, "type": "array", "items": { "title": "FileBasedStreamConfig", "type": "object", "properties": { - "name": { - "title": "Name", - "description": "The name of the stream.", - "type": "string" - }, + "name": {"title": "Name", "description": "The name of the stream.", "type": "string"}, "file_type": { "title": "File Type", "description": "The data file type that is being extracted for a stream.", - "type": "string" + "type": "string", }, "globs": { "title": "Globs", - "description": "The pattern used to specify which files should be selected from the file system. For more information on glob pattern matching look here.", + "description": 'The pattern used to specify which files should be selected from the file system. For more information on glob pattern matching look here.', "type": "array", - "items": { - "type": "string" - } + "items": {"type": "string"}, }, "validation_policy": { "title": "Validation Policy", "description": "The name of the validation policy that dictates sync behavior when a record does not adhere to the stream schema.", "default": "Emit Record", - "enum": [ - "Emit Record", - "Skip Record", - "Wait for Discover", - ], + "enum": ["Emit Record", "Skip Record", "Wait for Discover"], }, "input_schema": { "title": "Input Schema", "description": "The schema that will be used to validate records extracted from the file. This will override the stream schema that is auto-detected from incoming files.", - "type": "string" + "type": "string", }, "primary_key": { "title": "Primary Key", "description": "The column or columns (for a composite key) that serves as the unique identifier of a record.", - "type": "string" + "type": "string", }, "days_to_sync_if_history_is_full": { "title": "Days To Sync If History Is Full", "description": "When the state history of the file store is full, syncs will only read files that were last modified in the provided day range.", "default": 3, - "type": "integer" + "type": "integer", }, "format": { "title": "Format", @@ -105,160 +105,118 @@ "title": "Avro Format", "type": "object", "properties": { - "filetype": { - "title": "Filetype", - "default": "avro", - "enum": [ - "avro" - ], - "type": "string" - }, + "filetype": {"title": "Filetype", "default": "avro", "enum": ["avro"], "type": "string"}, "double_as_string": { "title": "Convert Double Fields to Strings", "description": "Whether to convert double fields to strings. This is recommended if you have decimal numbers with a high degree of precision because there can be a loss precision when handling floating point numbers.", "default": False, - "type": "boolean" - } - } + "type": "boolean", + }, + }, }, { "title": "CSV Format", "type": "object", "properties": { - "filetype": { - "title": "Filetype", - "default": "csv", - "enum": [ - "csv" - ], - "type": "string" - }, - "inference_type": { - "default": "None", - "description": "How to infer the types of the columns. If none, inference default to strings.", - "title": "Inference Type", - "enum": [ - "None", - "Primitive Types Only", - ] - }, + "filetype": {"title": "Filetype", "default": "csv", "enum": ["csv"], "type": "string"}, "delimiter": { "title": "Delimiter", "description": "The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.", "default": ",", - "type": "string" + "type": "string", }, "quote_char": { "title": "Quote Character", "description": "The character used for quoting CSV values. To disallow quoting, make this field blank.", - "default": "\"", - "type": "string" + "default": '"', + "type": "string", }, "escape_char": { "title": "Escape Character", "description": "The character used for escaping special characters. To disallow escaping, leave this field blank.", - "type": "string" + "type": "string", }, "encoding": { "title": "Encoding", - "description": "The character encoding of the CSV data. Leave blank to default to UTF8. See list of python encodings for allowable options.", + "description": 'The character encoding of the CSV data. Leave blank to default to UTF8. See list of python encodings for allowable options.', "default": "utf8", - "type": "string" + "type": "string", }, "double_quote": { "title": "Double Quote", "description": "Whether two quotes in a quoted CSV value denote a single quote in the data.", "default": True, - "type": "boolean" + "type": "boolean", }, "quoting_behavior": { "title": "Quoting Behavior", "description": "The quoting behavior determines when a value in a row should have quote marks added around it. For example, if Quote Non-numeric is specified, while reading, quotes are expected for row values that do not contain numbers. Or for Quote All, every row value will be expecting quotes.", "default": "Quote Special Characters", - "enum": [ - "Quote All", - "Quote Special Characters", - "Quote Non-numeric", - "Quote None" - ] + "enum": ["Quote All", "Quote Special Characters", "Quote Non-numeric", "Quote None"], }, "null_values": { "title": "Null Values", "description": "A set of case-sensitive strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field.", "default": [], "type": "array", - "items": { - "type": "string" - }, - "uniqueItems": True + "items": {"type": "string"}, + "uniqueItems": True, + }, + "strings_can_be_null": { + "title": "Strings Can Be Null", + "description": "Whether strings can be interpreted as null values. If true, strings that match the null_values set will be interpreted as null. If false, strings that match the null_values set will be interpreted as the string itself.", + "default": True, + "type": "boolean", }, "skip_rows_before_header": { "title": "Skip Rows Before Header", "description": "The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.", "default": 0, - "type": "integer" + "type": "integer", }, "skip_rows_after_header": { "title": "Skip Rows After Header", "description": "The number of rows to skip after the header row.", "default": 0, - "type": "integer" + "type": "integer", }, "autogenerate_column_names": { "title": "Autogenerate Column Names", "description": "Whether to autogenerate column names if column_names is empty. If true, column names will be of the form \u201cf0\u201d, \u201cf1\u201d\u2026 If false, column names will be read from the first CSV row after skip_rows_before_header.", "default": False, - "type": "boolean" + "type": "boolean", }, "true_values": { "title": "True Values", "description": "A set of case-sensitive strings that should be interpreted as true values.", - "default": [ - "y", - "yes", - "t", - "true", - "on", - "1" - ], + "default": ["y", "yes", "t", "true", "on", "1"], "type": "array", - "items": { - "type": "string" - }, - "uniqueItems": True + "items": {"type": "string"}, + "uniqueItems": True, }, "false_values": { "title": "False Values", "description": "A set of case-sensitive strings that should be interpreted as false values.", - "default": [ - "n", - "no", - "f", - "false", - "off", - "0" - ], + "default": ["n", "no", "f", "false", "off", "0"], "type": "array", - "items": { - "type": "string" - }, - "uniqueItems": True + "items": {"type": "string"}, + "uniqueItems": True, }, - } + "inference_type": { + "title": "Inference Type", + "description": "How to infer the types of the columns. If none, inference default to strings.", + "default": "None", + "airbyte_hidden": True, + "enum": ["None", "Primitive Types Only"], + }, + }, }, { "title": "Jsonl Format", "type": "object", "properties": { - "filetype": { - "title": "Filetype", - "default": "jsonl", - "enum": [ - "jsonl" - ], - "type": "string" - } - } + "filetype": {"title": "Filetype", "default": "jsonl", "enum": ["jsonl"], "type": "string"} + }, }, { "title": "Parquet Format", @@ -267,51 +225,32 @@ "filetype": { "title": "Filetype", "default": "parquet", - "enum": [ - "parquet" - ], - "type": "string" + "enum": ["parquet"], + "type": "string", }, "decimal_as_float": { "title": "Convert Decimal Fields to Floats", "description": "Whether to convert decimal fields to floats. There is a loss of precision when converting decimals to floats, so this is not recommended.", "default": False, - "type": "boolean" - } - } - } - ] + "type": "boolean", + }, + }, + }, + ], }, "schemaless": { "title": "Schemaless", "description": "When enabled, syncs will not validate or structure records against the stream's schema.", "default": False, - "type": "boolean" - } + "type": "boolean", + }, }, - "required": [ - "name", - "file_type" - ] - } + "required": ["name", "file_type"], + }, }, - "start_date": { - "title": "Start Date", - "description": "UTC date and time in the format 2017-01-25T00:00:00.000000Z. Any file modified before this date will not be replicated.", - "examples": [ - "2021-01-01T00:00:00.000000Z" - ], - "format": "date-time", - "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{6}Z$", - "pattern_descriptor": "YYYY-MM-DDTHH:mm:ss.SSSSSSZ", - "order": 1, - "type": "string" - } }, - "required": [ - "streams" - ] - } + "required": ["streams"], + }, } ) .set_expected_catalog( @@ -944,13 +883,7 @@ "file_type": "csv", "globs": ["*.csv"], "validation_policy": "Emit Record", - "format": { - "filetype": "csv", - "delimiter": "#", - "escape_char": "!", - "double_quote": True, - "newlines_in_values": False - }, + "format": {"filetype": "csv", "delimiter": "#", "escape_char": "!", "double_quote": True, "newlines_in_values": False}, }, { "name": "stream2", @@ -1055,7 +988,11 @@ "stream": "stream1", }, { - "data": {"col3": "val @@@@ 13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, + "data": { + "col3": "val @@@@ 13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, "stream": "stream1", }, { @@ -1063,7 +1000,11 @@ "stream": "stream1", }, { - "data": {"col3": "val @@ 13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, + "data": { + "col3": "val @@ 13b", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "b.csv", + }, "stream": "stream2", }, { @@ -1525,10 +1466,10 @@ "format": { "filetype": "csv", "null_values": ["null"], - } + }, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( @@ -1551,18 +1492,80 @@ "json_schema": { "type": "object", "properties": { - "col1": { - "type": "string" - }, - "col2": { - "type": "string" - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "col1": {"type": "string"}, + "col2": {"type": "string"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, + }, + }, + "name": "stream1", + "source_defined_cursor": True, + "supported_sync_modes": ["full_refresh", "incremental"], + } + ] + } + ) + .set_expected_records( + [ + { + "data": { + "col1": "2", + "col2": None, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, + ] + ) +).build() + +csv_string_are_not_null_if_strings_can_be_null_is_false_scenario = ( + TestScenarioBuilder() + .set_name("csv_string_are_not_null_if_strings_can_be_null_is_false") + .set_config( + { + "streams": [ + { + "name": "stream1", + "file_type": "csv", + "globs": ["*"], + "validation_policy": "Emit Record", + "input_schema": '{"col1": "string", "col2": "string"}', + "format": { + "filetype": "csv", + "null_values": ["null"], + "strings_can_be_null": False, + }, + } + ], + "start_date": "2023-06-04T03:54:07.000000Z", + } + ) + .set_files( + { + "a.csv": { + "contents": [ + ("col1", "col2"), + ("2", "null"), + ], + "last_modified": "2023-06-05T03:54:07.000000Z", + } + } + ) + .set_file_type("csv") + .set_expected_catalog( + { + "streams": [ + { + "default_cursor_field": ["_ab_source_file_last_modified"], + "json_schema": { + "type": "object", + "properties": { + "col1": {"type": "string"}, + "col2": {"type": "string"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -1574,8 +1577,15 @@ ) .set_expected_records( [ - {"data": {"col1": "2", "col2": None, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "col1": "2", + "col2": "null", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) ).build() @@ -1593,10 +1603,10 @@ "validation_policy": "Emit Record", "format": { "filetype": "csv", - } + }, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( @@ -1619,18 +1629,10 @@ "json_schema": { "type": "object", "properties": { - "col1": { - "type": ["null", "string"] - }, - "col2": { - "type": ["null", "string"] - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -1642,8 +1644,15 @@ ) .set_expected_records( [ - {"data": {"col1": "2", "col2": "null", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "col1": "2", + "col2": "null", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) ).build() @@ -1659,13 +1668,10 @@ "file_type": "csv", "globs": ["*"], "validation_policy": "Emit Record", - "format": { - "filetype": "csv", - "null_values": ["null"] - } + "format": {"filetype": "csv", "null_values": ["null"]}, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( @@ -1688,18 +1694,10 @@ "json_schema": { "type": "object", "properties": { - "col1": { - "type": ["null", "string"] - }, - "col2": { - "type": ["null", "string"] - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -1711,8 +1709,15 @@ ) .set_expected_records( [ - {"data": {"col1": "2", "col2": None, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "col1": "2", + "col2": None, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) ).build() @@ -1728,13 +1733,10 @@ "file_type": "csv", "globs": ["*"], "validation_policy": "Emit Record", - "format": { - "filetype": "csv", - "quoting_behavior": "Quote All" - } + "format": {"filetype": "csv", "quoting_behavior": "Quote All"}, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( @@ -1757,18 +1759,10 @@ "json_schema": { "type": "object", "properties": { - "col1": { - "type": ["null", "string"] - }, - "col2": { - "type": ["null", "string"] - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -1780,8 +1774,15 @@ ) .set_expected_records( [ - {"data": {"col1": "2", "col2": 'val\n2', "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "col1": "2", + "col2": "val\n2", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) ).build() @@ -1799,18 +1800,18 @@ "validation_policy": "Emit Record", "format": { "filetype": "csv", - } + }, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( { "a.csv": { "contents": [ - '''col1,col2''', - '''2,val\n2''', + """col1,col2""", + """2,val\n2""", ], "last_modified": "2023-06-05T03:54:07.000Z", } @@ -1825,18 +1826,10 @@ "json_schema": { "type": "object", "properties": { - "col1": { - "type": ["null", "string"] - }, - "col2": { - "type": ["null", "string"] - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -1849,16 +1842,27 @@ .set_expected_records( [ # Note that the value for col2 is truncated to "val" because the newline is not escaped - {"data": {"col1": "2", "col2": 'val', "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "col1": "2", + "col2": "val", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) - .set_expected_logs({"read": [ + .set_expected_logs( { - "level": "ERROR", - "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=a.csv line_no=2 n_skipped=0", + "read": [ + { + "level": "ERROR", + "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=a.csv line_no=2 n_skipped=0", + } + ] } - ]}) + ) .set_expected_discover_error(SchemaInferenceError, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value) ).build() @@ -1880,17 +1884,17 @@ "delimiter": ",", "escape_char": "\\", "quoting_behavior": "Quote All", - } + }, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( { "a.csv": { "contents": [ - '''col1,col2''', + """col1,col2""", '''val11,"val\\"2"''', ], "last_modified": "2023-06-05T03:54:07.000Z", @@ -1906,18 +1910,10 @@ "json_schema": { "type": "object", "properties": { - "col1": { - "type": ["null", "string"] - }, - "col2": { - "type": ["null", "string"] - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -1929,8 +1925,15 @@ ) .set_expected_records( [ - {"data": {"col1": 'val11', "col2": 'val"2', "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "col1": "val11", + "col2": 'val"2', + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) ).build() @@ -1953,17 +1956,17 @@ "quote_char": '"', "delimiter": ",", "quoting_behavior": "Quote All", - } + }, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( { "a.csv": { "contents": [ - '''col1,col2''', + """col1,col2""", '''val11,"val""2"''', ], "last_modified": "2023-06-05T03:54:07.000Z", @@ -1979,18 +1982,10 @@ "json_schema": { "type": "object", "properties": { - "col1": { - "type": ["null", "string"] - }, - "col2": { - "type": ["null", "string"] - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -2002,8 +1997,15 @@ ) .set_expected_records( [ - {"data": {"col1": 'val11', "col2": 'val"2', "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "col1": "val11", + "col2": 'val"2', + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) ).build() @@ -2020,24 +2022,18 @@ "file_type": "csv", "globs": ["*"], "validation_policy": "Emit Record", - "format": { - "filetype": "csv", - "double_quotes": True, - "quote_char": '@', - "delimiter": "|", - "escape_char": "+" - } + "format": {"filetype": "csv", "double_quotes": True, "quote_char": "@", "delimiter": "|", "escape_char": "+"}, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( { "a.csv": { "contents": [ - '''col1|col2''', - '''val"1,1|val+|2''', + """col1|col2""", + """val"1,1|val+|2""", ], "last_modified": "2023-06-05T03:54:07.000Z", } @@ -2052,18 +2048,10 @@ "json_schema": { "type": "object", "properties": { - "col1": { - "type": ["null", "string"] - }, - "col2": { - "type": ["null", "string"] - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -2075,8 +2063,15 @@ ) .set_expected_records( [ - {"data": {"col1": 'val"1,1', "col2": 'val|2', "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "col1": 'val"1,1', + "col2": "val|2", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) ).build() @@ -2096,20 +2091,20 @@ "format": { "filetype": "csv", "double_quotes": True, - "quote_char": '@', + "quote_char": "@", "delimiter": "|", - } + }, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( { "a.csv": { "contents": [ - '''col1|col2''', - '''val"1,1|@val|2@''', + """col1|col2""", + """val"1,1|@val|2@""", ], "last_modified": "2023-06-05T03:54:07.000Z", } @@ -2124,18 +2119,10 @@ "json_schema": { "type": "object", "properties": { - "col1": { - "type": ["null", "string"] - }, - "col2": { - "type": ["null", "string"] - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -2147,8 +2134,15 @@ ) .set_expected_records( [ - {"data": {"col1": 'val"1,1', "col2": 'val|2', "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "col1": 'val"1,1', + "col2": "val|2", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) ).build() @@ -2165,13 +2159,10 @@ "file_type": "csv", "globs": ["*"], "validation_policy": "Emit Record", - "format": { - "filetype": "csv", - "skip_rows_before_header": 2 - } + "format": {"filetype": "csv", "skip_rows_before_header": 2}, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( @@ -2196,18 +2187,10 @@ "json_schema": { "type": "object", "properties": { - "col1": { - "type": ["null", "string"] - }, - "col2": { - "type": ["null", "string"] - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -2219,8 +2202,15 @@ ) .set_expected_records( [ - {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) ).build() @@ -2236,13 +2226,10 @@ "file_type": "csv", "globs": ["*"], "validation_policy": "Emit Record", - "format": { - "filetype": "csv", - "skip_rows_after_header": 2 - } + "format": {"filetype": "csv", "skip_rows_after_header": 2}, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( @@ -2267,18 +2254,10 @@ "json_schema": { "type": "object", "properties": { - "col1": { - "type": ["null", "string"] - }, - "col2": { - "type": ["null", "string"] - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -2290,8 +2269,15 @@ ) .set_expected_records( [ - {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) ).build() @@ -2312,10 +2298,10 @@ "filetype": "csv", "skip_rows_before_header": 1, "skip_rows_after_header": 1, - } + }, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( @@ -2340,18 +2326,10 @@ "json_schema": { "type": "object", "properties": { - "col1": { - "type": ["null", "string"] - }, - "col2": { - "type": ["null", "string"] - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "col1": {"type": ["null", "string"]}, + "col2": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -2363,8 +2341,15 @@ ) .set_expected_records( [ - {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "col1": "val11", + "col2": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) ).build() @@ -2383,10 +2368,10 @@ "format": { "filetype": "csv", "autogenerate_column_names": True, - } + }, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( @@ -2408,18 +2393,10 @@ "json_schema": { "type": "object", "properties": { - "f0": { - "type": ["null", "string"] - }, - "f1": { - "type": ["null", "string"] - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "f0": {"type": ["null", "string"]}, + "f1": {"type": ["null", "string"]}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -2431,8 +2408,15 @@ ) .set_expected_records( [ - {"data": {"f0": "val11", "f1": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "f0": "val11", + "f1": "val12", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) ).build() @@ -2453,10 +2437,10 @@ "filetype": "csv", "true_values": ["this_is_true"], "false_values": ["this_is_false"], - } + }, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( @@ -2479,18 +2463,10 @@ "json_schema": { "type": "object", "properties": { - "col1": { - "type": "boolean" - }, - "col2": { - "type": "boolean" - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "col1": {"type": "boolean"}, + "col2": {"type": "boolean"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -2502,8 +2478,15 @@ ) .set_expected_records( [ - {"data": {"col1": True, "col2": False, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "col1": True, + "col2": False, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) ).build() @@ -2523,10 +2506,10 @@ "format": { "filetype": "csv", "null_values": ["null"], - } + }, } ], - "start_date": "2023-06-04T03:54:07.000000Z" + "start_date": "2023-06-04T03:54:07.000000Z", } ) .set_files( @@ -2549,18 +2532,10 @@ "json_schema": { "type": "object", "properties": { - "col1": { - "type": "boolean" - }, - "col2": { - "type": "string" - }, - "_ab_source_file_last_modified": { - "type": "string" - }, - "_ab_source_file_url": { - "type": "string" - }, + "col1": {"type": "boolean"}, + "col2": {"type": "string"}, + "_ab_source_file_last_modified": {"type": "string"}, + "_ab_source_file_url": {"type": "string"}, }, }, "name": "stream1", @@ -2572,8 +2547,15 @@ ) .set_expected_records( [ - {"data": {"col1": None, "col2": "na", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", - "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + { + "data": { + "col1": None, + "col2": "na", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", + "_ab_source_file_url": "a.csv", + }, + "stream": "stream1", + }, ] ) ).build() diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py index b059ba5822c7c..0df75ff3c50f8 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py @@ -50,6 +50,7 @@ csv_skip_after_header_scenario, csv_skip_before_and_after_header_scenario, csv_skip_before_header_scenario, + csv_string_are_not_null_if_strings_can_be_null_is_false_scenario, csv_string_can_be_null_with_input_schemas_scenario, csv_string_not_null_if_no_null_values_scenario, csv_strings_can_be_null_not_quoted_scenario, @@ -177,6 +178,7 @@ schemaless_jsonl_scenario, schemaless_jsonl_multi_stream_scenario, csv_string_can_be_null_with_input_schemas_scenario, + csv_string_are_not_null_if_strings_can_be_null_is_false_scenario, csv_string_not_null_if_no_null_values_scenario, csv_strings_can_be_null_not_quoted_scenario, csv_newline_in_values_quoted_value_scenario,