diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml index f6aa846fb5605..8ac9c4e90549e 100644 --- a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml @@ -271,7 +271,7 @@ - name: File sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77 dockerRepository: airbyte/source-file - dockerImageTag: 0.2.10 + dockerImageTag: 0.2.11 documentationUrl: https://docs.airbyte.io/integrations/sources/file icon: file.svg sourceType: file diff --git a/airbyte-config/init/src/main/resources/seed/source_specs.yaml b/airbyte-config/init/src/main/resources/seed/source_specs.yaml index 007d03d30f313..8804b33e1ede2 100644 --- a/airbyte-config/init/src/main/resources/seed/source_specs.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_specs.yaml @@ -2261,7 +2261,7 @@ supportsNormalization: false supportsDBT: false supported_destination_sync_modes: [] -- dockerImage: "airbyte/source-file:0.2.10" +- dockerImage: "airbyte/source-file:0.2.11" spec: documentationUrl: "https://docs.airbyte.io/integrations/sources/file" connectionSpecification: @@ -2289,6 +2289,7 @@ - "excel" - "feather" - "parquet" + - "yaml" default: "csv" title: "File Format" description: "The Format of the file which should be replicated (Warning:\ @@ -2300,7 +2301,7 @@ \ chosen file format to provide additional options and tune its behavior." examples: - "{}" - - "{'sep': ' '}" + - "{\"sep\": \" \"}" url: type: "string" title: "URL" diff --git a/airbyte-integrations/connectors/source-file/Dockerfile b/airbyte-integrations/connectors/source-file/Dockerfile index 417a749ec53a7..7ebaa1da95040 100644 --- a/airbyte-integrations/connectors/source-file/Dockerfile +++ b/airbyte-integrations/connectors/source-file/Dockerfile @@ -17,5 +17,5 @@ COPY source_file ./source_file ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.10 +LABEL io.airbyte.version=0.2.11 LABEL io.airbyte.name=airbyte/source-file diff --git a/airbyte-integrations/connectors/source-file/integration_tests/file_formats_test.py b/airbyte-integrations/connectors/source-file/integration_tests/file_formats_test.py index c5a9b399df0d7..a80808c533c5d 100644 --- a/airbyte-integrations/connectors/source-file/integration_tests/file_formats_test.py +++ b/airbyte-integrations/connectors/source-file/integration_tests/file_formats_test.py @@ -31,6 +31,7 @@ def check_read(config, expected_columns=10, expected_rows=42): ("excel", "xlsx", 8, 50, "demo"), ("feather", "feather", 9, 3, "demo"), ("parquet", "parquet", 9, 3, "demo"), + ("yaml", "yaml", 8, 3, "demo"), ], ) def test_local_file_read(file_format, extension, expected_columns, expected_rows, filename): diff --git a/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/yaml/configured_catalog.json b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/yaml/configured_catalog.json new file mode 100644 index 0000000000000..47fe23c74679b --- /dev/null +++ b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/yaml/configured_catalog.json @@ -0,0 +1,29 @@ +{ + "streams": [ + { + "stream": { + "name": "test", + "json_schema": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "$schema": "http://json-schema.org/schema#", + "type": "object", + "properties": { + "name": {"type": "string"}, + "sourceDefinitionId": {"type": "string"}, + "dockerRepository": {"type": "string"}, + "dockerImageTag": {"type": "string"}, + "documentationUrl": {"type": "string"}, + "icon": {"type": "string"}, + "sourceType": {"type": "string"}, + "releaseStage": {"type": "string"} + } + } + } + }, + "sync_mode": "full_refresh", + "destination_sync_mode": "overwrite" + } + ] +} diff --git a/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/yaml/demo.yaml b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/yaml/demo.yaml new file mode 100644 index 0000000000000..1a2c3c23d144c --- /dev/null +++ b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/yaml/demo.yaml @@ -0,0 +1,23 @@ +- name: Facebook Pages + sourceDefinitionId: 010eb12f-837b-4685-892d-0a39f76a98f5 + dockerRepository: airbyte/source-facebook-pages + dockerImageTag: 0.1.6 + documentationUrl: https://docs.airbyte.com/integrations/sources/facebook-pages + icon: facebook.svg + sourceType: api + releaseStage: alpha +- name: Faker + sourceDefinitionId: dfd88b22-b603-4c3d-aad7-3701784586b1 + dockerRepository: airbyte/source-faker + dockerImageTag: 0.1.5 + documentationUrl: https://docs.airbyte.com/integrations/source-faker + sourceType: api + releaseStage: alpha +- name: File + sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77 + dockerRepository: airbyte/source-file + dockerImageTag: 0.2.10 + documentationUrl: https://docs.airbyte.io/integrations/sources/file + icon: file.svg + sourceType: file + releaseStage: alpha \ No newline at end of file diff --git a/airbyte-integrations/connectors/source-file/setup.py b/airbyte-integrations/connectors/source-file/setup.py index 68057573e763f..4693685af987b 100644 --- a/airbyte-integrations/connectors/source-file/setup.py +++ b/airbyte-integrations/connectors/source-file/setup.py @@ -10,14 +10,14 @@ "gcsfs==0.7.1", "genson==1.2.2", "google-cloud-storage==1.35.0", - "pandas==1.2.0", + "pandas==1.4.3", "paramiko==2.7.2", "s3fs==0.4.2", "smart-open[all]==4.1.2", "lxml==4.6.5", "html5lib==1.1", "beautifulsoup4==4.9.3", - "pyarrow==3.0.0", + "pyarrow==8.0.0", "xlrd==2.0.1", "openpyxl==3.0.6", "pyxlsb==1.0.8", diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py index f82e2937b92c6..481b3f3efb193 100644 --- a/airbyte-integrations/connectors/source-file/source_file/client.py +++ b/airbyte-integrations/connectors/source-file/source_file/client.py @@ -19,6 +19,7 @@ from genson import SchemaBuilder from google.cloud.storage import Client as GCSClient from google.oauth2 import service_account +from yaml import safe_load class ConfigurationError(Exception): @@ -265,6 +266,10 @@ def load_nested_json(self, fp) -> list: result = [result] return result + def load_yaml(self, fp): + if self._reader_format == "yaml": + return pd.DataFrame(safe_load(fp)) + def load_dataframes(self, fp, skip_data=False) -> Iterable: """load and return the appropriate pandas dataframe. @@ -334,6 +339,12 @@ def read(self, fields: Iterable = None) -> Iterable[dict]: with self.reader.open(binary=self.binary_source) as fp: if self._reader_format == "json" or self._reader_format == "jsonl": yield from self.load_nested_json(fp) + elif self._reader_format == "yaml": + fields = set(fields) if fields else None + df = self.load_yaml(fp) + columns = fields.intersection(set(df.columns)) if fields else df.columns + df = df.where(pd.notnull(df), None) + yield from df[columns].to_dict(orient="records") else: fields = set(fields) if fields else None for df in self.load_dataframes(fp): @@ -345,8 +356,10 @@ def _stream_properties(self): with self.reader.open(binary=self.binary_source) as fp: if self._reader_format == "json" or self._reader_format == "jsonl": return self.load_nested_json_schema(fp) - - df_list = self.load_dataframes(fp, skip_data=False) + elif self._reader_format == "yaml": + df_list = [self.load_yaml(fp)] + else: + df_list = self.load_dataframes(fp, skip_data=False) fields = {} for df in df_list: for col in df.columns: diff --git a/airbyte-integrations/connectors/source-file/source_file/spec.json b/airbyte-integrations/connectors/source-file/source_file/spec.json index 834323a659685..777001fe80351 100644 --- a/airbyte-integrations/connectors/source-file/source_file/spec.json +++ b/airbyte-integrations/connectors/source-file/source_file/spec.json @@ -15,7 +15,7 @@ }, "format": { "type": "string", - "enum": ["csv", "json", "jsonl", "excel", "feather", "parquet"], + "enum": ["csv", "json", "jsonl", "excel", "feather", "parquet", "yaml"], "default": "csv", "title": "File Format", "description": "The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs)." diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md index d6c98e2465b1a..ce4f856d2c14f 100644 --- a/docs/integrations/sources/file.md +++ b/docs/integrations/sources/file.md @@ -47,6 +47,7 @@ This source produces a single table for the target file as it replicates only on | Feather | Yes | | Parquet | Yes | | Pickle | No | +| YAML | Yes | **This connector does not support syncing unstructured data files such as raw text, audio, or videos.** @@ -126,6 +127,7 @@ In order to read large files from a remote location, this connector uses the [sm | Version | Date | Pull Request | Subject | | ------- | ---------- | ------------------------------------------------------ | ------------------------------------------------- | +| 0.2.11 | 2022-07-12 | [9974](https://github.com/airbytehq/airbyte/pull/14588)| Add support to YAML format | | 0.2.9 | 2022-02-01 | [9974](https://github.com/airbytehq/airbyte/pull/9974) | Update airbyte-cdk 0.1.47 | | 0.2.8 | 2021-12-06 | [8524](https://github.com/airbytehq/airbyte/pull/8524) | Update connector fields title/description | | 0.2.7 | 2021-10-28 | [7387](https://github.com/airbytehq/airbyte/pull/7387) | Migrate source to CDK structure, add SAT testing. |