Skip to content

Commit

Permalink
馃帀 Add YAML format to source-file reader (#14588)
Browse files Browse the repository at this point in the history
* Add yaml reader

* Update docs

* Bumpversion of connector

* bump docs

* Update pyarrow dependency

* Upgrade pandas dependency

* auto-bump connector version

Co-authored-by: Octavia Squidington III <octavia-squidington-iii@users.noreply.github.com>
  • Loading branch information
2 people authored and girarda committed Jul 12, 2022
1 parent f3fc604 commit 2d2ef71
Show file tree
Hide file tree
Showing 10 changed files with 78 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@
- name: File
sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77
dockerRepository: airbyte/source-file
dockerImageTag: 0.2.10
dockerImageTag: 0.2.11
documentationUrl: https://docs.airbyte.io/integrations/sources/file
icon: file.svg
sourceType: file
Expand Down
5 changes: 3 additions & 2 deletions airbyte-config/init/src/main/resources/seed/source_specs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2261,7 +2261,7 @@
supportsNormalization: false
supportsDBT: false
supported_destination_sync_modes: []
- dockerImage: "airbyte/source-file:0.2.10"
- dockerImage: "airbyte/source-file:0.2.11"
spec:
documentationUrl: "https://docs.airbyte.io/integrations/sources/file"
connectionSpecification:
Expand Down Expand Up @@ -2289,6 +2289,7 @@
- "excel"
- "feather"
- "parquet"
- "yaml"
default: "csv"
title: "File Format"
description: "The Format of the file which should be replicated (Warning:\
Expand All @@ -2300,7 +2301,7 @@
\ chosen file format to provide additional options and tune its behavior."
examples:
- "{}"
- "{'sep': ' '}"
- "{\"sep\": \" \"}"
url:
type: "string"
title: "URL"
Expand Down
2 changes: 1 addition & 1 deletion airbyte-integrations/connectors/source-file/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ COPY source_file ./source_file
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

LABEL io.airbyte.version=0.2.10
LABEL io.airbyte.version=0.2.11
LABEL io.airbyte.name=airbyte/source-file
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def check_read(config, expected_columns=10, expected_rows=42):
("excel", "xlsx", 8, 50, "demo"),
("feather", "feather", 9, 3, "demo"),
("parquet", "parquet", 9, 3, "demo"),
("yaml", "yaml", 8, 3, "demo"),
],
)
def test_local_file_read(file_format, extension, expected_columns, expected_rows, filename):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"streams": [
{
"stream": {
"name": "test",
"json_schema": {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"$schema": "http://json-schema.org/schema#",
"type": "object",
"properties": {
"name": {"type": "string"},
"sourceDefinitionId": {"type": "string"},
"dockerRepository": {"type": "string"},
"dockerImageTag": {"type": "string"},
"documentationUrl": {"type": "string"},
"icon": {"type": "string"},
"sourceType": {"type": "string"},
"releaseStage": {"type": "string"}
}
}
}
},
"sync_mode": "full_refresh",
"destination_sync_mode": "overwrite"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
- name: Facebook Pages
sourceDefinitionId: 010eb12f-837b-4685-892d-0a39f76a98f5
dockerRepository: airbyte/source-facebook-pages
dockerImageTag: 0.1.6
documentationUrl: https://docs.airbyte.com/integrations/sources/facebook-pages
icon: facebook.svg
sourceType: api
releaseStage: alpha
- name: Faker
sourceDefinitionId: dfd88b22-b603-4c3d-aad7-3701784586b1
dockerRepository: airbyte/source-faker
dockerImageTag: 0.1.5
documentationUrl: https://docs.airbyte.com/integrations/source-faker
sourceType: api
releaseStage: alpha
- name: File
sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77
dockerRepository: airbyte/source-file
dockerImageTag: 0.2.10
documentationUrl: https://docs.airbyte.io/integrations/sources/file
icon: file.svg
sourceType: file
releaseStage: alpha
4 changes: 2 additions & 2 deletions airbyte-integrations/connectors/source-file/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
"gcsfs==0.7.1",
"genson==1.2.2",
"google-cloud-storage==1.35.0",
"pandas==1.2.0",
"pandas==1.4.3",
"paramiko==2.7.2",
"s3fs==0.4.2",
"smart-open[all]==4.1.2",
"lxml==4.6.5",
"html5lib==1.1",
"beautifulsoup4==4.9.3",
"pyarrow==3.0.0",
"pyarrow==8.0.0",
"xlrd==2.0.1",
"openpyxl==3.0.6",
"pyxlsb==1.0.8",
Expand Down
17 changes: 15 additions & 2 deletions airbyte-integrations/connectors/source-file/source_file/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from genson import SchemaBuilder
from google.cloud.storage import Client as GCSClient
from google.oauth2 import service_account
from yaml import safe_load


class ConfigurationError(Exception):
Expand Down Expand Up @@ -265,6 +266,10 @@ def load_nested_json(self, fp) -> list:
result = [result]
return result

def load_yaml(self, fp):
if self._reader_format == "yaml":
return pd.DataFrame(safe_load(fp))

def load_dataframes(self, fp, skip_data=False) -> Iterable:
"""load and return the appropriate pandas dataframe.
Expand Down Expand Up @@ -334,6 +339,12 @@ def read(self, fields: Iterable = None) -> Iterable[dict]:
with self.reader.open(binary=self.binary_source) as fp:
if self._reader_format == "json" or self._reader_format == "jsonl":
yield from self.load_nested_json(fp)
elif self._reader_format == "yaml":
fields = set(fields) if fields else None
df = self.load_yaml(fp)
columns = fields.intersection(set(df.columns)) if fields else df.columns
df = df.where(pd.notnull(df), None)
yield from df[columns].to_dict(orient="records")
else:
fields = set(fields) if fields else None
for df in self.load_dataframes(fp):
Expand All @@ -345,8 +356,10 @@ def _stream_properties(self):
with self.reader.open(binary=self.binary_source) as fp:
if self._reader_format == "json" or self._reader_format == "jsonl":
return self.load_nested_json_schema(fp)

df_list = self.load_dataframes(fp, skip_data=False)
elif self._reader_format == "yaml":
df_list = [self.load_yaml(fp)]
else:
df_list = self.load_dataframes(fp, skip_data=False)
fields = {}
for df in df_list:
for col in df.columns:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
},
"format": {
"type": "string",
"enum": ["csv", "json", "jsonl", "excel", "feather", "parquet"],
"enum": ["csv", "json", "jsonl", "excel", "feather", "parquet", "yaml"],
"default": "csv",
"title": "File Format",
"description": "The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs)."
Expand Down
2 changes: 2 additions & 0 deletions docs/integrations/sources/file.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ This source produces a single table for the target file as it replicates only on
| Feather | Yes |
| Parquet | Yes |
| Pickle | No |
| YAML | Yes |

**This connector does not support syncing unstructured data files such as raw text, audio, or videos.**

Expand Down Expand Up @@ -126,6 +127,7 @@ In order to read large files from a remote location, this connector uses the [sm

| Version | Date | Pull Request | Subject |
| ------- | ---------- | ------------------------------------------------------ | ------------------------------------------------- |
| 0.2.11 | 2022-07-12 | [9974](https://github.com/airbytehq/airbyte/pull/14588)| Add support to YAML format |
| 0.2.9 | 2022-02-01 | [9974](https://github.com/airbytehq/airbyte/pull/9974) | Update airbyte-cdk 0.1.47 |
| 0.2.8 | 2021-12-06 | [8524](https://github.com/airbytehq/airbyte/pull/8524) | Update connector fields title/description |
| 0.2.7 | 2021-10-28 | [7387](https://github.com/airbytehq/airbyte/pull/7387) | Migrate source to CDK structure, add SAT testing. |
Expand Down

0 comments on commit 2d2ef71

Please sign in to comment.