-
Notifications
You must be signed in to change notification settings - Fork 4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add SourceFile integration #716
Changes from 22 commits
04acf12
924506e
2dfaa28
e4bc915
9eb53f8
59b1b47
d2f2d6f
e5f5aad
0dfb77d
62c50a5
d761e76
5957ab5
e26189b
863d3c9
c6381b2
1843141
1769521
fbecf0e
53e6b10
99160ed
203d080
c32625d
3c2a60e
68c1f16
068c684
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"sourceId": "778daa7c-feaf-4db6-96f3-70fd645acc77", | ||
"name": "File", | ||
"dockerRepository": "airbyte/source-file", | ||
"dockerImageTag": "0.1.0", | ||
"documentationUrl": "https://hub.docker.com/r/airbyte/source-file" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../bases/base-python-test/base_python_test |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
build | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
FROM airbyte/integration-base-python:dev | ||
|
||
RUN apt-get update && apt-get install -y jq curl bash && rm -rf /var/lib/apt/lists/* | ||
|
||
ENV CODE_PATH="source_file" | ||
ENV AIRBYTE_IMPL_MODULE="source_file" | ||
ENV AIRBYTE_IMPL_PATH="SourceFile" | ||
|
||
WORKDIR /airbyte/integration_code | ||
COPY $CODE_PATH ./$CODE_PATH | ||
COPY setup.py ./ | ||
RUN pip install ".[main]" | ||
|
||
LABEL io.airbyte.version=0.1.0 | ||
LABEL io.airbyte.name=airbyte/source-file |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
FROM airbyte/base-python-test:dev | ||
|
||
RUN apt-get update && rm -rf /var/lib/apt/lists/* | ||
|
||
ENV CODE_PATH="integration_tests" | ||
ENV AIRBYTE_TEST_MODULE="integration_tests" | ||
ENV AIRBYTE_TEST_PATH="SourceFileStandardTest" | ||
ENV AIRBYTE_TEST_CASE=true | ||
|
||
LABEL io.airbyte.version=0.1.0 | ||
LABEL io.airbyte.name=airbyte/source-file-standard-test | ||
|
||
WORKDIR /airbyte/integration_code | ||
COPY source_file source_file | ||
COPY $CODE_PATH $CODE_PATH | ||
COPY secrets $CODE_PATH | ||
COPY source_file/*.json $CODE_PATH | ||
COPY setup.py ./ | ||
|
||
RUN pip install ".[integration_tests]" | ||
|
||
WORKDIR /airbyte |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Testing Source File | ||
|
||
This integration | ||
|
||
## Necessary Credentials for tests | ||
|
||
In order to run integrations tests in this connector, you need to: | ||
1. Testing Google Cloud Service Storage | ||
1. Download and store your Google [Service Account](https://console.cloud.google.com/iam-admin/serviceaccounts) JSON file in `secrets/gcs.json`, it should look something like this: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe i missed it, but how does gcs.json get imported into config.json? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's used in: The content of the JSON is copied into the configuration as a string. Then in the source.py of this connector, we either are able to manipulate the DICT object directly once we parse that string or have to produce a temporary file with the ocntent of the json (depending on the google API we are using) |
||
``` | ||
{ | ||
"type": "service_account", | ||
"project_id": "XXXXXXX", | ||
"private_key_id": "XXXXXXXX", | ||
"private_key": "-----BEGIN PRIVATE KEY-----\nXXXXXXXXXX\n-----END PRIVATE KEY-----\n", | ||
"client_email": "XXXXX@XXXXXX.iam.gserviceaccount.com", | ||
"client_id": "XXXXXXXXX", | ||
"auth_uri": "https://accounts.google.com/o/oauth2/auth", | ||
"token_uri": "https://oauth2.googleapis.com/token", | ||
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", | ||
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/XXXXXXX0XXXXXX.iam.gserviceaccount.com" | ||
} | ||
|
||
``` | ||
1. Your Service Account should have [Storage Admin Rights](https://console.cloud.google.com/iam-admin/iam) (to create Buckets, read and store files in GCS) | ||
|
||
1. Testing Amazon S3 | ||
1. Create a file at `secrets/aws.json` | ||
``` | ||
{ | ||
"aws_access_key_id": "XXXXXXX", | ||
"aws_secret_access_key": "XXXXXXX" | ||
} | ||
``` | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../bases/airbyte-protocol/airbyte_protocol |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../bases/base-python/base_python |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../bases/base-python-test/base_python_test |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
project.ext.pyModule = 'source_file' | ||
apply from: rootProject.file('tools/gradle/commons/integrations/python.gradle') | ||
apply from: rootProject.file('tools/gradle/commons/integrations/image.gradle') | ||
apply from: rootProject.file('tools/gradle/commons/integrations/test-image.gradle') | ||
apply from: rootProject.file('tools/gradle/commons/integrations/integration-test.gradle') | ||
apply from: rootProject.file('tools/gradle/commons/integrations/standard-source-test-python.gradle') | ||
|
||
|
||
standardSourceTestPython { | ||
ext { | ||
imageName = "${extractImageName(project.file('Dockerfile'))}:dev" | ||
pythonContainerName = "${extractImageName(project.file('Dockerfile.test'))}:dev" | ||
} | ||
} | ||
|
||
task unitTest(type: PythonTask){ | ||
command = "setup.py test" | ||
} | ||
|
||
build.dependsOn(unitTest) | ||
build.dependsOn ':airbyte-integrations:bases:base-python-test:build' | ||
buildImage.dependsOn ':airbyte-integrations:bases:base-python:buildImage' | ||
integrationTest.dependsOn(buildImage) | ||
|
||
buildTestImage.dependsOn ':airbyte-integrations:bases:base-python-test:buildImage' | ||
standardSourceTestPython.dependsOn(buildTestImage) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
""" | ||
MIT License | ||
|
||
Copyright (c) 2020 Airbyte | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. | ||
""" | ||
|
||
from .integration_source_test import TestSourceFile | ||
from .standard_source_test import SourceFileStandardTest | ||
|
||
__all__ = ["SourceFileStandardTest", "TestSourceFile"] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
{ | ||
"streams": [ | ||
{ | ||
"name": "my_own_data_sample/my_file.csv", | ||
"json_schema": { | ||
"$schema": "http://json-schema.org/draft-07/schema#", | ||
"type": "object", | ||
"properties": { | ||
"date": { | ||
"type": "string" | ||
}, | ||
"key": { | ||
"type": "string" | ||
}, | ||
"total_confirmed": { | ||
"type": "number" | ||
}, | ||
"total_healed": { | ||
"type": "number" | ||
}, | ||
"total_deceased": { | ||
"type": "number" | ||
}, | ||
"total_recovered": { | ||
"type": "number" | ||
}, | ||
"total_tested": { | ||
"type": "number" | ||
} | ||
} | ||
} | ||
} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"format": "csv", | ||
"reader_options": "{\"sep\": \",\", \"nrows\": 20}", | ||
"storage": "https://", | ||
"url": "storage.googleapis.com/covid19-open-data/v2/latest/epidemiology.csv", | ||
"reader_impl": "gcsfs" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
""" | ||
MIT License | ||
|
||
Copyright (c) 2020 Airbyte | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. | ||
""" | ||
|
||
import json | ||
import os | ||
import tempfile | ||
import uuid | ||
|
||
import boto3 | ||
import pytest | ||
from base_python import AirbyteLogger | ||
from botocore.errorfactory import ClientError | ||
from google.api_core.exceptions import Conflict | ||
from google.cloud import storage | ||
from source_file import SourceFile | ||
|
||
|
||
class TestSourceFile(object): | ||
service_account_file: str = "../secrets/gcs.json" | ||
aws_credentials: str = "../secrets/aws.json" | ||
cloud_bucket_name: str = "airbytetestbucket" | ||
|
||
@pytest.fixture(scope="class") | ||
def download_gcs_public_data(self): | ||
print("\nDownload public dataset from gcs to local /tmp") | ||
config = get_config() | ||
config["storage"] = "https://" | ||
config["url"] = "storage.googleapis.com/covid19-open-data/v2/latest/epidemiology.csv" | ||
df = run_load_dataframes(config) | ||
tmp_file = tempfile.NamedTemporaryFile(delete=False) | ||
df.to_csv(tmp_file.name, index=False) | ||
yield tmp_file.name | ||
os.remove(tmp_file.name) | ||
print(f"\nLocal File {tmp_file.name} is now deleted") | ||
|
||
@pytest.fixture(scope="class") | ||
def create_gcs_private_data(self, download_gcs_public_data): | ||
storage_client = storage.Client.from_service_account_json(self.service_account_file) | ||
bucket_name = create_unique_gcs_bucket(storage_client, self.cloud_bucket_name) | ||
print(f"\nUpload dataset to private gcs bucket {bucket_name}") | ||
bucket = storage_client.get_bucket(bucket_name) | ||
blob = bucket.blob("myfile.csv") | ||
blob.upload_from_filename(download_gcs_public_data) | ||
yield f"{bucket_name}/myfile.csv" | ||
bucket.delete(force=True) | ||
print(f"\nGCS Bucket {bucket_name} is now deleted") | ||
|
||
@pytest.fixture(scope="class") | ||
def create_aws_private_data(self, download_gcs_public_data): | ||
with open(self.aws_credentials) as json_file: | ||
aws_config = json.load(json_file) | ||
region = "eu-west-3" | ||
location = {"LocationConstraint": region} | ||
s3_client = boto3.client( | ||
"s3", | ||
aws_access_key_id=aws_config["aws_access_key_id"], | ||
aws_secret_access_key=aws_config["aws_secret_access_key"], | ||
region_name=region, | ||
) | ||
bucket_name = self.cloud_bucket_name | ||
print(f"\nUpload dataset to private aws bucket {bucket_name}") | ||
try: | ||
s3_client.head_bucket(Bucket=bucket_name) | ||
except ClientError: | ||
s3_client.create_bucket(Bucket=bucket_name, CreateBucketConfiguration=location) | ||
s3_client.upload_file(download_gcs_public_data, bucket_name, "myfile.csv") | ||
yield f"{bucket_name}/myfile.csv" | ||
s3 = boto3.resource( | ||
"s3", aws_access_key_id=aws_config["aws_access_key_id"], aws_secret_access_key=aws_config["aws_secret_access_key"] | ||
) | ||
bucket = s3.Bucket(bucket_name) | ||
bucket.objects.all().delete() | ||
print(f"\nS3 Bucket {bucket_name} is now deleted") | ||
|
||
@pytest.mark.parametrize( | ||
"reader_impl, storage, url", | ||
[ | ||
("gcfs", "https://", "storage.googleapis.com/covid19-open-data/v2/latest/epidemiology.csv"), | ||
("smart_open", "https://", "storage.googleapis.com/covid19-open-data/v2/latest/epidemiology.csv"), | ||
("smart_open", "file://", "local"), | ||
], | ||
) | ||
def test_local_data(self, download_gcs_public_data, reader_impl, storage, url): | ||
config = get_config() | ||
config["storage"] = storage | ||
if url != "local": | ||
config["url"] = url | ||
else: | ||
config["url"] = download_gcs_public_data | ||
config["reader_impl"] = reader_impl | ||
run_load_dataframes(config) | ||
|
||
@pytest.mark.parametrize("reader_impl", ["gcsfs", "smart_open"]) | ||
def test_remote_gcs_load(self, create_gcs_private_data, reader_impl): | ||
config = get_config() | ||
config["storage"] = "gs://" | ||
config["url"] = create_gcs_private_data | ||
config["reader_impl"] = reader_impl | ||
with open(self.service_account_file) as json_file: | ||
config["service_account_json"] = json.dumps(json.load(json_file)) | ||
run_load_dataframes(config) | ||
|
||
@pytest.mark.parametrize("reader_impl", ["s3fs", "smart_open"]) | ||
def test_remote_aws_load(self, create_aws_private_data, reader_impl): | ||
config = get_config() | ||
config["storage"] = "s3://" | ||
config["url"] = create_aws_private_data | ||
config["reader_impl"] = reader_impl | ||
with open(self.aws_credentials) as json_file: | ||
aws_config = json.load(json_file) | ||
config["aws_access_key_id"] = aws_config["aws_access_key_id"] | ||
config["aws_secret_access_key"] = aws_config["aws_secret_access_key"] | ||
run_load_dataframes(config) | ||
|
||
|
||
def run_load_dataframes(config): | ||
df_list = SourceFile.load_dataframes(config=config, logger=AirbyteLogger(), skip_data=False) | ||
assert len(df_list) == 1 # Properly load 1 DataFrame | ||
df = df_list[0] | ||
assert len(df.columns) == 10 # DataFrame should have 10 columns | ||
assert len(df.index) == 42 # DataFrame should have 42 rows of data | ||
return df | ||
|
||
|
||
def get_config(): | ||
return {"format": "csv", "reader_options": '{"sep": ",", "nrows": 42}'} | ||
|
||
|
||
def create_unique_gcs_bucket(storage_client, name: str) -> str: | ||
""" | ||
Make a unique bucket to which we'll upload the file. | ||
(GCS buckets are part of a single global namespace.) | ||
""" | ||
for i in range(0, 5): | ||
bucket_name = f"{name}-{uuid.uuid1()}" | ||
try: | ||
bucket = storage_client.bucket(bucket_name) | ||
bucket.storage_class = "STANDARD" | ||
# fixed locations are cheaper... | ||
storage_client.create_bucket(bucket, location="us-east1") | ||
print(f"\nNew GCS bucket created {bucket_name}") | ||
return bucket_name | ||
except Conflict: | ||
print(f"\nError: {bucket_name} already exists!") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't this be in
STANDARD_SOURCE_DEFINITION
and notSTANDARD_SOURCE
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure i understand your comment...
I don't see folders named
STANDARD_SOURCE_DEFINITION
inairbyte-config/init/src/main/resources/config/
?Where is the
STANDARD_SOURCE_DEFINITION
?