From c14d78ab5455a507eab3409259ca2bbb2a5fbc25 Mon Sep 17 00:00:00 2001 From: Vitalii Vdovenko Date: Thu, 28 Jan 2021 17:55:24 +0200 Subject: [PATCH] File source #1392 - merging with best practice --- .../integration_tests/file_formats_test.py | 74 ++++++ .../integration_source_test.py | 231 ------------------ .../formats/csv/configured_catalog_csv.json | 0 .../sample_files/formats/csv/demo.csv | 0 .../excel/configured_catalog_excel_xls.json | 0 .../excel/configured_catalog_excel_xlsx.json | 0 .../sample_files/formats/excel/demo.xls | Bin .../sample_files/formats/excel/demo.xlsx | Bin .../sample_files/formats/excel/demo1.xlsx | Bin .../feather/configured_catalog_feather.json | 0 .../sample_files/formats/feather/demo.feather | Bin .../formats/feather/demo1.feather | Bin .../formats/html/configured_catalog_html.json | 0 .../sample_files/formats/html/demo.html | 0 .../formats/json/configured_catalog_json.json | 0 .../sample_files/formats/json/demo.json | 0 .../formats/orc/configured_catalog_orc.json | 0 .../sample_files/formats/orc/demo.orc | Bin .../sample_files/formats/orc/demo1.orc | Bin .../parquet/configured_catalog_parquet.json | 0 .../sample_files/formats/parquet/demo.parquet | Bin .../formats/parquet/demo1.parquet | Bin .../pickle/configured_catalog_pickle.json | 0 .../sample_files/formats/pickle/demo.pkl | Bin .../sample_files/formats/pickle/demo1.pkl | Bin .../connectors/source-file/setup.py | 12 +- 26 files changed, 80 insertions(+), 237 deletions(-) create mode 100644 airbyte-integrations/connectors/source-file/integration_tests/file_formats_test.py delete mode 100644 airbyte-integrations/connectors/source-file/integration_tests/integration_source_test.py rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/csv/configured_catalog_csv.json (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/csv/demo.csv (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/excel/configured_catalog_excel_xls.json (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/excel/configured_catalog_excel_xlsx.json (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/excel/demo.xls (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/excel/demo.xlsx (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/excel/demo1.xlsx (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/feather/configured_catalog_feather.json (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/feather/demo.feather (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/feather/demo1.feather (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/html/configured_catalog_html.json (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/html/demo.html (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/json/configured_catalog_json.json (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/json/demo.json (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/orc/configured_catalog_orc.json (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/orc/demo.orc (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/orc/demo1.orc (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/parquet/configured_catalog_parquet.json (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/parquet/demo.parquet (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/parquet/demo1.parquet (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/pickle/configured_catalog_pickle.json (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/pickle/demo.pkl (100%) rename airbyte-integrations/connectors/source-file/{ => integration_tests}/sample_files/formats/pickle/demo1.pkl (100%) diff --git a/airbyte-integrations/connectors/source-file/integration_tests/file_formats_test.py b/airbyte-integrations/connectors/source-file/integration_tests/file_formats_test.py new file mode 100644 index 00000000000000..be0cedddd4ed3d --- /dev/null +++ b/airbyte-integrations/connectors/source-file/integration_tests/file_formats_test.py @@ -0,0 +1,74 @@ +""" +MIT License + +Copyright (c) 2020 Airbyte + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +from pathlib import Path + +import pytest +from base_python import AirbyteLogger +from source_file import SourceFile +from source_file.client import Client + +SAMPLE_DIRECTORY = Path(__file__).resolve().parent.joinpath("sample_files/formats") + + +def check_read(config, expected_columns=10, expected_rows=42): + client = Client(**config) + rows = list(client.read()) + assert len(rows) == expected_rows + assert len(rows[0]) == expected_columns + + +@pytest.mark.parametrize( + "file_format, extension, expected_columns, expected_rows", + [ + ("csv", "csv", 8, 5000), + ("json", "json", 2, 1), + ("excel", "xls", 8, 50), + ("excel", "xlsx", 8, 50), + ("feather", "feather", 9, 3), + ("parquet", "parquet", 9, 3), + ], +) +def test_local_file_read(file_format, extension, expected_columns, expected_rows): + file_directory = SAMPLE_DIRECTORY.joinpath(file_format) + file_path = str(file_directory.joinpath(f"demo.{extension}")) + configs = {"dataset_name": "test", "format": file_format, "url": file_path, "provider": {"storage": "local"}} + check_read(configs, expected_columns, expected_rows) + + +def run_load_dataframes(config, expected_columns=10, expected_rows=42): + df_list = SourceFile.load_dataframes(config=config, logger=AirbyteLogger(), skip_data=False) + assert len(df_list) == 1 # Properly load 1 DataFrame + df = df_list[0] + assert len(df.columns) == expected_columns # DataFrame should have 10 columns + assert len(df.index) == expected_rows # DataFrame should have 42 rows of data + return df + + +def run_load_nested_json_schema(config, expected_columns=10, expected_rows=42): + data_list = SourceFile.load_nested_json(config, logger=AirbyteLogger()) + assert len(data_list) == 1 # Properly load data + df = data_list[0] + assert len(df) == expected_rows # DataFrame should have 42 items + return df diff --git a/airbyte-integrations/connectors/source-file/integration_tests/integration_source_test.py b/airbyte-integrations/connectors/source-file/integration_tests/integration_source_test.py deleted file mode 100644 index 41461e0eb6855e..00000000000000 --- a/airbyte-integrations/connectors/source-file/integration_tests/integration_source_test.py +++ /dev/null @@ -1,231 +0,0 @@ -""" -MIT License - -Copyright (c) 2020 Airbyte - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" - -import json -import os -import tempfile -import uuid -from pathlib import Path - -import boto3 -import pytest -from base_python import AirbyteLogger -from botocore.errorfactory import ClientError -from google.api_core.exceptions import Conflict -from google.cloud import storage -from source_file import SourceFile - - -class TestSourceFile(object): - config_directory: str = Path(__file__).resolve().parent.parent.joinpath("secrets") - service_account_file: str = config_directory.joinpath("gcs.json") - aws_credentials: str = config_directory.joinpath("aws.json") - cloud_bucket_name: str = "airbytetestbucket" - local_files_directory: str = Path(__file__).resolve().parent.parent.joinpath("sample_files/formats") - - @pytest.fixture(scope="class") - def download_gcs_public_data(self): - print("\nDownload public dataset from gcs to local /tmp") - config = get_config(0) - config["provider"]["storage"] = "HTTPS" - config["url"] = "https://storage.googleapis.com/covid19-open-data/v2/latest/epidemiology.csv" - df = run_load_dataframes(config) - tmp_file = tempfile.NamedTemporaryFile(delete=False) - df.to_csv(tmp_file.name, index=False) - yield tmp_file.name - os.remove(tmp_file.name) - print(f"\nLocal File {tmp_file.name} is now deleted") - - @pytest.fixture(scope="class") - def create_gcs_private_data(self, download_gcs_public_data): - storage_client = storage.Client.from_service_account_json(self.service_account_file) - bucket_name = create_unique_gcs_bucket(storage_client, self.cloud_bucket_name) - print(f"\nUpload dataset to private gcs bucket {bucket_name}") - bucket = storage_client.get_bucket(bucket_name) - blob = bucket.blob("myfile.csv") - blob.upload_from_filename(download_gcs_public_data) - yield f"{bucket_name}/myfile.csv" - bucket.delete(force=True) - print(f"\nGCS Bucket {bucket_name} is now deleted") - - @pytest.fixture(scope="class") - def create_aws_private_data(self, download_gcs_public_data): - with open(self.aws_credentials) as json_file: - aws_config = json.load(json_file) - region = "eu-west-3" - location = {"LocationConstraint": region} - s3_client = boto3.client( - "s3", - aws_access_key_id=aws_config["aws_access_key_id"], - aws_secret_access_key=aws_config["aws_secret_access_key"], - region_name=region, - ) - bucket_name = self.cloud_bucket_name - print(f"\nUpload dataset to private aws bucket {bucket_name}") - try: - s3_client.head_bucket(Bucket=bucket_name) - except ClientError: - s3_client.create_bucket(Bucket=bucket_name, CreateBucketConfiguration=location) - s3_client.upload_file(download_gcs_public_data, bucket_name, "myfile.csv") - yield f"{bucket_name}/myfile.csv" - s3 = boto3.resource( - "s3", aws_access_key_id=aws_config["aws_access_key_id"], aws_secret_access_key=aws_config["aws_secret_access_key"] - ) - bucket = s3.Bucket(bucket_name) - bucket.objects.all().delete() - print(f"\nS3 Bucket {bucket_name} is now deleted") - - @pytest.mark.parametrize( - "reader_impl, storage_provider, url, columns_nb, config_index", - [ - # epidemiology csv - ("gcsfs", "HTTPS", "https://storage.googleapis.com/covid19-open-data/v2/latest/epidemiology.csv", 10, 0), - ("smart_open", "HTTPS", "storage.googleapis.com/covid19-open-data/v2/latest/epidemiology.csv", 10, 0), - ("smart_open", "local", "injected by tests", 10, 0), - # landsat compressed csv - ("gcsfs", "GCS", "gs://gcp-public-data-landsat/index.csv.gz", 18, 1), - ("smart_open", "GCS", "gs://gcp-public-data-landsat/index.csv.gz", 18, 0), - # GDELT csv - ("s3fs", "S3", "s3://gdelt-open-data/events/20190914.export.csv", 58, 2), - ("smart_open", "S3", "s3://gdelt-open-data/events/20190914.export.csv", 58, 2), - ], - ) - def test_public_and_local_data(self, download_gcs_public_data, reader_impl, storage_provider, url, columns_nb, config_index): - config = get_config(config_index) - config["provider"]["storage"] = storage_provider - if storage_provider != "local": - config["url"] = url - else: - # inject temp file path that was downloaded by the test as URL - config["url"] = download_gcs_public_data - config["provider"]["reader_impl"] = reader_impl - run_load_dataframes(config, expected_columns=columns_nb) - - @pytest.mark.parametrize("reader_impl", ["gcsfs", "smart_open"]) - def test_private_gcs_load(self, create_gcs_private_data, reader_impl): - config = get_config(0) - config["provider"]["storage"] = "GCS" - config["url"] = create_gcs_private_data - config["provider"]["reader_impl"] = reader_impl - with open(self.service_account_file) as json_file: - config["provider"]["service_account_json"] = json.dumps(json.load(json_file)) - run_load_dataframes(config) - - @pytest.mark.parametrize("reader_impl", ["s3fs", "smart_open"]) - def test_private_aws_load(self, create_aws_private_data, reader_impl): - config = get_config(0) - config["provider"]["storage"] = "S3" - config["url"] = create_aws_private_data - config["provider"]["reader_impl"] = reader_impl - with open(self.aws_credentials) as json_file: - aws_config = json.load(json_file) - config["provider"]["aws_access_key_id"] = aws_config["aws_access_key_id"] - config["provider"]["aws_secret_access_key"] = aws_config["aws_secret_access_key"] - run_load_dataframes(config) - - @pytest.mark.parametrize( - "storage_provider, url, user, password, host, columns_nb, rows_nb, config_index", - [ - ("SFTP", "/pub/example/readme.txt", "demo", "password", "test.rebex.net", 1, 6, 3), - ("SSH", "readme.txt", "demo", "password", "test.rebex.net", 1, 6, 3), - ], - ) - def test_private_provider(self, storage_provider, url, user, password, host, columns_nb, rows_nb, config_index): - config = get_config(config_index) - config["provider"]["storage"] = storage_provider - config["url"] = url - config["provider"]["user"] = user - config["provider"]["password"] = password - config["provider"]["host"] = host - run_load_dataframes(config, columns_nb, rows_nb) - - @pytest.mark.parametrize( - "file_format, extension, columns_nb, rows_nb", - [ - ("csv", "csv", 8, 5000), - ("json", "json", 0, 2), - ("html", "html", 3, 2), - # ("excel", "xls", 8, 50), - # ("excel", "xlsx", 8, 50), - # ("feather", "feather", 9, 3), - # ("parquet", "parquet", 9, 3), - ], - ) - def test_local_file_read( - self, - file_format, - extension, - columns_nb, - rows_nb, - ): - file_directory = self.local_files_directory.joinpath(file_format) - load_method = run_load_nested_json_schema if file_format == "json" else run_load_dataframes - file_path = str(file_directory.joinpath(f"demo.{extension}")) - configs = {"dataset_name": "test", "format": file_format, "url": file_path, "provider": {"storage": "local"}} - load_method(configs, columns_nb, rows_nb) - - -def run_load_dataframes(config, expected_columns=10, expected_rows=42): - df_list = SourceFile.load_dataframes(config=config, logger=AirbyteLogger(), skip_data=False) - assert len(df_list) == 1 # Properly load 1 DataFrame - df = df_list[0] - assert len(df.columns) == expected_columns # DataFrame should have 10 columns - assert len(df.index) == expected_rows # DataFrame should have 42 rows of data - return df - - -def run_load_nested_json_schema(config, expected_columns=10, expected_rows=42): - data_list = SourceFile.load_nested_json(config, logger=AirbyteLogger()) - assert len(data_list) == 1 # Properly load data - df = data_list[0] - assert len(df) == expected_rows # DataFrame should have 42 items - return df - - -def get_config(index: int) -> dict: - configs = [ - {"format": "csv", "reader_options": '{"sep": ",", "nrows": 42}', "provider": {}}, - {"format": "csv", "reader_options": '{"sep": ",", "nrows": 42, "compression": "gzip"}', "provider": {}}, - {"format": "csv", "reader_options": '{"sep": "\\t", "nrows": 42, "header": null}', "provider": {}}, - {"format": "csv", "reader_options": '{"sep": "\\r\\n", "names": ["text"], "header": null, "engine": "python"}', "provider": {}}, - ] - return configs[index] - - -def create_unique_gcs_bucket(storage_client, name: str) -> str: - """ - Make a unique bucket to which we'll upload the file. - (GCS buckets are part of a single global namespace.) - """ - for i in range(0, 5): - bucket_name = f"{name}-{uuid.uuid1()}" - try: - bucket = storage_client.bucket(bucket_name) - bucket.storage_class = "STANDARD" - # fixed locations are cheaper... - storage_client.create_bucket(bucket, location="us-east1") - print(f"\nNew GCS bucket created {bucket_name}") - return bucket_name - except Conflict: - print(f"\nError: {bucket_name} already exists!") diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/csv/configured_catalog_csv.json b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/csv/configured_catalog_csv.json similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/csv/configured_catalog_csv.json rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/csv/configured_catalog_csv.json diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/csv/demo.csv b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/csv/demo.csv similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/csv/demo.csv rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/csv/demo.csv diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/excel/configured_catalog_excel_xls.json b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/excel/configured_catalog_excel_xls.json similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/excel/configured_catalog_excel_xls.json rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/excel/configured_catalog_excel_xls.json diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/excel/configured_catalog_excel_xlsx.json b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/excel/configured_catalog_excel_xlsx.json similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/excel/configured_catalog_excel_xlsx.json rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/excel/configured_catalog_excel_xlsx.json diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/excel/demo.xls b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/excel/demo.xls similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/excel/demo.xls rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/excel/demo.xls diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/excel/demo.xlsx b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/excel/demo.xlsx similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/excel/demo.xlsx rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/excel/demo.xlsx diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/excel/demo1.xlsx b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/excel/demo1.xlsx similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/excel/demo1.xlsx rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/excel/demo1.xlsx diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/feather/configured_catalog_feather.json b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/feather/configured_catalog_feather.json similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/feather/configured_catalog_feather.json rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/feather/configured_catalog_feather.json diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/feather/demo.feather b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/feather/demo.feather similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/feather/demo.feather rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/feather/demo.feather diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/feather/demo1.feather b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/feather/demo1.feather similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/feather/demo1.feather rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/feather/demo1.feather diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/html/configured_catalog_html.json b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/html/configured_catalog_html.json similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/html/configured_catalog_html.json rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/html/configured_catalog_html.json diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/html/demo.html b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/html/demo.html similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/html/demo.html rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/html/demo.html diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/json/configured_catalog_json.json b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/json/configured_catalog_json.json similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/json/configured_catalog_json.json rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/json/configured_catalog_json.json diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/json/demo.json b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/json/demo.json similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/json/demo.json rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/json/demo.json diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/orc/configured_catalog_orc.json b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/orc/configured_catalog_orc.json similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/orc/configured_catalog_orc.json rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/orc/configured_catalog_orc.json diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/orc/demo.orc b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/orc/demo.orc similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/orc/demo.orc rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/orc/demo.orc diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/orc/demo1.orc b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/orc/demo1.orc similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/orc/demo1.orc rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/orc/demo1.orc diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/parquet/configured_catalog_parquet.json b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/parquet/configured_catalog_parquet.json similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/parquet/configured_catalog_parquet.json rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/parquet/configured_catalog_parquet.json diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/parquet/demo.parquet b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/parquet/demo.parquet similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/parquet/demo.parquet rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/parquet/demo.parquet diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/parquet/demo1.parquet b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/parquet/demo1.parquet similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/parquet/demo1.parquet rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/parquet/demo1.parquet diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/pickle/configured_catalog_pickle.json b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/pickle/configured_catalog_pickle.json similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/pickle/configured_catalog_pickle.json rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/pickle/configured_catalog_pickle.json diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/pickle/demo.pkl b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/pickle/demo.pkl similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/pickle/demo.pkl rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/pickle/demo.pkl diff --git a/airbyte-integrations/connectors/source-file/sample_files/formats/pickle/demo1.pkl b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/pickle/demo1.pkl similarity index 100% rename from airbyte-integrations/connectors/source-file/sample_files/formats/pickle/demo1.pkl rename to airbyte-integrations/connectors/source-file/integration_tests/sample_files/formats/pickle/demo1.pkl diff --git a/airbyte-integrations/connectors/source-file/setup.py b/airbyte-integrations/connectors/source-file/setup.py index a0fe6eda2ebd12..9a46fe442bdcb8 100644 --- a/airbyte-integrations/connectors/source-file/setup.py +++ b/airbyte-integrations/connectors/source-file/setup.py @@ -34,12 +34,12 @@ "paramiko==2.7.2", "s3fs==0.5.2", "smart-open[all]==4.1.2", - "lxml", - "html5lib", - "BeautifulSoup4", - "pyarrow", - "xlrd", - "openpyxl", + "lxml==4.6.2", + "html5lib==1.1", + "beautifulsoup4==4.9.3", + "pyarrow==2.0.0", + "xlrd==2.0.1", + "openpyxl==3.0.6", ] TEST_REQUIREMENTS = [