Skip to content

Commit

Permalink
Test file formats #1392 - creating samples and test case
Browse files Browse the repository at this point in the history
  • Loading branch information
vitaliizazmic committed Jan 21, 2021
1 parent 031edbb commit 79d2d42
Show file tree
Hide file tree
Showing 24 changed files with 6,452 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import os
import tempfile
import uuid
from pathlib import Path

import boto3
import pytest
Expand All @@ -40,6 +41,7 @@ class TestSourceFile(object):
service_account_file: str = "../secrets/gcs.json"
aws_credentials: str = "../secrets/aws.json"
cloud_bucket_name: str = "airbytetestbucket"
local_files_directory = Path(__file__).resolve().parent.parent.joinpath("sample_files/formats")

@pytest.fixture(scope="class")
def download_gcs_public_data(self):
Expand Down Expand Up @@ -157,6 +159,31 @@ def test_private_provider(self, storage_provider, url, user, password, host, col
config["provider"]["host"] = host
run_load_dataframes(config, columns_nb, rows_nb)

@pytest.mark.parametrize(
"file_format, extension, columns_nb, rows_nb",
[
("csv", "csv", 8, 5000),
("json", "json", 0, 2),
("html", "html", 3, 2),
("excel", "xls", 8, 50),
("excel", "xlsx", 8, 50),
("feather", "feather", 9, 3),
("parquet", "parquet", 9, 3),
],
)
def test_local_file_read(
self,
file_format,
extension,
columns_nb,
rows_nb,
):
file_directory = self.local_files_directory.joinpath(file_format)
load_method = run_load_nested_json_schema if file_format == "json" else run_load_dataframes
file_path = str(file_directory.joinpath(f"demo.{extension}"))
configs = {"dataset_name": "test", "format": file_format, "url": file_path, "provider": {"storage": "local"}}
load_method(configs, columns_nb, rows_nb)


def run_load_dataframes(config, expected_columns=10, expected_rows=42):
df_list = SourceFile.load_dataframes(config=config, logger=AirbyteLogger(), skip_data=False)
Expand All @@ -167,6 +194,14 @@ def run_load_dataframes(config, expected_columns=10, expected_rows=42):
return df


def run_load_nested_json_schema(config, expected_columns=10, expected_rows=42):
data_list = SourceFile.load_nested_json(config, logger=AirbyteLogger())
assert len(data_list) == 1 # Properly load data
df = data_list[0]
assert len(df) == expected_rows # DataFrame should have 42 items
return df


def get_config(index: int) -> dict:
configs = [
{"format": "csv", "reader_options": '{"sep": ",", "nrows": 42}', "provider": {}},
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"streams": [
{
"stream": {
"name": "test",
"json_schema": {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"Unnamed: 0": {
"type": "number"
},
"First Name": {
"type": "string"
},
"Last Name": {
"type": "string"
},
"Gender": {
"type": "string"
},
"Country": {
"type": "string"
},
"Age": {
"type": "number"
},
"Date": {
"type": "string"
},
"Id": {
"type": "number"
}
}
}
}
}
]
}
5,001 changes: 5,001 additions & 0 deletions airbyte-integrations/connectors/source-file/sample_files/formats/csv/demo.csv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"streams": [
{
"stream": {
"name": "test",
"json_schema": {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"0": {
"type": "string"
},
"1": {
"type": "string"
}
}
}
}
}
]
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"streams": [
{
"stream": {}
}
]
}
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"streams": [
{
"stream": {
"name": "test",
"json_schema": {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"0": {
"type": "string"
},
"1": {
"type": "string"
}
}
}
}
}
]
}
Loading

0 comments on commit 79d2d42

Please sign in to comment.