Skip to content
Permalink
Browse files
[ASTERIXDB-3005][EXT]: Ignore byte order mark when reading ext data
Change-Id: Ic7a863097ec4a6adad018785011f0d26d540f2a5
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/14785
Tested-by: Hussain Towaileb <hussainht@gmail.com>
Integration-Tests: Hussain Towaileb <hussainht@gmail.com>
Reviewed-by: Hussain Towaileb <hussainht@gmail.com>
Reviewed-by: Michael Blow <mblow@apache.org>
  • Loading branch information
htowaileb committed Jan 14, 2022
1 parent 564be36 commit c503ef8028d5786fee8031c4728d18ee081dcbd3
Show file tree
Hide file tree
Showing 21 changed files with 348 additions and 3 deletions.
@@ -19,6 +19,7 @@
package org.apache.asterix.test.external_dataset;

import static org.apache.asterix.test.external_dataset.BinaryFileConverterUtil.BINARY_GEN_BASEDIR;
import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.BOM_FILE_CONTAINER;
import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.FIXED_DATA_CONTAINER;

import java.io.BufferedWriter;
@@ -62,6 +63,7 @@ public class ExternalDatasetTestUtils {
private static Uploader playgroundDataLoader;
private static Uploader fixedDataLoader;
private static Uploader mixedDataLoader;
private static Uploader bomFileLoader;

protected TestCaseContext tcCtx;

@@ -95,10 +97,12 @@ public static void setDataPaths(String jsonDataPath, String csvDataPath, String
TSV_DATA_PATH = tsvDataPath;
}

public static void setUploaders(Uploader playgroundDataLoader, Uploader fixedDataLoader, Uploader mixedDataLoader) {
public static void setUploaders(Uploader playgroundDataLoader, Uploader fixedDataLoader, Uploader mixedDataLoader,
Uploader bomFileLoader) {
ExternalDatasetTestUtils.playgroundDataLoader = playgroundDataLoader;
ExternalDatasetTestUtils.fixedDataLoader = fixedDataLoader;
ExternalDatasetTestUtils.mixedDataLoader = mixedDataLoader;
ExternalDatasetTestUtils.bomFileLoader = bomFileLoader;
}

/**
@@ -148,6 +152,30 @@ public static void prepareFixedDataContainer() {
fixedDataLoader.upload("lvl1/lvl2/5.json", path, true, false);
}

/**
* This bucket contains files that start with byte order mark (BOM): U+FEFF
*/
public static void prepareBomFileContainer() {
LOGGER.info("Loading bom files data to " + BOM_FILE_CONTAINER);

// Files data
bomFileLoader.upload("1.json", "\uFEFF{\"id\": 1, \"age\": 1}", false, false);
bomFileLoader.upload("2.json", "\uFEFF{\"id\": 2, \"age\": 2}", false, false);
bomFileLoader.upload("3.json", "\uFEFF{\"id\": 3, \"age\": 3}", false, false);
bomFileLoader.upload("4.json", "\uFEFF{\"id\": 4, \"age\": 4}", false, false);
bomFileLoader.upload("5.json", "\uFEFF{\"id\": 5, \"age\": 5}", false, false);
bomFileLoader.upload("1.csv", "\uFEFF1,1", false, false);
bomFileLoader.upload("2.csv", "\uFEFF2,2", false, false);
bomFileLoader.upload("3.csv", "\uFEFF3,3", false, false);
bomFileLoader.upload("4.csv", "\uFEFF4,4", false, false);
bomFileLoader.upload("5.csv", "\uFEFF5,5", false, false);
bomFileLoader.upload("1.tsv", "\uFEFF1\t1", false, false);
bomFileLoader.upload("2.tsv", "\uFEFF2\t2", false, false);
bomFileLoader.upload("3.tsv", "\uFEFF3\t3", false, false);
bomFileLoader.upload("4.tsv", "\uFEFF4\t4", false, false);
bomFileLoader.upload("5.tsv", "\uFEFF5\t5", false, false);
}

public static void loadJsonFiles() {
String dataBasePath = JSON_DATA_PATH;
String definition = JSON_DEFINITION;
@@ -46,6 +46,7 @@ public static Collection<Object[]> tests() throws Exception {
PREPARE_BUCKET = AwsS3ExternalDatasetOnePartitionTest::prepareS3Bucket;
PREPARE_FIXED_DATA_BUCKET = AwsS3ExternalDatasetOnePartitionTest::prepareFixedDataBucket;
PREPARE_MIXED_DATA_BUCKET = AwsS3ExternalDatasetOnePartitionTest::prepareMixedDataBucket;
PREPARE_BOM_FILE_BUCKET = AwsS3ExternalDatasetOnePartitionTest::prepareBomDataBucket;
return LangExecutionUtil.tests(ONLY_TESTS, SUITE_TESTS);
}

@@ -57,4 +58,7 @@ private static void prepareFixedDataBucket() {

private static void prepareMixedDataBucket() {
}

private static void prepareBomDataBucket() {
}
}
@@ -89,6 +89,7 @@ public class AwsS3ExternalDatasetTest {
static Runnable PREPARE_BUCKET;
static Runnable PREPARE_FIXED_DATA_BUCKET;
static Runnable PREPARE_MIXED_DATA_BUCKET;
static Runnable PREPARE_BOM_FILE_BUCKET;

// Base directory paths for data files
private static final String JSON_DATA_PATH = joinPath("data", "json");
@@ -115,12 +116,15 @@ public class AwsS3ExternalDatasetTest {
public static final String PLAYGROUND_CONTAINER = "playground";
public static final String FIXED_DATA_CONTAINER = "fixed-data"; // Do not use, has fixed data
public static final String INCLUDE_EXCLUDE_CONTAINER = "include-exclude";
public static final String BOM_FILE_CONTAINER = "bom-file-container";
public static final PutObjectRequest.Builder playgroundBuilder =
PutObjectRequest.builder().bucket(PLAYGROUND_CONTAINER);
public static final PutObjectRequest.Builder fixedDataBuilder =
PutObjectRequest.builder().bucket(FIXED_DATA_CONTAINER);
public static final PutObjectRequest.Builder includeExcludeBuilder =
PutObjectRequest.builder().bucket(INCLUDE_EXCLUDE_CONTAINER);
public static final PutObjectRequest.Builder bomFileContainerBuilder =
PutObjectRequest.builder().bucket(BOM_FILE_CONTAINER);

public AwsS3ExternalDatasetTest(TestCaseContext tcCtx) {
this.tcCtx = tcCtx;
@@ -158,6 +162,8 @@ public static Collection<Object[]> tests() throws Exception {
PREPARE_BUCKET = ExternalDatasetTestUtils::preparePlaygroundContainer;
PREPARE_FIXED_DATA_BUCKET = ExternalDatasetTestUtils::prepareFixedDataContainer;
PREPARE_MIXED_DATA_BUCKET = ExternalDatasetTestUtils::prepareMixedDataContainer;
PREPARE_BOM_FILE_BUCKET = ExternalDatasetTestUtils::prepareBomFileContainer;

return LangExecutionUtil.tests(ONLY_TESTS, SUITE_TESTS);
}

@@ -199,15 +205,17 @@ private static void startAwsS3MockServer() {
client.createBucket(CreateBucketRequest.builder().bucket(PLAYGROUND_CONTAINER).build());
client.createBucket(CreateBucketRequest.builder().bucket(FIXED_DATA_CONTAINER).build());
client.createBucket(CreateBucketRequest.builder().bucket(INCLUDE_EXCLUDE_CONTAINER).build());
client.createBucket(CreateBucketRequest.builder().bucket(BOM_FILE_CONTAINER).build());
LOGGER.info("Client created successfully");

// Create the bucket and upload some json files
setDataPaths(JSON_DATA_PATH, CSV_DATA_PATH, TSV_DATA_PATH);
setUploaders(AwsS3ExternalDatasetTest::loadPlaygroundData, AwsS3ExternalDatasetTest::loadFixedData,
AwsS3ExternalDatasetTest::loadMixedData);
AwsS3ExternalDatasetTest::loadMixedData, AwsS3ExternalDatasetTest::loadBomData);
PREPARE_BUCKET.run();
PREPARE_FIXED_DATA_BUCKET.run();
PREPARE_MIXED_DATA_BUCKET.run();
PREPARE_BOM_FILE_BUCKET.run();
}

private static void loadPlaygroundData(String key, String content, boolean fromFile, boolean gzipped) {
@@ -222,6 +230,10 @@ private static void loadMixedData(String key, String content, boolean fromFile,
client.putObject(includeExcludeBuilder.key(key).build(), getRequestBody(content, fromFile, gzipped));
}

private static void loadBomData(String key, String content, boolean fromFile, boolean gzipped) {
client.putObject(bomFileContainerBuilder.key(key).build(), getRequestBody(content, fromFile, gzipped));
}

private static RequestBody getRequestBody(String content, boolean fromFile, boolean gzipped) {
RequestBody body;
// Content is string
@@ -0,0 +1,35 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

drop dataverse test if exists;
create dataverse test;
use test;

drop type test if exists;
create type test as { id: int, age: int };

drop dataset test1 if exists;
CREATE EXTERNAL DATASET test1(test) USING %adapter% (
%template%,
("container"="bom-file-container"),
("format"="csv"),
("include"="*.csv"),
("header"=False),
("null"="")
);
@@ -0,0 +1,23 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

use test;

select value test1 from test1 order by id asc;

@@ -0,0 +1,20 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

drop dataverse test if exists;
@@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

drop dataverse test if exists;
create dataverse test;
use test;

drop type test if exists;
create type test as open {
};

drop dataset test1 if exists;
CREATE EXTERNAL DATASET test1(test) USING %adapter% (
%template%,
("container"="bom-file-container"),
("format"="json"),
("include"="*.json")
);
@@ -0,0 +1,23 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

use test;

select value test1 from test1 order by id asc;

@@ -0,0 +1,20 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

drop dataverse test if exists;
@@ -0,0 +1,35 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

drop dataverse test if exists;
create dataverse test;
use test;

drop type test if exists;
create type test as { id: int, age: int };

drop dataset test1 if exists;
CREATE EXTERNAL DATASET test1(test) USING %adapter% (
%template%,
("container"="bom-file-container"),
("format"="tsv"),
("include"="*.tsv"),
("header"=False),
("null"="")
);
@@ -0,0 +1,23 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

use test;

select value test1 from test1 order by id asc;

@@ -0,0 +1,20 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

drop dataverse test if exists;
@@ -0,0 +1,5 @@
{ "id": 1, "age": 1 }
{ "id": 2, "age": 2 }
{ "id": 3, "age": 3 }
{ "id": 4, "age": 4 }
{ "id": 5, "age": 5 }
@@ -0,0 +1,5 @@
{ "id": 1, "age": 1 }
{ "id": 2, "age": 2 }
{ "id": 3, "age": 3 }
{ "id": 4, "age": 4 }
{ "id": 5, "age": 5 }

0 comments on commit c503ef8

Please sign in to comment.