From 04768b02d4f20c66220e6d3aa799c3c233f8c5cf Mon Sep 17 00:00:00 2001 From: Hussain Towaileb Date: Thu, 13 Jan 2022 23:34:51 +0300 Subject: [PATCH] [NO ISSUE]: Ignore byte order mark when reading ext data Change-Id: Iad0c298acdc85c32fb8cc79484b825b9d186c4cb Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/15823 Integration-Tests: Jenkins Tested-by: Jenkins Reviewed-by: Hussain Towaileb Reviewed-by: Michael Blow --- .../ExternalDatasetTestUtils.java | 30 +++++++++++++++- .../AwsS3ExternalDatasetOnePartitionTest.java | 4 +++ .../aws/AwsS3ExternalDatasetTest.java | 14 +++++++- .../byte_order_mark/csv/test.000.ddl.sqlpp | 35 +++++++++++++++++++ .../byte_order_mark/csv/test.001.query.sqlpp | 23 ++++++++++++ .../byte_order_mark/csv/test.099.ddl.sqlpp | 20 +++++++++++ .../byte_order_mark/json/test.000.ddl.sqlpp | 34 ++++++++++++++++++ .../byte_order_mark/json/test.001.query.sqlpp | 23 ++++++++++++ .../byte_order_mark/json/test.099.ddl.sqlpp | 20 +++++++++++ .../byte_order_mark/tsv/test.000.ddl.sqlpp | 35 +++++++++++++++++++ .../byte_order_mark/tsv/test.001.query.sqlpp | 23 ++++++++++++ .../byte_order_mark/tsv/test.099.ddl.sqlpp | 20 +++++++++++ .../common/byte_order_mark/csv/result.001.adm | 5 +++ .../byte_order_mark/json/result.001.adm | 5 +++ .../common/byte_order_mark/tsv/result.001.adm | 5 +++ ...te_external_dataset_azure_blob_storage.xml | 20 +++++++++++ .../testsuite_external_dataset_s3.xml | 20 +++++++++++ .../reader/stream/LineRecordReader.java | 6 ++++ .../reader/stream/QuotedLineRecordReader.java | 5 +++ .../stream/SemiStructuredRecordReader.java | 3 +- .../external/util/ExternalDataConstants.java | 1 + hyracks-fullstack/NOTICE | 2 +- 22 files changed, 349 insertions(+), 4 deletions(-) create mode 100644 asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.000.ddl.sqlpp create mode 100644 asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.001.query.sqlpp create mode 100644 asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.099.ddl.sqlpp create mode 100644 asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.000.ddl.sqlpp create mode 100644 asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.001.query.sqlpp create mode 100644 asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.099.ddl.sqlpp create mode 100644 asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.000.ddl.sqlpp create mode 100644 asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.001.query.sqlpp create mode 100644 asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.099.ddl.sqlpp create mode 100644 asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/csv/result.001.adm create mode 100644 asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/json/result.001.adm create mode 100644 asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/tsv/result.001.adm diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java index c064281d446..bbd882e8d20 100644 --- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java +++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java @@ -18,6 +18,7 @@ */ package org.apache.asterix.test.external_dataset; +import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.BOM_FILE_CONTAINER; import static org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.FIXED_DATA_CONTAINER; import java.io.BufferedWriter; @@ -59,6 +60,7 @@ public class ExternalDatasetTestUtils { private static Uploader playgroundDataLoader; private static Uploader fixedDataLoader; private static Uploader mixedDataLoader; + private static Uploader bomFileLoader; protected TestCaseContext tcCtx; @@ -80,10 +82,12 @@ public static void setDataPaths(String jsonDataPath, String csvDataPath, String TSV_DATA_PATH = tsvDataPath; } - public static void setUploaders(Uploader playgroundDataLoader, Uploader fixedDataLoader, Uploader mixedDataLoader) { + public static void setUploaders(Uploader playgroundDataLoader, Uploader fixedDataLoader, Uploader mixedDataLoader, + Uploader bomFileLoader) { ExternalDatasetTestUtils.playgroundDataLoader = playgroundDataLoader; ExternalDatasetTestUtils.fixedDataLoader = fixedDataLoader; ExternalDatasetTestUtils.mixedDataLoader = mixedDataLoader; + ExternalDatasetTestUtils.bomFileLoader = bomFileLoader; } /** @@ -129,6 +133,30 @@ public static void prepareFixedDataContainer() { fixedDataLoader.upload("lvl1/lvl2/5.json", path, true, false); } + /** + * This bucket contains files that start with byte order mark (BOM): U+FEFF + */ + public static void prepareBomFileContainer() { + LOGGER.info("Loading bom files data to " + BOM_FILE_CONTAINER); + + // Files data + bomFileLoader.upload("1.json", "\uFEFF{\"id\": 1, \"age\": 1}", false, false); + bomFileLoader.upload("2.json", "\uFEFF{\"id\": 2, \"age\": 2}", false, false); + bomFileLoader.upload("3.json", "\uFEFF{\"id\": 3, \"age\": 3}", false, false); + bomFileLoader.upload("4.json", "\uFEFF{\"id\": 4, \"age\": 4}", false, false); + bomFileLoader.upload("5.json", "\uFEFF{\"id\": 5, \"age\": 5}", false, false); + bomFileLoader.upload("1.csv", "\uFEFF1,1", false, false); + bomFileLoader.upload("2.csv", "\uFEFF2,2", false, false); + bomFileLoader.upload("3.csv", "\uFEFF3,3", false, false); + bomFileLoader.upload("4.csv", "\uFEFF4,4", false, false); + bomFileLoader.upload("5.csv", "\uFEFF5,5", false, false); + bomFileLoader.upload("1.tsv", "\uFEFF1\t1", false, false); + bomFileLoader.upload("2.tsv", "\uFEFF2\t2", false, false); + bomFileLoader.upload("3.tsv", "\uFEFF3\t3", false, false); + bomFileLoader.upload("4.tsv", "\uFEFF4\t4", false, false); + bomFileLoader.upload("5.tsv", "\uFEFF5\t5", false, false); + } + public static void loadJsonFiles() { String dataBasePath = JSON_DATA_PATH; String definition = JSON_DEFINITION; diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetOnePartitionTest.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetOnePartitionTest.java index 8114873da13..6c07fab35ab 100644 --- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetOnePartitionTest.java +++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetOnePartitionTest.java @@ -46,6 +46,7 @@ public static Collection tests() throws Exception { PREPARE_BUCKET = AwsS3ExternalDatasetOnePartitionTest::prepareS3Bucket; PREPARE_FIXED_DATA_BUCKET = AwsS3ExternalDatasetOnePartitionTest::prepareFixedDataBucket; PREPARE_MIXED_DATA_BUCKET = AwsS3ExternalDatasetOnePartitionTest::prepareMixedDataBucket; + PREPARE_BOM_FILE_BUCKET = AwsS3ExternalDatasetOnePartitionTest::prepareBomDataBucket; return LangExecutionUtil.tests(ONLY_TESTS, SUITE_TESTS); } @@ -57,4 +58,7 @@ private static void prepareFixedDataBucket() { private static void prepareMixedDataBucket() { } + + private static void prepareBomDataBucket() { + } } diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java index c3c94f6363e..36e64332cdf 100644 --- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java +++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java @@ -87,6 +87,7 @@ public class AwsS3ExternalDatasetTest { static Runnable PREPARE_BUCKET; static Runnable PREPARE_FIXED_DATA_BUCKET; static Runnable PREPARE_MIXED_DATA_BUCKET; + static Runnable PREPARE_BOM_FILE_BUCKET; // Base directory paths for data files private static final String JSON_DATA_PATH = joinPath("data", "json"); @@ -113,12 +114,15 @@ public class AwsS3ExternalDatasetTest { public static final String PLAYGROUND_CONTAINER = "playground"; public static final String FIXED_DATA_CONTAINER = "fixed-data"; // Do not use, has fixed data public static final String INCLUDE_EXCLUDE_CONTAINER = "include-exclude"; + public static final String BOM_FILE_CONTAINER = "bom-file-container"; public static final PutObjectRequest.Builder playgroundBuilder = PutObjectRequest.builder().bucket(PLAYGROUND_CONTAINER); public static final PutObjectRequest.Builder fixedDataBuilder = PutObjectRequest.builder().bucket(FIXED_DATA_CONTAINER); public static final PutObjectRequest.Builder includeExcludeBuilder = PutObjectRequest.builder().bucket(INCLUDE_EXCLUDE_CONTAINER); + public static final PutObjectRequest.Builder bomFileContainerBuilder = + PutObjectRequest.builder().bucket(BOM_FILE_CONTAINER); public AwsS3ExternalDatasetTest(TestCaseContext tcCtx) { this.tcCtx = tcCtx; @@ -155,6 +159,8 @@ public static Collection tests() throws Exception { PREPARE_BUCKET = ExternalDatasetTestUtils::preparePlaygroundContainer; PREPARE_FIXED_DATA_BUCKET = ExternalDatasetTestUtils::prepareFixedDataContainer; PREPARE_MIXED_DATA_BUCKET = ExternalDatasetTestUtils::prepareMixedDataContainer; + PREPARE_BOM_FILE_BUCKET = ExternalDatasetTestUtils::prepareBomFileContainer; + return LangExecutionUtil.tests(ONLY_TESTS, SUITE_TESTS); } @@ -196,15 +202,17 @@ private static void startAwsS3MockServer() { client.createBucket(CreateBucketRequest.builder().bucket(PLAYGROUND_CONTAINER).build()); client.createBucket(CreateBucketRequest.builder().bucket(FIXED_DATA_CONTAINER).build()); client.createBucket(CreateBucketRequest.builder().bucket(INCLUDE_EXCLUDE_CONTAINER).build()); + client.createBucket(CreateBucketRequest.builder().bucket(BOM_FILE_CONTAINER).build()); LOGGER.info("Client created successfully"); // Create the bucket and upload some json files setDataPaths(JSON_DATA_PATH, CSV_DATA_PATH, TSV_DATA_PATH); setUploaders(AwsS3ExternalDatasetTest::loadPlaygroundData, AwsS3ExternalDatasetTest::loadFixedData, - AwsS3ExternalDatasetTest::loadMixedData); + AwsS3ExternalDatasetTest::loadMixedData, AwsS3ExternalDatasetTest::loadBomData); PREPARE_BUCKET.run(); PREPARE_FIXED_DATA_BUCKET.run(); PREPARE_MIXED_DATA_BUCKET.run(); + PREPARE_BOM_FILE_BUCKET.run(); } private static void loadPlaygroundData(String key, String content, boolean fromFile, boolean gzipped) { @@ -219,6 +227,10 @@ private static void loadMixedData(String key, String content, boolean fromFile, client.putObject(includeExcludeBuilder.key(key).build(), getRequestBody(content, fromFile, gzipped)); } + private static void loadBomData(String key, String content, boolean fromFile, boolean gzipped) { + client.putObject(bomFileContainerBuilder.key(key).build(), getRequestBody(content, fromFile, gzipped)); + } + private static RequestBody getRequestBody(String content, boolean fromFile, boolean gzipped) { RequestBody body; diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.000.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.000.ddl.sqlpp new file mode 100644 index 00000000000..69e42c15e0c --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.000.ddl.sqlpp @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +drop dataverse test if exists; +create dataverse test; +use test; + +drop type test if exists; +create type test as { id: int, age: int }; + +drop dataset test1 if exists; +CREATE EXTERNAL DATASET test1(test) USING %adapter% ( +%template%, +("container"="bom-file-container"), +("format"="csv"), +("include"="*.csv"), +("header"=False), +("null"="") +); \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.001.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.001.query.sqlpp new file mode 100644 index 00000000000..5aa55807b35 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.001.query.sqlpp @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use test; + +select value test1 from test1 order by id asc; + diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.099.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.099.ddl.sqlpp new file mode 100644 index 00000000000..548e63267e4 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.099.ddl.sqlpp @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +drop dataverse test if exists; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.000.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.000.ddl.sqlpp new file mode 100644 index 00000000000..ad6513feb82 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.000.ddl.sqlpp @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +drop dataverse test if exists; +create dataverse test; +use test; + +drop type test if exists; +create type test as open { +}; + +drop dataset test1 if exists; +CREATE EXTERNAL DATASET test1(test) USING %adapter% ( +%template%, +("container"="bom-file-container"), +("format"="json"), +("include"="*.json") +); \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.001.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.001.query.sqlpp new file mode 100644 index 00000000000..5aa55807b35 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.001.query.sqlpp @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use test; + +select value test1 from test1 order by id asc; + diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.099.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.099.ddl.sqlpp new file mode 100644 index 00000000000..548e63267e4 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.099.ddl.sqlpp @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +drop dataverse test if exists; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.000.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.000.ddl.sqlpp new file mode 100644 index 00000000000..956e8358fd5 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.000.ddl.sqlpp @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +drop dataverse test if exists; +create dataverse test; +use test; + +drop type test if exists; +create type test as { id: int, age: int }; + +drop dataset test1 if exists; +CREATE EXTERNAL DATASET test1(test) USING %adapter% ( +%template%, +("container"="bom-file-container"), +("format"="tsv"), +("include"="*.tsv"), +("header"=False), +("null"="") +); \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.001.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.001.query.sqlpp new file mode 100644 index 00000000000..5aa55807b35 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.001.query.sqlpp @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use test; + +select value test1 from test1 order by id asc; + diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.099.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.099.ddl.sqlpp new file mode 100644 index 00000000000..548e63267e4 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.099.ddl.sqlpp @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +drop dataverse test if exists; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/csv/result.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/csv/result.001.adm new file mode 100644 index 00000000000..19d10f638d4 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/csv/result.001.adm @@ -0,0 +1,5 @@ +{ "id": 1, "age": 1 } +{ "id": 2, "age": 2 } +{ "id": 3, "age": 3 } +{ "id": 4, "age": 4 } +{ "id": 5, "age": 5 } diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/json/result.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/json/result.001.adm new file mode 100644 index 00000000000..19d10f638d4 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/json/result.001.adm @@ -0,0 +1,5 @@ +{ "id": 1, "age": 1 } +{ "id": 2, "age": 2 } +{ "id": 3, "age": 3 } +{ "id": 4, "age": 4 } +{ "id": 5, "age": 5 } diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/tsv/result.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/tsv/result.001.adm new file mode 100644 index 00000000000..19d10f638d4 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/tsv/result.001.adm @@ -0,0 +1,5 @@ +{ "id": 1, "age": 1 } +{ "id": 2, "age": 2 } +{ "id": 3, "age": 3 } +{ "id": 4, "age": 4 } +{ "id": 5, "age": 5 } diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_azure_blob_storage.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_azure_blob_storage.xml index 27ba14828ba..b3b9037ea26 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_azure_blob_storage.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_azure_blob_storage.xml @@ -356,4 +356,24 @@ + + + + + common/byte_order_mark/json + + + + + + common/byte_order_mark/csv + + + + + + common/byte_order_mark/tsv + + + diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml index ead8e899b91..29adec84c94 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml @@ -313,4 +313,24 @@ + + + + + common/byte_order_mark/json + + + + + + common/byte_order_mark/csv + + + + + + common/byte_order_mark/tsv + + + diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java index 4b861421138..db20d31450f 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java @@ -18,6 +18,8 @@ */ package org.apache.asterix.external.input.record.reader.stream; +import static org.apache.asterix.external.util.ExternalDataConstants.BYTE_ORDER_MARK; + import java.io.IOException; import java.util.Arrays; import java.util.Collections; @@ -121,6 +123,10 @@ public boolean hasNext() throws IOException { } } for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline + if (inputBuffer[bufferPosn] == BYTE_ORDER_MARK) { + startPosn++; + continue; + } if (inputBuffer[bufferPosn] == ExternalDataConstants.LF) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following byte diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java index 4c253bce151..4433b496ed2 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java @@ -18,6 +18,7 @@ */ package org.apache.asterix.external.input.record.reader.stream; +import static org.apache.asterix.external.util.ExternalDataConstants.BYTE_ORDER_MARK; import static org.apache.asterix.external.util.ExternalDataConstants.REC_ENDED_AT_EOF; import java.io.IOException; @@ -119,6 +120,10 @@ public boolean hasNext() throws IOException { boolean maybeInQuote = false; for (; bufferPosn < bufferLength; ++bufferPosn) { char ch = inputBuffer[bufferPosn]; + if (ch == BYTE_ORDER_MARK) { + startPosn++; + continue; + } // count lines here since we need to also count the lines inside quotes if (ch == ExternalDataConstants.LF || prevCharCR) { lineNumber++; diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/SemiStructuredRecordReader.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/SemiStructuredRecordReader.java index 0e23e46dfd9..2c31a0af4d2 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/SemiStructuredRecordReader.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/SemiStructuredRecordReader.java @@ -18,6 +18,7 @@ */ package org.apache.asterix.external.input.record.reader.stream; +import static org.apache.asterix.external.util.ExternalDataConstants.BYTE_ORDER_MARK; import static org.apache.asterix.external.util.ExternalDataConstants.CLOSING_BRACKET; import static org.apache.asterix.external.util.ExternalDataConstants.COMMA; import static org.apache.asterix.external.util.ExternalDataConstants.CR; @@ -134,7 +135,7 @@ public boolean hasNext() throws IOException { lineNumber++; } isLastCharCR = c == CR; - if (c == SPACE || c == TAB || c == LF || c == CR) { + if (c == SPACE || c == TAB || c == LF || c == CR || c == BYTE_ORDER_MARK) { continue; } if (c == recordStart && state != State.NESTED_OBJECT) { diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java index a0cf3874eac..1434e375a78 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java @@ -251,6 +251,7 @@ private ExternalDataConstants() { public static final char OPEN_BRACKET = '['; public static final char CLOSING_BRACKET = ']'; public static final char COMMA = ','; + public static final char BYTE_ORDER_MARK = '\uFEFF'; /** * Constant byte characters diff --git a/hyracks-fullstack/NOTICE b/hyracks-fullstack/NOTICE index 57c584396c7..2e33eed8fb8 100644 --- a/hyracks-fullstack/NOTICE +++ b/hyracks-fullstack/NOTICE @@ -1,5 +1,5 @@ Apache Hyracks and Algebricks -Copyright 2015-2021 The Apache Software Foundation +Copyright 2015-2022 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/).