From ad05bb95513233004c3933702ac21b8b8694a585 Mon Sep 17 00:00:00 2001 From: shardul-cr7 Date: Mon, 24 Dec 2018 12:51:16 +0530 Subject: [PATCH 1/5] Added validation for Inverted INdex columns and added a test case in case of varchar --- .../longstring/VarcharDataTypesBasicTestCase.scala | 13 +++++++++++++ .../spark/sql/catalyst/CarbonDDLSqlParser.scala | 10 ++++++++++ 2 files changed, 23 insertions(+) diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala index a96f7dfc1ad..2467d5aa44c 100644 --- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala +++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala @@ -191,6 +191,19 @@ class VarcharDataTypesBasicTestCase extends QueryTest with BeforeAndAfterEach wi assert(exceptionCaught.getMessage.contains("both in no_inverted_index and long_string_columns")) } + test("inverted index columns cannot be present in long_string_cols as they do not support sort_cols") { + val exceptionCaught = intercept[MalformedCarbonCommandException] { + sql( + s""" + | CREATE TABLE if not exists $longStringTable( + | id INT, name STRING, description STRING, address STRING, note STRING + | ) STORED BY 'carbondata' + | TBLPROPERTIES('inverted_index'='note', 'long_string_columns'='note,description') + |""".stripMargin) + } + assert(exceptionCaught.getMessage.contains("should be present in SORT_COLUMN(s)")) + } + private def prepareTable(): Unit = { sql( s""" diff --git a/integration/spark-common/src/main/scala/org/apache/spark/sql/catalyst/CarbonDDLSqlParser.scala b/integration/spark-common/src/main/scala/org/apache/spark/sql/catalyst/CarbonDDLSqlParser.scala index 35bf335c069..a46a8be22e7 100644 --- a/integration/spark-common/src/main/scala/org/apache/spark/sql/catalyst/CarbonDDLSqlParser.scala +++ b/integration/spark-common/src/main/scala/org/apache/spark/sql/catalyst/CarbonDDLSqlParser.scala @@ -374,6 +374,16 @@ abstract class CarbonDDLSqlParser extends AbstractCarbonSparkSQLParser { // get inverted index columns from table properties val invertedIdxCols = extractInvertedIndexColumns(fields, tableProperties) + // Validate if columns present in inverted index are part of sort columns. + if (invertedIdxCols.nonEmpty) { + invertedIdxCols.foreach { column => + if (!sortKeyDims.contains(column)) { + val errMsg = "INVERTED INDEX column: " + column + " should be present in SORT_COLUMN(s)" + throw new MalformedCarbonCommandException(errMsg) + } + } + } + // check for any duplicate columns in inverted and noinverted columns defined in tblproperties if (invertedIdxCols.nonEmpty && noInvertedIdxCols.nonEmpty) { invertedIdxCols.foreach { distCol => From f9a414695e4e16217c93c335196926314ae4c6dd Mon Sep 17 00:00:00 2001 From: shardul-cr7 Date: Mon, 24 Dec 2018 14:22:27 +0530 Subject: [PATCH 2/5] build failure handled --- .../dataload/TestNoInvertedIndexLoadAndQuery.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestNoInvertedIndexLoadAndQuery.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestNoInvertedIndexLoadAndQuery.scala index 13f8adb3ba0..1d7bf770754 100644 --- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestNoInvertedIndexLoadAndQuery.scala +++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestNoInvertedIndexLoadAndQuery.scala @@ -305,7 +305,7 @@ class TestNoInvertedIndexLoadAndQuery extends QueryTest with BeforeAndAfterAll { CREATE TABLE IF NOT EXISTS index1 (id Int, name String, city String) STORED BY 'org.apache.carbondata.format' - TBLPROPERTIES('DICTIONARY_INCLUDE'='id','INVERTED_INDEX'='city,name') + TBLPROPERTIES('DICTIONARY_INCLUDE'='id','INVERTED_INDEX'='city,name', 'SORT_COLUMNS'='city,name') """) sql( s""" @@ -333,7 +333,7 @@ class TestNoInvertedIndexLoadAndQuery extends QueryTest with BeforeAndAfterAll { CREATE TABLE IF NOT EXISTS index1 (id Int, name String, city String) STORED BY 'org.apache.carbondata.format' - TBLPROPERTIES('INVERTED_INDEX'='city,name,id') + TBLPROPERTIES('INVERTED_INDEX'='city,name,id','SORT_COLUMNS'='city,name,id') """) val carbonTable = CarbonMetadata.getInstance().getCarbonTable("default", "index1") assert(carbonTable.getColumnByName("index1", "city").getColumnSchema.getEncodingList @@ -352,7 +352,7 @@ class TestNoInvertedIndexLoadAndQuery extends QueryTest with BeforeAndAfterAll { CREATE TABLE IF NOT EXISTS index1 (id Int, name String, city String) STORED BY 'org.apache.carbondata.format' - TBLPROPERTIES('NO_INVERTED_INDEX'='city','INVERTED_INDEX'='city') + TBLPROPERTIES('NO_INVERTED_INDEX'='city','INVERTED_INDEX'='city','SORT_COLUMNS'='city') """) } assert(exception.getMessage From 564b3cf70091937a8461e8546b038b94d63cd480 Mon Sep 17 00:00:00 2001 From: shardul-cr7 Date: Mon, 24 Dec 2018 15:39:27 +0530 Subject: [PATCH 3/5] build failure handled --- .../testsuite/dataload/TestNoInvertedIndexLoadAndQuery.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestNoInvertedIndexLoadAndQuery.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestNoInvertedIndexLoadAndQuery.scala index 1d7bf770754..f483827c7bd 100644 --- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestNoInvertedIndexLoadAndQuery.scala +++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestNoInvertedIndexLoadAndQuery.scala @@ -340,7 +340,7 @@ class TestNoInvertedIndexLoadAndQuery extends QueryTest with BeforeAndAfterAll { .contains(Encoding.INVERTED_INDEX)) assert(carbonTable.getColumnByName("index1", "name").getColumnSchema.getEncodingList .contains(Encoding.INVERTED_INDEX)) - assert(!carbonTable.getColumnByName("index1", "id").getColumnSchema.getEncodingList + assert(carbonTable.getColumnByName("index1", "id").getColumnSchema.getEncodingList .contains(Encoding.INVERTED_INDEX)) } From e72eee016a664a575b83e1b1ec1e87648195d84c Mon Sep 17 00:00:00 2001 From: shardul-cr7 Date: Thu, 27 Dec 2018 10:58:30 +0530 Subject: [PATCH 4/5] Review comments handled --- docs/ddl-of-carbondata.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/ddl-of-carbondata.md b/docs/ddl-of-carbondata.md index 90153b77c21..6389b3b94f0 100644 --- a/docs/ddl-of-carbondata.md +++ b/docs/ddl-of-carbondata.md @@ -126,9 +126,11 @@ CarbonData DDL statements are documented here,which includes: By default inverted index is disabled as store size will be reduced, it can be enabled by using a table property. It might help to improve compression ratio and query speed, especially for low cardinality columns which are in reward position. Suggested use cases : For high cardinality columns, you can disable the inverted index for improving the data loading performance. + + **NOTE**: Columns specified in INVERTED_INDEX should also be present in SORT_COLUMNS. ``` - TBLPROPERTIES ('NO_INVERTED_INDEX'='column1', 'INVERTED_INDEX'='column2, column3') + TBLPROPERTIES ('SORT_COLUMNS'='column2,column3','NO_INVERTED_INDEX'='column1', 'INVERTED_INDEX'='column2, column3') ``` - ##### Sort Columns Configuration From 5c7645eceb877aad3ec0f29aac9da9375bbd0220 Mon Sep 17 00:00:00 2001 From: shardul-cr7 Date: Thu, 27 Dec 2018 18:56:41 +0530 Subject: [PATCH 5/5] Review Comments handled --- .../testsuite/longstring/VarcharDataTypesBasicTestCase.scala | 2 +- .../org/apache/spark/sql/catalyst/CarbonDDLSqlParser.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala index 2467d5aa44c..3148cac61b4 100644 --- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala +++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala @@ -201,7 +201,7 @@ class VarcharDataTypesBasicTestCase extends QueryTest with BeforeAndAfterEach wi | TBLPROPERTIES('inverted_index'='note', 'long_string_columns'='note,description') |""".stripMargin) } - assert(exceptionCaught.getMessage.contains("should be present in SORT_COLUMN(s)")) + assert(exceptionCaught.getMessage.contains("INVERTED_INDEX column: note should be present in SORT_COLUMNS")) } private def prepareTable(): Unit = { diff --git a/integration/spark-common/src/main/scala/org/apache/spark/sql/catalyst/CarbonDDLSqlParser.scala b/integration/spark-common/src/main/scala/org/apache/spark/sql/catalyst/CarbonDDLSqlParser.scala index a46a8be22e7..7d5c170e5a7 100644 --- a/integration/spark-common/src/main/scala/org/apache/spark/sql/catalyst/CarbonDDLSqlParser.scala +++ b/integration/spark-common/src/main/scala/org/apache/spark/sql/catalyst/CarbonDDLSqlParser.scala @@ -378,7 +378,7 @@ abstract class CarbonDDLSqlParser extends AbstractCarbonSparkSQLParser { if (invertedIdxCols.nonEmpty) { invertedIdxCols.foreach { column => if (!sortKeyDims.contains(column)) { - val errMsg = "INVERTED INDEX column: " + column + " should be present in SORT_COLUMN(s)" + val errMsg = "INVERTED_INDEX column: " + column + " should be present in SORT_COLUMNS" throw new MalformedCarbonCommandException(errMsg) } }