Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions common/utils/src/main/resources/error/error-classes.json
Original file line number Diff line number Diff line change
Expand Up @@ -1990,12 +1990,6 @@
},
"sqlState" : "HY000"
},
"INVALID_HIVE_COLUMN_NAME" : {
"message" : [
"Cannot create the table <tableName> having the column <columnName> whose name contains invalid characters <invalidChars> in Hive metastore."
],
"sqlState" : "42K05"
},
"INVALID_IDENTIFIER" : {
"message" : [
"The identifier <ident> is invalid. Please, consider quoting it with back-quotes as `<ident>`."
Expand Down
6 changes: 0 additions & 6 deletions docs/sql-error-conditions.md
Original file line number Diff line number Diff line change
Expand Up @@ -1182,12 +1182,6 @@ The handle `<handle>` is invalid.

For more details see [INVALID_HANDLE](sql-error-conditions-invalid-handle-error-class.html)

### INVALID_HIVE_COLUMN_NAME

[SQLSTATE: 42K05](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)

Cannot create the table `<tableName>` having the column `<columnName>` whose name contains invalid characters `<invalidChars>` in Hive metastore.

### INVALID_IDENTIFIER

[SQLSTATE: 42602](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,12 @@ import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.types.DataTypeUtils
import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils}
import org.apache.spark.sql.catalyst.util.TypeUtils.{toSQLId, toSQLValue}
import org.apache.spark.sql.execution.command.DDLUtils
import org.apache.spark.sql.execution.datasources.{PartitioningUtils, SourceOptions}
import org.apache.spark.sql.hive.client.HiveClient
import org.apache.spark.sql.internal.HiveSerDe
import org.apache.spark.sql.internal.StaticSQLConf._
import org.apache.spark.sql.types.{AnsiIntervalType, ArrayType, DataType, MapType, StructType, TimestampNTZType}
import org.apache.spark.sql.types._

/**
* A persistent implementation of the system catalog using Hive.
Expand Down Expand Up @@ -150,51 +149,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
}
}

/**
* Checks the validity of data column names. Hive metastore disallows the table to use some
* special characters (',', ':', and ';') in data column names, including nested column names.
* Partition columns do not have such a restriction. Views do not have such a restriction.
*/
private def verifyDataSchema(
tableName: TableIdentifier, tableType: CatalogTableType, dataSchema: StructType): Unit = {
if (tableType != VIEW) {
val invalidChars = Seq(",", ":", ";")
def verifyNestedColumnNames(schema: StructType): Unit = schema.foreach { f =>
f.dataType match {
case st: StructType => verifyNestedColumnNames(st)
case _ if invalidChars.exists(f.name.contains) =>
val invalidCharsString = invalidChars.map(c => s"'$c'").mkString(", ")
throw new AnalysisException(
errorClass = "INVALID_HIVE_COLUMN_NAME",
messageParameters = Map(
"invalidChars" -> invalidCharsString,
"tableName" -> toSQLId(tableName.nameParts),
"columnName" -> toSQLId(f.name)
))
case _ =>
}
}

dataSchema.foreach { f =>
f.dataType match {
// Checks top-level column names
case _ if f.name.contains(",") =>
throw new AnalysisException(
errorClass = "INVALID_HIVE_COLUMN_NAME",
messageParameters = Map(
"invalidChars" -> toSQLValue(","),
"tableName" -> toSQLId(tableName.nameParts),
"columnName" -> toSQLId(f.name)
))
// Checks nested column names
case st: StructType =>
verifyNestedColumnNames(st)
case _ =>
}
}
}
}

// --------------------------------------------------------------------------
// Databases
// --------------------------------------------------------------------------
Expand Down Expand Up @@ -260,8 +214,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
val table = tableDefinition.identifier.table
requireDbExists(db)
verifyTableProperties(tableDefinition)
verifyDataSchema(
tableDefinition.identifier, tableDefinition.tableType, tableDefinition.dataSchema)

if (tableExists(db, table) && !ignoreIfExists) {
throw new TableAlreadyExistsException(db = db, table = table)
Expand Down Expand Up @@ -711,7 +663,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
newDataSchema: StructType): Unit = withClient {
requireTableExists(db, table)
val oldTable = getTable(db, table)
verifyDataSchema(oldTable.identifier, oldTable.tableType, newDataSchema)
val schemaProps =
tableMetaToTableProps(oldTable, StructType(newDataSchema ++ oldTable.partitionSchema)).toMap

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException}
import org.apache.spark.sql.connector.catalog.CatalogManager
import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME
import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_OWNER
import org.apache.spark.sql.errors.DataTypeErrors.toSQLType
import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils}
import org.apache.spark.sql.execution.datasources.orc.OrcCompressionCodec
import org.apache.spark.sql.execution.datasources.parquet.{ParquetCompressionCodec, ParquetFooterReader}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils}
import org.apache.spark.sql.hive.HiveUtils.{CONVERT_METASTORE_ORC, CONVERT_METASTORE_PARQUET}
import org.apache.spark.sql.hive.orc.OrcFileOperator
Expand Down Expand Up @@ -2876,22 +2876,39 @@ class HiveDDLSuite
}
}

test("SPARK-24681 checks if nested column names do not include ',', ':', and ';'") {
Seq("nested,column", "nested:column", "nested;column").foreach { nestedColumnName =>
test("SPARK-47101 checks if nested column names do not include invalid characters") {
// delimiter characters
Seq(",", ":").foreach { c =>
val typ = s"array<struct<`abc${c}xyz`:int>>"
// The regex is from HiveClientImpl.getSparkSQLDataType, please keep them in sync.
val replaced = typ.replaceAll("`", "").replaceAll("(?<=struct<|,)([^,<:]+)(?=:)", "`$1`")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this replace rule came from Hive? Can we have a link?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel it's clearer to write the string literal of the replaced value, instead of using this complex regex.

withTable("t") {
checkError(
exception = intercept[SparkException] {
sql(s"CREATE TABLE t (a $typ) USING hive")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for parquet tables, do we still have this error?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still fine

},
errorClass = "CANNOT_RECOGNIZE_HIVE_TYPE",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just for my education, where do we throw this error?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In HiveClientImpl.getSparkSQLDataType

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. We can reuse the existing error class in this case.

parameters = Map(
"fieldType" -> toSQLType(replaced),
"fieldName" -> "`a`")
)
}
}
// other special characters
Seq(";", "^", "\\", "/", "%").foreach { c =>
val typ = s"array<struct<`abc${c}xyz`:int>>"
val replaced = typ.replaceAll("`", "")
val msg = s"java.lang.IllegalArgumentException: Error: : expected at the position " +
s"16 of '$replaced' but '$c' is found."
withTable("t") {
checkError(
exception = intercept[AnalysisException] {
spark.range(1)
.select(struct(lit(0).as(nestedColumnName)).as("toplevel"))
.write
.format("hive")
.saveAsTable("t")
sql(s"CREATE TABLE t (a $typ) USING hive")
},
errorClass = "INVALID_HIVE_COLUMN_NAME",
errorClass = "_LEGACY_ERROR_TEMP_3065",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does this PR switch from INVALID_HIVE_COLUMN_NAME to _LEGACY_ERROR_TEMP_3065?

Can we exclude the deletion of INVALID_HIVE_COLUMN_NAME from this PR?

  • docs/sql-error-conditions.md
  • common/utils/src/main/resources/error/error-classes.json

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

INVALID_HIVE_COLUMN_NAME is not necessary anymore. 1) the restrictions for column names have been removed in this PR. 2) Nested field names belong to the data type part. For these two reasons, INVALID_HIVE_COLUMN_NAME could be removed.

_LEGACY_ERROR_TEMP_3065 is thrown by org.apache.spark.sql.hive.HiveExternalCatalog#withClient. It's hard to distinguish one Hive error from another for metastore API calls.

parameters = Map(
"invalidChars" -> "',', ':', ';'",
"tableName" -> "`spark_catalog`.`default`.`t`",
"columnName" -> s"`$nestedColumnName`")
"clazz" -> "org.apache.hadoop.hive.ql.metadata.HiveException",
"msg" -> msg)
)
}
}
Expand Down Expand Up @@ -3385,24 +3402,11 @@ class HiveDDLSuite
}
}

test("SPARK-44911: Create the table with invalid column") {
test("SPARK-47101: comma is allowed in column name") {
val tbl = "t1"
withTable(tbl) {
val e = intercept[AnalysisException] {
sql(
s"""
|CREATE TABLE t1
|STORED AS parquet
|SELECT id, DATE'2018-01-01' + MAKE_DT_INTERVAL(0, id) FROM RANGE(0, 10)
""".stripMargin)
}
checkError(e,
errorClass = "INVALID_HIVE_COLUMN_NAME",
parameters = Map(
"invalidChars" -> "','",
"tableName" -> "`spark_catalog`.`default`.`t1`",
"columnName" -> "`DATE '2018-01-01' + make_dt_interval(0, id, 0, 0`.`000000)`")
)
sql("CREATE TABLE t1 STORED AS parquet SELECT id as `a,b` FROM range(1)")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for adding this simpler version.
However, if you don't mind, shall we keep the existing test case, too?

SELECT id, DATE'2018-01-01' + MAKE_DT_INTERVAL(0, id) FROM RANGE(0, 10)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @dongjoon-hyun, this is changed via request from @cloud-fan #45180 (comment)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, got it~

checkAnswer(sql("SELECT * FROM t1"), Row(0))
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2141,41 +2141,6 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi
}
}

test("Auto alias construction of get_json_object") {
val df = Seq(("1", """{"f1": "value1", "f5": 5.23}""")).toDF("key", "jstring")

withTable("t") {
val e = intercept[AnalysisException] {
df.select($"key", functions.get_json_object($"jstring", "$.f1"))
.write.format("hive").saveAsTable("t")
}
checkError(e,
errorClass = "INVALID_HIVE_COLUMN_NAME",
parameters = Map(
"invalidChars" -> "','",
"tableName" -> "`spark_catalog`.`default`.`t`",
"columnName" -> "`get_json_object(jstring, $`.`f1)`")
)
}

withTempView("tempView") {
withTable("t") {
df.createTempView("tempView")
val e = intercept[AnalysisException] {
sql("CREATE TABLE t USING hive AS " +
"SELECT key, get_json_object(jstring, '$.f1') FROM tempView")
}
checkError(e,
errorClass = "INVALID_HIVE_COLUMN_NAME",
parameters = Map(
"invalidChars" -> "','",
"tableName" -> "`spark_catalog`.`default`.`t`",
"columnName" -> "`get_json_object(jstring, $`.`f1)`")
)
}
}
}

test("SPARK-19912 String literals should be escaped for Hive metastore partition pruning") {
withTable("spark_19912") {
Seq(
Expand Down