[KYUUBI #3942] adapt Spark's JDBC data types to Hive data type defini…

…tions in KyuubiHiveDialect ### _Why are the changes needed?_ to close #3942 . Adapt Spark's JDBC data type to Hive data type definitions. 1. adapt `DoubleType` to "DOUBLE" for compatibility with Hive 2.1.x and below - "DOUBLE PRECISION" mapped in [`JdbcUtils.getCommonJDBCType` ](https://github.com/apache/spark/blob/v3.3.1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L145) is a pure alias for DOUBLE, only available starting with Hive 2.2.0. 2. adapt `IntegerType` to "INT" instead of "INTEGER" for compatibility with Hive 2.1.x and below - "INTEGER" is synonym for INT since Hive 2.2.0 3. fix the unsupported Spark data type mapping to the Hive data type definition with correct mappings: - FloatType to "FLOAT" instead of "REAL" - StringType to "STRING" instead of "TEXT" - BooleanType to "BOOLEAN" instead of "BIT(1)" - BinaryType to "BINARY" instead of "BLOB" - ByteType to "TINYINT" instead of "BYTE" Hive Data Type docs refers to https://cwiki.apache.org/confluence/display/hive/languagemanual+types . ### _How was this patch tested?_ - [x] Add some test cases that check the changes thoroughly including negative and positive cases if possible - [ ] Add screenshots for manual tests if appropriate - [x] [Run test](https://kyuubi.apache.org/docs/latest/develop_tools/testing.html#running-tests) locally before make a pull request Closes #3943 from bowenliang123/3942-hivetype. Closes #3942 10c3dde [liangbowen] Add ut for non adapted data types 5f6ce01 [liangbowen] Add links to doc and issues for Hive data type definition 1ae0fd1 [liangbowen] mapping IntegerType to "INT" instead of "INTEGER" for compatibility with Hive 2.1.x and below 13486bb [liangbowen] update ut cb0a053 [liangbowen] mapping ByteType to "TINYINT" c8a0282 [liangbowen] ut 2586258 [liangbowen] mapping BinaryType to "BINARY" e8b65a0 [liangbowen] adapt to Hive data type definitions 3f5cc75 [liangbowen] add ut 3a818da [liangbowen] comments a471466 [liangbowen] mapping FloatType to "FLOAT", BooleanType to "BOOLEAN" b9506dc [liangbowen] mapping DoubleType to "DOUBLE" and StringType to "STRING" Authored-by: liangbowen <liangbowen@gf.com.cn> Signed-off-by: Cheng Pan <chengpan@apache.org>
apache · Dec 9, 2022 · a07d234 · a07d234
1 parent 8305d80
commit a07d234
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 5 deletions.
diff --git a/...on-spark-jdbc-dialect/src/main/scala/org/apache/spark/sql/dialect/KyuubiHiveDialect.scala b/...on-spark-jdbc-dialect/src/main/scala/org/apache/spark/sql/dialect/KyuubiHiveDialect.scala
@@ -19,7 +19,9 @@ package org.apache.spark.sql.dialect
 
 import java.util.Locale
 
-import org.apache.spark.sql.jdbc.JdbcDialect
+import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils
+import org.apache.spark.sql.jdbc.{JdbcDialect, JdbcType}
+import org.apache.spark.sql.types._
 
 object KyuubiHiveDialect extends JdbcDialect {
 
@@ -34,4 +36,27 @@ object KyuubiHiveDialect extends JdbcDialect {
     colName.split('.').map(part => s"`$part`").mkString(".")
   }
 
+  /**
+   * Adapt to Hive data type definitions
+   * in https://cwiki.apache.org/confluence/display/hive/languagemanual+types .
+   *
+   * @param dt DataType in Spark SQL
+   * @return JdbcType with type definition adapted to Hive
+   */
+  override def getJDBCType(dt: DataType): Option[JdbcType] = dt match {
+    // [HIVE-14950] "INTEGER" is synonym for INT since Hive 2.2.0
+    // fallback to "INT" for better compatibility
+    case IntegerType => Option(JdbcType("INT", java.sql.Types.INTEGER))
+    // [HIVE-13556] "DOUBLE PRECISION" is alias for "DOUBLE" since Hive 2.2.0
+    // fallback to "DOUBLE" for better compatibility
+    case DoubleType => Option(JdbcType("DOUBLE", java.sql.Types.DOUBLE))
+    // adapt to Hive data type definition
+    case FloatType => Option(JdbcType("FLOAT", java.sql.Types.FLOAT))
+    case ByteType => Option(JdbcType("TINYINT", java.sql.Types.TINYINT))
+    case BooleanType => Option(JdbcType("BOOLEAN", java.sql.Types.BIT))
+    case StringType => Option(JdbcType("STRING", java.sql.Types.CLOB))
+    case BinaryType => Option(JdbcType("BINARY", java.sql.Types.BLOB))
+    case _ => JdbcUtils.getCommonJDBCType(dt)
+  }
+
 }
diff --git a/...ark-jdbc-dialect/src/test/scala/org/apache/spark/sql/dialect/KyuubiHiveDialectSuite.scala b/...ark-jdbc-dialect/src/test/scala/org/apache/spark/sql/dialect/KyuubiHiveDialectSuite.scala
@@ -17,19 +17,35 @@
 
 package org.apache.spark.sql.dialect
 
+import org.apache.spark.sql.dialect.KyuubiHiveDialect._
+import org.apache.spark.sql.types._
 // scalastyle:off
 import org.scalatest.funsuite.AnyFunSuite
 
 class KyuubiHiveDialectSuite extends AnyFunSuite {
 // scalastyle:on
 
   test("[KYUUBI #3489] Kyuubi Hive dialect: can handle jdbc url") {
-    assert(KyuubiHiveDialect.canHandle("jdbc:hive2://"))
-    assert(KyuubiHiveDialect.canHandle("jdbc:kyuubi://"))
+    assert(canHandle("jdbc:hive2://"))
+    assert(canHandle("jdbc:kyuubi://"))
   }
 
   test("[KYUUBI #3489] Kyuubi Hive dialect: quoteIdentifier") {
-    assertResult("`id`")(KyuubiHiveDialect.quoteIdentifier("id"))
-    assertResult("`table`.`id`")(KyuubiHiveDialect.quoteIdentifier("table.id"))
+    assertResult("`id`")(quoteIdentifier("id"))
+    assertResult("`table`.`id`")(quoteIdentifier("table.id"))
+  }
+
+  test("KYUUBI #3942 adapt to Hive data type definitions") {
+    def getJdbcTypeDefinition(dt: DataType): String = {
+      getJDBCType(dt).get.databaseTypeDefinition
+    }
+    assertResult("INT")(getJdbcTypeDefinition(IntegerType))
+    assertResult("DOUBLE")(getJdbcTypeDefinition(DoubleType))
+    assertResult("FLOAT")(getJdbcTypeDefinition(FloatType))
+    assertResult("TINYINT")(getJdbcTypeDefinition(ByteType))
+    assertResult("BOOLEAN")(getJdbcTypeDefinition(BooleanType))
+    assertResult("STRING")(getJdbcTypeDefinition(StringType))
+    assertResult("BINARY")(getJdbcTypeDefinition(BinaryType))
+    assertResult("DATE")(getJdbcTypeDefinition(DateType))
   }
 }