From dc96f2f8d6e08c4bc30bc11d6b29109d2aeb604b Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Thu, 6 Aug 2020 08:35:59 +0000 Subject: [PATCH] [SPARK-32546][SQL] Get table names directly from Hive tables ### What changes were proposed in this pull request? Get table names directly from a sequence of Hive tables in `HiveClientImpl.listTablesByType()` by skipping conversions Hive tables to Catalog tables. ### Why are the changes needed? A Hive metastore can be shared across many clients. A client can create tables using a SerDe which is not available on other clients, for instance `ROW FORMAT SERDE "com.ibm.spss.hive.serde2.xml.XmlSerDe"`. In the current implementation, other clients get the following exception while getting views: ``` java.lang.RuntimeException: MetaException(message:java.lang.ClassNotFoundException Class com.ibm.spss.hive.serde2.xml.XmlSerDe not found) ``` when `com.ibm.spss.hive.serde2.xml.XmlSerDe` is not available. ### Does this PR introduce _any_ user-facing change? Yes. For example, `SHOW VIEWS` returns a list of views instead of throwing an exception. ### How was this patch tested? - By existing test suites like: ``` $ build/sbt -Phive-2.3 "test:testOnly org.apache.spark.sql.hive.client.VersionsSuite" ``` - And manually: 1. Build Spark with Hive 1.2: `./build/sbt package -Phive-1.2 -Phive -Dhadoop.version=2.8.5` 2. Run spark-shell with a custom Hive SerDe, for instance download [json-serde-1.3.8-jar-with-dependencies.jar](https://github.com/cdamak/Twitter-Hive/blob/master/json-serde-1.3.8-jar-with-dependencies.jar) from https://github.com/cdamak/Twitter-Hive: ``` $ ./bin/spark-shell --jars ../Downloads/json-serde-1.3.8-jar-with-dependencies.jar ``` 3. Create a Hive table using this SerDe: ```scala scala> :paste // Entering paste mode (ctrl-D to finish) sql(s""" |CREATE TABLE json_table2(page_id INT NOT NULL) |ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' |""".stripMargin) // Exiting paste mode, now interpreting. res0: org.apache.spark.sql.DataFrame = [] scala> sql("SHOW TABLES").show +--------+-----------+-----------+ |database| tableName|isTemporary| +--------+-----------+-----------+ | default|json_table2| false| +--------+-----------+-----------+ scala> sql("SHOW VIEWS").show +---------+--------+-----------+ |namespace|viewName|isTemporary| +---------+--------+-----------+ +---------+--------+-----------+ ``` 4. Quit from the current `spark-shell` and run it without jars: ``` $ ./bin/spark-shell ``` 5. Show views. Without the fix, it throws the exception: ```scala scala> sql("SHOW VIEWS").show 20/08/06 10:53:36 ERROR log: error in initSerDe: java.lang.ClassNotFoundException Class org.openx.data.jsonserde.JsonSerDe not found java.lang.ClassNotFoundException: Class org.openx.data.jsonserde.JsonSerDe not found at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2273) at org.apache.hadoop.hive.metastore.MetaStoreUtils.getDeserializer(MetaStoreUtils.java:385) at org.apache.hadoop.hive.ql.metadata.Table.getDeserializerFromMetaStore(Table.java:276) at org.apache.hadoop.hive.ql.metadata.Table.getDeserializer(Table.java:258) at org.apache.hadoop.hive.ql.metadata.Table.getCols(Table.java:605) ``` After the fix: ```scala scala> sql("SHOW VIEWS").show +---------+--------+-----------+ |namespace|viewName|isTemporary| +---------+--------+-----------+ +---------+--------+-----------+ ``` Closes #29363 from MaxGekk/fix-listTablesByType-for-views. Authored-by: Max Gekk Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/hive/client/HiveClientImpl.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 3f70387a3b058..58ad5449b49fd 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -759,15 +759,17 @@ private[hive] class HiveClientImpl( dbName: String, pattern: String, tableType: CatalogTableType): Seq[String] = withHiveState { + val hiveTableType = toHiveTableType(tableType) try { // Try with Hive API getTablesByType first, it's supported from Hive 2.3+. - shim.getTablesByType(client, dbName, pattern, toHiveTableType(tableType)) + shim.getTablesByType(client, dbName, pattern, hiveTableType) } catch { case _: UnsupportedOperationException => // Fallback to filter logic if getTablesByType not supported. val tableNames = client.getTablesByPattern(dbName, pattern).asScala - val tables = getTablesByName(dbName, tableNames.toSeq).filter(_.tableType == tableType) - tables.map(_.identifier.table) + getRawTablesByName(dbName, tableNames) + .filter(_.getTableType == hiveTableType) + .map(_.getTableName) } }