Skip to content

org.apache.hudi.org.apache.hadoop_hive.metastore.api.NoSuchObjectException: <hivedb.tableName> table not found #954

@gfn9cho

Description

@gfn9cho

I am using hudi-spark-bundle-0.5.1-SNAPSHOT.jar in EMR and getting the below exception in hiveSync.
We are using AWS glue catalog for hive metastore.
Hive table is getting created. I could see the table in hive with no data in it.

> org.apache.hudi.hive.HoodieHiveSyncException: Failed to sync partitions for table <tableName>
>   at org.apache.hudi.hive.HiveSyncTool.syncPartitions(HiveSyncTool.java:172)
>   at org.apache.hudi.hive.HiveSyncTool.syncHoodieTable(HiveSyncTool.java:107)
>   at org.apache.hudi.hive.HiveSyncTool.syncHoodieTable(HiveSyncTool.java:67)
>   at org.apache.hudi.HoodieSparkSqlWriter$.syncHive(HoodieSparkSqlWriter.scala:235)
>   at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:169)
>   at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:91)
>   at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
>   at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
>   at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
>   at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
>   at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
>   at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
>   at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
>   at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
>   at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
>   at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
>   at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
>   at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
>   at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
>   at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
>   at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
>   at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
>   at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
>   at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:668)
>   at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:276)
>   at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:270)
>   at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:228)
>   ... 69 elided
> Caused by: org.apache.hudi.org.apache.hadoop_hive.metastore.api.NoSuchObjectException: <hiveDB>.<tableName> table not found
>   at org.apache.hudi.org.apache.hadoop_hive.metastore.api.ThriftHiveMetastore$get_partitions_result$get_partitions_resultStandardScheme.read(ThriftHiveMetastore.java)
>   at org.apache.hudi.org.apache.hadoop_hive.metastore.api.ThriftHiveMetastore$get_partitions_result$get_partitions_resultStandardScheme.read(ThriftHiveMetastore.java)
>   at org.apache.hudi.org.apache.hadoop_hive.metastore.api.ThriftHiveMetastore$get_partitions_result.read(ThriftHiveMetastore.java)
>   at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:86)
>   at org.apache.hudi.org.apache.hadoop_hive.metastore.api.ThriftHiveMetastore$Client.recv_get_partitions(ThriftHiveMetastore.java:2377)
>   at org.apache.hudi.org.apache.hadoop_hive.metastore.api.ThriftHiveMetastore$Client.get_partitions(ThriftHiveMetastore.java:2362)
>   at org.apache.hudi.org.apache.hadoop_hive.metastore.HiveMetaStoreClient.listPartitions(HiveMetaStoreClient.java:1162)
>   at org.apache.hudi.hive.HoodieHiveClient.scanTablePartitions(HoodieHiveClient.java:240)
>   at org.apache.hudi.hive.HiveSyncTool.syncPartitions(HiveSyncTool.java:162)
>   ... 95 more
> 

Below is the code,

spark-shell --master yarn --deploy-mode client  --conf spark.shuffle.spill=true \
 --conf spark.scheduler.mode=FIFO \
 --conf spark.executor.extraJavaOptions=-XX:MaxPermSize=1024m \
 --conf spark.sql.planner.externalSort=true --conf spark.shuffle.manager=sort \
 --conf spark.ui.port=8088 --conf spark.executor.memoryOverhead=2g  \
 --conf spark.rpc.message.maxSize=1024 --conf spark.file.transferTo=false \
 --conf spark.driver.maxResultSize=3g --conf spark.rdd.compress=true \
 --conf spark.executor.extraJavaOptions="-Dconfig.resource=spark-defaults.conf" \
 --conf spark.driver.JavaOptions="-Dspark.yarn.app.container.log.dir=/mnt/var/log/hadoop" \
 --conf spark.driver.extraJavaOptions="-Dconfig.file=spark-defaults.conf" \
 --conf spark.sql.parquet.writeLegacyFormat=true \
 --conf spark.enable.dynamicAllocation=true \
 --conf spark.dynamicAllocation.maxExecutors=10 \
 --conf spark.dynamicAllocation.minExecutors=1 \
 --conf spark.executor.cores=5 \
 --conf spark.executor.memory=3g --conf spark.driver.memory=2g  \
 --conf spark.executor.instances=4 --conf spark.serializer=org.apache.spark.serializer.KryoSerializer  \
 --name gwpl_staging_load_hudi \
 --files /etc/spark/conf/hive-site.xml \
 --properties-file /usr/lib/spark/conf/spark-defaults.conf \
 --jars /home/hadoop/hudi/hudi-spark-bundle-0.5.1-SNAPSHOT.jar 

import org.apache.hudi.DataSourceWriteOptions
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.spark.sql._
import org.apache.spark.sql.SaveMode._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.joda.time.format.DateTimeFormat

val stagePrefix="stg_gwpl"
val harmonizedStageDB=<hiveDB>
val harmonizedstagePath="s3://****/**"
val table="pc_policy"

val incrementalData=spark.sql("select * from <hivetable> limit 100").cache

incrementalData.write.
format("org.apache.hudi").
option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY,"ID").
option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "ingestiondt").
option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "UpdateTime").
option(HoodieWriteConfig.TABLE_NAME, stagePrefix + "_hudi_" + table).
option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2:hiveserver:10000").
option(DataSourceWriteOptions.HIVE_USER_OPT_KEY, "hive").
option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY, "hive").
option("hoodie.datasource.hive_sync.enable", true).
option("hoodie.datasource.hive_sync.database",harmonizedStageDB).
option("hoodie.datasource.hive_sync.table",stagePrefix + "_hudi_" + table).
option("hoodie.datasource.hive_sync.partition_fields","ingestiondt").
mode(SaveMode.Overwrite).
save(s"${harmonizedstagePath}/hudi/$table")

Please let me know if I can provide more details to it.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions