I am using hudi-spark-bundle-0.5.1-SNAPSHOT.jar in EMR and getting the below exception in hiveSync.
We are using AWS glue catalog for hive metastore.
Hive table is getting created. I could see the table in hive with no data in it.
> org.apache.hudi.hive.HoodieHiveSyncException: Failed to sync partitions for table <tableName>
> at org.apache.hudi.hive.HiveSyncTool.syncPartitions(HiveSyncTool.java:172)
> at org.apache.hudi.hive.HiveSyncTool.syncHoodieTable(HiveSyncTool.java:107)
> at org.apache.hudi.hive.HiveSyncTool.syncHoodieTable(HiveSyncTool.java:67)
> at org.apache.hudi.HoodieSparkSqlWriter$.syncHive(HoodieSparkSqlWriter.scala:235)
> at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:169)
> at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:91)
> at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
> at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
> at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
> at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
> at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
> at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
> at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
> at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
> at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
> at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
> at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
> at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
> at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:668)
> at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:276)
> at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:270)
> at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:228)
> ... 69 elided
> Caused by: org.apache.hudi.org.apache.hadoop_hive.metastore.api.NoSuchObjectException: <hiveDB>.<tableName> table not found
> at org.apache.hudi.org.apache.hadoop_hive.metastore.api.ThriftHiveMetastore$get_partitions_result$get_partitions_resultStandardScheme.read(ThriftHiveMetastore.java)
> at org.apache.hudi.org.apache.hadoop_hive.metastore.api.ThriftHiveMetastore$get_partitions_result$get_partitions_resultStandardScheme.read(ThriftHiveMetastore.java)
> at org.apache.hudi.org.apache.hadoop_hive.metastore.api.ThriftHiveMetastore$get_partitions_result.read(ThriftHiveMetastore.java)
> at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:86)
> at org.apache.hudi.org.apache.hadoop_hive.metastore.api.ThriftHiveMetastore$Client.recv_get_partitions(ThriftHiveMetastore.java:2377)
> at org.apache.hudi.org.apache.hadoop_hive.metastore.api.ThriftHiveMetastore$Client.get_partitions(ThriftHiveMetastore.java:2362)
> at org.apache.hudi.org.apache.hadoop_hive.metastore.HiveMetaStoreClient.listPartitions(HiveMetaStoreClient.java:1162)
> at org.apache.hudi.hive.HoodieHiveClient.scanTablePartitions(HoodieHiveClient.java:240)
> at org.apache.hudi.hive.HiveSyncTool.syncPartitions(HiveSyncTool.java:162)
> ... 95 more
>
Below is the code,
spark-shell --master yarn --deploy-mode client --conf spark.shuffle.spill=true \
--conf spark.scheduler.mode=FIFO \
--conf spark.executor.extraJavaOptions=-XX:MaxPermSize=1024m \
--conf spark.sql.planner.externalSort=true --conf spark.shuffle.manager=sort \
--conf spark.ui.port=8088 --conf spark.executor.memoryOverhead=2g \
--conf spark.rpc.message.maxSize=1024 --conf spark.file.transferTo=false \
--conf spark.driver.maxResultSize=3g --conf spark.rdd.compress=true \
--conf spark.executor.extraJavaOptions="-Dconfig.resource=spark-defaults.conf" \
--conf spark.driver.JavaOptions="-Dspark.yarn.app.container.log.dir=/mnt/var/log/hadoop" \
--conf spark.driver.extraJavaOptions="-Dconfig.file=spark-defaults.conf" \
--conf spark.sql.parquet.writeLegacyFormat=true \
--conf spark.enable.dynamicAllocation=true \
--conf spark.dynamicAllocation.maxExecutors=10 \
--conf spark.dynamicAllocation.minExecutors=1 \
--conf spark.executor.cores=5 \
--conf spark.executor.memory=3g --conf spark.driver.memory=2g \
--conf spark.executor.instances=4 --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
--name gwpl_staging_load_hudi \
--files /etc/spark/conf/hive-site.xml \
--properties-file /usr/lib/spark/conf/spark-defaults.conf \
--jars /home/hadoop/hudi/hudi-spark-bundle-0.5.1-SNAPSHOT.jar
import org.apache.hudi.DataSourceWriteOptions
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.spark.sql._
import org.apache.spark.sql.SaveMode._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.joda.time.format.DateTimeFormat
val stagePrefix="stg_gwpl"
val harmonizedStageDB=<hiveDB>
val harmonizedstagePath="s3://****/**"
val table="pc_policy"
val incrementalData=spark.sql("select * from <hivetable> limit 100").cache
incrementalData.write.
format("org.apache.hudi").
option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY,"ID").
option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "ingestiondt").
option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "UpdateTime").
option(HoodieWriteConfig.TABLE_NAME, stagePrefix + "_hudi_" + table).
option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2:hiveserver:10000").
option(DataSourceWriteOptions.HIVE_USER_OPT_KEY, "hive").
option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY, "hive").
option("hoodie.datasource.hive_sync.enable", true).
option("hoodie.datasource.hive_sync.database",harmonizedStageDB).
option("hoodie.datasource.hive_sync.table",stagePrefix + "_hudi_" + table).
option("hoodie.datasource.hive_sync.partition_fields","ingestiondt").
mode(SaveMode.Overwrite).
save(s"${harmonizedstagePath}/hudi/$table")
Please let me know if I can provide more details to it.
I am using hudi-spark-bundle-0.5.1-SNAPSHOT.jar in EMR and getting the below exception in hiveSync.
We are using AWS glue catalog for hive metastore.
Hive table is getting created. I could see the table in hive with no data in it.
Below is the code,
Please let me know if I can provide more details to it.