-
Notifications
You must be signed in to change notification settings - Fork 2.5k
Description
Describe the problem you faced
When trying to write data to hudi, my spark application fails with following error
java.lang.Exception: Could not sync using the meta sync class org.apache.hudi.hive.HiveSyncTool at com.trepp.zone.ZoneExecutionHelper.upsert(ZoneExecutionHelper.scala:101) at com.trepp.zone.Presentation.$anonfun$writeHudiObject$1(Presentation.scala:92) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at scala.util.Try$.apply(Try.scala:213) at com.trepp.zone.Presentation.writeHudiObject(Presentation.scala:81)
I am reading data from an Amazon S3 bucket and doing some transformation before writing data to hudi.
And I am using hudi-spark-bundle_2.12-0.11.0.jar available on maven central for Scala 12
Expected behavior
Successfully able to write to Hudi location
Environment Description
Scala version : 2.12.15
Hudi version : 0.11.0
Spark version : 3.2.1
Hadoop version : 3.2.1
Storage (HDFS/S3/GCS..) : S3
Running on Docker? (yes/no) : no
Running on AWS EMR version: 6.7.0
Additional context
Spark submit command:
- [Failing Script]: spark-submit --deploy-mode client --jars s3a://bucketpath/hudi/etl/hudi-spark-bundle_2.12-0.11.0.jar --driver-memory 6g --executor-memory 6g --executor-cores 4 --class com.trepp.TreppClient --master yarn --conf spark.files.maxPartitionBytes=268435456 --conf spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2 --conf spark.dynamicAllocation.enabled=true s3://bucketpath/etl/2.0-SNAPSHOT/etl-2.0-SNAPSHOT-jar-with-dependencies.jar -z presentation -a dataload -b bucketpath -k config/presentationzone/clo/goldenSetHoldings.json -w overwrite
- [Passing Script]: spark-submit --deploy-mode client --jars s3a://bucketpath/hudi/etl/hudi-spark-bundle_2.12-0.11.0.jar --driver-memory 6g --executor-memory 6g --executor-cores 4 --class com.trepp.TreppClient --master yarn --conf spark.files.maxPartitionBytes=268435456 --conf spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2 --conf spark.dynamicAllocation.enabled=true s3://bucketpath/etl/2.0-SNAPSHOT/etl-2.0-SNAPSHOT-jar-with-dependencies.jar -z presentation -a dataload -b bucketpath -k config/presentationzone/clo/accountbalances.json -w overwrite
Stacktrace
java.lang.Exception: Could not sync using the meta sync class org.apache.hudi.hive.HiveSyncTool at com.trepp.zone.ZoneExecutionHelper.upsert(ZoneExecutionHelper.scala:122) at com.trepp.zone.Presentation.$anonfun$writeHudiObject$1(Presentation.scala:92) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at scala.util.Try$.apply(Try.scala:213) at com.trepp.zone.Presentation.writeHudiObject(Presentation.scala:81) at com.trepp.process.Executor.$anonfun$writeObject$2(Executor.scala:136) at com.trepp.process.Executor.$anonfun$writeObject$2$adapted(Executor.scala:133) at scala.collection.immutable.List.foreach(List.scala:431) at com.trepp.process.Executor.$anonfun$writeObject$1(Executor.scala:133) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at scala.util.Try$.apply(Try.scala:213) at com.trepp.process.Executor.writeObject(Executor.scala:133) at com.trepp.process.Executor$$anon$2.$anonfun$accept$2(Executor.scala:118) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at scala.util.Try$.apply(Try.scala:213) at com.trepp.process.Executor$$anon$2.accept(Executor.scala:113) at com.trepp.process.Executor$$anon$2.accept(Executor.scala:111) at java.util.TreeMap.forEach(TreeMap.java:1005) at com.trepp.process.Executor.executeQuery(Executor.scala:111) at com.trepp.dataload.EtlImpl.$anonfun$executeProcess$3(EtlImpl.scala:43) at scala.util.Try$.apply(Try.scala:213) at com.trepp.dataload.EtlImpl.$anonfun$executeProcess$1(EtlImpl.scala:37) at com.trepp.dataload.EtlImpl.$anonfun$executeProcess$1$adapted(EtlImpl.scala:23) at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36) at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198) at com.trepp.dataload.EtlImpl.executeProcess(EtlImpl.scala:23) at com.trepp.TreppClient$.$anonfun$main$1(TreppClient.scala:46) at scala.util.Try$.apply(Try.scala:213) at com.trepp.TreppClient$.main(TreppClient.scala:40) at com.trepp.TreppClient.main(TreppClient.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52) at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:1000) at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180) at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203) at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90) at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1089) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1098) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Here are the list of all the Config. Properties that geys applied:
"classname" -> "org.apache.hudi" "hoodie.metadata.enable" -> "true" "hoodie.datasource.write.streaming.ignore.failed.batch" -> "true" "hoodie.populate.meta.fields" -> "true" "hoodie.table.metadata.partitions" -> "files" "hoodie.datasource.hive_sync.schema_string_length_thresh" -> "4000" "hoodie.datasource.hive_sync.use_jdbc" -> "false" "hoodie.write.lock.provider" -> "org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider" "hoodie.meta.sync.metadata_file_listing" -> {Boolean@24191} true "hoodie.cleaner.commits.retained" -> "1" "hoodie.datasource.write.keygenerator.class" -> "org.apache.hudi.keygen.ComplexKeyGenerator" "hoodie.timeline.layout.version" -> "1" "hoodie.datasource.hive_sync.create_managed_table" -> "false" "hoodie.table.checksum" -> "3012481716" "hoodie.datasource.write.precombine.field" -> "dl_change_seq" "hoodie.table.base.file.format" -> "PARQUET" "hoodie.cleaner.policy.failed.writes" -> "EAGER" "hoodie.table.timeline.timezone" -> "LOCAL" "hoodie.datasource.write.recordkey.field" -> "instrumentid,asofperiod" "hoodie.datasource.write.drop.partition.columns" -> "false" "hoodie.datasource.meta.sync.base.path" -> "s3://trepp-developmentservices-lake/presentationZone/clo/clogoldenSetHoldings" "hoodie.datasource.hive_sync.sync_as_datasource" -> "true" "hoodie.datasource.hive_sync.password" -> "hive" "hoodie.datasource.hive_sync.username" -> "hive" "hoodie.clustering.async.enabled" -> "false" "hoodie.payload.ordering.field" -> "dl_change_seq" "hoodie.cleaner.policy" -> "KEEP_LATEST_COMMITS" "hoodie.datasource.write.row.writer.enable" -> "true" "hoodie.archivelog.folder" -> "archived" "hoodie.datasource.hive_sync.base_file_format" -> "PARQUET" "hoodie.write.lock.zookeeper.url" -> "ip-10-73-99-147.ec2.internal" "hoodie.datasource.hive_sync.database" -> "presentation_dev" "hoodie.datasource.hive_sync.table" -> "clogoldenSetHoldings" "hoodie.datasource.write.commitmeta.key.prefix" -> "_" "hoodie.table.version" -> "4" "hoodie.table.type" -> "COPY_ON_WRITE" "hoodie.datasource.meta.sync.enable" -> "false" "hoodie.datasource.hive_sync.partition_fields" -> "year,month" "hoodie.table.recordkey.fields" -> "instrumentid,asofperiod" "hoodie.datasource.hive_sync.metastore.uris" -> "thrift://localhost:9083" "hoodie.partition.metafile.use.base.format" -> "false" "hoodie.datasource.write.operation" -> "upsert" "hoodie.datasource.hive_sync.partition_extractor_class" -> "org.apache.hudi.hive.MultiPartKeysValueExtractor" "hoodie.datasource.hive_sync.mode" -> "hms" "hoodie.datasource.write.streaming.retry.interval.ms" -> "2000" "hoodie.write.lock.zookeeper.port" -> "2181" "hoodie.index.type" -> "BLOOM" "hoodie.datasource.write.partitionpath.urlencode" -> "false" "hoodie.datasource.write.table.type" -> "COPY_ON_WRITE" "hoodie.table.partition.fields" -> "year,month" "hoodie.write.concurrency.mode" -> "single_writer" "hoodie.meta_sync.spark.version" -> "3.2.1-amzn-0" "hoodie.database.name" -> "" "hoodie.table.name" -> "clogoldenSetHoldings" "hoodie.datasource.hive_sync.jdbcurl" -> "jdbc:hive2://localhost:10000" "path" -> "s3://trepp-developmentservices-lake/presentationZone/clo/clogoldenSetHoldings" "hoodie.meta.sync.client.tool.class" -> "org.apache.hudi.hive.HiveSyncTool" "hoodie.datasource.write.reconcile.schema" -> "false" "hoodie.clustering.inline" -> "false" "hoodie.datasource.hive_sync.enable" -> "true" "hoodie.datasource.write.streaming.retry.count" -> "3" "hoodie.upsert.shuffle.parallelism" -> "64" "hoodie.table.keygenerator.class" -> "org.apache.hudi.keygen.ComplexKeyGenerator" "hoodie.datasource.compaction.async.enable" -> "true" "hoodie.datasource.write.insert.drop.duplicates" -> "false" "hoodie.write.lock.zookeeper.base_path" -> "/hudi" "hoodie.table.precombine.field" -> "dl_change_seq" "hoodie.datasource.write.partitionpath.field" -> "year,month" "hoodie.datasource.write.payload.class" -> "org.apache.hudi.common.model.OverwriteWithLatestAvroPayload" "hoodie.datasource.write.hive_style_partitioning" -> "true" "hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled" -> "false"