Skip to content

[SUPPORT] Hive 3 partitions #2661

@jonashartwig

Description

@jonashartwig

Steps to reproduce the behavior:

  1. start spark shell spark-shell \ --packages org.apache.hudi:hudi-spark-bundle_2.11:0.7.0 \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
  2. run spark code and create data:
import org.apache.spark.sql.Row
val rdd = spark.sparkContext.parallelize(List(Row(1, "a")))
import org.apache.spark.sql.types._
val df = spark.createDataFrame(rdd, StructType(List(StructField("int", IntegerType), StructField("string", StringType))))

import org.apache.hudi.QuickstartUtils._
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._
df.write.format("hudi").partitionBy("int").mode(Overwrite).option(TABLE_NAME, "hudi").option(PRECOMBINE_FIELD_OPT_KEY, "int").option(RECORDKEY_FIELD_OPT_KEY, "int").option(PARTITIONPATH_FIELD_OPT_KEY, "int").save("/data/test/swe/base/hudi/hudi")
  1. create hive table
CREATE EXTERNAL TABLE test_swe_base.t_hudi_hudi (`string` STRING)
PARTITIONED BY (`int` int)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hudi.hadoop.HoodieParquetInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION '/data/test/swe/base/hudi/hudi';
  1. recover partitions:
msck repair table test_swe_base.t_hudi_hudi;

Expected behavior
repair table command succeeds and the table can be queried foe 1 row

Environment Description

  • Hudi version : 0.7.0

  • Spark version : 2.4.5

  • Hive version : 3

  • Hadoop version : 3

  • Storage (HDFS/S3/GCS..) : HDFS

  • Running on Docker? (yes/no) :no
    Stacktrace
    2021-03-11 10:44:36,243 ERROR org.apache.hadoop.hive.metastore.HiveMetaStoreChecker: [HiveServer2-Background-Pool: Thread-34494]: org.apache.hadoop.hive.metastore.utils.MetastoreException: Invalid partition name hdfs://dl300cdppoc02.ddc.teliasonera.net:8020/data/test/swe/base/hudi/hudi/1
    2021-03-11 10:44:36,244 WARN org.apache.hadoop.hive.metastore.Msck: [HiveServer2-Background-Pool: Thread-34494]: Failed to run metacheck:
    org.apache.hadoop.hive.metastore.utils.MetastoreException: org.apache.hadoop.hive.metastore.utils.MetastoreException: Invalid partition name hdfs://dl300cdppoc02.ddc.teliasonera.net:8020/data/test/swe/base/hudi/hudi/1
    at org.apache.hadoop.hive.metastore.HiveMetaStoreChecker.checkPartitionDirs(HiveMetaStoreChecker.java:568) ~[hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.metastore.HiveMetaStoreChecker.checkPartitionDirs(HiveMetaStoreChecker.java:447) ~[hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.metastore.HiveMetaStoreChecker.findUnknownPartitions(HiveMetaStoreChecker.java:380) ~[hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.metastore.HiveMetaStoreChecker.checkTable(HiveMetaStoreChecker.java:353) ~[hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.metastore.HiveMetaStoreChecker.checkTable(HiveMetaStoreChecker.java:273) ~[hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.metastore.HiveMetaStoreChecker.checkMetastore(HiveMetaStoreChecker.java:139) ~[hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.metastore.Msck.repair(Msck.java:121) [hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.ql.ddl.misc.msck.MsckOperation.execute(MsckOperation.java:74) [hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.ql.ddl.DDLTask.execute(DDLTask.java:80) [hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:213) [hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:105) [hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.ql.Executor.launchTask(Executor.java:357) [hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.ql.Executor.launchTasks(Executor.java:330) [hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.ql.Executor.runTasks(Executor.java:246) [hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.ql.Executor.execute(Executor.java:109) [hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:740) [hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.ql.Driver.run(Driver.java:495) [hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.ql.Driver.run(Driver.java:489) [hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:166) [hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hive.service.cli.operation.SQLOperation.runQuery(SQLOperation.java:225) [hive-service-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    ]
    at org.apache.hive.service.cli.operation.SQLOperation.access$700(SQLOperation.java:87) [hive-service-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hive.service.cli.operation.SQLOperation$BackgroundWork$1.run(SQLOperation.java:322) [hive-service-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at java.security.AccessController.doPrivileged(Native Method) [?:1.8.0_232]
    at javax.security.auth.Subject.doAs(Subject.java:422) [?:1.8.0_232]
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1898) [hadoop-common-3.1.1.7.1.4.0-203.jar:?]
    at org.apache.hive.service.cli.operation.SQLOperation$BackgroundWork.run(SQLOperation.java:340) [hive-service-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) [?:1.8.0_232]
    at java.util.concurrent.FutureTask.run(FutureTask.java:266) [?:1.8.0_232]
    at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) [?:1.8.0_232]
    at java.util.concurrent.FutureTask.run(FutureTask.java:266) [?:1.8.0_232]
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_232]
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_232]
    at java.lang.Thread.run(Thread.java:748) [?:1.8.0_232]
    Caused by: org.apache.hadoop.hive.metastore.utils.MetastoreException: Invalid partition name hdfs://dl300cdppoc02.ddc.teliasonera.net:8020/data/test/swe/base/hudi/hudi/1
    at org.apache.hadoop.hive.metastore.HiveMetaStoreChecker$PathDepthInfoCallable.logOrThrowExceptionWithMsg(HiveMetaStoreChecker.java:519) ~[hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.metastore.HiveMetaStoreChecker$PathDepthInfoCallable.processPathDepthInfo(HiveMetaStoreChecker.java:500) ~[hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.metastore.HiveMetaStoreChecker$PathDepthInfoCallable.call(HiveMetaStoreChecker.java:470) ~[hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    at org.apache.hadoop.hive.metastore.HiveMetaStoreChecker$PathDepthInfoCallable.call(HiveMetaStoreChecker.java:452) ~[hive-exec-3.1.3000.7.1.4.0-203.jar:3.1.3000.7.1.4.0-203]
    ... 4 more

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions