-
Notifications
You must be signed in to change notification settings - Fork 2.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[HUDI-4081][HUDI-4472] Addressing Spark SQL vs Spark DS performance gap #6213
Changes from all commits
3d55f22
47e86f3
ed0b9c9
c5829c9
d6c34b0
4be35f1
0188b77
023be63
a8b0a25
27bb52f
225d037
c7f2688
ec3cbfb
b2f5007
193fa73
813ea6f
f7dae90
83bec46
f37753a
52a46e8
3fa6184
e089df9
0088598
40b432e
d098342
4a639a4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,7 +22,7 @@ import org.apache.avro.generic.GenericRecord | |
import org.apache.hadoop.conf.Configuration | ||
import org.apache.hadoop.fs.{FileSystem, Path} | ||
import org.apache.hudi.DataSourceWriteOptions._ | ||
import org.apache.hudi.HoodieConversionUtils.toProperties | ||
import org.apache.hudi.HoodieConversionUtils.{toProperties, toScalaOption} | ||
import org.apache.hudi.HoodieWriterUtils._ | ||
import org.apache.hudi.avro.HoodieAvroUtils | ||
import org.apache.hudi.client.{HoodieWriteResult, SparkRDDWriteClient} | ||
|
@@ -31,7 +31,7 @@ import org.apache.hudi.common.fs.FSUtils | |
import org.apache.hudi.common.model._ | ||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline | ||
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver} | ||
import org.apache.hudi.common.util.{CommitUtils, StringUtils} | ||
import org.apache.hudi.common.util.{CommitUtils, Functions, StringUtils} | ||
import org.apache.hudi.config.HoodieBootstrapConfig.{BASE_PATH, INDEX_CLASS_NAME} | ||
import org.apache.hudi.config.{HoodieInternalConfig, HoodieWriteConfig} | ||
import org.apache.hudi.exception.HoodieException | ||
|
@@ -72,8 +72,7 @@ object HoodieSparkSqlWriter { | |
hoodieTableConfigOpt: Option[HoodieTableConfig] = Option.empty, | ||
hoodieWriteClient: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] = Option.empty, | ||
asyncCompactionTriggerFn: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]] => Unit] = Option.empty, | ||
asyncClusteringTriggerFn: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]] => Unit] = Option.empty | ||
) | ||
asyncClusteringTriggerFn: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]] => Unit] = Option.empty) | ||
: (Boolean, common.util.Option[String], common.util.Option[String], common.util.Option[String], | ||
SparkRDDWriteClient[HoodieRecordPayload[Nothing]], HoodieTableConfig) = { | ||
|
||
|
@@ -241,39 +240,49 @@ object HoodieSparkSqlWriter { | |
sparkContext.getConf.registerKryoClasses( | ||
Array(classOf[org.apache.avro.generic.GenericData], | ||
classOf[org.apache.avro.Schema])) | ||
var schema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, nameSpace) | ||
val lastestSchema = getLatestTableSchema(fs, basePath, sparkContext, schema) | ||
|
||
// TODO(HUDI-4472) revisit and simplify schema handling | ||
val sourceSchema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, nameSpace) | ||
val latestTableSchema = getLatestTableSchema(fs, basePath, sparkContext).getOrElse(sourceSchema) | ||
|
||
val schemaEvolutionEnabled = parameters.getOrDefault(DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.key(), "false").toBoolean | ||
var internalSchemaOpt = getLatestTableInternalSchema(fs, basePath, sparkContext) | ||
if (reconcileSchema && parameters.getOrDefault(DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.key(), "false").toBoolean | ||
&& internalSchemaOpt.isEmpty) { | ||
// force apply full schema evolution. | ||
internalSchemaOpt = Some(AvroInternalSchemaConverter.convert(schema)) | ||
} | ||
if (reconcileSchema) { | ||
schema = lastestSchema | ||
} | ||
if (internalSchemaOpt.isDefined) { | ||
// Apply schema evolution. | ||
val mergedSparkSchema = if (!reconcileSchema) { | ||
AvroConversionUtils.convertAvroSchemaToStructType(AvroSchemaEvolutionUtils.canonicalizeColumnNullability(schema, lastestSchema)) | ||
|
||
val writerSchema: Schema = | ||
if (reconcileSchema) { | ||
// In case we need to reconcile the schema and schema evolution is enabled, | ||
// we will force-apply schema evolution to the writer's schema | ||
if (schemaEvolutionEnabled && internalSchemaOpt.isEmpty) { | ||
internalSchemaOpt = Some(AvroInternalSchemaConverter.convert(sourceSchema)) | ||
} | ||
|
||
if (internalSchemaOpt.isDefined) { | ||
// Apply schema evolution, by auto-merging write schema and read schema | ||
val mergedInternalSchema = AvroSchemaEvolutionUtils.reconcileSchema(sourceSchema, internalSchemaOpt.get) | ||
AvroInternalSchemaConverter.convert(mergedInternalSchema, latestTableSchema.getName) | ||
} else if (TableSchemaResolver.isSchemaCompatible(sourceSchema, latestTableSchema)) { | ||
// In case schema reconciliation is enabled and source and latest table schemas | ||
// are compatible (as defined by [[TableSchemaResolver#isSchemaCompatible]], then we will | ||
// pick latest table's schema as the writer's schema | ||
latestTableSchema | ||
} else { | ||
// Otherwise fallback to original source's schema | ||
sourceSchema | ||
} | ||
} else { | ||
// Auto merge write schema and read schema. | ||
val mergedInternalSchema = AvroSchemaEvolutionUtils.reconcileSchema(schema, internalSchemaOpt.get) | ||
AvroConversionUtils.convertAvroSchemaToStructType(AvroInternalSchemaConverter.convert(mergedInternalSchema, lastestSchema.getName)) | ||
// In case reconciliation is disabled, we still have to do nullability attributes | ||
// (minor) reconciliation, making sure schema of the incoming batch is in-line with | ||
// the data already committed in the table | ||
AvroSchemaEvolutionUtils.canonicalizeColumnNullability(sourceSchema, latestTableSchema) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Notice: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alexeykudinkin : prior to this patch, when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a good call out @xiarixiaoyao. Do you think it still might be an issue even if all our tests are green (for Spark 2.4)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
@alexeykudinkin do we have a UT to ensure namespace consistent? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What i meant is that all of our Spark writing is proxied t/h this code and therefore would there be issues due to namespacing in Avro 1.8 (Spark 2.4) we would learn it pretty fast There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok that sounds fair. i'm more curious on what different namespaces we might get? by right Hudi should dominate the namespace with some constant like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We might get auto-gen'd namespace when we convert incoming There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alexeykudinkin |
||
} | ||
schema = AvroConversionUtils.convertStructTypeToAvroSchema(mergedSparkSchema, structName, nameSpace) | ||
} | ||
|
||
if (reconcileSchema && internalSchemaOpt.isEmpty) { | ||
schema = lastestSchema | ||
} | ||
validateSchemaForHoodieIsDeleted(schema) | ||
sparkContext.getConf.registerAvroSchemas(schema) | ||
log.info(s"Registered avro schema : ${schema.toString(true)}") | ||
validateSchemaForHoodieIsDeleted(writerSchema) | ||
sparkContext.getConf.registerAvroSchemas(writerSchema) | ||
log.info(s"Registered avro schema : ${writerSchema.toString(true)}") | ||
|
||
// Convert to RDD[HoodieRecord] | ||
val genericRecords: RDD[GenericRecord] = HoodieSparkUtils.createRdd(df, structName, nameSpace, reconcileSchema, | ||
org.apache.hudi.common.util.Option.of(schema)) | ||
org.apache.hudi.common.util.Option.of(writerSchema)) | ||
val shouldCombine = parameters(INSERT_DROP_DUPS.key()).toBoolean || | ||
operation.equals(WriteOperationType.UPSERT) || | ||
parameters.getOrElse(HoodieWriteConfig.COMBINE_BEFORE_INSERT.key(), | ||
|
@@ -295,10 +304,10 @@ object HoodieSparkSqlWriter { | |
hoodieRecord | ||
}).toJavaRDD() | ||
|
||
val writeSchema = if (dropPartitionColumns) generateSchemaWithoutPartitionColumns(partitionColumns, schema) else schema | ||
val writerDataSchema = if (dropPartitionColumns) generateSchemaWithoutPartitionColumns(partitionColumns, writerSchema) else writerSchema | ||
// Create a HoodieWriteClient & issue the write. | ||
|
||
val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, writeSchema.toString, path, | ||
val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, writerDataSchema.toString, path, | ||
tblName, mapAsJavaMap(addSchemaEvolutionParameters(parameters, internalSchemaOpt) - HoodieWriteConfig.AUTO_COMMIT_ENABLE.key) | ||
)).asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] | ||
|
||
|
@@ -388,14 +397,18 @@ object HoodieSparkSqlWriter { | |
* @param schema incoming record's schema. | ||
* @return Pair of(boolean, table schema), where first entry will be true only if schema conversion is required. | ||
*/ | ||
def getLatestTableSchema(fs: FileSystem, basePath: Path, sparkContext: SparkContext, schema: Schema): Schema = { | ||
var latestSchema: Schema = schema | ||
def getLatestTableSchema(fs: FileSystem, basePath: Path, sparkContext: SparkContext): Option[Schema] = { | ||
if (FSUtils.isTableExists(basePath.toString, fs)) { | ||
val tableMetaClient = HoodieTableMetaClient.builder.setConf(sparkContext.hadoopConfiguration).setBasePath(basePath.toString).build() | ||
val tableMetaClient = HoodieTableMetaClient.builder | ||
.setConf(sparkContext.hadoopConfiguration) | ||
.setBasePath(basePath.toString) | ||
.build() | ||
val tableSchemaResolver = new TableSchemaResolver(tableMetaClient) | ||
latestSchema = tableSchemaResolver.getLatestSchema(schema, false, null) | ||
|
||
toScalaOption(tableSchemaResolver.getTableAvroSchemaFromLatestCommit(false)) | ||
} else { | ||
None | ||
} | ||
latestSchema | ||
} | ||
|
||
def registerKryoClassesAndGetGenericRecords(tblName: String, sparkContext: SparkContext, df: Dataset[Row], | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so essentially, this schema reconciliation is not handled well in schema evolution code path ? may be can you file a tracking ticket.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's one linked on top of this code section