New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-37273][SQL] Support hidden file metadata columns in Spark SQL #34575
Changes from 3 commits
dee06f6
06ac79e
fc043fd
170378b
73593c5
c531300
bd28eb7
e872d1f
60bdbc5
f78fe92
2baccdb
8b8b9fa
d984b50
0f6eccd
f780bf2
a0a538c
3516e4e
00bda90
afa0a83
65e79ab
3b3d635
4400f6a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -438,3 +438,70 @@ object VirtualColumn { | |
val groupingIdName: String = "spark_grouping_id" | ||
val groupingIdAttribute: UnresolvedAttribute = UnresolvedAttribute(groupingIdName) | ||
} | ||
|
||
/** | ||
* The internal representation of the hidden metadata column | ||
*/ | ||
class MetadataAttribute( | ||
override val name: String, | ||
override val dataType: DataType, | ||
override val nullable: Boolean = true, | ||
override val metadata: Metadata = Metadata.empty)( | ||
override val exprId: ExprId = NamedExpression.newExprId, | ||
override val qualifier: Seq[String] = Seq.empty[String]) | ||
extends AttributeReference(name, dataType, nullable, metadata)(exprId, qualifier) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's not extend |
||
|
||
// use to resolve supported metadata column references (e.g. different casings) | ||
override def withName(newName: String): MetadataAttribute = { | ||
if (name == newName) { | ||
this | ||
} else { | ||
MetadataAttribute(newName, dataType, nullable, metadata)(exprId, qualifier) | ||
} | ||
} | ||
|
||
override def withNullability(newNullability: Boolean): MetadataAttribute = { | ||
if (nullable == newNullability) { | ||
this | ||
} else { | ||
MetadataAttribute(name, dataType, newNullability, metadata)(exprId, qualifier) | ||
} | ||
} | ||
|
||
override def withQualifier(newQualifier: Seq[String]): MetadataAttribute = { | ||
if (qualifier == newQualifier) { | ||
this | ||
} else { | ||
MetadataAttribute(name, dataType, nullable, metadata)(exprId, newQualifier) | ||
} | ||
} | ||
|
||
override def withExprId(newExprId: ExprId): MetadataAttribute = { | ||
if (exprId == newExprId) { | ||
this | ||
} else { | ||
MetadataAttribute(name, dataType, nullable, metadata)(newExprId, qualifier) | ||
} | ||
} | ||
|
||
override def withDataType(newType: DataType): MetadataAttribute = { | ||
MetadataAttribute(name, newType, nullable, metadata)(exprId, qualifier) | ||
} | ||
|
||
override def newInstance(): MetadataAttribute = | ||
MetadataAttribute(name, dataType, nullable, metadata)(exprId, qualifier) | ||
|
||
override def withMetadata(newMetadata: Metadata): MetadataAttribute = { | ||
MetadataAttribute(name, dataType, nullable, newMetadata)(exprId, qualifier) | ||
} | ||
} | ||
|
||
object MetadataAttribute { | ||
|
||
def apply(name: String, dataType: DataType): MetadataAttribute = | ||
new MetadataAttribute(name, dataType, true)() | ||
|
||
def apply(name: String, dataType: DataType, nullable: Boolean, metadata: Metadata) | ||
(exprId: ExprId, qualifier: Seq[String]): MetadataAttribute = | ||
new MetadataAttribute(name, dataType, nullable, metadata)(exprId, qualifier) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -276,3 +276,10 @@ object LogicalPlanIntegrity { | |
checkIfSameExprIdNotReused(plan) && hasUniqueExprIdsForOutput(plan) | ||
} | ||
} | ||
|
||
/** | ||
* A logical plan node with exposed metadata columns | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A logical plan node that can generate metadata columns |
||
*/ | ||
trait ExposesMetadataColumns extends LogicalPlan { | ||
def withMetadataColumns(): ExposesMetadataColumns | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need it to return |
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,6 +35,7 @@ import org.apache.spark.sql.execution.datasources._ | |
import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat => ParquetSource} | ||
import org.apache.spark.sql.execution.datasources.v2.PushedDownOperators | ||
import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} | ||
import org.apache.spark.sql.execution.vectorized.{OffHeapColumnVector, OnHeapColumnVector} | ||
import org.apache.spark.sql.internal.SQLConf | ||
import org.apache.spark.sql.sources.{BaseRelation, Filter} | ||
import org.apache.spark.sql.types.StructType | ||
|
@@ -194,10 +195,17 @@ case class FileSourceScanExec( | |
disableBucketedScan: Boolean = false) | ||
extends DataSourceScanExec { | ||
|
||
lazy val outputMetadataStruct: Option[MetadataAttribute] = | ||
output.collectFirst { case meta: MetadataAttribute => meta } | ||
|
||
// Note that some vals referring the file-based relation are lazy intentionally | ||
// so that this plan can be canonicalized on executor side too. See SPARK-23731. | ||
override lazy val supportsColumnar: Boolean = { | ||
relation.fileFormat.supportBatch(relation.sparkSession, schema) | ||
// schema without file metadata column | ||
val fileSchema = if (outputMetadataStruct.isEmpty) schema else { | ||
StructType.fromAttributes(output.filterNot(_.isInstanceOf[MetadataAttribute])) | ||
} | ||
relation.fileFormat.supportBatch(relation.sparkSession, fileSchema) | ||
} | ||
|
||
private lazy val needsUnsafeRowConversion: Boolean = { | ||
|
@@ -212,7 +220,16 @@ case class FileSourceScanExec( | |
relation.fileFormat.vectorTypes( | ||
requiredSchema = requiredSchema, | ||
partitionSchema = relation.partitionSchema, | ||
relation.sparkSession.sessionState.conf) | ||
relation.sparkSession.sessionState.conf).map { vectorTypes => | ||
val metadataVectorClz = | ||
if (relation.sparkSession.sessionState.conf.offHeapColumnVectorEnabled) { | ||
classOf[OffHeapColumnVector].getName | ||
} else { | ||
classOf[OnHeapColumnVector].getName | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. since we will change to use a constant vector soon, how about we always use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good idea! thanks! |
||
} | ||
// for column-based file format, append metadata columns' vector type classes if any | ||
vectorTypes ++ (if (outputMetadataStruct.isDefined) Seq(metadataVectorClz) else Seq.empty) | ||
} | ||
|
||
private lazy val driverMetrics: HashMap[String, Long] = HashMap.empty | ||
|
||
|
@@ -355,7 +372,9 @@ case class FileSourceScanExec( | |
@transient | ||
private lazy val pushedDownFilters = { | ||
val supportNestedPredicatePushdown = DataSourceUtils.supportNestedPredicatePushdown(relation) | ||
dataFilters.flatMap(DataSourceStrategy.translateFilter(_, supportNestedPredicatePushdown)) | ||
dataFilters | ||
.filterNot(_.references.exists(_.isInstanceOf[MetadataAttribute])) | ||
.flatMap(DataSourceStrategy.translateFilter(_, supportNestedPredicatePushdown)) | ||
} | ||
|
||
override lazy val metadata: Map[String, String] = { | ||
|
@@ -597,7 +616,8 @@ case class FileSourceScanExec( | |
} | ||
} | ||
|
||
new FileScanRDD(fsRelation.sparkSession, readFile, filePartitions) | ||
new FileScanRDD(fsRelation.sparkSession, readFile, filePartitions, | ||
requiredSchema, outputMetadataStruct) | ||
} | ||
|
||
/** | ||
|
@@ -653,7 +673,8 @@ case class FileSourceScanExec( | |
val partitions = | ||
FilePartition.getFilePartitions(relation.sparkSession, splitFiles, maxSplitBytes) | ||
|
||
new FileScanRDD(fsRelation.sparkSession, readFile, partitions) | ||
new FileScanRDD(fsRelation.sparkSession, readFile, partitions, | ||
requiredSchema, outputMetadataStruct) | ||
} | ||
|
||
// Filters unused DynamicPruningExpression expressions - one which has been replaced | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjectio | |
import org.apache.spark.sql.errors.QueryExecutionErrors | ||
import org.apache.spark.sql.internal.SQLConf | ||
import org.apache.spark.sql.sources.Filter | ||
import org.apache.spark.sql.types.{DataType, StructType} | ||
import org.apache.spark.sql.types.{DataType, LongType, StringType, StructField, StructType} | ||
|
||
|
||
/** | ||
|
@@ -171,6 +171,25 @@ trait FileFormat { | |
def supportFieldName(name: String): Boolean = true | ||
} | ||
|
||
object FileFormat { | ||
|
||
val FILE_PATH = "file_path" | ||
|
||
val FILE_NAME = "file_name" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wondering do we also plan to deprecate existing expression There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. I think we should, as |
||
|
||
val FILE_SIZE = "file_size" | ||
|
||
val FILE_MODIFICATION_TIME = "file_modification_time" | ||
|
||
// supported metadata columns for hadoop fs relation | ||
val FILE_METADATA_COLUMNS: MetadataAttribute = MetadataAttribute("_metadata", | ||
new StructType() | ||
.add(StructField(FILE_PATH, StringType)) | ||
.add(StructField(FILE_NAME, StringType)) | ||
.add(StructField(FILE_SIZE, LongType)) | ||
.add(StructField(FILE_MODIFICATION_TIME, LongType))) | ||
} | ||
|
||
/** | ||
* The base class file format that is based on text file. | ||
*/ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will think about this new class. Maybe have something like AttributeReferenceBase trait.