Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions .asf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,14 @@ github:
- test-common-and-other-modules (scala-2.12, spark3.5, flink1.18)
- test-hudi-hadoop-mr-and-hudi-java-client (scala-2.12, spark3.5, flink1.20)
- test-hudi-trino-plugin
- test-spark-java-tests-part1 (scala-2.12, spark3.3, hudi-spark-datasource/hudi-spark3.3.x)
- test-spark-java-tests-part1 (scala-2.12, spark3.4, hudi-spark-datasource/hudi-spark3.4.x)
- test-spark-java-tests-part1 (scala-2.12, spark3.5, hudi-spark-datasource/hudi-spark3.5.x)
- test-spark-java-tests-part2 (scala-2.12, spark3.3, hudi-spark-datasource/hudi-spark3.3.x)
- test-spark-java-tests-part2 (scala-2.12, spark3.4, hudi-spark-datasource/hudi-spark3.4.x)
- test-spark-java-tests-part2 (scala-2.12, spark3.5, hudi-spark-datasource/hudi-spark3.5.x)
- test-spark-java-tests-part3 (scala-2.12, spark3.3, hudi-spark-datasource/hudi-spark3.3.x)
- test-spark-java-tests-part3 (scala-2.12, spark3.4, hudi-spark-datasource/hudi-spark3.4.x)
- test-spark-java-tests-part3 (scala-2.12, spark3.5, hudi-spark-datasource/hudi-spark3.5.x)
- test-spark-scala-dml-tests (scala-2.12, spark3.3, hudi-spark-datasource/hudi-spark3.3.x)
- test-spark-scala-dml-tests (scala-2.12, spark3.4, hudi-spark-datasource/hudi-spark3.4.x)
- test-spark-scala-dml-tests (scala-2.12, spark3.5, hudi-spark-datasource/hudi-spark3.5.x)
- test-spark-scala-other-tests (scala-2.12, spark3.3, hudi-spark-datasource/hudi-spark3.3.x)
- test-spark-scala-other-tests (scala-2.12, spark3.4, hudi-spark-datasource/hudi-spark3.4.x)
- test-spark-scala-other-tests (scala-2.12, spark3.5, hudi-spark-datasource/hudi-spark3.5.x)
- test-spark-java17-java-tests-part1 (scala-2.13, spark3.5, hudi-spark-datasource/hudi-spark3.5.x)
Expand All @@ -86,11 +81,10 @@ github:
- test-flink-1 (flink2.0, 1.11.4, 1.14.4)
- test-flink-1 (flink2.1, 1.11.4, 1.15.2)
- test-flink-2 (flink1.20, 1.11.4, 1.13.1)
- build-spark-java17 (scala-2.12, spark3.3, hudi-spark-datasource/hudi-spark3.3.x)
- build-spark-java17 (scala-2.12, spark3.4, hudi-spark-datasource/hudi-spark3.4.x)
- build-spark-java17 (scala-2.12, spark3.5, hudi-spark-datasource/hudi-spark3.5.x)
- build-flink-java17 (scala-2.12, flink1.20, 1.11.4, 1.13.1)
- validate-bundles (scala-2.12, flink1.17, 1.11.4, 1.12.3, spark3.3, spark3.3.4)
- validate-bundles (scala-2.12, flink1.17, 1.11.4, 1.12.3, spark3.5, spark3.5.1)
- validate-bundles (scala-2.12, flink1.18, 1.11.4, 1.13.1, spark3.4, spark3.4.3)
- validate-bundles (scala-2.13, flink1.19, 1.11.4, 1.13.1, spark3.5, spark3.5.1)
- validate-bundles (scala-2.13, flink1.20, 1.11.4, 1.13.1, spark3.5, spark3.5.1)
Expand Down
28 changes: 2 additions & 26 deletions .github/workflows/bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -266,10 +266,6 @@ jobs:
strategy:
matrix:
include:
- scalaProfile: "scala-2.12"
sparkProfile: "spark3.3"
sparkModules: "hudi-spark-datasource/hudi-spark3.3.x"

- scalaProfile: "scala-2.12"
sparkProfile: "spark3.4"
sparkModules: "hudi-spark-datasource/hudi-spark3.4.x"
Expand Down Expand Up @@ -318,10 +314,6 @@ jobs:
strategy:
matrix:
include:
- scalaProfile: "scala-2.12"
sparkProfile: "spark3.3"
sparkModules: "hudi-spark-datasource/hudi-spark3.3.x"

- scalaProfile: "scala-2.12"
sparkProfile: "spark3.4"
sparkModules: "hudi-spark-datasource/hudi-spark3.4.x"
Expand Down Expand Up @@ -383,10 +375,6 @@ jobs:
strategy:
matrix:
include:
- scalaProfile: "scala-2.12"
sparkProfile: "spark3.3"
sparkModules: "hudi-spark-datasource/hudi-spark3.3.x"

- scalaProfile: "scala-2.12"
sparkProfile: "spark3.4"
sparkModules: "hudi-spark-datasource/hudi-spark3.4.x"
Expand Down Expand Up @@ -442,10 +430,6 @@ jobs:
strategy:
matrix:
include:
- scalaProfile: "scala-2.12"
sparkProfile: "spark3.3"
sparkModules: "hudi-spark-datasource/hudi-spark3.3.x"

- scalaProfile: "scala-2.12"
sparkProfile: "spark3.4"
sparkModules: "hudi-spark-datasource/hudi-spark3.4.x"
Expand Down Expand Up @@ -501,10 +485,6 @@ jobs:
strategy:
matrix:
include:
- scalaProfile: "scala-2.12"
sparkProfile: "spark3.3"
sparkModules: "hudi-spark-datasource/hudi-spark3.3.x"

- scalaProfile: "scala-2.12"
sparkProfile: "spark3.4"
sparkModules: "hudi-spark-datasource/hudi-spark3.4.x"
Expand Down Expand Up @@ -1048,8 +1028,8 @@ jobs:
flinkProfile: 'flink1.17'
flinkAvroVersion: '1.11.4'
flinkParquetVersion: '1.12.3'
sparkProfile: 'spark3.3'
sparkRuntime: 'spark3.3.4'
sparkProfile: 'spark3.5'
sparkRuntime: 'spark3.5.1'

steps:
- uses: actions/checkout@v5
Expand Down Expand Up @@ -1256,10 +1236,6 @@ jobs:
strategy:
matrix:
include:
- scalaProfile: "scala-2.12"
sparkProfile: "spark3.3"
sparkModules: "hudi-spark-datasource/hudi-spark3.3.x"

- scalaProfile: "scala-2.12"
sparkProfile: "spark3.4"
sparkModules: "hudi-spark-datasource/hudi-spark3.4.x"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/maven_artifact_validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ jobs:
sparkRuntime: 'spark3.4.3'
- scalaProfile: 'scala-2.12'
flinkProfile: 'flink1.17'
sparkProfile: 'spark3.3'
sparkRuntime: 'spark3.3.4'
Comment thread
yihua marked this conversation as resolved.
sparkProfile: 'spark3.5'
sparkRuntime: 'spark3.5.1'
steps:
- uses: actions/checkout@v5
- name: Set up JDK 11
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release_candidate_validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ jobs:
sparkRuntime: 'spark3.4.3'
- scalaProfile: 'scala-2.12'
flinkProfile: 'flink1.17'
sparkProfile: 'spark3.3'
sparkRuntime: 'spark3.3.4'
Comment thread
yihua marked this conversation as resolved.
sparkProfile: 'spark3.5'
sparkRuntime: 'spark3.5.1'
steps:
- uses: actions/checkout@v5
- name: Set up JDK 11
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ Refer to the table below for building with different Spark and Scala versions.
| Maven build options | Expected Spark bundle jar name | Notes |
|:--------------------------|:---------------------------------------------|:-------------------------------------------------|
| (empty) | hudi-spark3.5-bundle_2.12 | For Spark 3.5.x and Scala 2.12 (default options) |
| `-Dspark3.3` | hudi-spark3.3-bundle_2.12 | For Spark 3.3.2+ and Scala 2.12 |
| `-Dspark3.4` | hudi-spark3.4-bundle_2.12 | For Spark 3.4.x and Scala 2.12 |
| `-Dspark3.5 -Dscala-2.12` | hudi-spark3.5-bundle_2.12 | For Spark 3.5.x and Scala 2.12 (same as default) |
| `-Dspark3.5 -Dscala-2.13` | hudi-spark3.5-bundle_2.13 | For Spark 3.5.x and Scala 2.13 |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,11 @@ private[hudi] trait SparkVersionsSupport {

def isSpark3: Boolean = getSparkVersion.startsWith("3.")
def isSpark4: Boolean = getSparkVersion.startsWith("4.")
def isSpark3_3: Boolean = getSparkVersion.startsWith("3.3")
def isSpark3_4: Boolean = getSparkVersion.startsWith("3.4")
def isSpark3_5: Boolean = getSparkVersion.startsWith("3.5")
def isSpark4_0: Boolean = getSparkVersion.startsWith("4.0")
def isSpark4_1: Boolean = getSparkVersion.startsWith("4.1")

def gteqSpark3_3_2: Boolean = getSparkVersion >= "3.3.2"
def gteqSpark3_4: Boolean = getSparkVersion >= "3.4"
def gteqSpark3_5: Boolean = getSparkVersion >= "3.5"
def gteqSpark4_0: Boolean = getSparkVersion >= "4.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,8 @@ object SparkAdapterSupport {
"org.apache.spark.sql.adapter.Spark4_0Adapter"
} else if (HoodieSparkUtils.isSpark3_5) {
"org.apache.spark.sql.adapter.Spark3_5Adapter"
} else if (HoodieSparkUtils.isSpark3_4) {
"org.apache.spark.sql.adapter.Spark3_4Adapter"
} else {
"org.apache.spark.sql.adapter.Spark3_3Adapter"
"org.apache.spark.sql.adapter.Spark3_4Adapter"
}
getClass.getClassLoader.loadClass(adapterClass)
.newInstance().asInstanceOf[SparkAdapter]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def insertOverwrite(self):
parser = argparse.ArgumentParser(description="Examples of various operations to perform on Hudi with PySpark",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-t", "--table", action="store", required=True, help="the name of the table to create")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("-p", "--package", action="store", help="the name of the hudi-spark-bundle package\n eg. \"org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.0\"")
group.add_argument("-p", "--package", action="store", help="the name of the hudi-spark-bundle package\n eg. \"org.apache.hudi:hudi-spark3.5-bundle_2.12:0.12.0\"")
group.add_argument("-j", "--jar", action="store", help="the full path to hudi-spark-bundle .jar file\n eg. \"[HUDI_BASE_PATH]/packaging/hudi-spark-bundle/target/hudi-spark-bundle[VERSION].jar\"")
args = vars(parser.parse_args())
package = args["package"]
Expand Down
6 changes: 3 additions & 3 deletions hudi-examples/hudi-examples-spark/src/test/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
* See the License for the specific language governing permissions and
-->
# Requirements
Python is required to run this. Pyspark 2.4.7 does not work with the latest versions of python (python 3.8+) so if you want to use a later version (in the example below 3.3) you can build Hudi by using the command:
Python is required to run this. Pyspark 2.4.7 does not work with the latest versions of python (python 3.8+) so if you want to use a later version (in the example below 3.5) you can build Hudi by using the command:
```bash
cd $HUDI_DIR
mvn clean install -DskipTests -Dspark3.3 -Dscala2.12
mvn clean install -DskipTests -Dspark3.5 -Dscala2.12
```
Various python packages may also need to be installed so you should get pip and then use **pip install \<package name\>** to get them
# How to Run
Expand All @@ -32,7 +32,7 @@ export PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH
export PYTHONPATH=$SPARK_HOME/python/lib/*.zip:$PYTHONPATH
```
4. Identify the Hudi Spark Bundle .jar or package that you wish to use:
A package will be in the format **org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.0**
A package will be in the format **org.apache.hudi:hudi-spark3.5-bundle_2.12:0.12.0**
A jar will be in the format **\[HUDI_BASE_PATH\]/packaging/hudi-spark-bundle/target/hudi-spark-bundle\[VERSION\].jar**
5. Go to the hudi directory and run the quickstart examples using the commands below, using the -t flag for the table name and the -p flag or -j flag for your package or jar respectively.
```bash
Expand Down
4 changes: 1 addition & 3 deletions hudi-spark-datasource/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ The modules are organized in a layered architecture to maximize code reuse acros
| `hudi-spark-common` | Core Spark integration code shared across all Spark versions. Contains DataSource V1/V2 implementations, file indexing, SQL writers, and incremental read support. |
| `hudi-spark3-common` | Code shared across Spark 3.x versions. Contains Spark 3 adapter interface, DML commands, and partition mapping. |
| `hudi-spark4-common` | Code shared across Spark 4.x versions. Contains Spark 4 adapter interface and 4.x-specific implementations. |
| `hudi-spark3.3.x` | Spark 3.3.x-specific adapter implementation with version-specific SQL parser and file readers. |
| `hudi-spark3.4.x` | Spark 3.4.x-specific adapter implementation. |
| `hudi-spark3.4.x` | Spark 3.4.x-specific adapter implementation with version-specific SQL parser and file readers. |
| `hudi-spark3.5.x` | Spark 3.5.x-specific adapter implementation (default). |
| `hudi-spark4.0.x` | Spark 4.0.x-specific adapter implementation. |
| `hudi-spark4.1.x` | Spark 4.1.x-specific adapter implementation. |
Expand All @@ -42,7 +41,6 @@ The modules are organized in a layered architecture to maximize code reuse acros

| Spark Version | Module | Scala Version | Java Version | Build Profile |
|---------------|--------|---------------|--------------|---------------|
| 3.3.x | `hudi-spark3.3.x` | 2.12 | 11+ | `-Dspark3.3` |
| 3.4.x | `hudi-spark3.4.x` | 2.12 | 11+ | `-Dspark3.4` |
| 3.5.x (default) | `hudi-spark3.5.x` | 2.12, 2.13 | 11+ | `-Dspark3.5` |
| 4.0.x | `hudi-spark4.0.x` | 2.13 | 17+ | `-Dspark4.0` |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,8 @@ object HoodieAnalysis extends SparkAdapterSupport {
"org.apache.spark.sql.hudi.analysis.HoodieSpark40DataSourceV2ToV1Fallback"
} else if (HoodieSparkUtils.isSpark3_5) {
"org.apache.spark.sql.hudi.analysis.HoodieSpark35DataSourceV2ToV1Fallback"
} else if (HoodieSparkUtils.isSpark3_4) {
"org.apache.spark.sql.hudi.analysis.HoodieSpark34DataSourceV2ToV1Fallback"
} else {
// Spark 3.3.x
"org.apache.spark.sql.hudi.analysis.HoodieSpark33DataSourceV2ToV1Fallback"
"org.apache.spark.sql.hudi.analysis.HoodieSpark34DataSourceV2ToV1Fallback"
}
val dataSourceV2ToV1Fallback: RuleBuilder =
session => instantiateKlass(dataSourceV2ToV1FallbackClass, session)
Expand Down Expand Up @@ -101,12 +98,8 @@ object HoodieAnalysis extends SparkAdapterSupport {
"org.apache.spark.sql.hudi.Spark40ResolveHudiAlterTableCommand"
} else if (HoodieSparkUtils.gteqSpark3_5) {
"org.apache.spark.sql.hudi.Spark35ResolveHudiAlterTableCommand"
} else if (HoodieSparkUtils.isSpark3_4) {
"org.apache.spark.sql.hudi.Spark34ResolveHudiAlterTableCommand"
} else if (HoodieSparkUtils.isSpark3_3) {
"org.apache.spark.sql.hudi.Spark33ResolveHudiAlterTableCommand"
} else {
throw new IllegalStateException("Unsupported Spark version")
"org.apache.spark.sql.hudi.Spark34ResolveHudiAlterTableCommand"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are silently falling back to spark 34 here. I think we should still throw the exception

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I cleaned this up to use only if-else branches to be consistent across the board, since we now compile against Spark 3.5 and 3.4. Having throw new IllegalStateException("Unsupported Spark version") is redundant.

}

val resolveAlterTableCommands: RuleBuilder =
Expand Down Expand Up @@ -150,11 +143,8 @@ object HoodieAnalysis extends SparkAdapterSupport {
"org.apache.spark.sql.execution.datasources.Spark40NestedSchemaPruning"
} else if (HoodieSparkUtils.gteqSpark3_5) {
"org.apache.spark.sql.execution.datasources.Spark35NestedSchemaPruning"
} else if (HoodieSparkUtils.gteqSpark3_4) {
"org.apache.spark.sql.execution.datasources.Spark34NestedSchemaPruning"
} else {
// spark 3.3
"org.apache.spark.sql.execution.datasources.Spark33NestedSchemaPruning"
Comment thread
yihua marked this conversation as resolved.
"org.apache.spark.sql.execution.datasources.Spark34NestedSchemaPruning"
}

val nestedSchemaPruningRule = ReflectionUtils.loadClass(nestedSchemaPruningClass).asInstanceOf[Rule[LogicalPlan]]
Expand All @@ -171,12 +161,9 @@ object HoodieAnalysis extends SparkAdapterSupport {
// - Precedes actual [[customEarlyScanPushDownRules]] invocation
val pruneFileSourcePartitionsClass = if (HoodieSparkUtils.gteqSpark4_0) {
"org.apache.spark.sql.hudi.analysis.Spark4HoodiePruneFileSourcePartitions"
} else if (HoodieSparkUtils.gteqSpark3_4) {
} else {
// Spark 3.4 and 3.5: PhysicalOperation and ScanOperation unified (SPARK-39764)
"org.apache.spark.sql.hudi.analysis.Spark3HoodiePruneFileSourcePartitions"
} else {
// Spark 3.3: Use ScanOperation for better compatibility
"org.apache.spark.sql.hudi.analysis.Spark33HoodiePruneFileSourcePartitions"
}
rules += (spark => instantiateKlass(pruneFileSourcePartitionsClass, spark))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,51 +36,27 @@ import scala.collection.JavaConverters
class TestHoodieSparkUtils {

@ParameterizedTest
@ValueSource(strings = Array("3.3.0", "3.3.2", "3.4.0", "3.5.0"))
@ValueSource(strings = Array("3.4.0", "3.5.0"))
def testSparkVersionCheckers(sparkVersion: String): Unit = {
val vsMock = new SparkVersionsSupport {
override def getSparkVersion: String = sparkVersion
}

sparkVersion match {
case "3.3.0" =>
assertTrue(vsMock.isSpark3)
assertTrue(vsMock.isSpark3_3)

assertFalse(vsMock.isSpark3_4)
assertFalse(vsMock.isSpark3_5)
assertFalse(vsMock.gteqSpark3_3_2)
assertFalse(vsMock.gteqSpark3_4)
assertFalse(vsMock.gteqSpark3_5)

case "3.3.2" =>
assertTrue(vsMock.isSpark3)
assertTrue(vsMock.isSpark3_3)
assertTrue(vsMock.gteqSpark3_3_2)


assertFalse(vsMock.isSpark3_4)
assertFalse(vsMock.isSpark3_5)
assertFalse(vsMock.gteqSpark3_4)
assertFalse(vsMock.gteqSpark3_5)

case "3.4.0" =>
assertTrue(vsMock.isSpark3)
assertTrue(vsMock.isSpark3_4)
assertTrue(vsMock.gteqSpark3_3_2)
assertTrue(vsMock.gteqSpark3_4)

assertFalse(vsMock.isSpark3_3)
assertFalse(vsMock.isSpark3_5)
assertFalse(vsMock.gteqSpark3_5)

case "3.5.0" =>
assertTrue(vsMock.isSpark3)
assertTrue(vsMock.isSpark3_5)
assertTrue(vsMock.gteqSpark3_3_2)
assertTrue(vsMock.gteqSpark3_4)
assertTrue(vsMock.gteqSpark3_5)

assertFalse(vsMock.isSpark3_3)
assertFalse(vsMock.isSpark3_4)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

package org.apache.hudi.functional

import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, DataSourceWriteOptions, HoodieBaseRelation, HoodieDataSourceHelpers, HoodieFileIndex, HoodieSchemaConversionUtils, HoodieSparkUtils, QuickstartUtils, ScalaAssertionSupport}
import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, DataSourceWriteOptions, HoodieBaseRelation, HoodieDataSourceHelpers, HoodieFileIndex, HoodieSchemaConversionUtils, QuickstartUtils, ScalaAssertionSupport}
import org.apache.hudi.DataSourceWriteOptions.{INLINE_CLUSTERING_ENABLE, KEYGENERATOR_CLASS_NAME}
import org.apache.hudi.HoodieConversionUtils.toJavaOption
import org.apache.hudi.QuickstartUtils.{convertToStringList, getQuickstartWriteConfigs}
Expand Down Expand Up @@ -1845,15 +1845,11 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup
@ParameterizedTest
@CsvSource(Array("true, 6", "false, 6", "true, 8", "false, 8", "true, 9", "false, 9"))
def testLogicalTypesReadRepair(vectorizedReadEnabled: Boolean, tableVersion: Int): Unit = {
// Note: for spark 3.3 and 3.4 we should fall back to nonvectorized reader
// Note: for spark 3.4 we should fall back to nonvectorized reader
// if that is not happening then this test will fail
val prevValue = spark.conf.get("spark.sql.parquet.enableVectorizedReader")
val prevTimezone = spark.conf.get("spark.sql.session.timeZone")
val propertyValue: String = System.getProperty("spark.testing")
try {
if (HoodieSparkUtils.isSpark3_3) {
System.setProperty("spark.testing", "true")
}
spark.conf.set("spark.sql.parquet.enableVectorizedReader", vectorizedReadEnabled.toString)
spark.conf.set("spark.sql.session.timeZone", "UTC")
val tableName = "trips_logical_types_json_cow_read_v" + tableVersion
Expand Down Expand Up @@ -1903,13 +1899,6 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup
} finally {
spark.conf.set("spark.sql.parquet.enableVectorizedReader", prevValue)
spark.conf.set("spark.sql.session.timeZone", prevTimezone)
if (HoodieSparkUtils.isSpark3_3) {
if (propertyValue == null) {
System.clearProperty("spark.testing")
} else {
System.setProperty("spark.testing", propertyValue)
}
}
}
}

Expand Down
Loading
Loading