Skip to content

Commit

Permalink
Merge branch 'apache:master' into add-compaction-metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
amrishlal committed Jun 20, 2023
2 parents 155e496 + 6ee6a52 commit eab8998
Show file tree
Hide file tree
Showing 677 changed files with 33,242 additions and 6,442 deletions.
2 changes: 1 addition & 1 deletion .codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ coverage:
range: "50...100"
status:
project: # settings affecting project coverage
enabled: no
enabled: yes

# do not run coverage on patch nor changes
patch: no
Expand Down
132 changes: 125 additions & 7 deletions .github/workflows/bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ on:
- master
- 'release-*'
env:
MVN_ARGS: -e -ntp -B -V -Pwarn-log -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.shade=warn -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.dependency=warn
MVN_ARGS: -e -ntp -B -V -Dgpg.skip -Djacoco.skip -Pwarn-log -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.shade=warn -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.dependency=warn
SPARK_COMMON_MODULES: hudi-spark-datasource/hudi-spark,hudi-spark-datasource/hudi-spark-common

jobs:
Expand Down Expand Up @@ -56,8 +56,8 @@ jobs:
sparkModules: "hudi-spark-datasource/hudi-spark2"

- scalaProfile: "scala-2.12"
sparkProfile: "spark2.4"
sparkModules: "hudi-spark-datasource/hudi-spark2"
sparkProfile: "spark3.0"
sparkModules: "hudi-spark-datasource/hudi-spark3.0.x"

- scalaProfile: "scala-2.12"
sparkProfile: "spark3.1"
Expand All @@ -71,6 +71,10 @@ jobs:
sparkProfile: "spark3.3"
sparkModules: "hudi-spark-datasource/hudi-spark3.3.x"

- scalaProfile: "scala-2.12"
sparkProfile: "spark3.4"
sparkModules: "hudi-spark-datasource/hudi-spark3.4.x"

steps:
- uses: actions/checkout@v2
- name: Set up JDK 8
Expand All @@ -84,7 +88,7 @@ jobs:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
SPARK_PROFILE: ${{ matrix.sparkProfile }}
run:
mvn clean install -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl hudi-examples/hudi-examples-spark,hudi-spark-datasource/hudi-spark -am -DskipTests=true $MVN_ARGS
mvn clean install -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DskipTests=true $MVN_ARGS
- name: Quickstart Test
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
Expand All @@ -96,15 +100,15 @@ jobs:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SPARK_MODULES: ${{ matrix.sparkModules }}
if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4 as it's covered by Azure CI
if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 as it's covered by Azure CI
run:
mvn test -Punit-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl "hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS
- name: FT - Spark
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SPARK_MODULES: ${{ matrix.sparkModules }}
if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4 as it's covered by Azure CI
if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 as it's covered by Azure CI
run:
mvn test -Pfunctional-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS

Expand Down Expand Up @@ -138,12 +142,23 @@ jobs:
FLINK_PROFILE: ${{ matrix.flinkProfile }}
run:
mvn test -Punit-tests -D"$SCALA_PROFILE" -D"$FLINK_PROFILE" -pl hudi-examples/hudi-examples-flink $MVN_ARGS
- name: Integration Test
env:
SCALA_PROFILE: 'scala-2.12'
FLINK_PROFILE: ${{ matrix.flinkProfile }}
if: ${{ endsWith(env.FLINK_PROFILE, '1.17') }}
run: |
mvn clean install -Pintegration-tests -D"$SCALA_PROFILE" -D"$FLINK_PROFILE" -pl hudi-flink-datasource/hudi-flink -am -Davro.version=1.10.0 -DskipTests=true $MVN_ARGS
mvn verify -Pintegration-tests -D"$SCALA_PROFILE" -D"$FLINK_PROFILE" -pl hudi-flink-datasource/hudi-flink $MVN_ARGS
validate-bundles:
runs-on: ubuntu-latest
strategy:
matrix:
include:
- flinkProfile: 'flink1.17'
sparkProfile: 'spark3.4'
sparkRuntime: 'spark3.4.0'
- flinkProfile: 'flink1.17'
sparkProfile: 'spark3.3'
sparkRuntime: 'spark3.3.2'
Expand All @@ -159,6 +174,9 @@ jobs:
- flinkProfile: 'flink1.13'
sparkProfile: 'spark3.1'
sparkRuntime: 'spark3.1.3'
- flinkProfile: 'flink1.14'
sparkProfile: 'spark3.0'
sparkRuntime: 'spark3.0.2'
steps:
- uses: actions/checkout@v2
- name: Set up JDK 8
Expand Down Expand Up @@ -199,7 +217,107 @@ jobs:
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SPARK_RUNTIME: ${{ matrix.sparkRuntime }}
SCALA_PROFILE: 'scala-2.12'
if: ${{ endsWith(env.SPARK_PROFILE, '3.3') }} # Only Spark 3.3 supports Java 17 as of now
if: ${{ env.SPARK_PROFILE >= 'spark3.3' }} # Only Spark 3.3 and above support Java 17
run: |
HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
./packaging/bundle-validation/ci_run.sh $HUDI_VERSION openjdk17
validate-release-candidate-bundles:
if: false
runs-on: ubuntu-latest
env:
HUDI_VERSION: 0.13.1-rcx
STAGING_REPO_NUM: 1123
strategy:
matrix:
include:
- flinkProfile: 'flink1.16'
sparkProfile: 'spark3'
sparkRuntime: 'spark3.3.2'
- flinkProfile: 'flink1.15'
sparkProfile: 'spark3.3'
sparkRuntime: 'spark3.3.1'
- flinkProfile: 'flink1.14'
sparkProfile: 'spark3.2'
sparkRuntime: 'spark3.2.3'
- flinkProfile: 'flink1.13'
sparkProfile: 'spark3.1'
sparkRuntime: 'spark3.1.3'
- flinkProfile: 'flink1.13'
sparkProfile: 'spark'
sparkRuntime: 'spark2.4.8'
- flinkProfile: 'flink1.13'
sparkProfile: 'spark2.4'
sparkRuntime: 'spark2.4.8'
steps:
- uses: actions/checkout@v2
- name: Set up JDK 8
uses: actions/setup-java@v2
with:
java-version: '8'
distribution: 'adopt'
architecture: x64
- name: IT - Bundle Validation - OpenJDK 8
env:
FLINK_PROFILE: ${{ matrix.flinkProfile }}
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SPARK_RUNTIME: ${{ matrix.sparkRuntime }}
run: |
./packaging/bundle-validation/ci_run.sh $HUDI_VERSION openjdk8 $STAGING_REPO_NUM
- name: IT - Bundle Validation - OpenJDK 11
env:
FLINK_PROFILE: ${{ matrix.flinkProfile }}
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SPARK_RUNTIME: ${{ matrix.sparkRuntime }}
if: ${{ startsWith(env.SPARK_PROFILE, 'spark3') }} # Only Spark 3.x supports Java 11 as of now
run: |
./packaging/bundle-validation/ci_run.sh $HUDI_VERSION openjdk11 $STAGING_REPO_NUM
- name: IT - Bundle Validation - OpenJDK 17
env:
FLINK_PROFILE: ${{ matrix.flinkProfile }}
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SPARK_RUNTIME: ${{ matrix.sparkRuntime }}
if: ${{ endsWith(env.SPARK_PROFILE, '3.3') }} # Only Spark 3.3 supports Java 17 as of now
run: |
./packaging/bundle-validation/ci_run.sh $HUDI_VERSION openjdk17 $STAGING_REPO_NUM
integration-tests:
runs-on: ubuntu-latest
strategy:
matrix:
include:
- sparkProfile: 'spark2.4'
sparkArchive: 'spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz'
steps:
- uses: actions/checkout@v2
- name: Set up JDK 8
uses: actions/setup-java@v2
with:
java-version: '8'
distribution: 'adopt'
architecture: x64
- name: Build Project
env:
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SCALA_PROFILE: '-Dscala-2.11 -Dscala.binary.version=2.11'
run:
mvn clean install $SCALA_PROFILE -D"$SPARK_PROFILE" -Pintegration-tests -DskipTests=true $MVN_ARGS
- name: 'UT integ-test'
env:
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SCALA_PROFILE: '-Dscala-2.11 -Dscala.binary.version=2.11'
run:
mvn test $SCALA_PROFILE -D"$SPARK_PROFILE" -Pintegration-tests -DskipUTs=false -DskipITs=true -pl hudi-integ-test $MVN_ARGS
- name: 'IT'
env:
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SPARK_ARCHIVE: ${{ matrix.sparkArchive }}
SCALA_PROFILE: '-Dscala-2.11 -Dscala.binary.version=2.11'
run: |
echo "Downloading $SPARK_ARCHIVE"
curl https://archive.apache.org/dist/spark/$SPARK_ARCHIVE --create-dirs -o $GITHUB_WORKSPACE/$SPARK_ARCHIVE
tar -xvf $GITHUB_WORKSPACE/$SPARK_ARCHIVE -C $GITHUB_WORKSPACE/
mkdir /tmp/spark-events/
SPARK_ARCHIVE_BASENAME=$(basename $SPARK_ARCHIVE)
export SPARK_HOME=$GITHUB_WORKSPACE/${SPARK_ARCHIVE_BASENAME%.*}
mvn verify $SCALA_PROFILE -D"$SPARK_PROFILE" -Pintegration-tests -pl !hudi-flink-datasource/hudi-flink $MVN_ARGS
52 changes: 31 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,11 @@ git clone https://github.com/apache/hudi.git && cd hudi
mvn clean package -DskipTests
# Start command
spark-2.4.4-bin-hadoop2.7/bin/spark-shell \
--jars `ls packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-*.*.*-SNAPSHOT.jar` \
spark-3.2.3-bin-hadoop3.2/bin/spark-shell \
--jars `ls packaging/hudi-spark-bundle/target/hudi-spark3.2-bundle_2.12-*.*.*-SNAPSHOT.jar` \
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
--conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
--conf 'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog' \
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'
```

Expand All @@ -82,29 +84,32 @@ mvn clean javadoc:aggregate -Pjavadocs

### Build with different Spark versions

The default Spark 2.x version supported is 2.4.4. The default Spark 3.x version, corresponding to `spark3` profile is 3.3.1.
Refer to the table below for building with different Spark and Scala versions.
The default Spark 2.x version supported is 2.4.4. The default Spark 3.x version, corresponding to `spark3` profile is
3.4.0. The default Scala version is 2.12. Refer to the table below for building with different Spark and Scala versions.

| Maven build options | Expected Spark bundle jar name | Notes |
|:--------------------------|:---------------------------------------------|:-------------------------------------------------|
| (empty) | hudi-spark-bundle_2.11 (legacy bundle name) | For Spark 2.4.4 and Scala 2.11 (default options) |
| `-Dspark2.4` | hudi-spark2.4-bundle_2.11 | For Spark 2.4.4 and Scala 2.11 (same as default) |
| `-Dspark3.1 -Dscala-2.12` | hudi-spark3.1-bundle_2.12 | For Spark 3.1.x and Scala 2.12 |
| `-Dspark3.2 -Dscala-2.12` | hudi-spark3.2-bundle_2.12 | For Spark 3.2.x and Scala 2.12 |
| `-Dspark3.3 -Dscala-2.12` | hudi-spark3.3-bundle_2.12 | For Spark 3.3.x and Scala 2.12 |
| `-Dspark3` | hudi-spark3-bundle_2.12 (legacy bundle name) | For Spark 3.3.x and Scala 2.12 |
| `-Dscala-2.12` | hudi-spark-bundle_2.12 (legacy bundle name) | For Spark 2.4.4 and Scala 2.12 |
| (empty) | hudi-spark3.2-bundle_2.12 | For Spark 3.2.x and Scala 2.12 (default options) |
| `-Dspark2.4 -Dscala-2.11` | hudi-spark2.4-bundle_2.11 | For Spark 2.4.4 and Scala 2.11 |
| `-Dspark3.0` | hudi-spark3.0-bundle_2.12 | For Spark 3.0.x and Scala 2.12 |
| `-Dspark3.1` | hudi-spark3.1-bundle_2.12 | For Spark 3.1.x and Scala 2.12 |
| `-Dspark3.2` | hudi-spark3.2-bundle_2.12 | For Spark 3.2.x and Scala 2.12 (same as default) |
| `-Dspark3.3` | hudi-spark3.3-bundle_2.12 | For Spark 3.3.x and Scala 2.12 |
| `-Dspark3.4` | hudi-spark3.4-bundle_2.12 | For Spark 3.4.x and Scala 2.12 |
| `-Dspark2 -Dscala-2.11` | hudi-spark-bundle_2.11 (legacy bundle name) | For Spark 2.4.4 and Scala 2.11 |
| `-Dspark2 -Dscala-2.12` | hudi-spark-bundle_2.12 (legacy bundle name) | For Spark 2.4.4 and Scala 2.12 |
| `-Dspark3` | hudi-spark3-bundle_2.12 (legacy bundle name) | For Spark 3.4.x and Scala 2.12 |

For example,
```
# Build against Spark 3.2.x
mvn clean package -DskipTests -Dspark3.2 -Dscala-2.12
mvn clean package -DskipTests
# Build against Spark 3.1.x
mvn clean package -DskipTests -Dspark3.1 -Dscala-2.12
# Build against Spark 3.4.x
mvn clean package -DskipTests -Dspark3.4
# Build against Spark 2.4.4 and Scala 2.11
mvn clean package -DskipTests -Dspark2.4
mvn clean package -DskipTests -Dspark2.4 -Dscala-2.11
```

#### What about "spark-avro" module?
Expand All @@ -123,21 +128,21 @@ Refer to the table below for building with different Flink and Scala versions.
| `-Dflink1.17` | hudi-flink1.17-bundle | For Flink 1.17 (same as default) |
| `-Dflink1.16` | hudi-flink1.16-bundle | For Flink 1.16 |
| `-Dflink1.15` | hudi-flink1.15-bundle | For Flink 1.15 |
| `-Dflink1.14 -Dscala-2.12` | hudi-flink1.14-bundle | For Flink 1.14 and Scala 2.12 |
| `-Dflink1.14` | hudi-flink1.14-bundle | For Flink 1.14 and Scala 2.11 |
| `-Dflink1.13 -Dscala-2.12` | hudi-flink1.13-bundle | For Flink 1.13 and Scala 2.12 |
| `-Dflink1.13` | hudi-flink1.13-bundle | For Flink 1.13 and Scala 2.11 |
| `-Dflink1.14` | hudi-flink1.14-bundle | For Flink 1.14 and Scala 2.12 |
| `-Dflink1.14 -Dscala-2.11` | hudi-flink1.14-bundle | For Flink 1.14 and Scala 2.11 |
| `-Dflink1.13` | hudi-flink1.13-bundle | For Flink 1.13 and Scala 2.12 |
| `-Dflink1.13 -Dscala-2.11` | hudi-flink1.13-bundle | For Flink 1.13 and Scala 2.11 |

For example,
```
# Build against Flink 1.15.x
mvn clean package -DskipTests -Dflink1.15
# Build against Flink 1.14.x and Scala 2.11
mvn clean package -DskipTests -Dflink1.14
mvn clean package -DskipTests -Dflink1.14 -Dscala-2.11
# Build against Flink 1.13.x and Scala 2.12
mvn clean package -DskipTests -Dflink1.13 -Dscala-2.12
mvn clean package -DskipTests -Dflink1.13
```

## Running Tests
Expand All @@ -152,6 +157,11 @@ Functional tests, which are tagged with `@Tag("functional")`, can be run with ma
mvn -Pfunctional-tests test
```

Integration tests can be run with maven profile `integration-tests`.
```
mvn -Pintegration-tests verify
```

To run tests with spark event logging enabled, define the Spark event log directory. This allows visualizing test DAG and stages using Spark History Server UI.
```
mvn -Punit-tests test -DSPARK_EVLOG_DIR=/path/for/spark/event/log
Expand Down
51 changes: 7 additions & 44 deletions azure-pipelines-20230430.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,9 @@ parameters:
default:
- 'hudi-spark-datasource'
- 'hudi-spark-datasource/hudi-spark'
- 'hudi-spark-datasource/hudi-spark2'
- 'hudi-spark-datasource/hudi-spark2-common'
- 'hudi-spark-datasource/hudi-spark3.2.x'
- 'hudi-spark-datasource/hudi-spark3.2plus-common'
- 'hudi-spark-datasource/hudi-spark3-common'
- 'hudi-spark-datasource/hudi-spark-common'
- name: job4UTModules
type: object
Expand All @@ -68,8 +69,9 @@ parameters:
- '!hudi-flink-datasource/hudi-flink1.17.x'
- '!hudi-spark-datasource'
- '!hudi-spark-datasource/hudi-spark'
- '!hudi-spark-datasource/hudi-spark2'
- '!hudi-spark-datasource/hudi-spark2-common'
- '!hudi-spark-datasource/hudi-spark3.2.x'
- '!hudi-spark-datasource/hudi-spark3.2plus-common'
- '!hudi-spark-datasource/hudi-spark3-common'
- '!hudi-spark-datasource/hudi-spark-common'
- name: job4FTModules
type: object
Expand All @@ -90,13 +92,10 @@ parameters:
- '!hudi-flink-datasource/hudi-flink1.17.x'

variables:
BUILD_PROFILES: '-Dscala-2.11 -Dspark2.4 -Dflink1.17'
BUILD_PROFILES: '-Dscala-2.12 -Dspark3.2 -Dflink1.17'
PLUGIN_OPTS: '-Dcheckstyle.skip=true -Drat.skip=true -Djacoco.skip=true -ntp -B -V -Pwarn-log -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.shade=warn -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.dependency=warn'
MVN_OPTS_INSTALL: '-Phudi-platform-service -DskipTests $(BUILD_PROFILES) $(PLUGIN_OPTS)'
MVN_OPTS_TEST: '-fae -Pwarn-log $(BUILD_PROFILES) $(PLUGIN_OPTS)'
SPARK_VERSION: '2.4.4'
HADOOP_VERSION: '2.7'
SPARK_ARCHIVE: spark-$(SPARK_VERSION)-bin-hadoop$(HADOOP_VERSION)
JOB1_MODULES: ${{ join(',',parameters.job1Modules) }}
JOB2_MODULES: ${{ join(',',parameters.job2Modules) }}
JOB3_MODULES: ${{ join(',',parameters.job3UTModules) }}
Expand Down Expand Up @@ -220,39 +219,3 @@ stages:
- script: |
grep "testcase" */target/surefire-reports/*.xml */*/target/surefire-reports/*.xml | awk -F'"' ' { print $6,$4,$2 } ' | sort -nr | head -n 100
displayName: Top 100 long-running testcases
- job: IT
displayName: IT modules
timeoutInMinutes: '150'
steps:
- task: Maven@4
displayName: maven install
inputs:
mavenPomFile: 'pom.xml'
goals: 'clean install'
options: $(MVN_OPTS_INSTALL) -Pintegration-tests
publishJUnitResults: false
jdkVersionOption: '1.8'
- task: Maven@4
displayName: UT integ-test
inputs:
mavenPomFile: 'pom.xml'
goals: 'test'
options: $(MVN_OPTS_TEST) -Pintegration-tests -DskipUTs=false -DskipITs=true -pl hudi-integ-test
publishJUnitResults: false
jdkVersionOption: '1.8'
mavenOptions: '-Xmx4g'
- task: AzureCLI@2
displayName: Prepare for IT
inputs:
azureSubscription: apachehudici-service-connection
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
echo 'Downloading $(SPARK_ARCHIVE)'
az storage blob download -c ci-caches -n $(SPARK_ARCHIVE).tgz -f $(Pipeline.Workspace)/$(SPARK_ARCHIVE).tgz --account-name apachehudici
tar -xvf $(Pipeline.Workspace)/$(SPARK_ARCHIVE).tgz -C $(Pipeline.Workspace)/
mkdir /tmp/spark-events/
- script: |
export SPARK_HOME=$(Pipeline.Workspace)/$(SPARK_ARCHIVE)
mvn $(MVN_OPTS_TEST) -Pintegration-tests verify
displayName: IT
Loading

0 comments on commit eab8998

Please sign in to comment.