refactor bundle tests setup

apache · Oct 25, 2022 · 4025ed7 · 4025ed7
1 parent 04f1a41
commit 4025ed7
Show file tree

Hide file tree

Showing 16 changed files with 5,412 additions and 138 deletions.
diff --git a/.github/workflows/bot.yml b/.github/workflows/bot.yml
@@ -72,7 +72,7 @@ jobs:
         if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4 as it's covered by Azure CI
         run: |
           HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
-          ./packaging/bundle-validation/spark-write-hive-sync/ci_run.sh $HUDI_VERSION
+          ./packaging/bundle-validation/ci_run.sh $HUDI_VERSION
       - name: Spark SQL Test
         env:
           SCALA_PROFILE: ${{ matrix.scalaProfile }}

diff --git a/packaging/bundle-validation/Dockerfile b/packaging/bundle-validation/Dockerfile
@@ -14,38 +14,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-FROM adoptopenjdk/openjdk8:alpine
-
-RUN apk add --no-cache --upgrade bash
-
-RUN mkdir /opt/bundle-validation
-ENV WORKDIR=/opt/bundle-validation
-WORKDIR $WORKDIR
-
-ARG HADOOP_VERSION=2.7.7
-ARG HIVE_VERSION=3.1.3
-ARG DERBY_VERSION=10.14.1.0
-ARG SPARK_VERSION=3.1.3
-ARG SPARK_HADOOP_VERSION=2.7
-
-RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz -P "$WORKDIR" \
-    && tar -xf $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz -C $WORKDIR/ \
-    && rm $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz
-ENV HADOOP_HOME=$WORKDIR/hadoop-$HADOOP_VERSION
-
-RUN wget https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz -P "$WORKDIR" \
-    && tar -xf $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz -C $WORKDIR/ \
-    && rm $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz
-ENV HIVE_HOME=$WORKDIR/apache-hive-$HIVE_VERSION-bin
-
-RUN wget https://archive.apache.org/dist/db/derby/db-derby-$DERBY_VERSION/db-derby-$DERBY_VERSION-bin.tar.gz -P "$WORKDIR" \
-    && tar -xf $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz -C $WORKDIR/ \
-    && rm $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz
-ENV DERBY_HOME=$WORKDIR/db-derby-$DERBY_VERSION-bin
-
-RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -P "$WORKDIR" \
-    && tar -xf $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -C $WORKDIR/ \
-    && rm $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
-ENV SPARK_HOME=$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION
-COPY validate.sh .
 
+ARG IMAGE_TAG=spark313hive313
+FROM apachehudi/hudi-ci-bundle-validation-base:$IMAGE_TAG
+
+# configure the stack
+ADD . .
+ENV HUDI_CONF_DIR=$WORKDIR/conf
+RUN cp conf/hive-site.xml $HIVE_HOME/conf/
+RUN cp conf/hive-site.xml $SPARK_HOME/conf/
+RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
+RUN cp conf/spark-defaults.conf $SPARK_HOME/conf/
+RUN if [[ $SPARK_HOME == *"spark-3.2"* ]] || [[ $SPARK_HOME == *"spark-3.3"* ]]; \
+    then printf "\nspark.sql.catalog.spark_catalog org.apache.spark.sql.hudi.catalog.HoodieCatalog\n" >> $SPARK_HOME/conf/spark-defaults.conf; fi
diff --git a/packaging/bundle-validation/Dockerfile-base b/packaging/bundle-validation/Dockerfile-base
@@ -0,0 +1,49 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+FROM adoptopenjdk/openjdk8:alpine
+
+RUN apk add --no-cache --upgrade bash
+
+RUN mkdir /opt/bundle-validation
+ENV WORKDIR=/opt/bundle-validation
+WORKDIR $WORKDIR
+
+ARG HADOOP_VERSION=2.7.7
+ARG HIVE_VERSION=3.1.3
+ARG DERBY_VERSION=10.14.1.0
+ARG SPARK_VERSION=3.1.3
+ARG SPARK_HADOOP_VERSION=2.7
+
+RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz -P "$WORKDIR" \
+    && tar -xf $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz -C $WORKDIR/ \
+    && rm $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz
+ENV HADOOP_HOME=$WORKDIR/hadoop-$HADOOP_VERSION
+
+RUN wget https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz -P "$WORKDIR" \
+    && tar -xf $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz -C $WORKDIR/ \
+    && rm $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz
+ENV HIVE_HOME=$WORKDIR/apache-hive-$HIVE_VERSION-bin
+
+RUN wget https://archive.apache.org/dist/db/derby/db-derby-$DERBY_VERSION/db-derby-$DERBY_VERSION-bin.tar.gz -P "$WORKDIR" \
+    && tar -xf $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz -C $WORKDIR/ \
+    && rm $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz
+ENV DERBY_HOME=$WORKDIR/db-derby-$DERBY_VERSION-bin
+
+RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -P "$WORKDIR" \
+    && tar -xf $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -C $WORKDIR/ \
+    && rm $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
+ENV SPARK_HOME=$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION
diff --git a/packaging/bundle-validation/ci_run.sh b/packaging/bundle-validation/ci_run.sh
@@ -18,8 +18,13 @@
 # under the License.
 
 # Note:
-# this script is to run by GitHub Actions CI tasks from the project root directory
-# and contains environment-specific variables
+#
+# This script is to
+#  - set the corresponding variables based on CI job's build profiles
+#  - prepare Hudi bundle jars for mounting into Docker container for validation
+#
+# This is to run by GitHub Actions CI tasks from the project root directory
+# and it contains the CI environment-specific variables.
 
 HUDI_VERSION=$1
 
@@ -54,32 +59,14 @@ elif [[ ${SPARK_PROFILE} == 'spark3.3' ]]; then
   IMAGE_TAG=spark330hive313
 fi
 
-# Copy bundle jars
-BUNDLE_VALIDATION_DIR=${GITHUB_WORKSPACE}/bundle-validation
-mkdir $BUNDLE_VALIDATION_DIR
-JARS_DIR=${BUNDLE_VALIDATION_DIR}/jars
-mkdir $JARS_DIR
-cp ${GITHUB_WORKSPACE}/packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar $JARS_DIR/
-cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-bundle/target/hudi-*-$HUDI_VERSION.jar $JARS_DIR/
-cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-slim-bundle/target/hudi-*-$HUDI_VERSION.jar $JARS_DIR/
+# Copy bundle jars to temp dir for mounting
+TMP_JARS_DIR=/tmp/jars/$(date +%s)
+mkdir -p $TMP_JARS_DIR
+cp ${GITHUB_WORKSPACE}/packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
+cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
+cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-slim-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
 echo 'Validating jars below:'
-ls -l $JARS_DIR
-
-# Copy hive data
-cp -r ${GITHUB_WORKSPACE}/packaging/bundle-validation/hive ${BUNDLE_VALIDATION_DIR}/
-
-# Copy utilities data
-cp -r ${GITHUB_WORKSPACE}/packaging/bundle-validation/utilities ${BUNDLE_VALIDATION_DIR}/
-cp -r ${GITHUB_WORKSPACE}/docker/demo/data ${BUNDLE_VALIDATION_DIR}/utilities/
-cp  ${GITHUB_WORKSPACE}/docker/demo/config/schema.avsc ${BUNDLE_VALIDATION_DIR}/utilities/
-
-# add shell args to utilities data
-SHELL_ARGS=" --conf spark.serializer=org.apache.spark.serializer.KryoSerializer" 
-if [[ $SPARK_PROFILE = "spark3.2" || $SPARK_PROFILE = "spark3.3" ]]; then
-    SHELL_ARGS+=" --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog"
-fi
-SHELL_ARGS+=" --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension"
-echo $SHELL_ARGS > ${BUNDLE_VALIDATION_DIR}/utilities/shell_args
+ls -l $TMP_JARS_DIR
 
 # build docker image
 cd ${GITHUB_WORKSPACE}/packaging/bundle-validation || exit 1
@@ -89,8 +76,9 @@ docker build \
 --build-arg DERBY_VERSION=$DERBY_VERSION \
 --build-arg SPARK_VERSION=$SPARK_VERSION \
 --build-arg SPARK_HADOOP_VERSION=$SPARK_HADOOP_VERSION \
+--build-arg IMAGE_TAG=$IMAGE_TAG \
 -t hudi-ci-bundle-validation:$IMAGE_TAG \
 .
 
-# run script in docker
-docker run -v ${GITHUB_WORKSPACE}/bundle-validation:/opt/bundle-validation/data -i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh
+# run validation script in docker
+docker run -v $TMP_JARS_DIR:/opt/bundle-validation/jars -i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh
diff --git a/...ging/bundle-validation/hive/hive-site.xml → ...ging/bundle-validation/conf/hive-site.xml b/...ging/bundle-validation/hive/hive-site.xml → ...ging/bundle-validation/conf/hive-site.xml
diff --git a/...undle-validation/hive/spark-defaults.conf → ...bundle-validation/conf/hudi-defaults.conf b/...undle-validation/hive/spark-defaults.conf → ...bundle-validation/conf/hudi-defaults.conf
@@ -15,6 +15,8 @@
 # limitations under the License.
 #
 
-spark.serializer org.apache.spark.serializer.KryoSerializer
-spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
-spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
+hoodie.upsert.shuffle.parallelism       8
+hoodie.insert.shuffle.parallelism       8
+hoodie.delete.shuffle.parallelism       8
+hoodie.bulkinsert.shuffle.parallelism   8
+hoodie.finalize.write.parallelism       8
diff --git a/packaging/bundle-validation/conf/spark-defaults.conf b/packaging/bundle-validation/conf/spark-defaults.conf
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+spark.serializer                  org.apache.spark.serializer.KryoSerializer
+spark.sql.extensions              org.apache.spark.sql.hudi.HoodieSparkSessionExtension
+spark.sql.warehouse.dir           file:///tmp/hudi-bundles/hive/warehouse
+spark.default.parallelism         8
+spark.sql.shuffle.partitions      8
diff --git a/...ing/bundle-validation/hive/validate.scala → ...ng/bundle-validation/spark/validate.scala b/...ing/bundle-validation/hive/validate.scala → ...ng/bundle-validation/spark/validate.scala
diff --git a/packaging/bundle-validation/utilities/commands.scala b/packaging/bundle-validation/utilities/commands.scala
@@ -1,9 +1,25 @@
-val hudiDf = spark.read.format("org.apache.hudi").load("/tmp/hudi-utilities-test/")
-val inputDf = spark.read.format("json").load("/opt/bundle-validation/data/utilities/data")
-hudiDf.registerTempTable("hudi_tbl")
-inputDf.registerTempTable("src_tbl")
-val hudiCount = spark.sql("select distinct date, key from hudi_tbl").count()
-val srcCount = spark.sql("select distinct date, key from src_tbl").count()
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+val hudiDf = spark.read.format("hudi").load("/tmp/hudi-utilities-test/")
+val inputDf = spark.read.format("json").load("/opt/bundle-validation/utilities/stocks/data")
+val hudiCount = hudiDf.select("date", "key").distinct.count
+val srcCount = inputDf.select("date", "key").distinct.count
 if (hudiCount == srcCount) System.exit(0)
 println(s"Counts don't match hudiCount: $hudiCount, srcCount: $srcCount")
 System.exit(1)
diff --git a/packaging/bundle-validation/utilities/hoodieapp.properties b/packaging/bundle-validation/utilities/hoodieapp.properties
@@ -0,0 +1,23 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+# limitations under the License.
+
+hoodie.datasource.write.recordkey.field=key
+hoodie.datasource.write.partitionpath.field=date
+hoodie.datasource.write.precombine.field=ts
+hoodie.metadata.enable=true
+hoodie.deltastreamer.source.dfs.root=file:///opt/bundle-validation/utilities/stocks/data
+hoodie.deltastreamer.schemaprovider.target.schema.file=file:///opt/bundle-validation/utilities/stocks/schema.avsc
+hoodie.deltastreamer.schemaprovider.source.schema.file=file:///opt/bundle-validation/utilities/stocks/schema.avsc
diff --git a/packaging/bundle-validation/utilities/slimcommands.scala b/packaging/bundle-validation/utilities/slimcommands.scala
@@ -1,9 +1,25 @@
-val hudiDf = spark.read.format("org.apache.hudi").load("/tmp/hudi-utilities-slim-test/")
-val inputDf = spark.read.format("json").load("/opt/bundle-validation/data/utilities/data")
-hudiDf.registerTempTable("hudi_tbl")
-inputDf.registerTempTable("src_tbl")
-val hudiCount = spark.sql("select distinct date, key from hudi_tbl").count()
-val srcCount = spark.sql("select distinct date, key from src_tbl").count()
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+val hudiDf = spark.read.format("hudi").load("/tmp/hudi-utilities-slim-test/")
+val inputDf = spark.read.format("json").load("/opt/bundle-validation/utilities/stocks/data")
+val hudiCount = hudiDf.select("date", "key").distinct.count
+val srcCount = inputDf.select("date", "key").distinct.count
 if (hudiCount == srcCount) System.exit(0)
 println(s"Counts don't match hudiCount: $hudiCount, srcCount: $srcCount")
 System.exit(1)