Skip to content

Commit

Permalink
refactor bundle tests setup
Browse files Browse the repository at this point in the history
  • Loading branch information
xushiyan committed Oct 25, 2022
1 parent 04f1a41 commit 4025ed7
Show file tree
Hide file tree
Showing 16 changed files with 5,412 additions and 138 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/bot.yml
Expand Up @@ -72,7 +72,7 @@ jobs:
if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4 as it's covered by Azure CI
run: |
HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
./packaging/bundle-validation/spark-write-hive-sync/ci_run.sh $HUDI_VERSION
./packaging/bundle-validation/ci_run.sh $HUDI_VERSION
- name: Spark SQL Test
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
Expand Down
46 changes: 12 additions & 34 deletions packaging/bundle-validation/Dockerfile
Expand Up @@ -14,38 +14,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
FROM adoptopenjdk/openjdk8:alpine

RUN apk add --no-cache --upgrade bash

RUN mkdir /opt/bundle-validation
ENV WORKDIR=/opt/bundle-validation
WORKDIR $WORKDIR

ARG HADOOP_VERSION=2.7.7
ARG HIVE_VERSION=3.1.3
ARG DERBY_VERSION=10.14.1.0
ARG SPARK_VERSION=3.1.3
ARG SPARK_HADOOP_VERSION=2.7

RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz -P "$WORKDIR" \
&& tar -xf $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz -C $WORKDIR/ \
&& rm $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz
ENV HADOOP_HOME=$WORKDIR/hadoop-$HADOOP_VERSION

RUN wget https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz -P "$WORKDIR" \
&& tar -xf $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz -C $WORKDIR/ \
&& rm $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz
ENV HIVE_HOME=$WORKDIR/apache-hive-$HIVE_VERSION-bin

RUN wget https://archive.apache.org/dist/db/derby/db-derby-$DERBY_VERSION/db-derby-$DERBY_VERSION-bin.tar.gz -P "$WORKDIR" \
&& tar -xf $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz -C $WORKDIR/ \
&& rm $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz
ENV DERBY_HOME=$WORKDIR/db-derby-$DERBY_VERSION-bin

RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -P "$WORKDIR" \
&& tar -xf $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -C $WORKDIR/ \
&& rm $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
ENV SPARK_HOME=$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION
COPY validate.sh .

ARG IMAGE_TAG=spark313hive313
FROM apachehudi/hudi-ci-bundle-validation-base:$IMAGE_TAG

# configure the stack
ADD . .
ENV HUDI_CONF_DIR=$WORKDIR/conf
RUN cp conf/hive-site.xml $HIVE_HOME/conf/
RUN cp conf/hive-site.xml $SPARK_HOME/conf/
RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
RUN cp conf/spark-defaults.conf $SPARK_HOME/conf/
RUN if [[ $SPARK_HOME == *"spark-3.2"* ]] || [[ $SPARK_HOME == *"spark-3.3"* ]]; \
then printf "\nspark.sql.catalog.spark_catalog org.apache.spark.sql.hudi.catalog.HoodieCatalog\n" >> $SPARK_HOME/conf/spark-defaults.conf; fi
49 changes: 49 additions & 0 deletions packaging/bundle-validation/Dockerfile-base
@@ -0,0 +1,49 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
FROM adoptopenjdk/openjdk8:alpine

RUN apk add --no-cache --upgrade bash

RUN mkdir /opt/bundle-validation
ENV WORKDIR=/opt/bundle-validation
WORKDIR $WORKDIR

ARG HADOOP_VERSION=2.7.7
ARG HIVE_VERSION=3.1.3
ARG DERBY_VERSION=10.14.1.0
ARG SPARK_VERSION=3.1.3
ARG SPARK_HADOOP_VERSION=2.7

RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz -P "$WORKDIR" \
&& tar -xf $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz -C $WORKDIR/ \
&& rm $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz
ENV HADOOP_HOME=$WORKDIR/hadoop-$HADOOP_VERSION

RUN wget https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz -P "$WORKDIR" \
&& tar -xf $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz -C $WORKDIR/ \
&& rm $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz
ENV HIVE_HOME=$WORKDIR/apache-hive-$HIVE_VERSION-bin

RUN wget https://archive.apache.org/dist/db/derby/db-derby-$DERBY_VERSION/db-derby-$DERBY_VERSION-bin.tar.gz -P "$WORKDIR" \
&& tar -xf $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz -C $WORKDIR/ \
&& rm $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz
ENV DERBY_HOME=$WORKDIR/db-derby-$DERBY_VERSION-bin

RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -P "$WORKDIR" \
&& tar -xf $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -C $WORKDIR/ \
&& rm $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
ENV SPARK_HOME=$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION
46 changes: 17 additions & 29 deletions packaging/bundle-validation/ci_run.sh
Expand Up @@ -18,8 +18,13 @@
# under the License.

# Note:
# this script is to run by GitHub Actions CI tasks from the project root directory
# and contains environment-specific variables
#
# This script is to
# - set the corresponding variables based on CI job's build profiles
# - prepare Hudi bundle jars for mounting into Docker container for validation
#
# This is to run by GitHub Actions CI tasks from the project root directory
# and it contains the CI environment-specific variables.

HUDI_VERSION=$1

Expand Down Expand Up @@ -54,32 +59,14 @@ elif [[ ${SPARK_PROFILE} == 'spark3.3' ]]; then
IMAGE_TAG=spark330hive313
fi

# Copy bundle jars
BUNDLE_VALIDATION_DIR=${GITHUB_WORKSPACE}/bundle-validation
mkdir $BUNDLE_VALIDATION_DIR
JARS_DIR=${BUNDLE_VALIDATION_DIR}/jars
mkdir $JARS_DIR
cp ${GITHUB_WORKSPACE}/packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar $JARS_DIR/
cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-bundle/target/hudi-*-$HUDI_VERSION.jar $JARS_DIR/
cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-slim-bundle/target/hudi-*-$HUDI_VERSION.jar $JARS_DIR/
# Copy bundle jars to temp dir for mounting
TMP_JARS_DIR=/tmp/jars/$(date +%s)
mkdir -p $TMP_JARS_DIR
cp ${GITHUB_WORKSPACE}/packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-slim-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
echo 'Validating jars below:'
ls -l $JARS_DIR

# Copy hive data
cp -r ${GITHUB_WORKSPACE}/packaging/bundle-validation/hive ${BUNDLE_VALIDATION_DIR}/

# Copy utilities data
cp -r ${GITHUB_WORKSPACE}/packaging/bundle-validation/utilities ${BUNDLE_VALIDATION_DIR}/
cp -r ${GITHUB_WORKSPACE}/docker/demo/data ${BUNDLE_VALIDATION_DIR}/utilities/
cp ${GITHUB_WORKSPACE}/docker/demo/config/schema.avsc ${BUNDLE_VALIDATION_DIR}/utilities/

# add shell args to utilities data
SHELL_ARGS=" --conf spark.serializer=org.apache.spark.serializer.KryoSerializer"
if [[ $SPARK_PROFILE = "spark3.2" || $SPARK_PROFILE = "spark3.3" ]]; then
SHELL_ARGS+=" --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog"
fi
SHELL_ARGS+=" --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension"
echo $SHELL_ARGS > ${BUNDLE_VALIDATION_DIR}/utilities/shell_args
ls -l $TMP_JARS_DIR

# build docker image
cd ${GITHUB_WORKSPACE}/packaging/bundle-validation || exit 1
Expand All @@ -89,8 +76,9 @@ docker build \
--build-arg DERBY_VERSION=$DERBY_VERSION \
--build-arg SPARK_VERSION=$SPARK_VERSION \
--build-arg SPARK_HADOOP_VERSION=$SPARK_HADOOP_VERSION \
--build-arg IMAGE_TAG=$IMAGE_TAG \
-t hudi-ci-bundle-validation:$IMAGE_TAG \
.

# run script in docker
docker run -v ${GITHUB_WORKSPACE}/bundle-validation:/opt/bundle-validation/data -i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh
# run validation script in docker
docker run -v $TMP_JARS_DIR:/opt/bundle-validation/jars -i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh
Expand Up @@ -15,6 +15,8 @@
# limitations under the License.
#

spark.serializer org.apache.spark.serializer.KryoSerializer
spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
hoodie.upsert.shuffle.parallelism 8
hoodie.insert.shuffle.parallelism 8
hoodie.delete.shuffle.parallelism 8
hoodie.bulkinsert.shuffle.parallelism 8
hoodie.finalize.write.parallelism 8
22 changes: 22 additions & 0 deletions packaging/bundle-validation/conf/spark-defaults.conf
@@ -0,0 +1,22 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

spark.serializer org.apache.spark.serializer.KryoSerializer
spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
spark.default.parallelism 8
spark.sql.shuffle.partitions 8
28 changes: 22 additions & 6 deletions packaging/bundle-validation/utilities/commands.scala
@@ -1,9 +1,25 @@
val hudiDf = spark.read.format("org.apache.hudi").load("/tmp/hudi-utilities-test/")
val inputDf = spark.read.format("json").load("/opt/bundle-validation/data/utilities/data")
hudiDf.registerTempTable("hudi_tbl")
inputDf.registerTempTable("src_tbl")
val hudiCount = spark.sql("select distinct date, key from hudi_tbl").count()
val srcCount = spark.sql("select distinct date, key from src_tbl").count()
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

val hudiDf = spark.read.format("hudi").load("/tmp/hudi-utilities-test/")
val inputDf = spark.read.format("json").load("/opt/bundle-validation/utilities/stocks/data")
val hudiCount = hudiDf.select("date", "key").distinct.count
val srcCount = inputDf.select("date", "key").distinct.count
if (hudiCount == srcCount) System.exit(0)
println(s"Counts don't match hudiCount: $hudiCount, srcCount: $srcCount")
System.exit(1)
23 changes: 23 additions & 0 deletions packaging/bundle-validation/utilities/hoodieapp.properties
@@ -0,0 +1,23 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

hoodie.datasource.write.recordkey.field=key
hoodie.datasource.write.partitionpath.field=date
hoodie.datasource.write.precombine.field=ts
hoodie.metadata.enable=true
hoodie.deltastreamer.source.dfs.root=file:///opt/bundle-validation/utilities/stocks/data
hoodie.deltastreamer.schemaprovider.target.schema.file=file:///opt/bundle-validation/utilities/stocks/schema.avsc
hoodie.deltastreamer.schemaprovider.source.schema.file=file:///opt/bundle-validation/utilities/stocks/schema.avsc
28 changes: 22 additions & 6 deletions packaging/bundle-validation/utilities/slimcommands.scala
@@ -1,9 +1,25 @@
val hudiDf = spark.read.format("org.apache.hudi").load("/tmp/hudi-utilities-slim-test/")
val inputDf = spark.read.format("json").load("/opt/bundle-validation/data/utilities/data")
hudiDf.registerTempTable("hudi_tbl")
inputDf.registerTempTable("src_tbl")
val hudiCount = spark.sql("select distinct date, key from hudi_tbl").count()
val srcCount = spark.sql("select distinct date, key from src_tbl").count()
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

val hudiDf = spark.read.format("hudi").load("/tmp/hudi-utilities-slim-test/")
val inputDf = spark.read.format("json").load("/opt/bundle-validation/utilities/stocks/data")
val hudiCount = hudiDf.select("date", "key").distinct.count
val srcCount = inputDf.select("date", "key").distinct.count
if (hudiCount == srcCount) System.exit(0)
println(s"Counts don't match hudiCount: $hudiCount, srcCount: $srcCount")
System.exit(1)

0 comments on commit 4025ed7

Please sign in to comment.