From 99a8a55c0e59fa0c87cfd28d94c09ebf0d2b11f8 Mon Sep 17 00:00:00 2001 From: Hans Van Akelyen Date: Thu, 4 Aug 2022 15:02:51 +0200 Subject: [PATCH 1/2] HOP-4090: Add Spark integration test and fix starting SparkJob from Hop --- assemblies/plugins/engines/beam/pom.xml | 6 +- .../engines/beam/src/assembly/assembly.xml | 10 +- .../tech/parquet/src/assembly/assembly.xml | 1 - core/pom.xml | 8 +- .../Dockerfile.unit-tests-spark | 113 +++++++ .../integration-tests-base-spark.yaml | 34 ++ .../integration-tests-spark.yaml | 43 +++ .../integration-tests/spark/Dockerfile.master | 49 +++ .../integration-tests/spark/Dockerfile.worker | 50 +++ .../spark/scripts/execute-step.sh | 30 ++ .../spark/scripts/finish-step.sh | 30 ++ .../integration-tests/spark/scripts/master.sh | 33 ++ .../spark/scripts/wait-for-step.sh | 30 ++ .../integration-tests/spark/scripts/worker.sh | 31 ++ .../spark/0001-test-spark-cluster.hpl | 93 ++++++ integration-tests/spark/dev-env-config.json | 3 + integration-tests/spark/hop-config.json | 290 ++++++++++++++++++ .../spark/main-0001-test-spark-cluster.hwf | 92 ++++++ .../pipeline-run-configuration/local.json | 17 + .../pipeline-run-configuration/spark.json | 26 ++ .../workflow-run-configuration/local.json | 9 + integration-tests/spark/project-config.json | 15 + 22 files changed, 1001 insertions(+), 12 deletions(-) create mode 100644 docker/integration-tests/Dockerfile.unit-tests-spark create mode 100644 docker/integration-tests/integration-tests-base-spark.yaml create mode 100644 docker/integration-tests/integration-tests-spark.yaml create mode 100644 docker/integration-tests/spark/Dockerfile.master create mode 100644 docker/integration-tests/spark/Dockerfile.worker create mode 100644 docker/integration-tests/spark/scripts/execute-step.sh create mode 100644 docker/integration-tests/spark/scripts/finish-step.sh create mode 100644 docker/integration-tests/spark/scripts/master.sh create mode 100644 docker/integration-tests/spark/scripts/wait-for-step.sh create mode 100644 docker/integration-tests/spark/scripts/worker.sh create mode 100644 integration-tests/spark/0001-test-spark-cluster.hpl create mode 100644 integration-tests/spark/dev-env-config.json create mode 100644 integration-tests/spark/hop-config.json create mode 100644 integration-tests/spark/main-0001-test-spark-cluster.hwf create mode 100644 integration-tests/spark/metadata/pipeline-run-configuration/local.json create mode 100644 integration-tests/spark/metadata/pipeline-run-configuration/spark.json create mode 100644 integration-tests/spark/metadata/workflow-run-configuration/local.json create mode 100644 integration-tests/spark/project-config.json diff --git a/assemblies/plugins/engines/beam/pom.xml b/assemblies/plugins/engines/beam/pom.xml index 1f2a117c420..27f50d3b0e9 100644 --- a/assemblies/plugins/engines/beam/pom.xml +++ b/assemblies/plugins/engines/beam/pom.xml @@ -187,9 +187,9 @@ 2.10.1 - org.codehaus.woodstox - woodstox-core-asl - 4.4.1 + com.fasterxml.woodstox + woodstox-core + 5.0.3 com.google.guava diff --git a/assemblies/plugins/engines/beam/src/assembly/assembly.xml b/assemblies/plugins/engines/beam/src/assembly/assembly.xml index 64a4d12a579..b25f867db7e 100644 --- a/assemblies/plugins/engines/beam/src/assembly/assembly.xml +++ b/assemblies/plugins/engines/beam/src/assembly/assembly.xml @@ -163,8 +163,6 @@ io.dropwizard.metrics:metrics-json io.dropwizard.metrics:metrics-jvm io.github.classgraph:classgraph - - io.grpc:grpc-alts io.grpc:grpc-api io.grpc:grpc-auth @@ -195,7 +193,6 @@ javax.annotation:javax.annotation-api javax.servlet:javax.servlet-api javax.xml.bind:jaxb-api - javax.xml.stream:stax-api joda-time:joda-time net.razorvine:pyrolite net.sf.py4j:py4j @@ -279,7 +276,7 @@ org.codehaus.jackson:jackson-mapper-asl org.codehaus.jackson:jackson-xc org.codehaus.mojo:animal-sniffer-annotations - org.codehaus.woodstox:woodstox-core-asl + com.fasterxml.woodstox:woodstox-core org.conscrypt:conscrypt-openjdk-uber org.fusesource.leveldbjni:leveldbjni-all org.glassfish.hk2.external:aopalliance-repackaged @@ -287,6 +284,11 @@ org.glassfish.hk2:hk2-locator org.glassfish.hk2:hk2-utils org.glassfish.hk2:osgi-resource-locator + org.glassfish.jersey.containers:jersey-container-servlet + org.glassfish.jersey.containers:jersey-container-servlet-core + org.glassfish.jersey.core:jersey-client + org.glassfish.jersey.core:jersey-common + org.glassfish.jersey.core:jersey-server org.glassfish.jersey.media:jersey-media-jaxb org.lz4:lz4-java org.mortbay.jetty:jetty-util diff --git a/assemblies/plugins/tech/parquet/src/assembly/assembly.xml b/assemblies/plugins/tech/parquet/src/assembly/assembly.xml index dcfe1b45317..c2c6bc0ebd1 100644 --- a/assemblies/plugins/tech/parquet/src/assembly/assembly.xml +++ b/assemblies/plugins/tech/parquet/src/assembly/assembly.xml @@ -59,7 +59,6 @@ io.netty:netty:jar jakarta.activation:jakarta.activation:jar javax.xml.bind:jaxb-api:jar - javax.xml.stream:stax-api:jar org.apache.curator:curator-client:jar org.apache.curator:curator-framework:jar org.apache.curator:curator-recipes:jar diff --git a/core/pom.xml b/core/pom.xml index a41e0df8d86..407c4bdbbbe 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -55,7 +55,7 @@ 0.1.54 1.0.7 2.6.9 - 4.4.1 + 5.0.3 1.5 3.28.0-GA 1.1.1 @@ -476,9 +476,9 @@ - org.codehaus.woodstox - woodstox-core-asl - ${woodstox-core-asl.version} + com.fasterxml.woodstox + woodstox-core + ${woodstox-core.version} test diff --git a/docker/integration-tests/Dockerfile.unit-tests-spark b/docker/integration-tests/Dockerfile.unit-tests-spark new file mode 100644 index 00000000000..a97d5034cb8 --- /dev/null +++ b/docker/integration-tests/Dockerfile.unit-tests-spark @@ -0,0 +1,113 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +FROM ubuntu +MAINTAINER Apache Hop + +# Argument Branch name, used to download correct version +ARG BRANCH_NAME +ENV BRANCH_NAME=$BRANCH_NAME +# path to where the artefacts should be deployed to +ENV DEPLOYMENT_PATH=/opt +# volume mount point +ENV VOLUME_MOUNT_POINT=/files +#Jenkins user an group +ARG JENKINS_USER=hop +ARG JENKINS_GROUP=hop +ARG JENKINS_UID=1000 +ARG JENKINS_GID=1000 +ARG GCP_KEY_FILE= +# Set system properties +ENV DEBIAN_FRONTEND=noninteractive + +# any JRE settings you want to pass on +# The “-XX:+AggressiveHeap” tells the container to use all memory assigned to the container. +# this removed the need to calculate the necessary heap Xmx +ENV HOP_OPTIONS=-XX:+AggressiveHeap + +# INSTALL REQUIRED PACKAGES AND ADJUST LOCALE +# procps: The package includes the programs ps, top, vmstat, w, kill, free, slabtop, and skill + +RUN apt-get update \ + && apt-get install --assume-yes \ + bash \ + curl \ + procps \ + git \ + python3-pip \ + openjdk-11-jre-headless \ + unzip \ + ttf-mscorefonts-installer \ + locales \ + && mkdir ${VOLUME_MOUNT_POINT} \ + && addgroup -gid ${JENKINS_GID} ${JENKINS_GROUP} \ + && useradd -m -d /home/${JENKINS_USER} -u ${JENKINS_UID} -g ${JENKINS_GROUP} ${JENKINS_USER} \ + && chown ${JENKINS_USER}:${JENKINS_GROUP} ${DEPLOYMENT_PATH} \ + && chown ${JENKINS_USER}:${JENKINS_GROUP} ${VOLUME_MOUNT_POINT} + +# Set Locale correctly +RUN sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && \ + locale-gen +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US:en +ENV LC_ALL en_US.UTF-8 + +# Install parquet-tools from Python + +RUN pip3 install parquet-tools + +# Copy the hop package from the local resources folder to the container image directory + +COPY --chown=${JENKINS_USER}:${JENKINS_GROUP} ./assemblies/client/target/hop-* ${DEPLOYMENT_PATH}/hop.zip + +# Unzip and install in correct location + +RUN unzip ${DEPLOYMENT_PATH}/hop.zip -d ${DEPLOYMENT_PATH} \ + && rm ${DEPLOYMENT_PATH}/hop.zip \ + #Remove Jars for Spark + && rm ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/flink-shaded-jackson* \ + && rm ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/jackson-module-scala* \ + && rm ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/scala-java8-compat* \ + && rm ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/scala-library* \ + && rm ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/scala-parser-combinators* \ + #Add extra jars + && wget -P ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/ https://repo1.maven.org/maven2/com/fasterxml/jackson/module/jackson-module-scala_2.12/2.13.3/jackson-module-scala_2.12-2.13.3.jar \ + && wget -P ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/ https://repo1.maven.org/maven2/org/json4s/json4s-ast_2.12/3.7.0-M5/json4s-ast_2.12-3.7.0-M5.jar \ + && wget -P ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/ https://repo1.maven.org/maven2/org/json4s/json4s-core_2.12/3.7.0-M5/json4s-core_2.12-3.7.0-M5.jar \ + && wget -P ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/ https://repo1.maven.org/maven2/org/json4s/json4s-jackson_2.12/3.7.0-M5/json4s-jackson_2.12-3.7.0-M5.jar \ + && wget -P ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/ https://repo1.maven.org/maven2/org/json4s/json4s-scalap_2.12/3.7.0-M5/json4s-scalap_2.12-3.7.0-M5.jar \ + && wget -P ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/ https://repo1.maven.org/maven2/log4j/log4j/1.2.17/log4j-1.2.17.jar \ + && wget -P ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/ https://repo1.maven.org/maven2/org/scala-lang/scala-compiler/2.12.10/scala-compiler-2.12.10.jar \ + && wget -P ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/ https://repo1.maven.org/maven2/org/scala-lang/scala-library/2.12.10/scala-library-2.12.10.jar \ + && wget -P ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/ https://repo1.maven.org/maven2/org/scala-lang/modules/scala-parser-combinators_2.12/1.1.2/scala-parser-combinators_2.12-1.1.2.jar \ + && wget -P ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/ https://repo1.maven.org/maven2/org/scala-lang/scala-reflect/2.12.10/scala-reflect-2.12.10.jar \ + && wget -P ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/ https://repo1.maven.org/maven2/org/scala-lang/modules/scala-xml_2.12/1.2.0/scala-xml_2.12-1.2.0.jar \ + && wget -P ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/ https://repo1.maven.org/maven2/org/apache/spark/spark-unsafe_2.12/3.1.3/spark-unsafe_2.12-3.1.3.jar \ + && wget -P ${DEPLOYMENT_PATH}/hop/plugins/engines/beam/lib/ https://repo1.maven.org/maven2/org/apache/xbean/xbean-asm7-shaded/4.15/xbean-asm7-shaded-4.15.jar \ + && chown -R ${JENKINS_USER}:${JENKINS_GROUP} ${DEPLOYMENT_PATH}/hop \ + && chmod 700 ${DEPLOYMENT_PATH}/hop/*.sh \ + && cd ${DEPLOYMENT_PATH}/hop \ + && ./hop-conf.sh --generate-fat-jar=/tmp/hop-fatjar.jar + +# make volume available so that hop pipeline and workflow files can be provided easily +VOLUME ["/files"] +USER ${JENKINS_USER} +ENV PATH=$PATH:${DEPLOYMENT_PATH}/hop +ENV GOOGLE_APPLICATION_CREDENTIALS="/tmp/google-key-apache-hop-it.json" +WORKDIR /home/${JENKINS_USER} +# CMD ["/bin/bash"] +ENTRYPOINT [] \ No newline at end of file diff --git a/docker/integration-tests/integration-tests-base-spark.yaml b/docker/integration-tests/integration-tests-base-spark.yaml new file mode 100644 index 00000000000..d6de1109318 --- /dev/null +++ b/docker/integration-tests/integration-tests-base-spark.yaml @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +version: '2.4' +services: + integration_test: + build: + context: ../../. + dockerfile: docker/integration-tests/Dockerfile.unit-tests-spark + args: + - JENKINS_USER=jenkins + - JENKINS_UID=1000 + - JENKINS_GROUP=jenkins + - JENKINS_GID=1000 + - GCP_KEY_FILE=./test + volumes: + - ../../integration-tests/:/files + environment: + - FLASK_ENV=docker + command: [ "bash", "-c", "/files/scripts/run-tests.sh ${PROJECT_NAME}" ] \ No newline at end of file diff --git a/docker/integration-tests/integration-tests-spark.yaml b/docker/integration-tests/integration-tests-spark.yaml new file mode 100644 index 00000000000..bae7899dfe2 --- /dev/null +++ b/docker/integration-tests/integration-tests-spark.yaml @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +version: '2.4' +services: + integration_test_spark: + extends: + file: integration-tests-base-spark.yaml + service: integration_test + depends_on: + - spark + links: + - spark + - spark-worker + + spark: + build: + context: ../../docker/integration-tests/spark/. + dockerfile: Dockerfile.master + environment: + - INIT_DAEMON_STEP=setup_spark + spark-worker: + build: + context: ../../docker/integration-tests/spark/. + dockerfile: Dockerfile.worker + depends_on: + - spark + environment: + - "SPARK_MASTER=spark://spark:7077" \ No newline at end of file diff --git a/docker/integration-tests/spark/Dockerfile.master b/docker/integration-tests/spark/Dockerfile.master new file mode 100644 index 00000000000..d4bb5db970e --- /dev/null +++ b/docker/integration-tests/spark/Dockerfile.master @@ -0,0 +1,49 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +FROM alpine:3.10 + +ENV ENABLE_INIT_DAEMON false +ENV INIT_DAEMON_BASE_URI http://identifier/init-daemon +ENV INIT_DAEMON_STEP spark_master_init + +ENV BASE_URL=https://archive.apache.org/dist/spark/ +ENV SPARK_VERSION=3.1.3 +ENV HADOOP_VERSION=3.2 +ENV SPARK_MASTER_PORT 7077 +ENV SPARK_MASTER_WEBUI_PORT 8080 +ENV SPARK_MASTER_LOG /spark/logs + +COPY ./scripts/wait-for-step.sh / +COPY ./scripts/execute-step.sh / +COPY ./scripts/finish-step.sh / +COPY ./scripts/master.sh / + +RUN apk add --no-cache curl bash openjdk11-jre nss libc6-compat coreutils procps \ + && ln -s /lib64/ld-linux-x86-64.so.2 /lib/ld-linux-x86-64.so.2 \ + && chmod +x *.sh \ + && wget ${BASE_URL}/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ + && tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ + && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark \ + && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ + && cd / + +#Give permission to execute scripts +RUN chmod +x /wait-for-step.sh && chmod +x /execute-step.sh && chmod +x /finish-step.sh && chmod +x /master.sh + +EXPOSE 8080 7077 6066 + +CMD ["/bin/bash", "/master.sh"] \ No newline at end of file diff --git a/docker/integration-tests/spark/Dockerfile.worker b/docker/integration-tests/spark/Dockerfile.worker new file mode 100644 index 00000000000..958625f1b8d --- /dev/null +++ b/docker/integration-tests/spark/Dockerfile.worker @@ -0,0 +1,50 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +FROM alpine:3.10 + +ENV ENABLE_INIT_DAEMON false +ENV INIT_DAEMON_BASE_URI http://identifier/init-daemon +ENV INIT_DAEMON_STEP spark_master_init + +ENV BASE_URL=https://archive.apache.org/dist/spark/ +ENV SPARK_VERSION=3.1.3 +ENV HADOOP_VERSION=3.2 +ENV SPARK_WORKER_WEBUI_PORT 8081 +ENV SPARK_WORKER_LOG /spark/logs +ENV SPARK_MASTER "spark://spark:7077" + +COPY ./scripts/wait-for-step.sh / +COPY ./scripts/execute-step.sh / +COPY ./scripts/finish-step.sh / +COPY ./scripts/worker.sh / + +RUN apk add --no-cache curl bash openjdk11-jre nss libc6-compat coreutils procps \ + && ln -s /lib64/ld-linux-x86-64.so.2 /lib/ld-linux-x86-64.so.2 \ + && chmod +x *.sh \ + && wget ${BASE_URL}/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ + && tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ + && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark \ + && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ + && cd / + +#Give permission to execute scripts +RUN chmod +x /wait-for-step.sh && chmod +x /execute-step.sh && chmod +x /finish-step.sh && chmod +x /worker.sh + +EXPOSE 8081 + +CMD ["/bin/bash", "/worker.sh"] \ No newline at end of file diff --git a/docker/integration-tests/spark/scripts/execute-step.sh b/docker/integration-tests/spark/scripts/execute-step.sh new file mode 100644 index 00000000000..21850a4a907 --- /dev/null +++ b/docker/integration-tests/spark/scripts/execute-step.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if [ $ENABLE_INIT_DAEMON = "true" ] + then + echo "Execute step ${INIT_DAEMON_STEP} in pipeline" + while true; do + sleep 5 + echo -n '.' + string=$(curl -sL -w "%{http_code}" -X PUT $INIT_DAEMON_BASE_URI/execute?step=$INIT_DAEMON_STEP -o /dev/null) + [ "$string" = "204" ] && break + done + echo "Notified execution of step ${INIT_DAEMON_STEP}" +fi \ No newline at end of file diff --git a/docker/integration-tests/spark/scripts/finish-step.sh b/docker/integration-tests/spark/scripts/finish-step.sh new file mode 100644 index 00000000000..fc38a227978 --- /dev/null +++ b/docker/integration-tests/spark/scripts/finish-step.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if [ $ENABLE_INIT_DAEMON = "true" ] + then + echo "Finish step ${INIT_DAEMON_STEP} in pipeline" + while true; do + sleep 5 + echo -n '.' + string=$(curl -sL -w "%{http_code}" -X PUT $INIT_DAEMON_BASE_URI/finish?step=$INIT_DAEMON_STEP -o /dev/null) + [ "$string" = "204" ] && break + done + echo "Notified finish of step ${INIT_DAEMON_STEP}" +fi \ No newline at end of file diff --git a/docker/integration-tests/spark/scripts/master.sh b/docker/integration-tests/spark/scripts/master.sh new file mode 100644 index 00000000000..ae20d28bb42 --- /dev/null +++ b/docker/integration-tests/spark/scripts/master.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +export SPARK_MASTER_HOST=${SPARK_MASTER_HOST:-`hostname`} + +export SPARK_HOME=/spark + +. "/spark/sbin/spark-config.sh" + +. "/spark/bin/load-spark-env.sh" + +mkdir -p $SPARK_MASTER_LOG + +ln -sf /dev/stdout $SPARK_MASTER_LOG/spark-master.out + +cd /spark/bin && /spark/sbin/../bin/spark-class org.apache.spark.deploy.master.Master \ + --ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG/spark-master.out \ No newline at end of file diff --git a/docker/integration-tests/spark/scripts/wait-for-step.sh b/docker/integration-tests/spark/scripts/wait-for-step.sh new file mode 100644 index 00000000000..2bbefa824a6 --- /dev/null +++ b/docker/integration-tests/spark/scripts/wait-for-step.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if [ $ENABLE_INIT_DAEMON = "true" ] + then + echo "Validating if step ${INIT_DAEMON_STEP} can start in pipeline" + while true; do + sleep 5 + echo -n '.' + string=$(curl -s $INIT_DAEMON_BASE_URI/canStart?step=$INIT_DAEMON_STEP) + [ "$string" = "true" ] && break + done + echo "Can start step ${INIT_DAEMON_STEP}" +fi \ No newline at end of file diff --git a/docker/integration-tests/spark/scripts/worker.sh b/docker/integration-tests/spark/scripts/worker.sh new file mode 100644 index 00000000000..b1454625aca --- /dev/null +++ b/docker/integration-tests/spark/scripts/worker.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +export SPARK_HOME=/spark + +. "/spark/sbin/spark-config.sh" + +. "/spark/bin/load-spark-env.sh" + +mkdir -p $SPARK_WORKER_LOG + +ln -sf /dev/stdout $SPARK_WORKER_LOG/spark-worker.out + +/spark/sbin/../bin/spark-class org.apache.spark.deploy.worker.Worker \ + --webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG/spark-worker.out \ No newline at end of file diff --git a/integration-tests/spark/0001-test-spark-cluster.hpl b/integration-tests/spark/0001-test-spark-cluster.hpl new file mode 100644 index 00000000000..1e86f3d71ba --- /dev/null +++ b/integration-tests/spark/0001-test-spark-cluster.hpl @@ -0,0 +1,93 @@ + + + + + 0001-test-flink-cluster + Y + + + + Normal + + + N + 1000 + 100 + - + 2022/07/20 13:48:20.461 + - + 2022/07/20 13:48:20.461 + + N + + + + + + Generate rows + Dummy (do nothing) + Y + + + + Generate rows + RowGenerator + + Y + + 1 + + none + + + + + 5000 + FiveSecondsAgo + N + 10 + now + + + 176 + 144 + + + + Dummy (do nothing) + Dummy + + Y + + 1 + + none + + + + + 416 + 144 + + + + + + diff --git a/integration-tests/spark/dev-env-config.json b/integration-tests/spark/dev-env-config.json new file mode 100644 index 00000000000..e091c09a874 --- /dev/null +++ b/integration-tests/spark/dev-env-config.json @@ -0,0 +1,3 @@ +{ + "variables" : [ ] +} \ No newline at end of file diff --git a/integration-tests/spark/hop-config.json b/integration-tests/spark/hop-config.json new file mode 100644 index 00000000000..102ac981d58 --- /dev/null +++ b/integration-tests/spark/hop-config.json @@ -0,0 +1,290 @@ +{ + "variables": [ + { + "name": "HOP_LENIENT_STRING_TO_NUMBER_CONVERSION", + "value": "N", + "description": "System wide flag to allow lenient string to number conversion for backward compatibility. If this setting is set to \"Y\", an string starting with digits will be converted successfully into a number. (example: 192.168.1.1 will be converted into 192 or 192.168 or 192168 depending on the decimal and grouping symbol). The default (N) will be to throw an error if non-numeric symbols are found in the string." + }, + { + "name": "HOP_COMPATIBILITY_DB_IGNORE_TIMEZONE", + "value": "N", + "description": "System wide flag to ignore timezone while writing date/timestamp value to the database." + }, + { + "name": "HOP_LOG_SIZE_LIMIT", + "value": "0", + "description": "The log size limit for all pipelines and workflows that don't have the \"log size limit\" property set in their respective properties." + }, + { + "name": "HOP_EMPTY_STRING_DIFFERS_FROM_NULL", + "value": "N", + "description": "NULL vs Empty String. If this setting is set to Y, an empty string and null are different. Otherwise they are not." + }, + { + "name": "HOP_MAX_LOG_SIZE_IN_LINES", + "value": "0", + "description": "The maximum number of log lines that are kept internally by Hop. Set to 0 to keep all rows (default)" + }, + { + "name": "HOP_MAX_LOG_TIMEOUT_IN_MINUTES", + "value": "1440", + "description": "The maximum age (in minutes) of a log line while being kept internally by Hop. Set to 0 to keep all rows indefinitely (default)" + }, + { + "name": "HOP_MAX_WORKFLOW_TRACKER_SIZE", + "value": "5000", + "description": "The maximum number of workflow trackers kept in memory" + }, + { + "name": "HOP_MAX_ACTIONS_LOGGED", + "value": "5000", + "description": "The maximum number of action results kept in memory for logging purposes." + }, + { + "name": "HOP_MAX_LOGGING_REGISTRY_SIZE", + "value": "10000", + "description": "The maximum number of logging registry entries kept in memory for logging purposes." + }, + { + "name": "HOP_LOG_TAB_REFRESH_DELAY", + "value": "1000", + "description": "The hop log tab refresh delay." + }, + { + "name": "HOP_LOG_TAB_REFRESH_PERIOD", + "value": "1000", + "description": "The hop log tab refresh period." + }, + { + "name": "HOP_PLUGIN_CLASSES", + "value": null, + "description": "A comma delimited list of classes to scan for plugin annotations" + }, + { + "name": "HOP_PLUGIN_PACKAGES", + "value": null, + "description": "A comma delimited list of packages to scan for plugin annotations (warning: slow!!)" + }, + { + "name": "HOP_TRANSFORM_PERFORMANCE_SNAPSHOT_LIMIT", + "value": "0", + "description": "The maximum number of transform performance snapshots to keep in memory. Set to 0 to keep all snapshots indefinitely (default)" + }, + { + "name": "HOP_ROWSET_GET_TIMEOUT", + "value": "50", + "description": "The name of the variable that optionally contains an alternative rowset get timeout (in ms). This only makes a difference for extremely short lived pipelines." + }, + { + "name": "HOP_ROWSET_PUT_TIMEOUT", + "value": "50", + "description": "The name of the variable that optionally contains an alternative rowset put timeout (in ms). This only makes a difference for extremely short lived pipelines." + }, + { + "name": "HOP_CORE_TRANSFORMS_FILE", + "value": null, + "description": "The name of the project variable that will contain the alternative location of the hop-transforms.xml file. You can use this to customize the list of available internal transforms outside of the codebase." + }, + { + "name": "HOP_CORE_WORKFLOW_ACTIONS_FILE", + "value": null, + "description": "The name of the project variable that will contain the alternative location of the hop-workflow-actions.xml file." + }, + { + "name": "HOP_SERVER_OBJECT_TIMEOUT_MINUTES", + "value": "1440", + "description": "This project variable will set a time-out after which waiting, completed or stopped pipelines and workflows will be automatically cleaned up. The default value is 1440 (one day)." + }, + { + "name": "HOP_PIPELINE_PAN_JVM_EXIT_CODE", + "value": null, + "description": "Set this variable to an integer that will be returned as the Pan JVM exit code." + }, + { + "name": "HOP_DISABLE_CONSOLE_LOGGING", + "value": "N", + "description": "Set this variable to Y to disable standard Hop logging to the console. (stdout)" + }, + { + "name": "HOP_REDIRECT_STDERR", + "value": "N", + "description": "Set this variable to Y to redirect stderr to Hop logging." + }, + { + "name": "HOP_REDIRECT_STDOUT", + "value": "N", + "description": "Set this variable to Y to redirect stdout to Hop logging." + }, + { + "name": "HOP_DEFAULT_NUMBER_FORMAT", + "value": null, + "description": "The name of the variable containing an alternative default number format" + }, + { + "name": "HOP_DEFAULT_BIGNUMBER_FORMAT", + "value": null, + "description": "The name of the variable containing an alternative default bignumber format" + }, + { + "name": "HOP_DEFAULT_INTEGER_FORMAT", + "value": null, + "description": "The name of the variable containing an alternative default integer format" + }, + { + "name": "HOP_DEFAULT_DATE_FORMAT", + "value": null, + "description": "The name of the variable containing an alternative default date format" + }, + { + "name": "HOP_DEFAULT_TIMESTAMP_FORMAT", + "value": null, + "description": "The name of the variable containing an alternative default timestamp format" + }, + { + "name": "HOP_DEFAULT_SERVLET_ENCODING", + "value": null, + "description": "Defines the default encoding for servlets, leave it empty to use Java default encoding" + }, + { + "name": "HOP_FAIL_ON_LOGGING_ERROR", + "value": "N", + "description": "Set this variable to Y when you want the workflow/pipeline fail with an error when the related logging process (e.g. to a database) fails." + }, + { + "name": "HOP_AGGREGATION_MIN_NULL_IS_VALUED", + "value": "N", + "description": "Set this variable to Y to set the minimum to NULL if NULL is within an aggregate. Otherwise by default NULL is ignored by the MIN aggregate and MIN is set to the minimum value that is not NULL. See also the variable HOP_AGGREGATION_ALL_NULLS_ARE_ZERO." + }, + { + "name": "HOP_AGGREGATION_ALL_NULLS_ARE_ZERO", + "value": "N", + "description": "Set this variable to Y to return 0 when all values within an aggregate are NULL. Otherwise by default a NULL is returned when all values are NULL." + }, + { + "name": "HOP_COMPATIBILITY_TEXT_FILE_OUTPUT_APPEND_NO_HEADER", + "value": "N", + "description": "Set this variable to Y for backward compatibility for the Text File Output transform. Setting this to Ywill add no header row at all when the append option is enabled, regardless if the file is existing or not." + }, + { + "name": "HOP_PASSWORD_ENCODER_PLUGIN", + "value": "Hop", + "description": "Specifies the password encoder plugin to use by ID (Hop is the default)." + }, + { + "name": "HOP_SYSTEM_HOSTNAME", + "value": null, + "description": "You can use this variable to speed up hostname lookup. Hostname lookup is performed by Hop so that it is capable of logging the server on which a workflow or pipeline is executed." + }, + { + "name": "HOP_SERVER_JETTY_ACCEPTORS", + "value": null, + "description": "A variable to configure jetty option: acceptors for Carte" + }, + { + "name": "HOP_SERVER_JETTY_ACCEPT_QUEUE_SIZE", + "value": null, + "description": "A variable to configure jetty option: acceptQueueSize for Carte" + }, + { + "name": "HOP_SERVER_JETTY_RES_MAX_IDLE_TIME", + "value": null, + "description": "A variable to configure jetty option: lowResourcesMaxIdleTime for Carte" + }, + { + "name": "HOP_COMPATIBILITY_MERGE_ROWS_USE_REFERENCE_STREAM_WHEN_IDENTICAL", + "value": "N", + "description": "Set this variable to Y for backward compatibility for the Merge Rows (diff) transform. Setting this to Y will use the data from the reference stream (instead of the comparison stream) in case the compared rows are identical." + }, + { + "name": "HOP_SPLIT_FIELDS_REMOVE_ENCLOSURE", + "value": "false", + "description": "Set this variable to false to preserve enclosure symbol after splitting the string in the Split fields transform. Changing it to true will remove first and last enclosure symbol from the resulting string chunks." + }, + { + "name": "HOP_ALLOW_EMPTY_FIELD_NAMES_AND_TYPES", + "value": "false", + "description": "Set this variable to TRUE to allow your pipeline to pass 'null' fields and/or empty types." + }, + { + "name": "HOP_GLOBAL_LOG_VARIABLES_CLEAR_ON_EXPORT", + "value": "false", + "description": "Set this variable to false to preserve global log variables defined in pipeline / workflow Properties -> Log panel. Changing it to true will clear it when export pipeline / workflow." + }, + { + "name": "HOP_FILE_OUTPUT_MAX_STREAM_COUNT", + "value": "1024", + "description": "This project variable is used by the Text File Output transform. It defines the max number of simultaneously open files within the transform. The transform will close/reopen files as necessary to insure the max is not exceeded" + }, + { + "name": "HOP_FILE_OUTPUT_MAX_STREAM_LIFE", + "value": "0", + "description": "This project variable is used by the Text File Output transform. It defines the max number of milliseconds between flushes of files opened by the transform." + }, + { + "name": "HOP_USE_NATIVE_FILE_DIALOG", + "value": "N", + "description": "Set this value to Y if you want to use the system file open/save dialog when browsing files" + }, + { + "name": "HOP_AUTO_CREATE_CONFIG", + "value": "Y", + "description": "Set this value to N if you don't want to automatically create a hop configuration file (hop-config.json) when it's missing" + } + ], + "LocaleDefault": "en_BE", + "guiProperties": { + "FontFixedSize": "13", + "MaxUndo": "100", + "DarkMode": "Y", + "FontNoteSize": "13", + "ShowOSLook": "Y", + "FontFixedStyle": "0", + "FontNoteName": ".AppleSystemUIFont", + "FontFixedName": "Monospaced", + "FontGraphStyle": "0", + "FontDefaultSize": "13", + "GraphColorR": "255", + "FontGraphSize": "13", + "IconSize": "32", + "BackgroundColorB": "255", + "FontNoteStyle": "0", + "FontGraphName": ".AppleSystemUIFont", + "FontDefaultName": ".AppleSystemUIFont", + "GraphColorG": "255", + "UseGlobalFileBookmarks": "Y", + "FontDefaultStyle": "0", + "GraphColorB": "255", + "BackgroundColorR": "255", + "BackgroundColorG": "255", + "WorkflowDialogStyle": "RESIZE,MAX,MIN", + "LineWidth": "1", + "ContextDialogShowCategories": "Y" + }, + "projectsConfig": { + "enabled": true, + "projectMandatory": true, + "environmentMandatory": true, + "defaultProject": "default", + "defaultEnvironment": null, + "standardParentProject": "default", + "standardProjectsFolder": null, + "projectConfigurations": [ + { + "projectName": "default", + "projectHome": "${HOP_CONFIG_FOLDER}", + "configFilename": "project-config.json" + } + ], + "lifecycleEnvironments": [ + { + "name": "dev", + "purpose": "Testing", + "projectName": "default", + "configurationFiles": [ + "${PROJECT_HOME}/dev-env-config.json" + ] + } + ], + "projectLifecycles": [] + } +} \ No newline at end of file diff --git a/integration-tests/spark/main-0001-test-spark-cluster.hwf b/integration-tests/spark/main-0001-test-spark-cluster.hwf new file mode 100644 index 00000000000..6b99c64239c --- /dev/null +++ b/integration-tests/spark/main-0001-test-spark-cluster.hwf @@ -0,0 +1,92 @@ + + + + main-0001-test-spark-cluster + Y + + + + - + 2022/07/20 13:47:38.846 + - + 2022/07/20 13:47:38.846 + + + + + Start + + SPECIAL + + N + 0 + 0 + 60 + 12 + 0 + 1 + 1 + N + 50 + 50 + + + + 0001-test-spark-cluster.hpl + + PIPELINE + + ${PROJECT_HOME}/0001-test-spark-cluster.hpl + N + N + N + N + N + + + N + N + Basic + N + Y + N + spark + + Y + + N + 208 + 50 + + + + + + Start + 0001-test-spark-cluster.hpl + Y + Y + Y + + + + + + diff --git a/integration-tests/spark/metadata/pipeline-run-configuration/local.json b/integration-tests/spark/metadata/pipeline-run-configuration/local.json new file mode 100644 index 00000000000..63794efcaf6 --- /dev/null +++ b/integration-tests/spark/metadata/pipeline-run-configuration/local.json @@ -0,0 +1,17 @@ +{ + "engineRunConfiguration": { + "Local": { + "feedback_size": "50000", + "sample_size": "100", + "sample_type_in_gui": "Last", + "rowset_size": "10000", + "safe_mode": false, + "show_feedback": false, + "topo_sort": false, + "gather_metrics": false + } + }, + "configurationVariables": [], + "name": "local", + "description": "Runs your pipelines locally with the standard local Hop pipeline engine" +} \ No newline at end of file diff --git a/integration-tests/spark/metadata/pipeline-run-configuration/spark.json b/integration-tests/spark/metadata/pipeline-run-configuration/spark.json new file mode 100644 index 00000000000..d15ae3dcfb7 --- /dev/null +++ b/integration-tests/spark/metadata/pipeline-run-configuration/spark.json @@ -0,0 +1,26 @@ +{ + "engineRunConfiguration": { + "BeamSparkPipelineEngine": { + "sparkMinReadTimeMillis": "", + "sparkMaster": "spark://spark:7077", + "sparkEnableSparkMetricSinks": false, + "streamingHopTransformsBufferSize": "", + "sparkCheckpointDir": "", + "sparkMaxRecordsPerBatch": "", + "userAgent": "Hop", + "sparkBatchIntervalMillis": "", + "pluginsToStage": "", + "tempLocation": "/var/folders/v0/hjm6kpgd0076ncdkpf8rg3dw0000gn/T/", + "sparkCheckpointDurationMillis": "", + "fatJar": "/tmp/hop-fatjar.jar", + "sparkBundleSize": "", + "sparkReadTimePercentage": "", + "streamingHopTransformsFlushInterval": "", + "transformPluginClasses": "", + "xpPluginClasses": "" + } + }, + "configurationVariables": [], + "name": "spark", + "description": "" +} \ No newline at end of file diff --git a/integration-tests/spark/metadata/workflow-run-configuration/local.json b/integration-tests/spark/metadata/workflow-run-configuration/local.json new file mode 100644 index 00000000000..e37a93039aa --- /dev/null +++ b/integration-tests/spark/metadata/workflow-run-configuration/local.json @@ -0,0 +1,9 @@ +{ + "engineRunConfiguration": { + "Local": { + "safe_mode": false + } + }, + "name": "local", + "description": "Runs your workflows locally with the standard local Hop workflow engine" +} \ No newline at end of file diff --git a/integration-tests/spark/project-config.json b/integration-tests/spark/project-config.json new file mode 100644 index 00000000000..899c6c10927 --- /dev/null +++ b/integration-tests/spark/project-config.json @@ -0,0 +1,15 @@ +{ + "metadataBaseFolder": "${PROJECT_HOME}/metadata", + "unitTestsBasePath": "${PROJECT_HOME}", + "dataSetsCsvFolder": "${PROJECT_HOME}/datasets", + "enforcingExecutionInHome": true, + "config": { + "variables": [ + { + "name": "HOP_LICENSE_HEADER_FILE", + "value": "${PROJECT_HOME}/../asf-header.txt", + "description": "This will automatically serialize the ASF license header into pipelines and workflows in the integration test projects" + } + ] + } +} \ No newline at end of file From 70764ac51927d4fb8af2854d16051a6285c07c11 Mon Sep 17 00:00:00 2001 From: Hans Van Akelyen Date: Thu, 4 Aug 2022 15:30:19 +0200 Subject: [PATCH 2/2] [DOC] fix broken link --- docs/hop-user-manual/modules/ROOT/pages/pipeline/transforms.adoc | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/hop-user-manual/modules/ROOT/pages/pipeline/transforms.adoc b/docs/hop-user-manual/modules/ROOT/pages/pipeline/transforms.adoc index c5760b96573..f0a584c9778 100644 --- a/docs/hop-user-manual/modules/ROOT/pages/pipeline/transforms.adoc +++ b/docs/hop-user-manual/modules/ROOT/pages/pipeline/transforms.adoc @@ -121,7 +121,6 @@ The pages nested under this topic contain information on how to use the transfor * xref:pipeline/transforms/ldapoutput.adoc[LDAP Output] * xref:pipeline/transforms/loadfileinput.adoc[Load file content in memory] * xref:pipeline/transforms/mail.adoc[Mail] -* xref:pipeline/transforms/mailvalidator.adoc[Mail Validator] * xref:pipeline/transforms/mapping-input.adoc[Mapping Input] * xref:pipeline/transforms/mapping-output.adoc[Mapping Output] * xref:pipeline/transforms/memgroupby.adoc[Memory Group By]