Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 75 additions & 2 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ jobs:
GITHUB_PREV_SHA: ${{ github.event.before }}
outputs:
required: ${{ steps.set-outputs.outputs.required }}
user: ${{ steps.set-outputs.outputs.user }}
img_tag: ${{ steps.set-outputs.outputs.img_tag }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
Expand All @@ -73,6 +75,9 @@ jobs:
- name: Check all modules
id: set-outputs
run: |
# Convert to lowercase to meet docker repo name requirement
echo ::set-output name=user::$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
echo ::set-output name=img_tag::$(echo "${{ inputs.branch }}-${{ github.run_id }}")
if [ -z "${{ inputs.jobs }}" ]; then
# is-changed.py is missing in branch-3.2, and it might run in scheduled build, see also SPARK-39517
pyspark=true; sparkr=true; tpcds=true; docker=true;
Expand Down Expand Up @@ -251,13 +256,57 @@ jobs:
name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
path: "**/target/unit-tests.log"

pyspark:
infra-image:
needs: precondition
if: fromJson(needs.precondition.outputs.required).pyspark == 'true'
runs-on: ubuntu-latest
steps:
- name: Login to GitHub Container Registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Checkout Spark repository
uses: actions/checkout@v2
# In order to fetch changed files
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
-
name: Set up QEMU
uses: docker/setup-qemu-action@v1
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
-
name: Build and push
id: docker_build
uses: docker/build-push-action@v2
with:
context: ./dev/infra/
push: true
tags: |
ghcr.io/${{ needs.precondition.outputs.user }}/apache-spark-ci-image-pyspark-${{ needs.precondition.outputs.img_tag }}
# TODO: Change yikun cache to apache cache
# Use the infra image cache to speed up
cache-from: type=registry,ref=ghcr.io/yikun/apache-spark-github-action-image-cache:${{ inputs.branch }}

pyspark:
needs: [precondition, infra-image]
if: fromJson(needs.precondition.outputs.required).pyspark == 'true'
name: "Build modules: ${{ matrix.modules }}"
runs-on: ubuntu-20.04
container:
image: dongjoon/apache-spark-github-action-image:20220207
image: ghcr.io/${{ needs.precondition.outputs.user }}/apache-spark-ci-image-pyspark-${{ needs.precondition.outputs.img_tag }}
strategy:
fail-fast: false
matrix:
Expand All @@ -281,6 +330,7 @@ jobs:
SKIP_UNIDOC: true
SKIP_MIMA: true
METASPACE_SIZE: 1g
GHCR_DEL_SECRET: ${{ secrets.GHCR_DEL }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
Expand All @@ -289,6 +339,9 @@ jobs:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Add GITHUB_WORKSPACE to git trust safe.directory
run: |
git config --global --add safe.directory ${GITHUB_WORKSPACE}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
Expand Down Expand Up @@ -795,3 +848,23 @@ jobs:
with:
name: unit-tests-log-docker-integration--8-${{ inputs.hadoop }}-hive2.3
path: "**/target/unit-tests.log"

infra-image-post:
# Note that: there are only GHCR creation permission for secrets.GITHUB_TOKEN
# If you want to clean up the CI images, you need to:
# - 1. Generate the token from https://github.com/settings/tokens with `write:packages` and `delete:packages`
# - 2. Add the the token as secrets `GHCR_DEL` in https://github.com/{username}}/spark/settings/secrets/actions
# Consider there are no limit of GHCR, this step is not required.
# https://docs.github.com/en/billing/managing-billing-for-github-packages/about-billing-for-github-packages#about-billing-for-github-packages
# TODO: Switch to actions/delete-package-versions when it supported:
# https://github.com/actions/delete-package-versions/issues/74
- name: Cleanup docker image
# Only cleanup the ci images when users set the secrets in fork repo
if: always() && ${{ env.GHCR_DEL_SECRET }}
needs: [ pyspark, sparkr, lint ]
run: >
curl
-X DELETE
-H "Accept: application/vnd.github+json"
-H "Authorization: token ${{ secrets.GHCR_DEL }}"
https://api.github.com/user/packages/container/apache-spark-github-action-image-pyspark-${{ needs.precondition.outputs.img_tag }}
55 changes: 55 additions & 0 deletions dev/infra/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Image for building and testing Spark branches. Based on Ubuntu 20.04.
FROM ubuntu:20.04

ENV DEBIAN_FRONTEND noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN true

ARG APT_INSTALL="apt-get install --no-install-recommends -y"

RUN apt-get clean
RUN apt-get update
RUN $APT_INSTALL software-properties-common git libxml2-dev pkg-config curl wget openjdk-8-jdk libpython3-dev python3-pip python3-setuptools python3.8 python3.9
RUN update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java

RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9
RUN python3.9 -m pip install 'numpy<1.23.0' pyarrow 'pandas<1.4.0' scipy xmlrunner plotly>=4.8 sklearn 'mlflow>=1.0' coverage matplotlib

RUN add-apt-repository ppa:pypy/ppa
RUN apt update
RUN $APT_INSTALL gfortran libopenblas-dev liblapack-dev

RUN $APT_INSTALL build-essential
RUN mkdir -p /usr/local/pypy/pypy3.7 && \
curl -sqL https://downloads.python.org/pypy/pypy3.7-v7.3.7-linux64.tar.bz2 | tar xjf - -C /usr/local/pypy/pypy3.7 --strip-components=1 && \
ln -sf /usr/local/pypy/pypy3.7/bin/pypy /usr/local/bin/pypy3.7 && \
ln -sf /usr/local/pypy/pypy3.7/bin/pypy /usr/local/bin/pypy3

RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3
RUN pypy3 -m pip install 'numpy<1.23.0' 'pandas<1.4.0' scipy coverage matplotlib

RUN $APT_INSTALL gnupg ca-certificates pandoc
RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' >> /etc/apt/sources.list
RUN gpg --keyserver keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9
RUN gpg -a --export E084DAB9 | apt-key add -
RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/'
RUN apt update
RUN $APT_INSTALL r-base libcurl4-openssl-dev qpdf libssl-dev zlib1g-dev
RUN Rscript -e "install.packages(c('knitr', 'markdown', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2', 'xml2'), repos='https://cloud.r-project.org/')"