apache · Yikun · Jun 27, 2022 · Jul 5, 2022
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -56,6 +56,8 @@ jobs:
       GITHUB_PREV_SHA: ${{ github.event.before }}
     outputs:
       required: ${{ steps.set-outputs.outputs.required }}
+      user: ${{ steps.set-outputs.outputs.user }}
+      img_tag: ${{ steps.set-outputs.outputs.img_tag }}
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
@@ -73,6 +75,9 @@ jobs:
     - name: Check all modules
       id: set-outputs
       run: |
+        # Convert to lowercase to meet docker repo name requirement
+        echo ::set-output name=user::$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
+        echo ::set-output name=img_tag::$(echo "${{ inputs.branch }}-${{ github.run_id }}")
         if [ -z "${{ inputs.jobs }}" ]; then
           # is-changed.py is missing in branch-3.2, and it might run in scheduled build, see also SPARK-39517
           pyspark=true; sparkr=true; tpcds=true; docker=true;
@@ -251,13 +256,57 @@ jobs:
         name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
         path: "**/target/unit-tests.log"
 
-  pyspark:
+  infra-image:
     needs: precondition
     if: fromJson(needs.precondition.outputs.required).pyspark == 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout Spark repository
+        uses: actions/checkout@v2
+        # In order to fetch changed files
+        with:
+          fetch-depth: 0
+          repository: apache/spark
+          ref: ${{ inputs.branch }}
+      - name: Sync the current branch with the latest in Apache Spark
+        if: github.repository != 'apache/spark'
+        run: |
+          echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
+          git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
+          git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
+          git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
+      -
+        name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      -
+        name: Build and push
+        id: docker_build
+        uses: docker/build-push-action@v2
+        with:
+          context: ./dev/infra/
+          push: true
+          tags: |
+            ghcr.io/${{ needs.precondition.outputs.user }}/apache-spark-ci-image-pyspark-${{ needs.precondition.outputs.img_tag }}
+          # TODO: Change yikun cache to apache cache
+          # Use the infra image cache to speed up
+          cache-from: type=registry,ref=ghcr.io/yikun/apache-spark-github-action-image-cache:${{ inputs.branch }}
+
+  pyspark:
+    needs: [precondition, infra-image]
+    if: fromJson(needs.precondition.outputs.required).pyspark == 'true'
     name: "Build modules: ${{ matrix.modules }}"
     runs-on: ubuntu-20.04
     container:
-      image: dongjoon/apache-spark-github-action-image:20220207
+      image: ghcr.io/${{ needs.precondition.outputs.user }}/apache-spark-ci-image-pyspark-${{ needs.precondition.outputs.img_tag }}
     strategy:
       fail-fast: false
       matrix:
@@ -281,6 +330,7 @@ jobs:
       SKIP_UNIDOC: true
       SKIP_MIMA: true
       METASPACE_SIZE: 1g
+      GHCR_DEL_SECRET: ${{ secrets.GHCR_DEL }}
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
@@ -289,6 +339,9 @@ jobs:
         fetch-depth: 0
         repository: apache/spark
         ref: ${{ inputs.branch }}
+    - name: Add GITHUB_WORKSPACE to git trust safe.directory
+      run: |
+        git config --global --add safe.directory ${GITHUB_WORKSPACE}
     - name: Sync the current branch with the latest in Apache Spark
       if: github.repository != 'apache/spark'
       run: |
@@ -795,3 +848,23 @@ jobs:
       with:
         name: unit-tests-log-docker-integration--8-${{ inputs.hadoop }}-hive2.3
         path: "**/target/unit-tests.log"
+
+  infra-image-post:
+    # Note that: there are only GHCR creation permission for secrets.GITHUB_TOKEN
+    # If you want to clean up the CI images, you need to:
+    # - 1. Generate the token from https://github.com/settings/tokens with `write:packages` and `delete:packages`
+    # - 2. Add the the token as secrets `GHCR_DEL` in https://github.com/{username}}/spark/settings/secrets/actions
+    # Consider there are no limit of GHCR, this step is not required.
+    # https://docs.github.com/en/billing/managing-billing-for-github-packages/about-billing-for-github-packages#about-billing-for-github-packages
+    # TODO: Switch to actions/delete-package-versions when it supported:
+    # https://github.com/actions/delete-package-versions/issues/74
+    - name: Cleanup docker image
+      # Only cleanup the ci images when users set the secrets in fork repo
+      if: always() && ${{ env.GHCR_DEL_SECRET }}
+      needs: [ pyspark, sparkr, lint ]
+      run: >
+        curl
+        -X DELETE
+        -H "Accept: application/vnd.github+json"
+        -H "Authorization: token ${{ secrets.GHCR_DEL }}"
+        https://api.github.com/user/packages/container/apache-spark-github-action-image-pyspark-${{ needs.precondition.outputs.img_tag }}
diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile
@@ -0,0 +1,55 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Image for building and testing Spark branches. Based on Ubuntu 20.04.
+FROM ubuntu:20.04
+
+ENV DEBIAN_FRONTEND noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN true
+
+ARG APT_INSTALL="apt-get install --no-install-recommends -y"
+
+RUN apt-get clean
+RUN apt-get update
+RUN $APT_INSTALL software-properties-common git libxml2-dev pkg-config curl wget openjdk-8-jdk libpython3-dev python3-pip python3-setuptools python3.8 python3.9
+RUN update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
+
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9
+RUN python3.9 -m pip install 'numpy<1.23.0' pyarrow 'pandas<1.4.0' scipy xmlrunner plotly>=4.8 sklearn 'mlflow>=1.0' coverage matplotlib
+
+RUN add-apt-repository ppa:pypy/ppa
+RUN apt update
+RUN $APT_INSTALL gfortran libopenblas-dev liblapack-dev
+
+RUN $APT_INSTALL build-essential
+RUN mkdir -p /usr/local/pypy/pypy3.7 && \
+    curl -sqL https://downloads.python.org/pypy/pypy3.7-v7.3.7-linux64.tar.bz2 | tar xjf - -C /usr/local/pypy/pypy3.7 --strip-components=1 && \
+    ln -sf /usr/local/pypy/pypy3.7/bin/pypy /usr/local/bin/pypy3.7 && \
+    ln -sf /usr/local/pypy/pypy3.7/bin/pypy /usr/local/bin/pypy3
+
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3
+RUN pypy3 -m pip install 'numpy<1.23.0' 'pandas<1.4.0' scipy coverage matplotlib
+
+RUN $APT_INSTALL gnupg ca-certificates pandoc
+RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' >> /etc/apt/sources.list
+RUN gpg --keyserver keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9
+RUN gpg -a --export E084DAB9 | apt-key add -
+RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/'
+RUN apt update
+RUN $APT_INSTALL r-base libcurl4-openssl-dev qpdf libssl-dev zlib1g-dev
+RUN Rscript -e "install.packages(c('knitr', 'markdown', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2', 'xml2'), repos='https://cloud.r-project.org/')"
+