Merge branch 'master' into SPARK-31709

apache · Jul 31, 2020 · b471a0e · b471a0e
2 parents 3fbe65a + 9d7b1d9
commit b471a0e
Show file tree

Hide file tree

Showing 1,716 changed files with 48,360 additions and 20,582 deletions.
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -9,148 +9,265 @@ on:
     - master
 
 jobs:
+  # TODO(SPARK-32248): Recover JDK 11 builds
+  # Build: build Spark and run the tests for specified modules.
   build:
-
+    name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
-        java: [ '1.8', '11' ]
-        hadoop: [ 'hadoop-2.7', 'hadoop-3.2' ]
-        hive: [ 'hive-1.2', 'hive-2.3' ]
-        exclude:
-        - java: '11'
-          hive: 'hive-1.2'
-        - hadoop: 'hadoop-3.2'
-          hive: 'hive-1.2'
-    name: Build Spark - JDK${{ matrix.java }}/${{ matrix.hadoop }}/${{ matrix.hive }}
-
+        java:
+          - 1.8
+        hadoop:
+          - hadoop3.2
+        hive:
+          - hive2.3
+        # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now.
+        # Kinesis tests depends on external Amazon kinesis service.
+        # Note that the modules below are from sparktestsupport/modules.py.
+        modules:
+          - |-
+            core, unsafe, kvstore, avro,
+            network-common, network-shuffle, repl, launcher,
+            examples, sketch, graphx
+          - |-
+            catalyst, hive-thriftserver
+          - |-
+            streaming, sql-kafka-0-10, streaming-kafka-0-10,
+            mllib-local, mllib,
+            yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
+          - |-
+            pyspark-sql, pyspark-mllib, pyspark-resource
+          - |-
+            pyspark-core, pyspark-streaming, pyspark-ml
+          - |-
+            sparkr
+        # Here, we split Hive and SQL tests into some of slow ones and the rest of them.
+        included-tags: [""]
+        excluded-tags: [""]
+        comment: [""]
+        include:
+          # Hive tests
+          - modules: hive
+            java: 1.8
+            hadoop: hadoop3.2
+            hive: hive2.3
+            included-tags: org.apache.spark.tags.SlowHiveTest
+            comment: "- slow tests"
+          - modules: hive
+            java: 1.8
+            hadoop: hadoop3.2
+            hive: hive2.3
+            excluded-tags: org.apache.spark.tags.SlowHiveTest
+            comment: "- other tests"
+          # SQL tests
+          - modules: sql
+            java: 1.8
+            hadoop: hadoop3.2
+            hive: hive2.3
+            included-tags: org.apache.spark.tags.ExtendedSQLTest
+            comment: "- slow tests"
+          - modules: sql
+            java: 1.8
+            hadoop: hadoop3.2
+            hive: hive2.3
+            excluded-tags: org.apache.spark.tags.ExtendedSQLTest
+            comment: "- other tests"
+    env:
+      MODULES_TO_TEST: ${{ matrix.modules }}
+      EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
+      INCLUDED_TAGS: ${{ matrix.included-tags }}
+      HADOOP_PROFILE: ${{ matrix.hadoop }}
+      HIVE_PROFILE: ${{ matrix.hive }}
+      # GitHub Actions' default miniconda to use in pip packaging test.
+      CONDA_PREFIX: /usr/share/miniconda
+      GITHUB_PREV_SHA: ${{ github.event.before }}
     steps:
-    - uses: actions/checkout@master
-    # We split caches because GitHub Action Cache has a 400MB-size limit.
-    - uses: actions/cache@v1
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+      # In order to fetch changed files
+      with:
+        fetch-depth: 0
+    # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
+    - name: Cache Scala, SBT, Maven and Zinc
+      uses: actions/cache@v1
       with:
         path: build
         key: build-${{ hashFiles('**/pom.xml') }}
         restore-keys: |
           build-
-    - uses: actions/cache@v1
-      with:
-        path: ~/.m2/repository/com
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com-${{ hashFiles('**/pom.xml') }}
-        restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com-
-    - uses: actions/cache@v1
-      with:
-        path: ~/.m2/repository/org
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org-${{ hashFiles('**/pom.xml') }}
-        restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org-
-    - uses: actions/cache@v1
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
       with:
-        path: ~/.m2/repository/net
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net-${{ hashFiles('**/pom.xml') }}
+        path: ~/.m2/repository
+        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }}
         restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net-
-    - uses: actions/cache@v1
+          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-
+    - name: Cache Ivy local repository
+      uses: actions/cache@v2
       with:
-        path: ~/.m2/repository/io
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io-${{ hashFiles('**/pom.xml') }}
+        path: ~/.ivy2/cache
+        key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }}
         restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io-
-    - name: Set up JDK ${{ matrix.java }}
+          ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-
+    - name: Install JDK ${{ matrix.java }}
       uses: actions/setup-java@v1
       with:
         java-version: ${{ matrix.java }}
-    - name: Build with Maven
+    # PySpark
+    - name: Install PyPy3
+      # Note that order of Python installations here matters because default python3 is
+      # overridden by pypy3.
+      uses: actions/setup-python@v2
+      if: contains(matrix.modules, 'pyspark')
+      with:
+        python-version: pypy3
+        architecture: x64
+    - name: Install Python 3.6
+      uses: actions/setup-python@v2
+      if: contains(matrix.modules, 'pyspark')
+      with:
+        python-version: 3.6
+        architecture: x64
+    - name: Install Python 3.8
+      uses: actions/setup-python@v2
+      # We should install one Python that is higher then 3+ for SQL and Yarn because:
+      # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
+      # - Yarn has a Python specific test too, for example, YarnClusterSuite.
+      if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+      with:
+        python-version: 3.8
+        architecture: x64
+    - name: Install Python packages (Python 3.6 and PyPy3)
+      if: contains(matrix.modules, 'pyspark')
+      # PyArrow is not supported in PyPy yet, see ARROW-2651.
+      # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
       run: |
-        export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
-        export MAVEN_CLI_OPTS="--no-transfer-progress"
+        python3.6 -m pip install numpy pyarrow pandas scipy
+        python3.6 -m pip list
+        pypy3 -m pip install numpy pandas
+        pypy3 -m pip list
+    - name: Install Python packages (Python 3.8)
+      if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+      run: |
+        python3.8 -m pip install numpy pyarrow pandas scipy
+        python3.8 -m pip list
+    # SparkR
+    - name: Install R 4.0
+      if: contains(matrix.modules, 'sparkr')
+      run: |
+        sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
+        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
+        sudo apt-get update
+        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
+    - name: Install R packages
+      if: contains(matrix.modules, 'sparkr')
+      run: |
+        # qpdf is required to reduce the size of PDFs to make CRAN check pass. See SPARK-32497.
+        sudo apt-get install -y libcurl4-openssl-dev qpdf
+        sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
+        # Show installed packages in R.
+        sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
+    # Run the tests.
+    - name: "Run tests: ${{ matrix.modules }}"
+      run: |
+        # Hive tests become flaky when running in parallel as it's too intensive.
+        if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
         mkdir -p ~/.m2
-        ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -P${{ matrix.hive }} -Phive-thriftserver -P${{ matrix.hadoop }} -Phadoop-cloud -Djava.version=${{ matrix.java }} install
+        ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
         rm -rf ~/.m2/repository/org/apache/spark
 
-
+  # Static analysis, and documentation build
   lint:
+    name: Linters, licenses, dependencies and documentation generation
     runs-on: ubuntu-latest
-    name: Linters (Java/Scala/Python), licenses, dependencies
     steps:
-    - uses: actions/checkout@master
-    - uses: actions/setup-java@v1
-      with:
-        java-version: '11'
-    - uses: actions/setup-python@v1
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
       with:
-        python-version: '3.x'
-        architecture: 'x64'
-    - name: Scala
-      run: ./dev/lint-scala
-    - name: Java
-      run: ./dev/lint-java
-    - name: Python
-      run: |
-        pip install flake8 sphinx numpy
-        ./dev/lint-python
-    - name: License
-      run: ./dev/check-license
-    - name: Dependencies
-      run: ./dev/test-dependencies.sh
-
-  lintr:
-    runs-on: ubuntu-latest
-    name: Linter (R)
-    steps:
-    - uses: actions/checkout@master
-    - uses: actions/setup-java@v1
+        path: ~/.m2/repository
+        key: docs-maven-repo-${{ hashFiles('**/pom.xml') }}
+        restore-keys: |
+          docs-maven-
+    - name: Install JDK 1.8
+      uses: actions/setup-java@v1
       with:
-        java-version: '11'
-    - uses: r-lib/actions/setup-r@v1
+        java-version: 1.8
+    - name: Install Python 3.6
+      uses: actions/setup-python@v2
       with:
-        r-version: '3.6.2'
-    - name: Install lib
+        python-version: 3.6
+        architecture: x64
+    - name: Install Python linter dependencies
       run: |
-        sudo apt-get install -y libcurl4-openssl-dev
-    - name: install R packages
+        # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
+        #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
+        pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme
+    - name: Install R 4.0
       run: |
-        sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')"
+        sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
+        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
+        sudo apt-get update
+        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
+    - name: Install R linter dependencies and SparkR
+      run: |
+        sudo apt-get install -y libcurl4-openssl-dev
+        sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
         sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')"
-    - name: package and install SparkR
-      run: ./R/install-dev.sh
-    - name: lint-r
+        ./R/install-dev.sh
+    - name: Install Ruby 2.7 for documentation generation
+      uses: actions/setup-ruby@v1
+      with:
+        ruby-version: 2.7
+    - name: Install dependencies for documentation generation
+      run: |
+        sudo apt-get install -y libcurl4-openssl-dev pandoc
+        # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
+        #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
+        pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme
+        gem install jekyll jekyll-redirect-from rouge
+        sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
+    - name: Scala linter
+      run: ./dev/lint-scala
+    - name: Java linter
+      run: ./dev/lint-java
+    - name: Python linter
+      run: ./dev/lint-python
+    - name: R linter
       run: ./dev/lint-r
+    - name: License test
+      run: ./dev/check-license
+    - name: Dependencies test
+      run: ./dev/test-dependencies.sh
+    - name: Run documentation build
+      run: |
+        cd docs
+        jekyll build
 
-  docs:
+  java11:
+    name: Java 11 build
     runs-on: ubuntu-latest
-    name: Generate documents
     steps:
-    - uses: actions/checkout@master
-    - uses: actions/cache@v1
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
       with:
         path: ~/.m2/repository
-        key: docs-maven-repo-${{ hashFiles('**/pom.xml') }}
+        key: java11-maven-${{ hashFiles('**/pom.xml') }}
         restore-keys: |
-          docs-maven-repo-
-    - uses: actions/setup-java@v1
-      with:
-        java-version: '1.8'
-    - uses: actions/setup-python@v1
-      with:
-        python-version: '3.x'
-        architecture: 'x64'
-    - uses: actions/setup-ruby@v1
-      with:
-        ruby-version: '2.7'
-    - uses: r-lib/actions/setup-r@v1
+          java11-maven-
+    - name: Install Java 11
+      uses: actions/setup-java@v1
       with:
-        r-version: '3.6.2'
-    - name: Install lib and pandoc
-      run: |
-        sudo apt-get install -y libcurl4-openssl-dev pandoc
-    - name: Install packages
-      run: |
-        pip install sphinx mkdocs numpy
-        gem install jekyll jekyll-redirect-from rouge
-        sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')"
-    - name: Run jekyll build
+        java-version: 11
+    - name: Build with Maven
       run: |
-        cd docs
-        jekyll build
+        export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
+        export MAVEN_CLI_OPTS="--no-transfer-progress"
+        mkdir -p ~/.m2
+        ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install
+        rm -rf ~/.m2/repository/org/apache/spark
diff --git a/.gitignore b/.gitignore
@@ -64,6 +64,7 @@ python/lib/pyspark.zip
 python/.eggs/
 python/deps
 python/docs/_site/
+python/docs/source/reference/api/
 python/test_coverage/coverage_data
 python/test_coverage/htmlcov
 python/pyspark/python

diff --git a/LICENSE b/LICENSE
@@ -222,14 +222,13 @@ external/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaRe
 Python Software Foundation License
 ----------------------------------
 
-pyspark/heapq3.py
-python/docs/_static/copybutton.js
+python/docs/source/_static/copybutton.js
 
 BSD 3-Clause
 ------------
 
 python/lib/py4j-*-src.zip
-python/pyspark/cloudpickle.py
+python/pyspark/cloudpickle/*.py
 python/pyspark/join.py
 core/src/main/resources/org/apache/spark/ui/static/d3.min.js