Merge remote-tracking branch 'origin/master' into parse-ansi-interval…

…-literals # Conflicts: # docs/sql-migration-guide.md
apache · Apr 19, 2021 · 7908150 · 7908150
2 parents 68c36d1 + a74f601
commit 7908150
Show file tree

Hide file tree

Showing 52 changed files with 683 additions and 274 deletions.
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
@@ -17,11 +17,18 @@
 # under the License.
 #
 
-name: "Pull Request Labeler"
+# Intentionally has a general name.
+# because the test status check created in GitHub Actions
+# currently randomly picks any associated workflow.
+# So, the name was changed to make sense in that context too.
+# See also https://github.community/t/specify-check-suite-when-creating-a-checkrun/118380/10
+
+name: "On pull requests"
 on: pull_request_target
 
 jobs:
   label:
+    name: Label pull requests
     runs-on: ubuntu-latest
     steps:
     # In order to get back the negated matches like in the old config,

diff --git a/.github/workflows/notify_test_workflow.yml b/.github/workflows/notify_test_workflow.yml
@@ -1,10 +1,35 @@
-name: Notify test workflow
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# Intentionally has a general name.
+# because the test status check created in GitHub Actions
+# currently randomly picks any associated workflow.
+# So, the name was changed to make sense in that context too.
+# See also https://github.community/t/specify-check-suite-when-creating-a-checkrun/118380/10
+name: On pull request update
 on:
   pull_request_target:
     types: [opened, reopened, synchronize]
 
 jobs:
   notify:
+    name: Notify test workflow
     runs-on: ubuntu-20.04
     steps:
       - name: "Notify test workflow"
@@ -13,28 +38,53 @@ jobs:
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           script: |
-            const endpoint = "GET /repos/:owner/:repo/actions/workflows/:id/runs?&branch=:branch"
+            const endpoint = 'GET /repos/:owner/:repo/actions/workflows/:id/runs?&branch=:branch'
+
+            // TODO: Should use pull_request.user and pull_request.user.repos_url?
+            // If a different person creates a commit to another forked repo,
+            // it wouldn't be able to detect.
             const params = {
               owner: context.payload.pull_request.head.repo.owner.login,
               repo: context.payload.pull_request.head.repo.name,
-              id: "build_and_test.yml",
+              id: 'build_and_test.yml',
               branch: context.payload.pull_request.head.ref,
             }
 
+            console.log('Ref: ' + context.payload.pull_request.head.ref)
+            console.log('SHA: ' + context.payload.pull_request.head.sha)
+
+            // Wait 3 seconds to make sure the fork repository triggered a workflow.
+            await new Promise(r => setTimeout(r, 3000))
+
             const runs = await github.request(endpoint, params)
-            var runID = runs.data.workflow_runs[0].id
-
-            var msg = "**[Test build #" + runID + "]"
-              + "(https://github.com/" +  context.payload.pull_request.head.repo.full_name
-              + "/actions/runs/" + runID + ")** "
-              + "for PR " + context.issue.number
-              + " at commit [`" + context.payload.pull_request.head.sha.substring(0, 7) + "`]"
-              + "(https://github.com/" + context.payload.pull_request.head.repo.full_name
-              + "/commit/" + context.payload.pull_request.head.sha + ")."
-
-            github.issues.createComment({
-              issue_number: context.issue.number,
-              owner: context.payload.repository.owner.login,
-              repo: context.payload.repository.name,
-              body: msg
+            const runID = runs.data.workflow_runs[0].id
+            // TODO: If no workflows were found, it's likely GitHub Actions was not enabled
+
+            if (runs.data.workflow_runs[0].head_sha != context.payload.pull_request.head.sha) {
+              throw new Error('There was a new unsynced commit pushed. Please retrigger the workflow.');
+            }
+
+            const runUrl = 'https://github.com/'
+              + context.payload.pull_request.head.repo.full_name
+              + '/actions/runs/'
+              + runID
+
+            const name = 'Build and test'
+            const head_sha = context.payload.pull_request.head.sha
+            const status = 'queued'
+
+            github.checks.create({
+              ...context.repo,
+              name,
+              head_sha,
+              status,
+              output: {
+                title: 'Test results',
+                summary: runUrl,
+                text: JSON.stringify({
+                  owner: context.payload.pull_request.head.repo.owner.login,
+                  repo: context.payload.pull_request.head.repo.name,
+                  run_id: runID
+                })
+              }
             })
diff --git a/.github/workflows/update_build_status.yml b/.github/workflows/update_build_status.yml
@@ -0,0 +1,95 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: Update build status workflow
+
+on:
+  schedule:
+  - cron: "*/15 * * * *"
+
+jobs:
+  update:
+    name: Update build status
+    runs-on: ubuntu-20.04
+    steps:
+      - name: "Update build status"
+        uses: actions/github-script@v3
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const endpoint = 'GET /repos/:owner/:repo/pulls?state=:state'
+            const params = {
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              state: 'open'
+            }
+
+            // See https://docs.github.com/en/graphql/reference/enums#mergestatestatus
+            const maybeReady = ['behind', 'clean', 'draft', 'has_hooks', 'unknown', 'unstable'];
+
+            // Iterate open PRs
+            for await (const prs of github.paginate.iterator(endpoint,params)) {
+              // Each page
+              for await (const pr of prs.data) {
+                console.log('SHA: ' + pr.head.sha)
+                console.log('  Mergeable status: ' + pr.mergeable_state)
+                if (pr.mergeable_state == null || maybeReady.includes(pr.mergeable_state)) {
+                  const checkRuns = await github.request('GET /repos/{owner}/{repo}/commits/{ref}/check-runs', {
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    ref: pr.head.sha
+                  })
+
+                  // Iterator GitHub Checks in the PR
+                  for await (const cr of checkRuns.data.check_runs) {
+                    if (cr.name == 'Build and test') {
+                      // text contains parameters to make request in JSON.
+                      const params = JSON.parse(cr.output.text)
+
+                      // Get the workflow run in the forked repository
+                      const run = await github.request('GET /repos/{owner}/{repo}/actions/runs/{run_id}', params)
+
+                      // Keep syncing the status of the checks
+                      if (run.data.status == 'completed') {
+                        console.log('    Run ' + cr.id + ': set status (' + run.data.status + ') and conclusion (' + run.data.conclusion + ')')
+                        const response = await github.request('PATCH /repos/{owner}/{repo}/check-runs/{check_run_id}', {
+                          owner: context.repo.owner,
+                          repo: context.repo.repo,
+                          check_run_id: cr.id,
+                          output: cr.output,
+                          status: run.data.status,
+                          conclusion: run.data.conclusion
+                        })
+                      } else {
+                        console.log('    Run ' + cr.id + ': set status (' + run.data.status + ')')
+                        const response = await github.request('PATCH /repos/{owner}/{repo}/check-runs/{check_run_id}', {
+                          owner: context.repo.owner,
+                          repo: context.repo.repo,
+                          check_run_id: cr.id,
+                          output: cr.output,
+                          status: run.data.status,
+                        })
+                      }
+
+                      break
+                    }
+                  }
+                }
+              }
+            }
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationCache.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationCache.scala
@@ -156,18 +156,19 @@ private[history] class ApplicationCache(
    */
   @throws[NoSuchElementException]
   private def loadApplicationEntry(appId: String, attemptId: Option[String]): CacheEntry = {
-    logDebug(s"Loading application Entry $appId/$attemptId")
+    lazy val application = s"$appId/${attemptId.mkString}"
+    logDebug(s"Loading application Entry $application")
     metrics.loadCount.inc()
     val loadedUI = time(metrics.loadTimer) {
       metrics.lookupCount.inc()
       operations.getAppUI(appId, attemptId) match {
         case Some(loadedUI) =>
-          logDebug(s"Loaded application $appId/$attemptId")
+          logDebug(s"Loaded application $application")
           loadedUI
         case None =>
           metrics.lookupFailureCount.inc()
           // guava's cache logs via java.util log, so is of limited use. Hence: our own message
-          logInfo(s"Failed to load application attempt $appId/$attemptId")
+          logInfo(s"Failed to load application attempt $application")
           throw new NoSuchElementException(s"no application with application Id '$appId'" +
               attemptId.map { id => s" attemptId '$id'" }.getOrElse(" and no attempt Id"))
       }
@@ -182,7 +183,7 @@ private[history] class ApplicationCache(
       new CacheEntry(loadedUI, completed)
     } catch {
       case e: Exception =>
-        logWarning(s"Failed to initialize application UI for $appId/$attemptId", e)
+        logWarning(s"Failed to initialize application UI for $application", e)
         operations.detachSparkUI(appId, attemptId, loadedUI.ui)
         throw e
     }

diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -325,7 +325,8 @@ private[spark] object UIUtils extends Logging {
         <div class="container-fluid">
           <div class="row">
             <div class="col-12">
-              <h3 style="vertical-align: bottom; display: inline-block;">
+              <h3 style="vertical-align: bottom; white-space: nowrap; overflow: hidden;
+                text-overflow: ellipsis;">
                 {title}
                 {helpButton}
               </h3>

diff --git a/dev/requirements.txt b/dev/requirements.txt
@@ -1,5 +1,5 @@
 flake8==3.5.0
-jira==1.0.3
+jira==2.0.0
 PyGithub==1.26.0
 sphinx
 pydata_sphinx_theme

diff --git a/docs/index.md b/docs/index.md
@@ -25,11 +25,6 @@ It provides high-level APIs in Java, Scala, Python and R,
 and an optimized engine that supports general execution graphs.
 It also supports a rich set of higher-level tools including [Spark SQL](sql-programming-guide.html) for SQL and structured data processing, [MLlib](ml-guide.html) for machine learning, [GraphX](graphx-programming-guide.html) for graph processing, and [Structured Streaming](structured-streaming-programming-guide.html) for incremental computation and stream processing.
 
-# Security
-
-Security in Spark is OFF by default. This could mean you are vulnerable to attack by default.
-Please see [Spark Security](security.html) before downloading and running Spark.
-
 # Downloading
 
 Get Spark from the [downloads page](https://spark.apache.org/downloads.html) of the project website. This documentation is for Spark version {{site.SPARK_VERSION}}. Spark uses Hadoop's client libraries for HDFS and YARN. Downloads are pre-packaged for a handful of popular Hadoop versions.

diff --git a/docs/quick-start.md b/docs/quick-start.md
@@ -32,11 +32,6 @@ you can download a package for any version of Hadoop.
 
 Note that, before Spark 2.0, the main programming interface of Spark was the Resilient Distributed Dataset (RDD). After Spark 2.0, RDDs are replaced by Dataset, which is strongly-typed like an RDD, but with richer optimizations under the hood. The RDD interface is still supported, and you can get a more detailed reference at the [RDD programming guide](rdd-programming-guide.html). However, we highly recommend you to switch to use Dataset, which has better performance than RDD. See the [SQL programming guide](sql-programming-guide.html) to get more information about Dataset.
 
-# Security
-
-Security in Spark is OFF by default. This could mean you are vulnerable to attack by default.
-Please see [Spark Security](security.html) before running Spark.
-
 # Interactive Analysis with the Spark Shell
 
 ## Basics

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
@@ -25,8 +25,10 @@ Kubernetes scheduler that has been added to Spark.
 
 # Security
 
-Security in Spark is OFF by default. This could mean you are vulnerable to attack by default.
-Please see [Spark Security](security.html) and the specific advice below before running Spark.
+Security features like authentication are not enabled by default. When deploying a cluster that is open to the internet
+or an untrusted network, it's important to secure access to the cluster to prevent unauthorized applications
+from running on the cluster.
+Please see [Spark Security](security.html) and the specific security sections in this doc before running Spark.
 
 ## User Identity
 

diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
@@ -32,7 +32,9 @@ The advantages of deploying Spark with Mesos include:
 
 # Security
 
-Security in Spark is OFF by default. This could mean you are vulnerable to attack by default.
+Security features like authentication are not enabled by default. When deploying a cluster that is open to the internet
+or an untrusted network, it's important to secure access to the cluster to prevent unauthorized applications
+from running on the cluster.
 Please see [Spark Security](security.html) and the specific security sections in this doc before running Spark.
 
 # How it Works

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
@@ -26,7 +26,9 @@ was added to Spark in version 0.6.0, and improved in subsequent releases.
 
 # Security
 
-Security in Spark is OFF by default. This could mean you are vulnerable to attack by default.
+Security features like authentication are not enabled by default. When deploying a cluster that is open to the internet
+or an untrusted network, it's important to secure access to the cluster to prevent unauthorized applications
+from running on the cluster.
 Please see [Spark Security](security.html) and the specific security sections in this doc before running Spark.
 
 # Launching Spark on YARN

diff --git a/docs/security.md b/docs/security.md
@@ -23,7 +23,10 @@ license: |
 
 # Spark Security: Things You Need To Know
 
-Security in Spark is OFF by default. This could mean you are vulnerable to attack by default.
+Security features like authentication are not enabled by default. When deploying a cluster that is open to the internet
+or an untrusted network, it's important to secure access to the cluster to prevent unauthorized applications
+from running on the cluster.
+
 Spark supports multiple deployments types and each one supports different levels of security. Not
 all deployment types will be secure in all environments and none are secure by default. Be
 sure to evaluate your environment, what Spark supports, and take the appropriate measure to secure

diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
@@ -25,7 +25,9 @@ In addition to running on the Mesos or YARN cluster managers, Spark also provide
 
 # Security
 
-Security in Spark is OFF by default. This could mean you are vulnerable to attack by default.
+Security features like authentication are not enabled by default. When deploying a cluster that is open to the internet
+or an untrusted network, it's important to secure access to the cluster to prevent unauthorized applications
+from running on the cluster.
 Please see [Spark Security](security.html) and the specific security sections in this doc before running Spark.
 
 # Installing Spark Standalone to a Cluster

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
@@ -79,6 +79,8 @@ license: |
 
   - In Spark 3.2, `TRANSFORM` operator can't support alias in inputs. In Spark 3.1 and earlier, we can write script transform like `SELECT TRANSFORM(a AS c1, b AS c2) USING 'cat' FROM TBL`.
 
+  - In Spark 3.2, `TRANSFORM` operator can support `ArrayType/MapType/StructType` without Hive SerDe, in this mode, we use `StructsToJosn` to convert `ArrayType/MapType/StructType` column to `STRING` and use `JsonToStructs` to parse `STRING` to `ArrayType/MapType/StructType`. In Spark 3.1, Spark just support case `ArrayType/MapType/StructType` column as `STRING` but can't support parse `STRING` to `ArrayType/MapType/StructType` output columns.
+
   - In Spark 3.2, the unit-to-unit interval literals like `INTERVAL '1-1' YEAR TO MONTH` are converted to ANSI interval types: `YearMonthIntervalType` or `DayTimeIntervalType`. In Spark 3.1 and earlier, such interval literals are converted to `CalendarIntervalType`. To restore the behavior before Spark 3.2, you can set `spark.sql.legacy.interval.enabled` to `true`.
 
 ## Upgrading from Spark SQL 3.0 to 3.1

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -90,7 +90,9 @@ public static int calculateBitSetWidthInBytes(int numFields) {
           FloatType,
           DoubleType,
           DateType,
-          TimestampType
+          TimestampType,
+          YearMonthIntervalType,
+          DayTimeIntervalType
         })));
   }