apache · steveloughran · Jun 24, 2021 · Jun 24, 2021 · dongjoon-hyun · Jun 25, 2021
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -421,6 +421,9 @@ private[spark] object SparkHadoopUtil extends Logging {
    * Returns a Configuration object with Spark configuration applied on top. Unlike
    * the instance method, this will always return a Configuration instance, and not a
    * cluster manager-specific type.
+   * The configuration will load all default values set in core-default.xml,
+   * and if found on the classpath, those of core-site.xml.
+   * This is done before the spark overrides are applied.
    */
   private[spark] def newConfiguration(conf: SparkConf): Configuration = {
     val hadoopConf = new Configuration()
@@ -487,6 +490,18 @@ private[spark] object SparkHadoopUtil extends Logging {
     if (conf.getOption("spark.hadoop.fs.s3a.downgrade.syncable.exceptions").isEmpty) {
       hadoopConf.set("fs.s3a.downgrade.syncable.exceptions", "true")
     }
+    // In Hadoop 3.3.1, AWS region handling with the default "" endpoint only works
+    // in EC2 deployments or when the AWS CLI is installed.
+    // The workaround is to set the name of the S3 endpoint explicitly,
+    // if not already set. See HADOOP-17771.
+    // This change is harmless on older versions and compatible with
+    // later Hadoop releases
+    if (hadoopConf.get("fs.s3a.endpoint", "").isEmpty &&
+      hadoopConf.get("fs.s3a.endpoint.region") == null) {
+      // set to US central endpoint which can also connect to buckets
+      // in other regions at the expense of a HEAD request during fs creation
+      hadoopConf.set("fs.s3a.endpoint", "s3.amazonaws.com")
+    }
   }
 
   private def appendSparkHiveConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = {

diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy
+
+import org.apache.hadoop.conf.Configuration
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+
+class SparkHadoopUtilSuite extends SparkFunSuite {
+
+  /**
+   * Verify that spark.hadoop options are propagated, and that
+   * the default s3a options are set as expected.
+   */
+  test("appendSparkHadoopConfigs with propagation and defaults") {
+    val sc = new SparkConf()
+    val hadoopConf = new Configuration(false)
+    sc.set("spark.hadoop.orc.filterPushdown", "true")
+    new SparkHadoopUtil().appendSparkHadoopConfigs(sc, hadoopConf)
+    assertConfigValue(hadoopConf, "orc.filterPushdown", "true" )
+    assertConfigValue(hadoopConf, "fs.s3a.downgrade.syncable.exceptions", "true")
+    assertConfigValue(hadoopConf, "fs.s3a.endpoint", "s3.amazonaws.com")
+  }
+
+  /**
+   * An empty S3A endpoint will be overridden just as a null value
+   * would.
+   */
+  test("appendSparkHadoopConfigs with S3A endpoint set to empty string") {
+    val sc = new SparkConf()
+    val hadoopConf = new Configuration(false)
+    sc.set("spark.hadoop.fs.s3a.endpoint", "")
+    new SparkHadoopUtil().appendSparkHadoopConfigs(sc, hadoopConf)
+    assertConfigValue(hadoopConf, "fs.s3a.endpoint", "s3.amazonaws.com")
+  }
+
+  /**
+   * Explicitly set the patched s3a options and verify that they are not overridden.
+   */
+  test("appendSparkHadoopConfigs with S3A options explicitly set") {
+    val sc = new SparkConf()
+    val hadoopConf = new Configuration(false)
+    sc.set("spark.hadoop.fs.s3a.downgrade.syncable.exceptions", "false")
+    sc.set("spark.hadoop.fs.s3a.endpoint", "s3-eu-west-1.amazonaws.com")
+    new SparkHadoopUtil().appendSparkHadoopConfigs(sc, hadoopConf)
+    assertConfigValue(hadoopConf, "fs.s3a.downgrade.syncable.exceptions", "false")
+    assertConfigValue(hadoopConf, "fs.s3a.endpoint",
+      "s3-eu-west-1.amazonaws.com")
+  }
+
+  /**
+   * If the endpoint region is set (even to a blank string) in
+   * "spark.hadoop.fs.s3a.endpoint.region" then the endpoint is not set,
+   * even when the s3a endpoint is "".
+   * This supports a feature in later hadoop versions where this configuration
+   * pair triggers a revert to the "SDK to work out the region" algorithm,
+   * which works on EC2 deployments.
+   */
+  test("appendSparkHadoopConfigs with S3A endpoint region set to an empty string") {
+    val sc = new SparkConf()
+    val hadoopConf = new Configuration(false)
+    sc.set("spark.hadoop.fs.s3a.endpoint.region", "")
+    new SparkHadoopUtil().appendSparkHadoopConfigs(sc, hadoopConf)
+    // the endpoint value will not have been set
+    assertConfigValue(hadoopConf, "fs.s3a.endpoint", null)
+  }
+
+  /**
+   * Assert that a hadoop configuration option has the expected value.
+   * @param hadoopConf configuration to query
+   * @param key key to look up
+   * @param expected expected value.
+   */
+  private def assertConfigValue(
+    hadoopConf: Configuration,
+    key: String,
+    expected: String): Unit = {
+    assert(hadoopConf.get(key) === expected,
+      s"Mismatch in expected value of $key")
+  }
+}