From c7e817f79453cac08fabce388b8515a884e4d9e6 Mon Sep 17 00:00:00 2001
From: Marco Gaido <mgaido@hortonworks.com>
Date: Wed, 29 Nov 2017 10:42:55 +0100
Subject: [PATCH 1/2] [SPARK-22635][SQL][ORC] FileNotFoundException while
 reading ORC files containing special characters

---
 .../apache/spark/sql/hive/orc/OrcFileFormat.scala  | 14 +++++++-------
 .../spark/sql/hive/MetastoreDataSourcesSuite.scala |  3 ++-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 3b33a9ff082f3..ceb2441418200 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -59,8 +59,9 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       sparkSession: SparkSession,
       options: Map[String, String],
       files: Seq[FileStatus]): Option[StructType] = {
+    val fileNames = files.map(_.getPath.toString)
     OrcFileOperator.readSchema(
-      files.map(_.getPath.toString),
+      fileNames,
       Some(sparkSession.sessionState.newHadoopConf())
     )
   }
@@ -133,10 +134,12 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
     (file: PartitionedFile) => {
       val conf = broadcastedHadoopConf.value.value
 
+      val filePath = new Path(new URI(file.filePath))
+
       // SPARK-8501: Empty ORC files always have an empty schema stored in their footer. In this
       // case, `OrcFileOperator.readSchema` returns `None`, and we can't read the underlying file
       // using the given physical schema. Instead, we simply return an empty iterator.
-      val isEmptyFile = OrcFileOperator.readSchema(Seq(file.filePath), Some(conf)).isEmpty
+      val isEmptyFile = OrcFileOperator.readSchema(Seq(filePath.toString), Some(conf)).isEmpty
       if (isEmptyFile) {
         Iterator.empty
       } else {
@@ -146,15 +149,12 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
           val job = Job.getInstance(conf)
           FileInputFormat.setInputPaths(job, file.filePath)
 
-          val fileSplit = new FileSplit(
-            new Path(new URI(file.filePath)), file.start, file.length, Array.empty
-          )
+          val fileSplit = new FileSplit(filePath, file.start, file.length, Array.empty)
           // Custom OrcRecordReader is used to get
           // ObjectInspector during recordReader creation itself and can
           // avoid NameNode call in unwrapOrcStructs per file.
           // Specifically would be helpful for partitioned datasets.
-          val orcReader = OrcFile.createReader(
-            new Path(new URI(file.filePath)), OrcFile.readerOptions(conf))
+          val orcReader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf))
           new SparkOrcNewRecordReader(orcReader, conf, fileSplit.getStart, fileSplit.getLength)
         }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index a1060476f2211..c8caba83bf365 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -1350,7 +1350,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
       withTempDir { dir =>
         val tmpFile = s"$dir/$nameWithSpecialChars"
         spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
-        spark.read.format(format).load(tmpFile)
+        val fileContent = spark.read.format(format).load(tmpFile)
+        checkAnswer(fileContent, Seq(Row("a"), Row("b")))
       }
     }
   }

From 8cda8c3cba5d64fb723748258a038ce16e7314a7 Mon Sep 17 00:00:00 2001
From: Marco Gaido <mgaido@hortonworks.com>
Date: Wed, 29 Nov 2017 17:14:57 +0100
Subject: [PATCH 2/2] minor: revert inferSchema change

---
 .../scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index ceb2441418200..95741c7b30289 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -59,9 +59,8 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       sparkSession: SparkSession,
       options: Map[String, String],
       files: Seq[FileStatus]): Option[StructType] = {
-    val fileNames = files.map(_.getPath.toString)
     OrcFileOperator.readSchema(
-      fileNames,
+      files.map(_.getPath.toString),
       Some(sparkSession.sessionState.newHadoopConf())
     )
   }