From caeeb0d3cf9f0f898a8fa730723005e8c4ef77b5 Mon Sep 17 00:00:00 2001 From: s71955 Date: Tue, 11 Sep 2018 19:41:55 +0530 Subject: [PATCH] [SPARK-23425][SQL][FOLLOWUP] Support wildcards in HDFS path for load table command. What changes were proposed in this pull request Updated the Migration guide for the behavior changes done in the JIRA issue SPARK-23425. Added testcase for verifying the Load command with space character in a file name. How was this patch tested? Manually verified and added UT. --- docs/sql-programming-guide.md | 1 + .../spark/sql/hive/execution/SQLQuerySuite.scala | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 9da7d64322eb6..e262987ab23de 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1898,6 +1898,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see - Since Spark 2.4, File listing for compute statistics is done in parallel by default. This can be disabled by setting `spark.sql.parallelFileListingInStatsComputation.enabled` to `False`. - Since Spark 2.4, Metadata files (e.g. Parquet summary files) and temporary files are not counted as data files when calculating table size during Statistics computation. - Since Spark 2.4, empty strings are saved as quoted empty strings `""`. In version 2.3 and earlier, empty strings are equal to `null` values and do not reflect to any characters in saved CSV files. For example, the row of `"a", null, "", 1` was writted as `a,,,1`. Since Spark 2.4, the same row is saved as `a,,"",1`. To restore the previous behavior, set the CSV option `emptyValue` to empty (not quoted) string. + - Since Spark 2.4, The LOAD DATA command supports wildcard `?` and `*`, which match any one character, and zero or more characters, respectively. Example: `LOAD DATA INPATH '/tmp/folder*/'` or `LOAD DATA INPATH '/tmp/part-?'`. Special Characters like `space` also now work in paths. Example: `LOAD DATA INPATH '/tmp/folder name/'`. ## Upgrading From Spark SQL 2.3.0 to 2.3.1 and above diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 20c4c36c05091..e49aea267026e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1916,6 +1916,21 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } } + test("SPARK-23425 Test LOAD DATA LOCAL INPATH with space in file name") { + withTempDir { dir => + val path = dir.toURI.toString.stripSuffix("/") + val dirPath = dir.getAbsoluteFile + for (i <- 1 to 3) { + Files.write(s"$i", new File(dirPath, s"part-r-0000 $i"), StandardCharsets.UTF_8) + } + withTable("load_t") { + sql("CREATE TABLE load_t (a STRING)") + sql(s"LOAD DATA LOCAL INPATH '$path/part-r-0000 1' INTO TABLE load_t") + checkAnswer(sql("SELECT * FROM load_t"), Seq(Row("1"))) + } + } + } + test("Support wildcard character in folderlevel for LOAD DATA LOCAL INPATH") { withTempDir { dir => val path = dir.toURI.toString.stripSuffix("/")