diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index 9d09cab4fd482..2367bb692a175 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -135,6 +135,8 @@ class CSVOptions( val positiveInf = parameters.getOrElse("positiveInf", "Inf") val negativeInf = parameters.getOrElse("negativeInf", "-Inf") + // Set bom to true to fix some characters are garbled when opening with Excel. + val bom = getBool("bom") val compressionCodec: Option[String] = { val name = parameters.get("compression").orElse(parameters.get("codec")) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CsvOutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CsvOutputWriter.scala index 2b549536ae486..881e8659ef646 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CsvOutputWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CsvOutputWriter.scala @@ -39,6 +39,10 @@ class CsvOutputWriter( private val gen = new UnivocityGenerator(dataSchema, writer, params) + if (params.bom) { + writer.write(0xFEFF) + } + if (params.headerFlag) { gen.writeHeaders() } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index fcb7bdc25f08f..f9340fdabf24c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -26,6 +26,7 @@ import java.util.Locale import java.util.zip.GZIPOutputStream import scala.collection.JavaConverters._ +import scala.collection.mutable import scala.util.Properties import com.univocity.parsers.common.TextParsingException @@ -34,6 +35,7 @@ import org.apache.hadoop.io.SequenceFile.CompressionType import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.{SparkConf, SparkException, TestUtils} +import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd} import org.apache.spark.sql.{AnalysisException, Column, DataFrame, QueryTest, Row} import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.internal.SQLConf @@ -2353,6 +2355,45 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa assert(df.schema.last == StructField("col_mixed_types", StringType, true)) } } + + test("Some characters are garbled when opening csv files with Excel") { + // scalastyle:off nonascii + val chinese = "我爱中文" + val korean = "나는 한국인을 좋아한다" + val japanese = "私は日本人が好き" + // scalastyle:on nonascii + val english = "I love English" + + val df = spark.sql(s"SELECT '$chinese' AS Chinese, '$korean' AS Korean," + + s"'$japanese' AS Japanese, '$english' AS English") + + Seq(true, false).foreach { bom => + withTempPath { p => + val path = p.getAbsolutePath + df.write.option("bom", bom).csv(path) + + val bytesReads = new mutable.ArrayBuffer[Long]() + val bytesReadListener = new SparkListener() { + override def onTaskEnd(taskEnd: SparkListenerTaskEnd) { + bytesReads += taskEnd.taskMetrics.inputMetrics.bytesRead + } + } + sparkContext.addSparkListener(bytesReadListener) + try { + spark.read.csv(path).limit(1).collect() + sparkContext.listenerBus.waitUntilEmpty(1000L) + if (bom) { + assert(bytesReads.sum === 202) + } else { + assert(bytesReads.sum === 196) + } + } finally { + sparkContext.removeSparkListener(bytesReadListener) + } + checkAnswer(spark.read.csv(path), Seq(Row(chinese, korean, japanese, english))) + } + } + } } class CSVv1Suite extends CSVSuite {