From 83dd40bf368ad77205e90c0ad06cb9202f4febc2 Mon Sep 17 00:00:00 2001 From: AbinayaJayaprakasam Date: Wed, 26 Nov 2025 01:45:17 +0530 Subject: [PATCH 1/2] [SPARK-44988][SQL] Support reading Parquet TIMESTAMP(NANOS,false) Convert TIMESTAMP(NANOS,*) to LongType regardless of nanosAsLong config to allow reading Parquet files with nanosecond precision timestamps. ### What changes were proposed in this pull request? Simplified the TIMESTAMP(NANOS) handling in ParquetSchemaConverter to always convert to LongType, removing the nanosAsLong condition check that caused TIMESTAMP(NANOS,false) files to be unreadable. ### Why are the changes needed? SPARK-40819 added spark.sql.legacy.parquet.nanosAsLong as a workaround for TIMESTAMP(NANOS,true), but: - Only worked for TIMESTAMP(NANOS,true), not for TIMESTAMP(NANOS,false) - Required users to know about an obscure internal config flag - Still required manual casting from Long to Timestamp This fix makes all NANOS timestamps readable by default. Since Spark cannot fully support nanosecond precision in its type system, converting to LongType preserves precision while allowing files to be read. ### Does this PR introduce any user-facing change? Yes - Parquet files with TIMESTAMP(NANOS,*) are now readable by default without configuration. Values are read as LongType (nanoseconds since epoch). Users can convert to timestamp if needed: (col('nanos') / 1e9).cast('timestamp') ### How was this patch tested? - Updated ParquetSchemaSuite test expectations (lines 1112-1121) - All 110 tests in ParquetSchemaSuite pass - Manually tested with TIMESTAMP(NANOS,false) Parquet file generated via PyArrow --- .../parquet/ParquetSchemaConverter.scala | 9 ++++++--- .../datasources/parquet/ParquetSchemaSuite.scala | 16 ++++++++-------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala index 9e6f4447ca79..554e4e945b9a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala @@ -309,10 +309,13 @@ class ParquetToSparkSchemaConverter( } else { TimestampNTZType } - // SPARK-40819: NANOS are not supported as a Timestamp, convert to LongType without - // timezone awareness to address behaviour regression introduced by SPARK-34661 + // SPARK-40819 & SPARK-44988: NANOS are not supported as a Timestamp, convert to LongType + // The nanosAsLong config was originally intended as a migration flag, but since Spark + // doesn't have full NANOS support and real-world files exist with TIMESTAMP(NANOS,*) + // we always convert to LongType regardless of the config to prevent unreadable files. + // This handles both TIMESTAMP(NANOS,true) and TIMESTAMP(NANOS,false) case timestamp: TimestampLogicalTypeAnnotation - if timestamp.getUnit == TimeUnit.NANOS && nanosAsLong => + if timestamp.getUnit == TimeUnit.NANOS => LongType case time: TimeLogicalTypeAnnotation if time.getUnit == TimeUnit.MICROS && !time.isAdjustedToUTC => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala index 56076175d60e..197461003d33 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala @@ -1109,15 +1109,15 @@ class ParquetSchemaSuite extends ParquetSchemaTest { } } - test("SPARK-40819: parquet file with TIMESTAMP(NANOS, true) (with default nanosAsLong=false)") { + test("SPARK-40819 & SPARK-44988: parquet file with TIMESTAMP(NANOS, true) " + + "(with default nanosAsLong=false)") { + // After SPARK-44988, TIMESTAMP(NANOS,*) is always converted to LongType regardless of + // nanosAsLong config to ensure files are readable + val tsAttribute = "birthday" val testDataPath = testFile("test-data/timestamp-nanos.parquet") - checkError( - exception = intercept[AnalysisException] { - spark.read.parquet(testDataPath).collect() - }, - condition = "PARQUET_TYPE_ILLEGAL", - parameters = Map("parquetType" -> "INT64 (TIMESTAMP(NANOS,true))") - ) + val data = spark.read.parquet(testDataPath).select(tsAttribute) + assert(data.schema.fields.head.dataType == LongType) + assert(data.orderBy(desc(tsAttribute)).take(1).head.getAs[Long](0) == 1668537129123534758L) } test("SPARK-47261: parquet file with unsupported type") { From 059c360defa0d12be9985f6790dd13e34fec5d42 Mon Sep 17 00:00:00 2001 From: AbinayaJayaprakasam Date: Wed, 26 Nov 2025 13:00:46 +0530 Subject: [PATCH 2/2] Retrigger CI