Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-45433][SQL][3.4] Fix CSV/JSON schema inference when timestamps do not match specified timestampFormat #43343

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.util.{DateFormatter, TimestampFormatter}
import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
import org.apache.spark.sql.errors.QueryExecutionErrors
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
import org.apache.spark.sql.types._

class CSVInferSchema(val options: CSVOptions) extends Serializable {
Expand Down Expand Up @@ -202,8 +203,11 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
// We can only parse the value as TimestampNTZType if it does not have zone-offset or
// time-zone component and can be parsed with the timestamp formatter.
// Otherwise, it is likely to be a timestamp with timezone.
if (timestampNTZFormatter.parseWithoutTimeZoneOptional(field, false).isDefined) {
SQLConf.get.timestampType
val timestampType = SQLConf.get.timestampType
if ((SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY ||
timestampType == TimestampNTZType) &&
timestampNTZFormatter.parseWithoutTimeZoneOptional(field, false).isDefined) {
timestampType
} else {
tryParseTimestamp(field)
}
Expand Down
Expand Up @@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
import org.apache.spark.sql.errors.QueryExecutionErrors
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
import org.apache.spark.sql.types._
import org.apache.spark.util.Utils

Expand Down Expand Up @@ -148,11 +149,13 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable {
val bigDecimal = decimalParser(field)
DecimalType(bigDecimal.precision, bigDecimal.scale)
}
val timestampType = SQLConf.get.timestampType
if (options.prefersDecimal && decimalTry.isDefined) {
decimalTry.get
} else if (options.inferTimestamp &&
} else if (options.inferTimestamp && (SQLConf.get.legacyTimeParserPolicy ==
LegacyBehaviorPolicy.LEGACY || timestampType == TimestampNTZType) &&
timestampNTZFormatter.parseWithoutTimeZoneOptional(field, false).isDefined) {
SQLConf.get.timestampType
timestampType
} else if (options.inferTimestamp &&
timestampFormatter.parseOptional(field).isDefined) {
TimestampType
Expand Down
Expand Up @@ -263,4 +263,14 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper {
inferSchema = new CSVInferSchema(options)
assert(inferSchema.inferField(DateType, "2012_12_12") == DateType)
}

test("SPARK-45433: inferring the schema when timestamps do not match specified timestampFormat" +
" with only one row") {
val options = new CSVOptions(
Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss"),
columnPruning = false,
defaultTimeZoneId = "UTC")
val inferSchema = new CSVInferSchema(options)
assert(inferSchema.inferField(NullType, "2884-06-24T02:45:51.138") == StringType)
}
}
Expand Up @@ -112,4 +112,12 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
checkType(Map("inferTimestamp" -> "true"), json, TimestampType)
checkType(Map("inferTimestamp" -> "false"), json, StringType)
}

test("SPARK-45433: inferring the schema when timestamps do not match specified timestampFormat" +
" with only one row") {
checkType(
Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", "inferTimestamp" -> "true"),
"""{"a": "2884-06-24T02:45:51.138"}""",
StringType)
}
}