Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-32025][SQL] Csv schema inference problems with different types in the same column #28896

Closed
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -103,7 +103,16 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
typeSoFar
} else {
typeSoFar match {
case NullType => tryParseInteger(field)
case NullType =>
tryParseInteger(field) match {
planga82 marked this conversation as resolved.
Show resolved Hide resolved
case _: StringType =>
tryParseTimestamp(field) match {
case _: StringType =>
tryParseBoolean(field)
case timestamp => timestamp
}
case numeric => numeric
}
case IntegerType => tryParseInteger(field)
case LongType => tryParseLong(field)
planga82 marked this conversation as resolved.
Show resolved Hide resolved
case _: DecimalType =>
Expand Down Expand Up @@ -161,7 +170,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field)) {
DoubleType
} else {
tryParseTimestamp(field)
stringType()
}
}

Expand All @@ -170,7 +179,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
if ((allCatch opt timestampParser.parse(field)).isDefined) {
TimestampType
} else {
tryParseBoolean(field)
stringType()
}
}

Expand Down
Expand Up @@ -51,16 +51,37 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper {
val options = new CSVOptions(Map("timestampFormat" -> "yyyy-MM-dd HH:mm:ss"), false, "UTC")
val inferSchema = new CSVInferSchema(options)

assert(inferSchema.inferField(NullType, "1") == IntegerType)
assert(inferSchema.inferField(NullType, "1.0") == DoubleType)
assert(inferSchema.inferField(IntegerType, "1.0") == DoubleType)
assert(inferSchema.inferField(LongType, "1.0") == DoubleType)
assert(inferSchema.inferField(DoubleType, "1.0") == DoubleType)
assert(inferSchema.inferField(IntegerType, "test") == StringType)
assert(inferSchema.inferField(LongType, "test") == StringType)
assert(inferSchema.inferField(IntegerType, "1.0") == DoubleType)
assert(inferSchema.inferField(DoubleType, null) == DoubleType)
assert(inferSchema.inferField(DoubleType, "test") == StringType)
assert(inferSchema.inferField(LongType, "2015-08-20 14:57:00") == TimestampType)
assert(inferSchema.inferField(DoubleType, "2015-08-20 15:57:00") == TimestampType)
assert(inferSchema.inferField(LongType, "True") == BooleanType)
assert(inferSchema.inferField(IntegerType, "FALSE") == BooleanType)
assert(inferSchema.inferField(TimestampType, "FALSE") == BooleanType)
assert(inferSchema.inferField(IntegerType, null) == IntegerType)
assert(inferSchema.inferField(LongType, null) == LongType)
assert(inferSchema.inferField(DoubleType, null) == DoubleType)
assert(inferSchema.inferField(IntegerType, "2015-08-20 14:57:00") == StringType)
assert(inferSchema.inferField(LongType, "2015-08-20 14:57:00") == StringType)
assert(inferSchema.inferField(DoubleType, "2015-08-20 14:57:00") == StringType)
assert(inferSchema.inferField(IntegerType, "True") == StringType)
assert(inferSchema.inferField(LongType, "True") == StringType)
assert(inferSchema.inferField(DoubleType, "True") == StringType)

assert(inferSchema.inferField(NullType, "2015-08-20 14:57:00") == TimestampType)
assert(inferSchema.inferField(TimestampType, "2015-08-20 14:57:00") == TimestampType)
assert(inferSchema.inferField(TimestampType, null) == TimestampType)
assert(inferSchema.inferField(TimestampType, "1.0") == StringType)
assert(inferSchema.inferField(TimestampType, "False") == StringType)
assert(inferSchema.inferField(TimestampType, "test") == StringType)

assert(inferSchema.inferField(NullType, "True") == BooleanType)
assert(inferSchema.inferField(BooleanType, "FALSE") == BooleanType)
assert(inferSchema.inferField(BooleanType, null) == BooleanType)
assert(inferSchema.inferField(BooleanType, "2015-08-20 14:57:00") == StringType)
assert(inferSchema.inferField(BooleanType, "1.0") == StringType)
assert(inferSchema.inferField(BooleanType, "test") == StringType)

val textValueOne = Long.MaxValue.toString + "0"
val decimalValueOne = new java.math.BigDecimal(textValueOne)
Expand Down
4 changes: 4 additions & 0 deletions sql/core/src/test/resources/test-data/mixed-types1.csv
@@ -0,0 +1,4 @@
col_mixed_types
2012
1997
True
planga82 marked this conversation as resolved.
Show resolved Hide resolved
Expand Up @@ -67,6 +67,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa
private val valueMalformedFile = "test-data/value-malformed.csv"
private val badAfterGoodFile = "test-data/bad_after_good.csv"
private val malformedRowFile = "test-data/malformedRow.csv"
private val mixedTypes = "test-data/mixed-types1.csv"

/** Verifies data and schema. */
private def verifyCars(
Expand Down Expand Up @@ -2341,6 +2342,16 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa
checkAnswer(csv, Row(null))
}
}

test("Test mixed types") {
planga82 marked this conversation as resolved.
Show resolved Hide resolved
val cars = spark
.read
.format("csv")
.option("header", "true")
.load(testFile(mixedTypes))

assert(cars.schema.fields.last == StructField("col_mixed_types", StringType, true))
}
}

class CSVv1Suite extends CSVSuite {
Expand Down