Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-32025][SQL] Csv schema inference problems with different types in the same column #28896

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -102,20 +102,19 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
if (field == null || field.isEmpty || field == options.nullValue) {
typeSoFar
} else {
typeSoFar match {
val typeElemInfer = typeSoFar match {
case NullType => tryParseInteger(field)
case IntegerType => tryParseInteger(field)
case LongType => tryParseLong(field)
planga82 marked this conversation as resolved.
Show resolved Hide resolved
case _: DecimalType =>
// DecimalTypes have different precisions and scales, so we try to find the common type.
compatibleType(typeSoFar, tryParseDecimal(field)).getOrElse(StringType)
case _: DecimalType => tryParseDecimal(field)
case DoubleType => tryParseDouble(field)
case TimestampType => tryParseTimestamp(field)
case BooleanType => tryParseBoolean(field)
case StringType => StringType
case other: DataType =>
throw new UnsupportedOperationException(s"Unexpected data type $other")
}
compatibleType(typeSoFar, typeElemInfer).getOrElse(StringType)
}
}

Expand Down
Expand Up @@ -56,11 +56,11 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper {
assert(inferSchema.inferField(IntegerType, "1.0") == DoubleType)
assert(inferSchema.inferField(DoubleType, null) == DoubleType)
assert(inferSchema.inferField(DoubleType, "test") == StringType)
assert(inferSchema.inferField(LongType, "2015-08-20 14:57:00") == TimestampType)
assert(inferSchema.inferField(DoubleType, "2015-08-20 15:57:00") == TimestampType)
assert(inferSchema.inferField(LongType, "True") == BooleanType)
assert(inferSchema.inferField(IntegerType, "FALSE") == BooleanType)
assert(inferSchema.inferField(TimestampType, "FALSE") == BooleanType)
assert(inferSchema.inferField(LongType, "2015-08-20 14:57:00") == StringType)
assert(inferSchema.inferField(DoubleType, "2015-08-20 15:57:00") == StringType)
assert(inferSchema.inferField(LongType, "True") == StringType)
assert(inferSchema.inferField(IntegerType, "FALSE") == StringType)
assert(inferSchema.inferField(TimestampType, "FALSE") == StringType)

val textValueOne = Long.MaxValue.toString + "0"
val decimalValueOne = new java.math.BigDecimal(textValueOne)
Expand Down
Expand Up @@ -2341,6 +2341,18 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa
checkAnswer(csv, Row(null))
}
}

test("SPARK-32025: infer the schema from mixed-type values") {
withTempPath { path =>
Seq("col_mixed_types", "2012", "1997", "True").toDS.write.text(path.getCanonicalPath)
val df = spark.read.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.load(path.getCanonicalPath)

assert(df.schema.last == StructField("col_mixed_types", StringType, true))
}
}
}

class CSVv1Suite extends CSVSuite {
Expand Down