From 803a6a443ba9f7d3dc34d68b0d15f53c1b6054fb Mon Sep 17 00:00:00 2001 From: Xingbo Jiang Date: Sun, 15 Apr 2018 23:12:16 +0800 Subject: [PATCH 1/7] fix type coercion when promting to StringType --- .../apache/spark/sql/catalyst/analysis/TypeCoercion.scala | 8 +++++++- .../spark/sql/catalyst/analysis/TypeCoercionSuite.scala | 5 +++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index ec7e7761dc4c2..e2908d7991e5b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -178,7 +178,13 @@ object TypeCoercion { private def findWiderCommonType(types: Seq[DataType]): Option[DataType] = { types.foldLeft[Option[DataType]](Some(NullType))((r, c) => r match { case Some(d) => findWiderTypeForTwo(d, c) - case None => None + // Currently we find the wider common type by comparing the two types from left to right, + // this can be a problem when you have two data types which don't have a common type but each + // can be promoted to StringType. For instance, (TimestampType, IntegerType, StringType) + // should have StringType as the wider common type. + case None if types.exists(_ == StringType) && + types.forall(stringPromotion(_, StringType).nonEmpty) => Some(StringType) + case _ => None }) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala index 8ac49dc05e3cf..275303f694501 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala @@ -572,6 +572,11 @@ class TypeCoercionSuite extends AnalysisTest { Coalesce(Seq(nullLit, floatNullLit, doubleLit, stringLit)), Coalesce(Seq(Cast(nullLit, StringType), Cast(floatNullLit, StringType), Cast(doubleLit, StringType), Cast(stringLit, StringType)))) + + ruleTest(rule, + Coalesce(Seq(timestampLit, intLit, stringLit)), + Coalesce(Seq(Cast(timestampLit, StringType), Cast(intLit, StringType), + Cast(stringLit, StringType)))) } test("CreateArray casts") { From 571912f3ed21cd3753fa76225f88d0f6d8298989 Mon Sep 17 00:00:00 2001 From: Xingbo Jiang Date: Tue, 17 Apr 2018 22:19:41 +0800 Subject: [PATCH 2/7] refactor and update migration guide. --- docs/sql-programming-guide.md | 1 + .../sql/catalyst/analysis/TypeCoercion.scala | 20 +++++++++---------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 55d35b9dd31db..29111d2ce29c8 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1810,6 +1810,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see - Since Spark 2.4, writing a dataframe with an empty or nested empty schema using any file formats (parquet, orc, json, text, csv etc.) is not allowed. An exception is thrown when attempting to write dataframes with empty schema. - Since Spark 2.4, Spark compares a DATE type with a TIMESTAMP type after promotes both sides to TIMESTAMP. To set `false` to `spark.sql.hive.compareDateTimestampInTimestamp` restores the previous behavior. This option will be removed in Spark 3.0. - Since Spark 2.4, creating a managed table with nonempty location is not allowed. An exception is thrown when attempting to create a managed table with nonempty location. To set `true` to `spark.sql.allowCreatingManagedTableUsingNonemptyLocation` restores the previous behavior. This option will be removed in Spark 3.0. + - Since Spark 2.4, finding the widest common type for the arguments of a variadic function(e.g. IN/COALESCE) should always success when each of the types of arguments is either StringType or can be promoted to StringType. Previously this may throw an exception for some specific arguments ordering. ## Upgrading From Spark SQL 2.2 to 2.3 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index e2908d7991e5b..638168173c6b4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -176,16 +176,16 @@ object TypeCoercion { } private def findWiderCommonType(types: Seq[DataType]): Option[DataType] = { - types.foldLeft[Option[DataType]](Some(NullType))((r, c) => r match { - case Some(d) => findWiderTypeForTwo(d, c) - // Currently we find the wider common type by comparing the two types from left to right, - // this can be a problem when you have two data types which don't have a common type but each - // can be promoted to StringType. For instance, (TimestampType, IntegerType, StringType) - // should have StringType as the wider common type. - case None if types.exists(_ == StringType) && - types.forall(stringPromotion(_, StringType).nonEmpty) => Some(StringType) - case _ => None - }) + // `findWiderTypeForTwo` doesn't satisfy the associative law, i.e. (a op b) op c may not equal + // to a op (b op c). This is only a problem when each of the types is StringType or can be + // promoted to StringType. For instance, (TimestampType, IntegerType, StringType) should have + // StringType as the wider common type. + val (stringTypes, nonStringTypes) = types.partition(_ == StringType) + (stringTypes.distinct ++ nonStringTypes).foldLeft[Option[DataType]](Some(NullType))((r, c) => + r match { + case Some(d) => findWiderTypeForTwo(d, c) + case _ => None + }) } /** From d77f2136451008b2be6f9f63c8ffcae9a39a2426 Mon Sep 17 00:00:00 2001 From: Xingbo Jiang Date: Tue, 17 Apr 2018 23:15:19 +0800 Subject: [PATCH 3/7] update comments --- .../apache/spark/sql/catalyst/analysis/TypeCoercion.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 638168173c6b4..e6b161c33ef40 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -176,10 +176,10 @@ object TypeCoercion { } private def findWiderCommonType(types: Seq[DataType]): Option[DataType] = { - // `findWiderTypeForTwo` doesn't satisfy the associative law, i.e. (a op b) op c may not equal - // to a op (b op c). This is only a problem when each of the types is StringType or can be - // promoted to StringType. For instance, (TimestampType, IntegerType, StringType) should have - // StringType as the wider common type. + // findWiderTypeForTwo doesn't satisfy the associative law, i.e. (a op b) op c may not equal + // to a op (b op c). This is only a problem for StringType. Excluding StringType, + // findWiderTypeForTwo satisfies the associative law. For instance, (TimestampType, + // IntegerType, StringType) should have StringType as the wider common type. val (stringTypes, nonStringTypes) = types.partition(_ == StringType) (stringTypes.distinct ++ nonStringTypes).foldLeft[Option[DataType]](Some(NullType))((r, c) => r match { From 4ce5081fb2da8dabb216413fdda4da0f0b061f71 Mon Sep 17 00:00:00 2001 From: Xingbo Jiang Date: Wed, 18 Apr 2018 10:44:51 +0800 Subject: [PATCH 4/7] fix also for array types --- .../apache/spark/sql/catalyst/analysis/TypeCoercion.scala | 4 +++- .../spark/sql/catalyst/analysis/TypeCoercionSuite.scala | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index e6b161c33ef40..8512340c7f2ca 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -180,7 +180,9 @@ object TypeCoercion { // to a op (b op c). This is only a problem for StringType. Excluding StringType, // findWiderTypeForTwo satisfies the associative law. For instance, (TimestampType, // IntegerType, StringType) should have StringType as the wider common type. - val (stringTypes, nonStringTypes) = types.partition(_ == StringType) + val (stringTypes, nonStringTypes) = types.partition { t => + t == StringType || t == ArrayType(StringType) + } (stringTypes.distinct ++ nonStringTypes).foldLeft[Option[DataType]](Some(NullType))((r, c) => r match { case Some(d) => findWiderTypeForTwo(d, c) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala index 275303f694501..fd6a3121663ed 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala @@ -539,6 +539,9 @@ class TypeCoercionSuite extends AnalysisTest { val floatLit = Literal.create(1.0f, FloatType) val timestampLit = Literal.create("2017-04-12", TimestampType) val decimalLit = Literal(new java.math.BigDecimal("1000000000000000000000")) + val tsArrayLit = Literal(Array(new Timestamp(System.currentTimeMillis()))) + val strArrayLit = Literal(Array("c")) + val intArrayLit = Literal(Array(1)) ruleTest(rule, Coalesce(Seq(doubleLit, intLit, floatLit)), @@ -577,6 +580,11 @@ class TypeCoercionSuite extends AnalysisTest { Coalesce(Seq(timestampLit, intLit, stringLit)), Coalesce(Seq(Cast(timestampLit, StringType), Cast(intLit, StringType), Cast(stringLit, StringType)))) + + ruleTest(rule, + Coalesce(Seq(tsArrayLit, intArrayLit, strArrayLit)), + Coalesce(Seq(Cast(tsArrayLit, ArrayType(StringType)), + Cast(intArrayLit, ArrayType(StringType)), Cast(strArrayLit, ArrayType(StringType))))) } test("CreateArray casts") { From 55eefe0883ad8aec6a79f56a42085ea645e8eecb Mon Sep 17 00:00:00 2001 From: Xingbo Jiang Date: Wed, 18 Apr 2018 21:55:58 +0800 Subject: [PATCH 5/7] refactor --- .../spark/sql/catalyst/analysis/TypeCoercion.scala | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 8512340c7f2ca..0264bbcebf8df 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -175,14 +175,22 @@ object TypeCoercion { }) } + /** + * Whether the data type contains StringType. + */ + def hasStringType(dt: DataType): Boolean = dt match { + case StringType => true + case ArrayType(et, _) => hasStringType(et) + // Add StructType if we support string promotion for struct fields in the future. + case _ => false + } + private def findWiderCommonType(types: Seq[DataType]): Option[DataType] = { // findWiderTypeForTwo doesn't satisfy the associative law, i.e. (a op b) op c may not equal // to a op (b op c). This is only a problem for StringType. Excluding StringType, // findWiderTypeForTwo satisfies the associative law. For instance, (TimestampType, // IntegerType, StringType) should have StringType as the wider common type. - val (stringTypes, nonStringTypes) = types.partition { t => - t == StringType || t == ArrayType(StringType) - } + val (stringTypes, nonStringTypes) = types.partition(hasStringType(_)) (stringTypes.distinct ++ nonStringTypes).foldLeft[Option[DataType]](Some(NullType))((r, c) => r match { case Some(d) => findWiderTypeForTwo(d, c) From c2abce2dcb4d4cdf822150b80cf83104cd9e41f3 Mon Sep 17 00:00:00 2001 From: Xingbo Jiang Date: Wed, 18 Apr 2018 23:58:58 +0800 Subject: [PATCH 6/7] update comments --- .../apache/spark/sql/catalyst/analysis/TypeCoercion.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 0264bbcebf8df..281f206e8d59e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -187,9 +187,9 @@ object TypeCoercion { private def findWiderCommonType(types: Seq[DataType]): Option[DataType] = { // findWiderTypeForTwo doesn't satisfy the associative law, i.e. (a op b) op c may not equal - // to a op (b op c). This is only a problem for StringType. Excluding StringType, - // findWiderTypeForTwo satisfies the associative law. For instance, (TimestampType, - // IntegerType, StringType) should have StringType as the wider common type. + // to a op (b op c). This is only a problem for StringType or nested StringType in ArrayType. + // Excluding these types, findWiderTypeForTwo satisfies the associative law. For instance, + // (TimestampType, IntegerType, StringType) should have StringType as the wider common type. val (stringTypes, nonStringTypes) = types.partition(hasStringType(_)) (stringTypes.distinct ++ nonStringTypes).foldLeft[Option[DataType]](Some(NullType))((r, c) => r match { From ff514afd8adf817ee14a7504f0e9e8244c596ab6 Mon Sep 17 00:00:00 2001 From: Xingbo Jiang Date: Thu, 19 Apr 2018 00:14:51 +0800 Subject: [PATCH 7/7] update migration guide --- docs/sql-programming-guide.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 29111d2ce29c8..e8ff1470970f7 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1810,8 +1810,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see - Since Spark 2.4, writing a dataframe with an empty or nested empty schema using any file formats (parquet, orc, json, text, csv etc.) is not allowed. An exception is thrown when attempting to write dataframes with empty schema. - Since Spark 2.4, Spark compares a DATE type with a TIMESTAMP type after promotes both sides to TIMESTAMP. To set `false` to `spark.sql.hive.compareDateTimestampInTimestamp` restores the previous behavior. This option will be removed in Spark 3.0. - Since Spark 2.4, creating a managed table with nonempty location is not allowed. An exception is thrown when attempting to create a managed table with nonempty location. To set `true` to `spark.sql.allowCreatingManagedTableUsingNonemptyLocation` restores the previous behavior. This option will be removed in Spark 3.0. - - Since Spark 2.4, finding the widest common type for the arguments of a variadic function(e.g. IN/COALESCE) should always success when each of the types of arguments is either StringType or can be promoted to StringType. Previously this may throw an exception for some specific arguments ordering. - + - Since Spark 2.4, the type coercion rules can automatically promote the argument types of the variadic SQL functions (e.g., IN/COALESCE) to the widest common type, no matter how the input arguments order. In prior Spark versions, the promotion could fail in some specific orders (e.g., TimestampType, IntegerType and StringType) and throw an exception. ## Upgrading From Spark SQL 2.2 to 2.3 - Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the referenced columns only include the internal corrupt record column (named `_corrupt_record` by default). For example, `spark.read.schema(schema).json(file).filter($"_corrupt_record".isNotNull).count()` and `spark.read.schema(schema).json(file).select("_corrupt_record").show()`. Instead, you can cache or save the parsed results and then send the same query. For example, `val df = spark.read.schema(schema).json(file).cache()` and then `df.filter($"_corrupt_record".isNotNull).count()`.