Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-35091][SPARK-35090][SQL] Support extract from ANSI Intervals #32351

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -2378,15 +2378,15 @@ object DatePart {
Literal(null, DoubleType)
} else {
val fieldStr = fieldEval.asInstanceOf[UTF8String].toString
val analysisException = QueryCompilationErrors.literalTypeUnsupportedForSourceTypeError(
fieldStr, source)
if (source.dataType == CalendarIntervalType) {
ExtractIntervalPart.parseExtractField(
fieldStr,
source,
throw analysisException)
} else {
DatePart.parseExtractField(fieldStr, source, throw analysisException)

def analysisException =
throw QueryCompilationErrors.literalTypeUnsupportedForSourceTypeError(fieldStr, source)

source.dataType match {
case YearMonthIntervalType | DayTimeIntervalType | CalendarIntervalType =>
ExtractIntervalPart.parseExtractField(fieldStr, source, analysisException)
case _ =>
DatePart.parseExtractField(fieldStr, source, analysisException)
}
}
}
Expand Down Expand Up @@ -2414,6 +2414,10 @@ object DatePart {
5
> SELECT _FUNC_('seconds', interval 5 hours 30 seconds 1 milliseconds 1 microseconds);
30.001001
> SELECT _FUNC_('MONTH', INTERVAL '2021-11' YEAR TO MONTH);
11
> SELECT _FUNC_('MINUTE', INTERVAL '123 23:55:59.002001' DAY TO SECOND);
55
""",
note = """
The _FUNC_ function is equivalent to the SQL-standard function `EXTRACT(field FROM source)`
Expand Down Expand Up @@ -2479,6 +2483,10 @@ case class DatePart(field: Expression, source: Expression, child: Expression)
5
> SELECT _FUNC_(seconds FROM interval 5 hours 30 seconds 1 milliseconds 1 microseconds);
30.001001
> SELECT _FUNC_(MONTH FROM INTERVAL '2021-11' YEAR TO MONTH);
11
> SELECT _FUNC_(MINUTE FROM INTERVAL '123 23:55:59.002001' DAY TO SECOND);
55
""",
note = """
The _FUNC_ function is equivalent to `date_part(field, source)`.
Expand Down
Expand Up @@ -29,67 +29,105 @@ import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.CalendarInterval

abstract class ExtractIntervalPart(
child: Expression,
abstract class ExtractIntervalPart[T](
val dataType: DataType,
func: CalendarInterval => Any,
funcName: String)
extends UnaryExpression with ExpectsInputTypes with NullIntolerant with Serializable {

override def inputTypes: Seq[AbstractDataType] = Seq(CalendarIntervalType)

override protected def nullSafeEval(interval: Any): Any = {
func(interval.asInstanceOf[CalendarInterval])
}

func: T => Any,
funcName: String) extends UnaryExpression with NullIntolerant with Serializable {
override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
val iu = IntervalUtils.getClass.getName.stripSuffix("$")
defineCodeGen(ctx, ev, c => s"$iu.$funcName($c)")
}

override protected def nullSafeEval(interval: Any): Any = {
func(interval.asInstanceOf[T])
}
}

case class ExtractIntervalYears(child: Expression)
extends ExtractIntervalPart(child, IntegerType, getYears, "getYears") {
extends ExtractIntervalPart[CalendarInterval](IntegerType, getYears, "getYears") {
override protected def withNewChildInternal(newChild: Expression): ExtractIntervalYears =
copy(child = newChild)
}

case class ExtractIntervalMonths(child: Expression)
extends ExtractIntervalPart(child, ByteType, getMonths, "getMonths") {
extends ExtractIntervalPart[CalendarInterval](ByteType, getMonths, "getMonths") {
override protected def withNewChildInternal(newChild: Expression): ExtractIntervalMonths =
copy(child = newChild)
}

case class ExtractIntervalDays(child: Expression)
extends ExtractIntervalPart(child, IntegerType, getDays, "getDays") {
extends ExtractIntervalPart[CalendarInterval](IntegerType, getDays, "getDays") {
override protected def withNewChildInternal(newChild: Expression): ExtractIntervalDays =
copy(child = newChild)
}

case class ExtractIntervalHours(child: Expression)
extends ExtractIntervalPart(child, LongType, getHours, "getHours") {
extends ExtractIntervalPart[CalendarInterval](ByteType, getHours, "getHours") {
override protected def withNewChildInternal(newChild: Expression): ExtractIntervalHours =
copy(child = newChild)
}

case class ExtractIntervalMinutes(child: Expression)
extends ExtractIntervalPart(child, ByteType, getMinutes, "getMinutes") {
extends ExtractIntervalPart[CalendarInterval](ByteType, getMinutes, "getMinutes") {
override protected def withNewChildInternal(newChild: Expression): ExtractIntervalMinutes =
copy(child = newChild)
}

case class ExtractIntervalSeconds(child: Expression)
extends ExtractIntervalPart(child, DecimalType(8, 6), getSeconds, "getSeconds") {
extends ExtractIntervalPart[CalendarInterval](DecimalType(8, 6), getSeconds, "getSeconds") {
override protected def withNewChildInternal(newChild: Expression): ExtractIntervalSeconds =
copy(child = newChild)
}

case class ExtractANSIIntervalYears(child: Expression)
extends ExtractIntervalPart[Int](IntegerType, getYears, "getYears") {
override protected def withNewChildInternal(newChild: Expression): ExtractANSIIntervalYears =
copy(child = newChild)
}

case class ExtractANSIIntervalMonths(child: Expression)
extends ExtractIntervalPart[Int](ByteType, getMonths, "getMonths") {
override protected def withNewChildInternal(newChild: Expression): ExtractANSIIntervalMonths =
copy(child = newChild)
}

case class ExtractANSIIntervalDays(child: Expression)
extends ExtractIntervalPart[Long](IntegerType, getDays, "getDays") {
override protected def withNewChildInternal(newChild: Expression): ExtractANSIIntervalDays = {
copy(child = newChild)
}
}

case class ExtractANSIIntervalHours(child: Expression)
extends ExtractIntervalPart[Long](ByteType, getHours, "getHours") {
override protected def withNewChildInternal(newChild: Expression): ExtractANSIIntervalHours =
copy(child = newChild)
}

case class ExtractANSIIntervalMinutes(child: Expression)
extends ExtractIntervalPart[Long](ByteType, getMinutes, "getMinutes") {
override protected def withNewChildInternal(newChild: Expression): ExtractANSIIntervalMinutes =
copy(child = newChild)
}

case class ExtractANSIIntervalSeconds(child: Expression)
extends ExtractIntervalPart[Long](DecimalType(8, 6), getSeconds, "getSeconds") {
override protected def withNewChildInternal(newChild: Expression): ExtractANSIIntervalSeconds =
copy(child = newChild)
}

object ExtractIntervalPart {

def parseExtractField(
extractField: String,
source: Expression,
errorHandleFunc: => Nothing): Expression = extractField.toUpperCase(Locale.ROOT) match {
case "YEAR" if source.dataType == YearMonthIntervalType => ExtractANSIIntervalYears(source)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't we support all the shortcuts "YEAR" | "Y" | "YEARS" | "YR" | "YRS"? Can we merge the case?

case "YEAR" | "Y" | "YEARS" | "YR" | "YRS" => if (source.dataType == YearMonthIntervalType) ... else ...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For ANSI compliance, I didn't add the abbreviations. For inner consistency, I am OK to add them

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's add them. We will use the new interval types by default and this is a breaking change.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated

case "MONTH" if source.dataType == YearMonthIntervalType => ExtractANSIIntervalMonths(source)
case "DAY" if source.dataType == DayTimeIntervalType => ExtractANSIIntervalDays(source)
case "HOUR" if source.dataType == DayTimeIntervalType => ExtractANSIIntervalHours(source)
case "MINUTE" if source.dataType == DayTimeIntervalType => ExtractANSIIntervalMinutes(source)
case "SECOND" if source.dataType == DayTimeIntervalType => ExtractANSIIntervalSeconds(source)
case "YEAR" | "Y" | "YEARS" | "YR" | "YRS" => ExtractIntervalYears(source)
case "MONTH" | "MON" | "MONS" | "MONTHS" => ExtractIntervalMonths(source)
case "DAY" | "D" | "DAYS" => ExtractIntervalDays(source)
Expand Down
Expand Up @@ -592,7 +592,7 @@ object PushFoldableIntoBranches extends Rule[LogicalPlan] with PredicateHelper {
true
case _: CastBase => true
case _: GetDateField | _: LastDay => true
case _: ExtractIntervalPart => true
case _: ExtractIntervalPart[_] => true
case _: ArraySetLike => true
case _: ExtractValue => true
case _ => false
Expand Down
Expand Up @@ -54,31 +54,39 @@ object IntervalUtils {
}
import IntervalUnit._

def getYears(interval: CalendarInterval): Int = {
interval.months / MONTHS_PER_YEAR
}
def getYears(months: Int): Int = months / MONTHS_PER_YEAR

def getMonths(interval: CalendarInterval): Byte = {
(interval.months % MONTHS_PER_YEAR).toByte
}
def getYears(interval: CalendarInterval): Int = getYears(interval.months)

def getMonths(months: Int): Byte = (months % MONTHS_PER_YEAR).toByte

def getMonths(interval: CalendarInterval): Byte = getMonths(interval.months)

def getDays(microseconds: Long): Int = (microseconds / MICROS_PER_DAY).toInt

def getDays(interval: CalendarInterval): Int = {
val daysInMicroseconds = (interval.microseconds / MICROS_PER_DAY).toInt
val daysInMicroseconds = getDays(interval.microseconds)
Math.addExact(interval.days, daysInMicroseconds)
}

def getHours(interval: CalendarInterval): Long = {
(interval.microseconds % MICROS_PER_DAY) / MICROS_PER_HOUR
def getHours(microseconds: Long): Byte = {
((microseconds % MICROS_PER_DAY) / MICROS_PER_HOUR).toByte
}

def getMinutes(interval: CalendarInterval): Byte = {
((interval.microseconds % MICROS_PER_HOUR) / MICROS_PER_MINUTE).toByte
def getHours(interval: CalendarInterval): Byte = getHours(interval.microseconds)

def getMinutes(microseconds: Long): Byte = {
((microseconds % MICROS_PER_HOUR) / MICROS_PER_MINUTE).toByte
}

def getSeconds(interval: CalendarInterval): Decimal = {
Decimal(interval.microseconds % MICROS_PER_MINUTE, 8, 6)
def getMinutes(interval: CalendarInterval): Byte = getMinutes(interval.microseconds)

def getSeconds(microseconds: Long): Decimal = {
Decimal(microseconds % MICROS_PER_MINUTE, 8, 6)
}

def getSeconds(interval: CalendarInterval): Decimal = getSeconds(interval.microseconds)

private def toLongWithRange(
fieldName: IntervalUnit,
s: String,
Expand Down
Expand Up @@ -23,8 +23,8 @@ import java.time.temporal.ChronoUnit
import scala.language.implicitConversions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.util.{DateTimeTestUtils, IntervalUtils}
import org.apache.spark.sql.catalyst.util.DateTimeConstants._
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils
import org.apache.spark.sql.catalyst.util.IntervalUtils.{safeStringToInterval, stringToInterval}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{DayTimeIntervalType, Decimal, DecimalType, YearMonthIntervalType}
Expand Down Expand Up @@ -76,17 +76,17 @@ class IntervalExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
}

test("hours") {
checkEvaluation(ExtractIntervalHours("0 hours"), 0L)
checkEvaluation(ExtractIntervalHours("1 hour"), 1L)
checkEvaluation(ExtractIntervalHours("-1 hour"), -1L)
checkEvaluation(ExtractIntervalHours("23 hours"), 23L)
checkEvaluation(ExtractIntervalHours("-23 hours"), -23L)
checkEvaluation(ExtractIntervalHours("0 hours"), 0.toByte)
checkEvaluation(ExtractIntervalHours("1 hour"), 1.toByte)
checkEvaluation(ExtractIntervalHours("-1 hour"), -1.toByte)
checkEvaluation(ExtractIntervalHours("23 hours"), 23.toByte)
checkEvaluation(ExtractIntervalHours("-23 hours"), -23.toByte)
// Years, months and days must not be taken into account
checkEvaluation(ExtractIntervalHours("100 year 10 months 10 days 10 hours"), 10L)
checkEvaluation(ExtractIntervalHours("100 year 10 months 10 days 10 hours"), 10.toByte)
// Minutes should be taken into account
checkEvaluation(ExtractIntervalHours("10 hours 100 minutes"), 11L)
checkEvaluation(ExtractIntervalHours(largeInterval), 11L)
checkEvaluation(ExtractIntervalHours("25 hours"), 1L)
checkEvaluation(ExtractIntervalHours("10 hours 100 minutes"), 11.toByte)
checkEvaluation(ExtractIntervalHours(largeInterval), 11.toByte)
checkEvaluation(ExtractIntervalHours("25 hours"), 1.toByte)

}

Expand Down Expand Up @@ -410,4 +410,40 @@ class IntervalExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
DayTimeIntervalType, numType)
}
}

test("ANSI: extract years and months") {
Seq(Period.ZERO,
Period.ofMonths(100),
Period.ofMonths(-100),
Period.ofYears(100),
Period.ofYears(-100)).foreach { p =>
checkEvaluation(ExtractANSIIntervalYears(Literal(p)),
IntervalUtils.getYears(p.toTotalMonths.toInt))
checkEvaluation(ExtractANSIIntervalMonths(Literal(p)),
IntervalUtils.getMonths(p.toTotalMonths.toInt))
}
checkEvaluation(ExtractANSIIntervalYears(Literal(null, YearMonthIntervalType)), null)
checkEvaluation(ExtractANSIIntervalMonths(Literal(null, YearMonthIntervalType)), null)
}

test("ANSI: extract days, hours, minutes and seconds") {
Seq(Duration.ZERO,
Duration.ofMillis(1L * MILLIS_PER_DAY + 2 * MILLIS_PER_SECOND),
Duration.ofMillis(-1L * MILLIS_PER_DAY + 2 * MILLIS_PER_SECOND),
Duration.ofDays(100),
Duration.ofDays(-100),
Duration.ofHours(-100)).foreach { d =>

checkEvaluation(ExtractANSIIntervalDays(Literal(d)), d.toDays.toInt)
checkEvaluation(ExtractANSIIntervalHours(Literal(d)), (d.toHours % HOURS_PER_DAY).toByte)
checkEvaluation(ExtractANSIIntervalMinutes(Literal(d)),
(d.toMinutes % MINUTES_PER_HOUR).toByte)
checkEvaluation(ExtractANSIIntervalSeconds(Literal(d)),
IntervalUtils.getSeconds(IntervalUtils.durationToMicros(d)))
}
checkEvaluation(ExtractANSIIntervalDays(Literal(null, DayTimeIntervalType)), null)
checkEvaluation(ExtractANSIIntervalHours(Literal(null, DayTimeIntervalType)), null)
checkEvaluation(ExtractANSIIntervalMinutes(Literal(null, DayTimeIntervalType)), null)
checkEvaluation(ExtractANSIIntervalSeconds(Literal(null, DayTimeIntervalType)), null)
}
}
31 changes: 31 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/extract.sql
Expand Up @@ -128,3 +128,34 @@ select c - i from t;
select year(c - i) from t;
select extract(year from c - i) from t;
select extract(month from to_timestamp(c) - i) from t;

-- extract fields from year-month/day-time intervals
select extract(YEAR from interval '2-1' YEAR TO MONTH);
select date_part('YEAR', interval '2-1' YEAR TO MONTH);
select extract(YEAR from -interval '2-1' YEAR TO MONTH);
select extract(MONTH from interval '2-1' YEAR TO MONTH);
select date_part('MONTH', interval '2-1' YEAR TO MONTH);
select extract(MONTH from -interval '2-1' YEAR TO MONTH);
select date_part(NULL, interval '2-1' YEAR TO MONTH);

-- invalid
select extract(DAY from interval '2-1' YEAR TO MONTH);
select date_part('DAY', interval '2-1' YEAR TO MONTH);
select date_part('not_supported', interval '2-1' YEAR TO MONTH);

select extract(DAY from interval '123 12:34:56.789123123' DAY TO SECOND);
select date_part('DAY', interval '123 12:34:56.789123123' DAY TO SECOND);
select extract(DAY from -interval '123 12:34:56.789123123' DAY TO SECOND);
select extract(HOUR from interval '123 12:34:56.789123123' DAY TO SECOND);
select date_part('HOUR', interval '123 12:34:56.789123123' DAY TO SECOND);
select extract(HOUR from -interval '123 12:34:56.789123123' DAY TO SECOND);
select extract(MINUTE from interval '123 12:34:56.789123123' DAY TO SECOND);
select date_part('MINUTE', interval '123 12:34:56.789123123' DAY TO SECOND);
select extract(MINUTE from -interval '123 12:34:56.789123123' DAY TO SECOND);
select extract(SECOND from interval '123 12:34:56.789123123' DAY TO SECOND);
select date_part('SECOND', interval '123 12:34:56.789123123' DAY TO SECOND);
select extract(SECOND from -interval '123 12:34:56.789123123' DAY TO SECOND);
select date_part(NULL, interval '123 12:34:56.789123123' DAY TO SECOND);

select extract(MONTH from interval '123 12:34:56.789123123' DAY TO SECOND);
select date_part('not_supported', interval '123 12:34:56.789123123' DAY TO SECOND);