Skip to content

Commit

Permalink
HIVE-27199: Read TIMESTAMP WITH LOCAL TIME ZONE columns from text fil…
Browse files Browse the repository at this point in the history
…es using custom formats (Stamatis Zampetakis reviewed by Ayush Saxena, John Sherman, Attila Turóczy)

1. Support parsing TimestampTZ using the TimestampParser, which accepts
multiple DateTimeFormatters.
2. Pass timestamp.formats in Lazy inspector handling
TIMESTAMP WITH LOCAL TIME ZONE and instantiate a TimestampParser.
3. Refactor TimestampTZUtil to allow passing different
DateTimeFormatters.
4. Add tests covering timestamps with 3 different formats (built-in,
plus 2 more not covered by the default).

These changes give more flexibility to users reading timestamps from
text files and it also aligns the way TIMESTAMP and
TIMESTAMP WITH LOCAL TIME ZONE behave when a custom format is provided.

Closes #4170
  • Loading branch information
zabetak committed Apr 18, 2023
1 parent 796c909 commit f6ac8d4
Show file tree
Hide file tree
Showing 8 changed files with 245 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,13 @@ public static TimestampTZ parse(String s) {
}

public static TimestampTZ parse(String s, ZoneId defaultTimeZone) {
return parse(s, defaultTimeZone, FORMATTER);
}

public static TimestampTZ parse(String s, ZoneId defaultTimeZone, DateTimeFormatter formatter) {
// need to handle offset with single digital hour, see JDK-8066806
s = handleSingleDigitHourOffset(s);
TemporalAccessor accessor = FORMATTER.parse(s);
TemporalAccessor accessor = formatter.parse(s);

LocalDate localDate = accessor.query(TemporalQueries.localDate());

Expand Down
19 changes: 19 additions & 0 deletions common/src/java/org/apache/hive/common/util/TimestampParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@

import java.math.BigDecimal;
import java.math.RoundingMode;
import java.time.DateTimeException;
import java.time.Instant;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
Expand All @@ -34,8 +36,12 @@
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Objects;

import org.apache.hadoop.hive.common.type.Timestamp;
import org.apache.hadoop.hive.common.type.TimestampTZ;
import org.apache.hadoop.hive.common.type.TimestampTZUtil;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -199,6 +205,19 @@ public Timestamp parseTimestamp(final String text) {

}

public TimestampTZ parseTimestamp(String text, ZoneId defaultTimeZone) {
Objects.requireNonNull(text);
for (DateTimeFormatter f : dtFormatters) {
try {
return TimestampTZUtil.parse(text, defaultTimeZone, f);
} catch (DateTimeException e) {
// Ignore and try next formatter
}
}
return TimestampTZUtil.parse(text, defaultTimeZone);
}


/**
* The goal of this class is to return a timestamp. A timestamp represents a
* single moment (instant) on the time line. However, some strings will not
Expand Down
20 changes: 20 additions & 0 deletions data/files/timestamps_mixed_formats.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
DEFAULT2016-05-03 12:26:34.123456789 Europe/London
DEFAULT2016-05-03 12:26:34.123456789Europe/London
DEFAULT2016-05-03 12:26:34.123456789 GMT+01:00
DEFAULT2016-05-03 12:26:34.123456789GMT+01:00
DEFAULT2016-05-03 12:26:34.123456789 GMT+1:00
DEFAULT2016-05-03 12:26:34.123456789GMT+1:00
DEFAULT2016-05-03 12:26:34.123456789 +01:00
DEFAULT2016-05-03 12:26:34.123456789+01:00
DEFAULT2016-05-03 12:26:34.123456789 +1:00
DEFAULT2016-05-03 12:26:34.123456789
DEFAULT2016-05-03 12:26:34.123450000
DEFAULT2016-05-03 12:26:34.12345
DEFAULT2016-05-03 12:26:34.1
DEFAULT2016-05-03 12:26:34.0
DEFAULT2016-05-03 12:26:34
DEFAULT2016-05-03
FORMAT12016-05-03T12:26:34Europe/London
FORMAT12016-05-03T12:26:34+01:00
FORMAT2May 3 2016 12:26:34
FORMAT2May 03 2016 12:26:34
21 changes: 21 additions & 0 deletions ql/src/test/queries/clientpositive/timestamptz_formats.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
CREATE TABLE timestampltz_formats (
formatid string,
tsval timestamp with local time zone
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe';

LOAD DATA LOCAL INPATH '../../data/files/timestamps_mixed_formats.txt' overwrite into table timestampltz_formats;

SELECT formatid, tsval FROM timestampltz_formats;

ALTER TABLE timestampltz_formats SET SERDEPROPERTIES ("timestamp.formats"="yyyy-MM-dd'T'HH:mm:ssVV");

SELECT formatid, tsval FROM timestampltz_formats;

ALTER TABLE timestampltz_formats SET SERDEPROPERTIES ("timestamp.formats"="MMM d yyyy HH:mm:ss");

SELECT formatid, tsval FROM timestampltz_formats;

ALTER TABLE timestampltz_formats SET SERDEPROPERTIES ("timestamp.formats"="yyyy-MM-dd'T'HH:mm:ssVV,MMM d yyyy HH:mm:ss");

SELECT formatid, tsval FROM timestampltz_formats;
160 changes: 160 additions & 0 deletions ql/src/test/results/clientpositive/llap/timestamptz_formats.q.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
PREHOOK: query: CREATE TABLE timestampltz_formats (
formatid string,
tsval timestamp with local time zone
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@timestampltz_formats
POSTHOOK: query: CREATE TABLE timestampltz_formats (
formatid string,
tsval timestamp with local time zone
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@timestampltz_formats
PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/timestamps_mixed_formats.txt' overwrite into table timestampltz_formats
PREHOOK: type: LOAD
#### A masked pattern was here ####
PREHOOK: Output: default@timestampltz_formats
POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/timestamps_mixed_formats.txt' overwrite into table timestampltz_formats
POSTHOOK: type: LOAD
#### A masked pattern was here ####
POSTHOOK: Output: default@timestampltz_formats
PREHOOK: query: SELECT formatid, tsval FROM timestampltz_formats
PREHOOK: type: QUERY
PREHOOK: Input: default@timestampltz_formats
#### A masked pattern was here ####
POSTHOOK: query: SELECT formatid, tsval FROM timestampltz_formats
POSTHOOK: type: QUERY
POSTHOOK: Input: default@timestampltz_formats
#### A masked pattern was here ####
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 12:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 12:26:34.12345 US/Pacific
DEFAULT 2016-05-03 12:26:34.12345 US/Pacific
DEFAULT 2016-05-03 12:26:34.1 US/Pacific
DEFAULT 2016-05-03 12:26:34.0 US/Pacific
DEFAULT 2016-05-03 12:26:34.0 US/Pacific
DEFAULT 2016-05-03 00:00:00.0 US/Pacific
FORMAT1 NULL
FORMAT1 NULL
FORMAT2 NULL
FORMAT2 NULL
PREHOOK: query: ALTER TABLE timestampltz_formats SET SERDEPROPERTIES ("timestamp.formats"="yyyy-MM-dd'T'HH:mm:ssVV")
PREHOOK: type: ALTERTABLE_SERDEPROPERTIES
PREHOOK: Input: default@timestampltz_formats
PREHOOK: Output: default@timestampltz_formats
POSTHOOK: query: ALTER TABLE timestampltz_formats SET SERDEPROPERTIES ("timestamp.formats"="yyyy-MM-dd'T'HH:mm:ssVV")
POSTHOOK: type: ALTERTABLE_SERDEPROPERTIES
POSTHOOK: Input: default@timestampltz_formats
POSTHOOK: Output: default@timestampltz_formats
PREHOOK: query: SELECT formatid, tsval FROM timestampltz_formats
PREHOOK: type: QUERY
PREHOOK: Input: default@timestampltz_formats
#### A masked pattern was here ####
POSTHOOK: query: SELECT formatid, tsval FROM timestampltz_formats
POSTHOOK: type: QUERY
POSTHOOK: Input: default@timestampltz_formats
#### A masked pattern was here ####
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 12:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 12:26:34.12345 US/Pacific
DEFAULT 2016-05-03 12:26:34.12345 US/Pacific
DEFAULT 2016-05-03 12:26:34.1 US/Pacific
DEFAULT 2016-05-03 12:26:34.0 US/Pacific
DEFAULT 2016-05-03 12:26:34.0 US/Pacific
DEFAULT 2016-05-03 00:00:00.0 US/Pacific
FORMAT1 2016-05-03 04:26:34.0 US/Pacific
FORMAT1 2016-05-03 04:26:34.0 US/Pacific
FORMAT2 NULL
FORMAT2 NULL
PREHOOK: query: ALTER TABLE timestampltz_formats SET SERDEPROPERTIES ("timestamp.formats"="MMM d yyyy HH:mm:ss")
PREHOOK: type: ALTERTABLE_SERDEPROPERTIES
PREHOOK: Input: default@timestampltz_formats
PREHOOK: Output: default@timestampltz_formats
POSTHOOK: query: ALTER TABLE timestampltz_formats SET SERDEPROPERTIES ("timestamp.formats"="MMM d yyyy HH:mm:ss")
POSTHOOK: type: ALTERTABLE_SERDEPROPERTIES
POSTHOOK: Input: default@timestampltz_formats
POSTHOOK: Output: default@timestampltz_formats
PREHOOK: query: SELECT formatid, tsval FROM timestampltz_formats
PREHOOK: type: QUERY
PREHOOK: Input: default@timestampltz_formats
#### A masked pattern was here ####
POSTHOOK: query: SELECT formatid, tsval FROM timestampltz_formats
POSTHOOK: type: QUERY
POSTHOOK: Input: default@timestampltz_formats
#### A masked pattern was here ####
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 12:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 12:26:34.12345 US/Pacific
DEFAULT 2016-05-03 12:26:34.12345 US/Pacific
DEFAULT 2016-05-03 12:26:34.1 US/Pacific
DEFAULT 2016-05-03 12:26:34.0 US/Pacific
DEFAULT 2016-05-03 12:26:34.0 US/Pacific
DEFAULT 2016-05-03 00:00:00.0 US/Pacific
FORMAT1 NULL
FORMAT1 NULL
FORMAT2 2016-05-03 12:26:34.0 US/Pacific
FORMAT2 2016-05-03 12:26:34.0 US/Pacific
PREHOOK: query: ALTER TABLE timestampltz_formats SET SERDEPROPERTIES ("timestamp.formats"="yyyy-MM-dd'T'HH:mm:ssVV,MMM d yyyy HH:mm:ss")
PREHOOK: type: ALTERTABLE_SERDEPROPERTIES
PREHOOK: Input: default@timestampltz_formats
PREHOOK: Output: default@timestampltz_formats
POSTHOOK: query: ALTER TABLE timestampltz_formats SET SERDEPROPERTIES ("timestamp.formats"="yyyy-MM-dd'T'HH:mm:ssVV,MMM d yyyy HH:mm:ss")
POSTHOOK: type: ALTERTABLE_SERDEPROPERTIES
POSTHOOK: Input: default@timestampltz_formats
POSTHOOK: Output: default@timestampltz_formats
PREHOOK: query: SELECT formatid, tsval FROM timestampltz_formats
PREHOOK: type: QUERY
PREHOOK: Input: default@timestampltz_formats
#### A masked pattern was here ####
POSTHOOK: query: SELECT formatid, tsval FROM timestampltz_formats
POSTHOOK: type: QUERY
POSTHOOK: Input: default@timestampltz_formats
#### A masked pattern was here ####
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 04:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 12:26:34.123456789 US/Pacific
DEFAULT 2016-05-03 12:26:34.12345 US/Pacific
DEFAULT 2016-05-03 12:26:34.12345 US/Pacific
DEFAULT 2016-05-03 12:26:34.1 US/Pacific
DEFAULT 2016-05-03 12:26:34.0 US/Pacific
DEFAULT 2016-05-03 12:26:34.0 US/Pacific
DEFAULT 2016-05-03 00:00:00.0 US/Pacific
FORMAT1 2016-05-03 04:26:34.0 US/Pacific
FORMAT1 2016-05-03 04:26:34.0 US/Pacific
FORMAT2 2016-05-03 12:26:34.0 US/Pacific
FORMAT2 2016-05-03 12:26:34.0 US/Pacific
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,10 @@
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.time.DateTimeException;
import java.time.ZoneId;
import java.time.format.DateTimeParseException;

import org.apache.hadoop.hive.common.type.TimestampTZ;
import org.apache.hadoop.hive.common.type.TimestampTZUtil;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.io.TimestampLocalTZWritable;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyTimestampLocalTZObjectInspector;
Expand Down Expand Up @@ -70,10 +69,10 @@ public void init(ByteArrayRef bytes, int start, int length) {
logExceptionMessage(bytes, start, length,
serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME.toUpperCase());
} else {
t = TimestampTZUtil.parse(s, timeZone);
t = getInspector().getParser().parseTimestamp(s, timeZone);
isNull = false;
}
} catch (DateTimeParseException e) {
} catch (DateTimeException e) {
isNull = true;
logExceptionMessage(bytes, start, length, serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME.toUpperCase());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ public static AbstractPrimitiveLazyObjectInspector<?> getLazyObjectInspector(
return getLazyBooleanObjectInspector(lazyParams.isExtendedBooleanLiteral());
case TIMESTAMP:
return getLazyTimestampObjectInspector(lazyParams.getTimestampFormats());
case TIMESTAMPLOCALTZ:
return new LazyTimestampLocalTZObjectInspector((TimestampLocalTZTypeInfo)typeInfo, lazyParams.getTimestampFormats());
default:
return getLazyObjectInspector(typeInfo);
}
Expand All @@ -172,9 +174,6 @@ public static AbstractPrimitiveLazyObjectInspector<?> getLazyObjectInspector(
case DECIMAL:
poi = new LazyHiveDecimalObjectInspector((DecimalTypeInfo)typeInfo);
break;
case TIMESTAMPLOCALTZ:
poi = new LazyTimestampLocalTZObjectInspector((TimestampLocalTZTypeInfo)typeInfo);
break;
default:
throw new RuntimeException(
"Primitive type " + typeInfo.getPrimitiveCategory() + " should not take parameters");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,24 @@
import org.apache.hadoop.hive.serde2.lazy.LazyTimestampLocalTZ;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampLocalTZObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TimestampLocalTZTypeInfo;
import org.apache.hive.common.util.TimestampParser;

import java.util.Collections;
import java.util.List;

public class LazyTimestampLocalTZObjectInspector
extends AbstractPrimitiveLazyObjectInspector<TimestampLocalTZWritable>
implements TimestampLocalTZObjectInspector {

private final TimestampParser parser;

protected LazyTimestampLocalTZObjectInspector(TimestampLocalTZTypeInfo typeInfo) {
this(typeInfo, Collections.emptyList());
}

LazyTimestampLocalTZObjectInspector(TimestampLocalTZTypeInfo typeInfo, List<String> formats) {
super(typeInfo);
this.parser = new TimestampParser(formats == null ? Collections.emptyList() : formats);
}

@Override
Expand All @@ -49,4 +60,8 @@ public TimestampTZ getPrimitiveJavaObject(Object o) {
public Object copyObject(Object o) {
return o == null ? null : new LazyTimestampLocalTZ((LazyTimestampLocalTZ) o);
}

public TimestampParser getParser() {
return parser;
}
}

0 comments on commit f6ac8d4

Please sign in to comment.