diff --git a/analysis/datetime/javatime/javatime.go b/analysis/datetime/javatime/javatime.go new file mode 100644 index 000000000..0b525fa16 --- /dev/null +++ b/analysis/datetime/javatime/javatime.go @@ -0,0 +1,240 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package javatime + +import ( + "fmt" + "strings" + "time" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/registry" +) + +const Name = "javastyle" + +var textLiteralDelimiter byte = '\'' // single quote + +// java style date strings are represented in +// https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html +// +// Some format specifiers are not specified in go time package, such as: +// - 'V' for timezone name, like 'Europe/Berlin' or 'America/New_York'. +// - 'Q' for quarter of year, like Q3 or 3rd Quarter. +// - 'zzzz' for full name of timezone like "Japan Standard Time" or "Eastern Standard Time". +// - 'O' for localized zone-offset, like GMT+8 or GMT+08:00. +// - '[]' for optional section of the format. +// - 'G' for era, like AD or BC. +// - 'W' for week of month. +// - 'D' for day of year. +// So date strings with date elements cannot be parsed. +var timeElementToLayout = map[byte]map[int]string{ + 'M': { + 4: "January", + 3: "Jan", // MMM = short month name + 2: "01", // MM = month of year (2 digits) (01-12) + 1: "1", // M = month of year (1 digit) (1-12) + }, + 'd': { + 2: "02", // dd = day of month (2 digits) (01-31) + 1: "2", // d = day of month (1 digit) (1-31) + }, + 'a': { + 2: "pm", // PM = PM/AM + 1: "PM", // PM = PM/AM + }, + 'H': { + 2: "15", // HH = hour (24 hour clock) (2 digits) + 1: "15", // H = hour (24 hour clock) (1 digit) + }, + 'm': { + 2: "04", // mm = minute (2 digits) + 1: "4", // m = minute (1 digit) + }, + 's': { + 2: "05", // ss = seconds (2 digits) + 1: "5", // s = seconds (1 digit) + }, + + // timezone offsets from UTC below + 'X': { + 5: "Z07:00:00", // XXXXXX = timezone offset (+-hh:mm:ss) + 4: "Z070000", // XXXXX = timezone offset (+-hhmmss) + 3: "Z07:00", // XXX = timezone offset (+-hh:mm) + 2: "Z0700", // XX = timezone offset (+-hhmm) + 1: "Z07", // X = timezone offset (+-hh) + }, + 'x': { + 5: "-07:00:00", // xxxxxx = timezone offset (+-hh:mm:ss) + 4: "-070000", // xxxxx = timezone offset (+-hhmmss) + 3: "-07:00", // xxx = timezone offset (+-hh:mm) + 2: "-0700", // xx = timezone offset (+-hhmm) + 1: "-07", // x = timezone offset (+-hh) + }, +} + +type DateTimeParser struct { + layouts []string +} + +func New(layouts []string) *DateTimeParser { + return &DateTimeParser{ + layouts: layouts, + } +} + +func (p *DateTimeParser) ParseDateTime(input string) (time.Time, string, error) { + for _, layout := range p.layouts { + rv, err := time.Parse(layout, input) + if err == nil { + return rv, layout, nil + } + } + return time.Time{}, "", analysis.ErrInvalidDateTime +} + +func letterCounter(layout string, idx int) int { + count := 1 + for idx+count < len(layout) { + if layout[idx+count] == layout[idx] { + count++ + } else { + break + } + } + return count +} + +func invalidFormatError(character byte, count int) error { + return fmt.Errorf("invalid format string, unknown format specifier: " + strings.Repeat(string(character), count)) +} + +func parseJavaString(layout string) (string, error) { + var dateTimeLayout strings.Builder + + for idx := 0; idx < len(layout); { + // check if the character is a text literal delimiter (') + if layout[idx] == textLiteralDelimiter { + if idx+1 < len(layout) && layout[idx+1] == textLiteralDelimiter { + // if the next character is also a text literal delimiter, then + // copy the character as is + dateTimeLayout.WriteByte(textLiteralDelimiter) + idx += 2 + continue + } + // find the next text literal delimiter + for idx++; idx < len(layout); idx++ { + if layout[idx] == textLiteralDelimiter { + break + } + dateTimeLayout.WriteByte(layout[idx]) + } + // idx can either be equal to len(layout) if the text literal delimiter is not found + // after the first text literal delimiter or it will be equal to the index of the + // second text literal delimiter + if idx == len(layout) { + // text literal delimiter not found error + return "", fmt.Errorf("invalid format string, expected text literal delimiter: " + string(textLiteralDelimiter)) + } + // increment idx to skip the second text literal delimiter + idx++ + continue + } + // check if character is a letter in english alphabet - a-zA-Z which are reserved + // for format specifiers + if (layout[idx] >= 'a' && layout[idx] <= 'z') || (layout[idx] >= 'A' && layout[idx] <= 'Z') { + // find the number of times the character occurs consecutively + count := letterCounter(layout, idx) + character := layout[idx] + // first check the table + if layout, ok := timeElementToLayout[character][count]; ok { + dateTimeLayout.WriteString(layout) + } else { + switch character { + case 'y', 'u', 'Y': + // year + if count == 2 { + dateTimeLayout.WriteString("06") + } else { + format := fmt.Sprintf("%%0%ds", count) + dateTimeLayout.WriteString(fmt.Sprintf(format, "2006")) + } + case 'h', 'K': + // hour (1-12) + if count == 2 { + dateTimeLayout.WriteString("03") + } else if count == 1 { + dateTimeLayout.WriteString("3") + } else { + return "", invalidFormatError(character, count) + } + case 'E': + // day of week + if count == 4 { + dateTimeLayout.WriteString("Monday") + } else if count <= 3 { + dateTimeLayout.WriteString("Mon") + } else { + return "", invalidFormatError(character, count) + } + case 'S': + // fraction of second + if count > 9 { + return "", invalidFormatError(character, count) + } + dateTimeLayout.WriteString(strings.Repeat(string('0'), count)) + case 'z': + // timezone id + if count < 5 { + dateTimeLayout.WriteString("MST") + } else { + return "", invalidFormatError(character, count) + } + default: + return "", invalidFormatError(character, count) + } + } + idx += count + } else { + // copy the character as is + dateTimeLayout.WriteByte(layout[idx]) + idx++ + } + } + return dateTimeLayout.String(), nil +} + +func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) { + layouts, ok := config["layouts"].([]interface{}) + if !ok { + return nil, fmt.Errorf("must specify layouts") + } + var layoutStrs []string + for _, layout := range layouts { + layoutStr, ok := layout.(string) + if ok { + layout, err := parseJavaString(layoutStr) + if err != nil { + return nil, err + } + layoutStrs = append(layoutStrs, layout) + } + } + return New(layoutStrs), nil +} + +func init() { + registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) +} diff --git a/analysis/datetime/javatime/javatime_test.go b/analysis/datetime/javatime/javatime_test.go new file mode 100644 index 000000000..9195e40b0 --- /dev/null +++ b/analysis/datetime/javatime/javatime_test.go @@ -0,0 +1,88 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package javatime + +import ( + "fmt" + "testing" +) + +func TestConversionFromJavaStyle(t *testing.T) { + tests := []struct { + input string + output string + err error + }{ + { + input: "yyyy-MM-dd", + output: "2006-01-02", + err: nil, + }, + { + input: "uuu/M''''dd'T'HH:m:ss.SSS", + output: "2006/1''02T15:4:05.000", + err: nil, + }, + { + input: "YYYY-MM-dd'T'H:mm:ss zzz", + output: "2006-01-02T15:04:05 MST", + err: nil, + }, + { + input: "MMMM dd yyyy', 'HH:mm:ss.SSS", + output: "January 02 2006, 15:04:05.000", + }, + { + input: "h 'o'''' clock' a, XXX", + output: "3 o' clock PM, Z07:00", + err: nil, + }, + { + input: "YYYY-MM-dd'T'HH:mm:ss'Z'", + output: "2006-01-02T15:04:05Z", + err: nil, + }, + { + input: "E MMM d H:m:s z Y", + output: "Mon Jan 2 15:4:5 MST 2006", + err: nil, + }, + { + input: "E MMM d H:m:s z Y", + output: "Mon Jan 2 15:4:5 MST 2006", + err: nil, + }, + { + input: "E MMM DD H:m:s z Y", + output: "", + err: fmt.Errorf("invalid format string, unknown format specifier: DD"), + }, + { + input: "E MMM''''' H:m:s z Y", + output: "", + err: fmt.Errorf("invalid format string, expected text literal delimiter: '"), + }, + } + for _, test := range tests { + out, err := parseJavaString(test.input) + if err != nil && test.err == nil || err == nil && test.err != nil { + t.Fatalf("expected error %v, got error %v", test.err, err) + } + if out != test.output { + t.Fatalf("expected output %v, got %v", test.output, out) + } + } + +} diff --git a/analysis/datetime/percent/percent.go b/analysis/datetime/percent/percent.go index d44091117..39e62ce92 100644 --- a/analysis/datetime/percent/percent.go +++ b/analysis/datetime/percent/percent.go @@ -27,38 +27,47 @@ const Name = "percentstyle" var formatDelimiter byte = '%' -var timezoneSpecifier byte = 'Z' - var formatSpecifierToLayout = map[byte]string{ + // format specifiers as per strftime in the C standard library + // https://man7.org/linux/man-pages/man3/strftime.3.html + formatDelimiter: string(formatDelimiter), - 'd': "2", - 'D': "02", - 'm': "1", - 'M': "01", - 'y': "06", - 'Y': "2006", - 'b': "Jan", - 'B': "January", - 'a': "Mon", - 'A': "Monday", - 'h': "3", - 'H': "03", - 'O': "15", - 'i': "4", - 'I': "04", - 's': "5", - 'S': "05", - 'p': "PM", - 'P': "pm", - 'N': ".999999999", + 'a': "Mon", // %a = short weekday name + 'A': "Monday", // %A = full weekday name + 'd': "02", // %d = day of month (2 digits) (01-31) + 'e': "2", // %e = day of month (1 digit) (1-31) + 'b': "Jan", // %b = short month name + 'B': "January", // %B = full month name + 'm': "01", // %m = month of year (2 digits) (01-12) + 'y': "06", // %y = year without century + 'Y': "2006", // %Y = year with century + 'H': "15", // %H = hour (24 hour clock) (2 digits) + 'I': "03", // %I = hour (12 hour clock) (2 digits) + 'l': "3", // %l = hour (12 hour clock) (1 digit) + 'p': "PM", // %p = PM/AM + 'P': "pm", // %P = pm/am (lowercase) + 'M': "04", // %M = minute (2 digits) + 'S': "05", // %S = seconds (2 digits) + 'f': "999999", // .%f = fraction of seconds - up to microseconds (6 digits) - deci/milli/micro + 'Z': "MST", // %Z = timezone name (GMT, JST, UTC etc) + // %z is present in timezone options + + // some additional options not in strftime to support additional options such as + // disallow 0 padding in minute and seconds, nanosecond precision, etc + 'o': "1", // %o = month of year (1 digit) (1-12) + 'i': "4", // %i = minute (1 digit) + 's': "5", // %s = seconds (1 digit) + 'N': "999999999", // .%N = fraction of seconds - up to microseconds (9 digits) - milli/micro/nano } +// some additional options for timezone +// such as allowing colon in timezone offset and specifying the seconds var timezoneOptions = map[string]string{ - "Z:M": "Z07:00", - "Z:S": "Z07:00:00", - "ZH": "Z07", - "ZM": "Z0700", - "ZS": "Z070000", + "z": "Z0700", // %z = timezone offset in +-hhmm / +-(2 digit hour)(2 digit minute) +0500, -0600 etc + "z:M": "Z07:00", // %z:M = timezone offset(+-hh:mm) / +-(2 digit hour):(2 digit minute) +05:00, -06:00 etc + "z:S": "Z07:00:00", // %z:M = timezone offset(+-hh:mm:ss) / +-(2 digit hour):(2 digit minute):(2 digit second) +05:20:00, -06:30:00 etc + "zH": "Z07", // %zH = timezone offset(+-hh) / +-(2 digit hour) +05, -06 etc + "zS": "Z070000", // %zS = timezone offset(+-hhmmss) / +-(2 digit hour)(2 digit minute)(2 digit second) +052000, -063000 etc } type DateTimeParser struct { @@ -71,24 +80,24 @@ func New(layouts []string) *DateTimeParser { } } -func checkTZOptions(formatString string, idx int) (string, int, error) { - key := "Z" - if idx+1 >= len(formatString) { - return "", 0, fmt.Errorf("invalid format string, expected character after " + string(timezoneSpecifier)) - } - if formatString[idx+1] == ':' { - // check if there is a character after the colon - if idx+2 >= len(formatString) { - return "", 0, fmt.Errorf("invalid format string, expected character after colon") +func checkTZOptions(formatString string, idx int) (string, int) { + // idx is pointing to % + // idx + 1 is pointing to z + if idx+2 < len(formatString) { + if formatString[idx+2] == ':' { + // check if there is a character after the colon + if idx+3 < len(formatString) && (formatString[idx+3] == 'M' || formatString[idx+3] == 'S') { + return timezoneOptions[fmt.Sprintf("z:%s", string(formatString[idx+3]))], idx + 4 + } + // %z: OR %z: detected; return the default layout Z0700 and increment idx by 2 to print : literally + return timezoneOptions["z"], idx + 2 + } else if formatString[idx+2] == 'H' || formatString[idx+2] == 'S' { + // %zH or %zS detected; return the layouts Z07 / z070000 and increment idx by 2 to point to the next character + // after %zH or %zS + return timezoneOptions[fmt.Sprintf("z%s", string(formatString[idx+2]))], idx + 3 } - key += ":" - idx++ - } - key += string(formatString[idx+1]) - if layout, ok := timezoneOptions[key]; ok { - return layout, idx + 2, nil } - return "", 0, fmt.Errorf("invalid format string, unknown timezone specifier: " + key) + return timezoneOptions["z"], idx + 2 } func parseFormatString(formatString string) (string, error) { @@ -96,9 +105,9 @@ func parseFormatString(formatString string) (string, error) { // iterate over the format string and replace the format specifiers with // the corresponding golang constants for idx := 0; idx < len(formatString); { - // check if the character is a format specifier + // check if the character is a format delimiter (%) if formatString[idx] == formatDelimiter { - // check if there is a character after the format specifier + // check if there is a character after the format delimiter (%) if idx+1 >= len(formatString) { return "", fmt.Errorf("invalid format string, expected character after " + string(formatDelimiter)) } @@ -106,15 +115,11 @@ func parseFormatString(formatString string) (string, error) { if layout, ok := formatSpecifierToLayout[formatSpecifier]; ok { dateTimeLayout.WriteString(layout) idx += 2 - } else if formatSpecifier == timezoneSpecifier { + } else if formatSpecifier == 'z' { // did not find a valid specifier // check if it is for timezone var tzLayout string - var err error - tzLayout, idx, err = checkTZOptions(formatString, idx+1) - if err != nil { - return "", err - } + tzLayout, idx = checkTZOptions(formatString, idx) dateTimeLayout.WriteString(tzLayout) } else { return "", fmt.Errorf("invalid format string, unknown format specifier: " + string(formatSpecifier)) diff --git a/analysis/datetime/percent/percent_test.go b/analysis/datetime/percent/percent_test.go index 560083ff2..9856a7640 100644 --- a/analysis/datetime/percent/percent_test.go +++ b/analysis/datetime/percent/percent_test.go @@ -27,53 +27,78 @@ func TestConversionFromPercentStyle(t *testing.T) { }{ { input: "%Y-%m-%d", - output: "2006-1-2", + output: "2006-01-02", err: nil, }, { - input: "%Y/%M%%%%%DT%H%i:%S", - output: "2006/01%%02T034:05", + input: "%Y/%m%%%%%dT%H%M:%S", + output: "2006/01%%02T1504:05", err: nil, }, { - input: "%Y-%M-%DT%O:%I:%S%ZM", - output: "2006-01-02T15:04:05Z0700", + input: "%Y-%m-%dT%H:%M:%S %Z%z", + output: "2006-01-02T15:04:05 MSTZ0700", err: nil, }, { - input: "%B %D, %Y %H:%I %P %Z:M", - output: "January 02, 2006 03:04 pm Z07:00", + input: "%B %e, %Y %l:%i %P %z:M", + output: "January 2, 2006 3:4 pm Z07:00", err: nil, }, { - input: "Hour %O Minute %iseconds %S%N Timezone:%Z:S, Weekday %a; Day %D Month %b, Year %y", - output: "Hour 15 Minute 4seconds 05.999999999 Timezone:Z07:00:00, Weekday Mon; Day 02 Month Jan, Year 06", + input: "Hour %H Minute %Mseconds %S.%N Timezone:%Z:S, Weekday %a; Day %d Month %b, Year %y", + output: "Hour 15 Minute 04seconds 05.999999999 Timezone:MST:S, Weekday Mon; Day 02 Month Jan, Year 06", err: nil, }, { - input: "%Y-%M-%D%T%O:%I:%S%ZM", - output: "", - err: fmt.Errorf("invalid format string, unknown format specifier: T"), + input: "%Y-%m-%dT%H:%M:%S.%N", + output: "2006-01-02T15:04:05.999999999", + err: nil, }, { - input: "%Y-%M-%DT%O:%I%S%ZM%", - output: "", - err: fmt.Errorf("invalid format string, invalid format string, expected character after %%"), + input: "%H:%M:%S %Z %z", + output: "15:04:05 MST Z0700", + err: nil, }, { - input: "%Y-%M-%DT%O:%I:%S%Z", - output: "", - err: fmt.Errorf("invalid format string, expected character after Z"), + input: "%H:%M:%S %Z %z:", + output: "15:04:05 MST Z0700:", + err: nil, + }, + { + input: "%H:%M:%S %Z %z:M", + output: "15:04:05 MST Z07:00", + err: nil, }, { - input: "%Y-%M-%DT%O:%I:%S%Z:", + input: "%H:%M:%S %Z %z:A", + output: "15:04:05 MST Z0700:A", + err: nil, + }, + { + input: "%H:%M:%S %Z %zM", + output: "15:04:05 MST Z0700M", + err: nil, + }, + { + input: "%H:%M:%S %Z %zS", + output: "15:04:05 MST Z070000", + err: nil, + }, + { + input: "%H:%M:%S %Z %z%Z %zS%z:%zH", + output: "15:04:05 MST Z0700MST Z070000Z0700:Z07", + err: nil, + }, + { + input: "%Y-%m-%d%T%H:%M:%S %ZM", output: "", - err: fmt.Errorf("invalid format string, expected character after colon"), + err: fmt.Errorf("invalid format string, unknown format specifier: T"), }, { - input: "%O:%I:%S%Z%H:%M:%S", + input: "%Y-%m-%dT%H:%M:%S %ZM%", output: "", - err: fmt.Errorf("invalid format string, unknown timezone specifier: Z%%"), + err: fmt.Errorf("invalid format string, invalid format string, expected character after %%"), }, } for _, test := range tests { diff --git a/config/config.go b/config/config.go index 2f6df4f4d..acd2cbeaa 100644 --- a/config/config.go +++ b/config/config.go @@ -70,7 +70,9 @@ import ( // date time parsers _ "github.com/blevesearch/bleve/v2/analysis/datetime/flexible" + _ "github.com/blevesearch/bleve/v2/analysis/datetime/javatime" _ "github.com/blevesearch/bleve/v2/analysis/datetime/optional" + _ "github.com/blevesearch/bleve/v2/analysis/datetime/percent" _ "github.com/blevesearch/bleve/v2/analysis/datetime/sanitized" _ "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/microseconds" _ "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds"