From 62e747bbe8f745220c56b6c770b01d52b412fdfc Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 7 Sep 2023 22:31:18 +0530 Subject: [PATCH] MB-58034: Enable datetime parser customization in the date range query object (#1860) - Add a new query DateRangeStringQuery, which takes in the start and end bounds as strings and then parses them using the DateTimeParser set in the DateRangeStringQuery struct, if present and defaults to using the package level QueryDateTimeParser. - ParseQuery, which analyzes a marshalled json to identify the query to run, will now default to DateRangeStringQuery instead, which is preferable because it already has the dates in string format, and unmarshalling to DateRangeQuery will always trigger parsing of the date string with only QueryDateTimeParser, leaving no room for user configuration. --- mapping/index.go | 17 -- search/query/date_range.go | 4 +- search/query/date_range_string.go | 176 ++++++++++++++++++ search/query/query.go | 2 +- search/query/query_test.go | 2 +- search_test.go | 294 ++++++++++++++++++++++++++++++ 6 files changed, 474 insertions(+), 21 deletions(-) create mode 100644 search/query/date_range_string.go diff --git a/mapping/index.go b/mapping/index.go index e2ac99f39..99642bc40 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -417,23 +417,6 @@ func (im *IndexMappingImpl) DateTimeParserNamed(name string) analysis.DateTimePa return dateTimeParser } -func (im *IndexMappingImpl) datetimeParserNameForPath(path string) string { - - // first we look for explicit mapping on the field - for _, docMapping := range im.TypeMapping { - pathMapping, _ := docMapping.documentMappingForPath(path) - if pathMapping != nil { - if len(pathMapping.Fields) > 0 { - if pathMapping.Fields[0].Analyzer != "" { - return pathMapping.Fields[0].Analyzer - } - } - } - } - - return im.DefaultDateTimeParser -} - func (im *IndexMappingImpl) AnalyzeText(analyzerName string, text []byte) (analysis.TokenStream, error) { analyzer, err := im.cache.AnalyzerNamed(analyzerName) if err != nil { diff --git a/search/query/date_range.go b/search/query/date_range.go index 34844c976..bbb2a54ef 100644 --- a/search/query/date_range.go +++ b/search/query/date_range.go @@ -30,10 +30,10 @@ import ( index "github.com/blevesearch/bleve_index_api" ) -// QueryDateTimeParser controls the default query date time parser +// QueryDateTimeParser controls the default query date time parser. var QueryDateTimeParser = optional.Name -// QueryDateTimeFormat controls the format when Marshaling to JSON +// QueryDateTimeFormat controls the format when Marshaling to JSON. var QueryDateTimeFormat = time.RFC3339 var cache = registry.NewCache() diff --git a/search/query/date_range_string.go b/search/query/date_range_string.go new file mode 100644 index 000000000..b5e5c1701 --- /dev/null +++ b/search/query/date_range_string.go @@ -0,0 +1,176 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package query + +import ( + "context" + "fmt" + "math" + "time" + + "github.com/blevesearch/bleve/v2/mapping" + "github.com/blevesearch/bleve/v2/numeric" + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/searcher" + index "github.com/blevesearch/bleve_index_api" +) + +// DateRangeStringQuery represents a query for a range of date values. +// Start and End are the range endpoints, as strings. +// Start and End are parsed using DateTimeParser, which is a custom date time parser +// defined in the index mapping. If DateTimeParser is not specified, then the +// top-level config.QueryDateTimeParser is used. +type DateRangeStringQuery struct { + Start string `json:"start,omitempty"` + End string `json:"end,omitempty"` + InclusiveStart *bool `json:"inclusive_start,omitempty"` + InclusiveEnd *bool `json:"inclusive_end,omitempty"` + FieldVal string `json:"field,omitempty"` + BoostVal *Boost `json:"boost,omitempty"` + DateTimeParser string `json:"datetime_parser,omitempty"` +} + +// NewDateRangeStringQuery creates a new Query for ranges +// of date values. +// Date strings are parsed using the DateTimeParser field of the query struct, +// which is a custom date time parser defined in the index mapping. +// if DateTimeParser is not specified, then the +// top-level config.QueryDateTimeParser is used. +// Either, but not both endpoints can be nil. +func NewDateRangeStringQuery(start, end string) *DateRangeStringQuery { + return NewDateRangeStringInclusiveQuery(start, end, nil, nil) +} + +// NewDateRangeStringQuery creates a new Query for ranges +// of date values. +// Date strings are parsed using the DateTimeParser field of the query struct, +// which is a custom date time parser defined in the index mapping. +// if DateTimeParser is not specified, then the +// top-level config.QueryDateTimeParser is used. +// Either, but not both endpoints can be nil. +// startInclusive and endInclusive control inclusion of the endpoints. +func NewDateRangeStringInclusiveQuery(start, end string, startInclusive, endInclusive *bool) *DateRangeStringQuery { + return &DateRangeStringQuery{ + Start: start, + End: end, + InclusiveStart: startInclusive, + InclusiveEnd: endInclusive, + } +} + +func (q *DateRangeStringQuery) SetBoost(b float64) { + boost := Boost(b) + q.BoostVal = &boost +} + +func (q *DateRangeStringQuery) Boost() float64 { + return q.BoostVal.Value() +} + +func (q *DateRangeStringQuery) SetField(f string) { + q.FieldVal = f +} + +func (q *DateRangeStringQuery) Field() string { + return q.FieldVal +} + +func (q *DateRangeStringQuery) SetDateTimeParser(d string) { + q.DateTimeParser = d +} + +func (q *DateRangeStringQuery) DateTimeParserName() string { + return q.DateTimeParser +} + +func (q *DateRangeStringQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { + field := q.FieldVal + if q.FieldVal == "" { + field = m.DefaultSearchField() + } + + dateTimeParserName := QueryDateTimeParser + if q.DateTimeParser != "" { + dateTimeParserName = q.DateTimeParser + } + dateTimeParser := m.DateTimeParserNamed(dateTimeParserName) + if dateTimeParser == nil { + return nil, fmt.Errorf("no dateTimeParser named '%s' registered", dateTimeParserName) + } + + var startTime, endTime time.Time + var err error + if q.Start != "" { + startTime, _, err = dateTimeParser.ParseDateTime(q.Start) + if err != nil { + return nil, fmt.Errorf("%v, date time parser name: %s", err, dateTimeParserName) + } + } + if q.End != "" { + endTime, _, err = dateTimeParser.ParseDateTime(q.End) + if err != nil { + return nil, fmt.Errorf("%v, date time parser name: %s", err, dateTimeParserName) + } + } + + min, max, err := q.parseEndpoints(startTime, endTime) + if err != nil { + return nil, err + } + return searcher.NewNumericRangeSearcher(ctx, i, min, max, q.InclusiveStart, q.InclusiveEnd, field, q.BoostVal.Value(), options) +} + +func (q *DateRangeStringQuery) parseEndpoints(startTime, endTime time.Time) (*float64, *float64, error) { + min := math.Inf(-1) + max := math.Inf(1) + + if startTime.IsZero() && endTime.IsZero() { + return nil, nil, fmt.Errorf("date range query must specify at least one of start/end") + } + + if !startTime.IsZero() { + if !isDateTimeWithinRange(startTime) { + // overflow + return nil, nil, fmt.Errorf("invalid/unsupported date range, start: %v", q.Start) + } + startInt64 := startTime.UnixNano() + min = numeric.Int64ToFloat64(startInt64) + } + if !endTime.IsZero() { + if !isDateTimeWithinRange(endTime) { + // overflow + return nil, nil, fmt.Errorf("invalid/unsupported date range, end: %v", q.End) + } + endInt64 := endTime.UnixNano() + max = numeric.Int64ToFloat64(endInt64) + } + + return &min, &max, nil +} + +func (q *DateRangeStringQuery) Validate() error { + // either start or end must be specified + if q.Start == "" && q.End == "" { + return fmt.Errorf("date range query must specify at least one of start/end") + } + return nil +} + +func isDateTimeWithinRange(t time.Time) bool { + if t.Before(MinRFC3339CompatibleTime) || t.After(MaxRFC3339CompatibleTime) { + return false + } + return true +} diff --git a/search/query/query.go b/search/query/query.go index a4e0f015a..1ef97ff8a 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -185,7 +185,7 @@ func ParseQuery(input []byte) (Query, error) { _, hasStart := tmp["start"] _, hasEnd := tmp["end"] if hasStart || hasEnd { - var rv DateRangeQuery + var rv DateRangeStringQuery err := json.Unmarshal(input, &rv) if err != nil { return nil, err diff --git a/search/query/query_test.go b/search/query/query_test.go index 0082e8acf..228fb65cd 100644 --- a/search/query/query_test.go +++ b/search/query/query_test.go @@ -176,7 +176,7 @@ func TestParseQuery(t *testing.T) { { input: []byte(`{"start":"` + startDateStr + `","end":"` + endDateStr + `","field":"desc"}`), output: func() Query { - q := NewDateRangeQuery(startDate, endDate) + q := NewDateRangeStringQuery(startDateStr, endDateStr) q.SetField("desc") return q }(), diff --git a/search_test.go b/search_test.go index 414c907c8..6ddb861a9 100644 --- a/search_test.go +++ b/search_test.go @@ -2473,3 +2473,297 @@ func TestCustomDateTimeParserLayoutValidation(t *testing.T) { } } } + +func TestDateRangeStringQuery(t *testing.T) { + idxMapping := NewIndexMapping() + + err := idxMapping.AddCustomDateTimeParser("customDT", map[string]interface{}{ + "type": sanitized.Name, + "layouts": []interface{}{ + "02/01/2006 15:04:05", + "2006/01/02 3:04PM", + }, + }) + + if err != nil { + t.Fatal(err) + } + + err = idxMapping.AddCustomDateTimeParser("queryDT", map[string]interface{}{ + "type": sanitized.Name, + "layouts": []interface{}{ + "02/01/2006 3:04PM", + }, + }) + + if err != nil { + t.Fatal(err) + } + + dtmap := NewDateTimeFieldMapping() + dtmap.DateFormat = "customDT" + idxMapping.DefaultMapping.AddFieldMappingsAt("date", dtmap) + + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx, err := New(tmpIndexPath, idxMapping) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + documents := map[string]map[string]interface{}{ + "doc1": { + "date": "2001/08/20 6:00PM", + }, + "doc2": { + "date": "20/08/2001 18:00:20", + }, + "doc3": { + "date": "20/08/2001 18:10:00", + }, + "doc4": { + "date": "2001/08/20 6:15PM", + }, + "doc5": { + "date": "20/08/2001 18:20:00", + }, + } + + batch := idx.NewBatch() + for docID, doc := range documents { + err := batch.Index(docID, doc) + if err != nil { + t.Fatal(err) + } + } + err = idx.Batch(batch) + if err != nil { + t.Fatal(err) + } + + type testResult struct { + docID string // doc ID of the hit + hitField string // fields returned as part of the hit + } + + type testStruct struct { + start string + end string + field string + dateTimeParser string // name of the custom date time parser to use if nil, use QueryDateTimeParser + includeStart bool + includeEnd bool + expectedHits []testResult + err error + } + + testQueries := []testStruct{ + // test cases with RFC3339 parser and toggling includeStart and includeEnd + { + start: "2001-08-20T18:00:00", + end: "2001-08-20T18:10:00", + field: "date", + includeStart: true, + includeEnd: true, + expectedHits: []testResult{ + { + docID: "doc1", + hitField: "2001/08/20 6:00PM", + }, + { + docID: "doc2", + hitField: "20/08/2001 18:00:20", + }, + { + docID: "doc3", + hitField: "20/08/2001 18:10:00", + }, + }, + }, + { + start: "2001-08-20T18:00:00", + end: "2001-08-20T18:10:00", + field: "date", + includeStart: false, + includeEnd: true, + expectedHits: []testResult{ + { + docID: "doc2", + hitField: "20/08/2001 18:00:20", + }, + { + docID: "doc3", + hitField: "20/08/2001 18:10:00", + }, + }, + }, + { + start: "2001-08-20T18:00:00", + end: "2001-08-20T18:10:00", + field: "date", + includeStart: false, + includeEnd: false, + expectedHits: []testResult{ + { + docID: "doc2", + hitField: "20/08/2001 18:00:20", + }, + }, + }, + // test cases with custom parser and omitting start and end + { + start: "20/08/2001 18:00:00", + end: "2001/08/20 6:10PM", + field: "date", + dateTimeParser: "customDT", + includeStart: true, + includeEnd: true, + expectedHits: []testResult{ + { + docID: "doc1", + hitField: "2001/08/20 6:00PM", + }, + { + docID: "doc2", + hitField: "20/08/2001 18:00:20", + }, + { + docID: "doc3", + hitField: "20/08/2001 18:10:00", + }, + }, + }, + { + end: "20/08/2001 18:15:00", + field: "date", + dateTimeParser: "customDT", + includeStart: true, + includeEnd: true, + expectedHits: []testResult{ + { + docID: "doc1", + hitField: "2001/08/20 6:00PM", + }, + { + docID: "doc2", + hitField: "20/08/2001 18:00:20", + }, + { + docID: "doc3", + hitField: "20/08/2001 18:10:00", + }, + { + docID: "doc4", + hitField: "2001/08/20 6:15PM", + }, + }, + }, + { + start: "2001/08/20 6:15PM", + field: "date", + dateTimeParser: "customDT", + includeStart: true, + includeEnd: true, + expectedHits: []testResult{ + { + docID: "doc4", + hitField: "2001/08/20 6:15PM", + }, + { + docID: "doc5", + hitField: "20/08/2001 18:20:00", + }, + }, + }, + { + start: "20/08/2001 6:15PM", + field: "date", + dateTimeParser: "queryDT", + includeStart: true, + includeEnd: true, + expectedHits: []testResult{ + { + docID: "doc4", + hitField: "2001/08/20 6:15PM", + }, + { + docID: "doc5", + hitField: "20/08/2001 18:20:00", + }, + }, + }, + // error path test cases + { + field: "date", + dateTimeParser: "customDT", + includeStart: true, + includeEnd: true, + err: fmt.Errorf("date range query must specify at least one of start/end"), + }, + { + field: "date", + includeStart: true, + includeEnd: true, + err: fmt.Errorf("date range query must specify at least one of start/end"), + }, + { + start: "2001-08-20T18:00:00", + end: "2001-08-20T18:10:00", + field: "date", + dateTimeParser: "customDT", + err: fmt.Errorf("unable to parse datetime with any of the layouts, date time parser name: customDT"), + }, + { + start: "3001-08-20T18:00:00", + end: "2001-08-20T18:10:00", + field: "date", + err: fmt.Errorf("invalid/unsupported date range, start: 3001-08-20T18:00:00"), + }, + { + start: "2001/08/20 6:00PM", + end: "3001/08/20 6:30PM", + field: "date", + dateTimeParser: "customDT", + err: fmt.Errorf("invalid/unsupported date range, end: 3001/08/20 6:30PM"), + }, + } + + for _, dtq := range testQueries { + var err error + dateQuery := query.NewDateRangeStringInclusiveQuery(dtq.start, dtq.end, &dtq.includeStart, &dtq.includeEnd) + dateQuery.SetDateTimeParser(dtq.dateTimeParser) + dateQuery.SetField(dtq.field) + + sr := NewSearchRequest(dateQuery) + sr.SortBy([]string{dtq.field}) + sr.Fields = []string{dtq.field} + + res, err := idx.Search(sr) + if err != nil { + if dtq.err == nil { + t.Fatalf("expected no error, got: %v", err) + } + if dtq.err.Error() != err.Error() { + t.Fatalf("expected error: %v, got: %v", dtq.err, err) + } + continue + } + if len(res.Hits) != len(dtq.expectedHits) { + t.Fatalf("expected %d hits, got %d", len(dtq.expectedHits), len(res.Hits)) + } + for i, hit := range res.Hits { + if hit.ID != dtq.expectedHits[i].docID { + t.Fatalf("expected docID %s, got %s", dtq.expectedHits[i].docID, hit.ID) + } + if hit.Fields[dtq.field].(string) != dtq.expectedHits[i].hitField { + t.Fatalf("expected hit field %s, got %s", dtq.expectedHits[i].hitField, hit.Fields[dtq.field]) + } + } + } +}