Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MB-31210: Support Fuzzy MatchPhrase, MultiPhrase and Phrase queries #1847

Merged
merged 8 commits into from Aug 24, 2023
12 changes: 12 additions & 0 deletions search/query/match_phrase.go
Expand Up @@ -29,6 +29,8 @@ type MatchPhraseQuery struct {
FieldVal string `json:"field,omitempty"`
Analyzer string `json:"analyzer,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Prefix int `json:"prefix_length"`
Fuzziness int `json:"fuzziness"`
}

// NewMatchPhraseQuery creates a new Query object
Expand Down Expand Up @@ -58,6 +60,14 @@ func (q *MatchPhraseQuery) SetField(f string) {
q.FieldVal = f
}

func (q *MatchPhraseQuery) SetFuzziness(f int) {
q.Fuzziness = f
}

func (q *MatchPhraseQuery) SetPrefix(p int) {
q.Prefix = p
}

func (q *MatchPhraseQuery) Field() string {
return q.FieldVal
}
Expand All @@ -84,6 +94,8 @@ func (q *MatchPhraseQuery) Searcher(ctx context.Context, i index.IndexReader, m
phrase := tokenStreamToPhrase(tokens)
phraseQuery := NewMultiPhraseQuery(phrase, field)
phraseQuery.SetBoost(q.BoostVal.Value())
phraseQuery.SetFuzziness(q.Fuzziness)
phraseQuery.SetPrefix(q.Prefix)
return phraseQuery.Searcher(ctx, i, m, options)
}
noneQuery := NewMatchNoneQuery()
Expand Down
20 changes: 16 additions & 4 deletions search/query/multi_phrase.go
Expand Up @@ -26,9 +26,11 @@ import (
)

type MultiPhraseQuery struct {
Terms [][]string `json:"terms"`
Field string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Terms [][]string `json:"terms"`
Field string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Fuzziness int `json:"fuzziness"`
Prefix int `json:"prefix_length"`
CascadingRadium marked this conversation as resolved.
Show resolved Hide resolved
}

// NewMultiPhraseQuery creates a new Query for finding
Expand All @@ -47,6 +49,14 @@ func NewMultiPhraseQuery(terms [][]string, field string) *MultiPhraseQuery {
}
}

func (q *MultiPhraseQuery) SetFuzziness(f int) {
q.Fuzziness = f
}

func (q *MultiPhraseQuery) SetPrefix(p int) {
q.Prefix = p
}

func (q *MultiPhraseQuery) SetBoost(b float64) {
boost := Boost(b)
q.BoostVal = &boost
Expand All @@ -57,7 +67,7 @@ func (q *MultiPhraseQuery) Boost() float64 {
}

func (q *MultiPhraseQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) {
return searcher.NewMultiPhraseSearcher(ctx, i, q.Terms, q.Field, options)
return searcher.NewMultiPhraseSearcher(ctx, i, q.Terms, q.Prefix, q.Fuzziness, q.Field, q.BoostVal.Value(), options)
}

func (q *MultiPhraseQuery) Validate() error {
Expand All @@ -77,5 +87,7 @@ func (q *MultiPhraseQuery) UnmarshalJSON(data []byte) error {
q.Terms = tmp.Terms
q.Field = tmp.Field
q.BoostVal = tmp.BoostVal
q.Fuzziness = tmp.Fuzziness
q.Prefix = tmp.Prefix
return nil
}
20 changes: 16 additions & 4 deletions search/query/phrase.go
Expand Up @@ -26,9 +26,11 @@ import (
)

type PhraseQuery struct {
Terms []string `json:"terms"`
Field string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Terms []string `json:"terms"`
Field string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Fuzziness int `json:"fuzziness"`
Prefix int `json:"prefix_length"`
}

// NewPhraseQuery creates a new Query for finding
Expand All @@ -49,12 +51,20 @@ func (q *PhraseQuery) SetBoost(b float64) {
q.BoostVal = &boost
}

func (q *PhraseQuery) SetFuzziness(f int) {
q.Fuzziness = f
}

func (q *PhraseQuery) SetPrefix(p int) {
q.Prefix = p
}

func (q *PhraseQuery) Boost() float64 {
return q.BoostVal.Value()
}

func (q *PhraseQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) {
return searcher.NewPhraseSearcher(ctx, i, q.Terms, q.Field, options)
return searcher.NewPhraseSearcher(ctx, i, q.Terms, q.Prefix, q.Fuzziness, q.Field, q.BoostVal.Value(), options)
}

func (q *PhraseQuery) Validate() error {
Expand All @@ -74,5 +84,7 @@ func (q *PhraseQuery) UnmarshalJSON(data []byte) error {
q.Terms = tmp.Terms
q.Field = tmp.Field
q.BoostVal = tmp.BoostVal
q.Fuzziness = tmp.Fuzziness
q.Prefix = tmp.Prefix
return nil
}
48 changes: 24 additions & 24 deletions search/query/query.go
Expand Up @@ -72,25 +72,18 @@ func ParseQuery(input []byte) (Query, error) {
if err != nil {
return nil, err
}
_, isMatchQuery := tmp["match"]
_, hasFuzziness := tmp["fuzziness"]
if hasFuzziness && !isMatchQuery {
_, isMatchQuery := tmp["match"]
_, isMatchPhraseQuery := tmp["match_phrase"]
_, hasTerms := tmp["terms"]
if hasFuzziness && !isMatchQuery && !isMatchPhraseQuery && !hasTerms {
var rv FuzzyQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return &rv, nil
}
_, isTermQuery := tmp["term"]
if isTermQuery {
var rv TermQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return &rv, nil
}
if isMatchQuery {
var rv MatchQuery
err := json.Unmarshal(input, &rv)
Expand All @@ -99,7 +92,6 @@ func ParseQuery(input []byte) (Query, error) {
}
return &rv, nil
}
_, isMatchPhraseQuery := tmp["match_phrase"]
if isMatchPhraseQuery {
var rv MatchPhraseQuery
err := json.Unmarshal(input, &rv)
Expand All @@ -108,18 +100,6 @@ func ParseQuery(input []byte) (Query, error) {
}
return &rv, nil
}
_, hasMust := tmp["must"]
_, hasShould := tmp["should"]
_, hasMustNot := tmp["must_not"]
if hasMust || hasShould || hasMustNot {
var rv BooleanQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return &rv, nil
}
_, hasTerms := tmp["terms"]
if hasTerms {
var rv PhraseQuery
err := json.Unmarshal(input, &rv)
Expand All @@ -134,6 +114,26 @@ func ParseQuery(input []byte) (Query, error) {
}
return &rv, nil
}
_, isTermQuery := tmp["term"]
if isTermQuery {
var rv TermQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return &rv, nil
}
_, hasMust := tmp["must"]
_, hasShould := tmp["should"]
_, hasMustNot := tmp["must_not"]
if hasMust || hasShould || hasMustNot {
var rv BooleanQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return &rv, nil
}
_, hasConjuncts := tmp["conjuncts"]
if hasConjuncts {
var rv ConjunctionQuery
Expand Down
4 changes: 4 additions & 0 deletions search/searcher/search_fuzzy.go
Expand Up @@ -61,6 +61,10 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s
if ctx != nil {
reportIOStats(ctx, dictBytesRead)
search.RecordSearchCost(ctx, search.AddM, dictBytesRead)
fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey)
if fuzzyTermMatches != nil {
fuzzyTermMatches.(map[string][]string)[term] = candidates
}
}

return NewMultiTermSearcher(ctx, indexReader, candidates, field,
Expand Down
93 changes: 74 additions & 19 deletions search/searcher/search_phrase.go
Expand Up @@ -41,6 +41,8 @@ type PhraseSearcher struct {
paths []phrasePath
locations []search.Location
initialized bool
// map a term to a list of fuzzy terms that match it
fuzzyTermMatches map[string][]string
}

func (s *PhraseSearcher) Size() int {
Expand All @@ -64,22 +66,39 @@ func (s *PhraseSearcher) Size() int {
return sizeInBytes
}

func NewPhraseSearcher(ctx context.Context, indexReader index.IndexReader, terms []string, field string, options search.SearcherOptions) (*PhraseSearcher, error) {
func NewPhraseSearcher(ctx context.Context, indexReader index.IndexReader, terms []string,
prefix, fuzziness int, field string, boost float64, options search.SearcherOptions) (*PhraseSearcher, error) {

// turn flat terms []string into [][]string
mterms := make([][]string, len(terms))
for i, term := range terms {
mterms[i] = []string{term}
}
return NewMultiPhraseSearcher(ctx, indexReader, mterms, field, options)
return NewMultiPhraseSearcher(ctx, indexReader, mterms, prefix, fuzziness, field, boost, options)
}

func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader, terms [][]string, field string, options search.SearcherOptions) (*PhraseSearcher, error) {
func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader, terms [][]string,
prefix, fuzziness int, field string, boost float64, options search.SearcherOptions) (*PhraseSearcher, error) {

options.IncludeTermVectors = true
var termPositionSearchers []search.Searcher
var err error
var ts search.Searcher
var fuzzyTermMatches map[string][]string
if fuzziness > 0 {
fuzzyTermMatches = make(map[string][]string)
ctx = context.WithValue(ctx, search.FuzzyMatchPhraseKey, fuzzyTermMatches)
}
for _, termPos := range terms {
if len(termPos) == 1 && termPos[0] != "" {
// single term
ts, err := NewTermSearcher(ctx, indexReader, termPos[0], field, 1.0, options)
if fuzziness > 0 {
// fuzzy
ts, err = NewFuzzySearcher(ctx, indexReader, termPos[0], prefix, fuzziness, field, boost, options)
} else {
// non-fuzzy
ts, err = NewTermSearcher(ctx, indexReader, termPos[0], field, boost, options)
}
if err != nil {
// close any searchers already opened
for _, ts := range termPositionSearchers {
Expand All @@ -95,7 +114,13 @@ func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader,
if term == "" {
continue
}
ts, err := NewTermSearcher(ctx, indexReader, term, field, 1.0, options)
if fuzziness > 0 {
// fuzzy
ts, err = NewFuzzySearcher(ctx, indexReader, term, prefix, fuzziness, field, boost, options)
} else {
// non-fuzzy
ts, err = NewTermSearcher(ctx, indexReader, term, field, boost, options)
}
if err != nil {
// close any searchers already opened
for _, ts := range termPositionSearchers {
Expand Down Expand Up @@ -128,8 +153,9 @@ func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader,

// build our searcher
rv := PhraseSearcher{
mustSearcher: mustSearcher,
terms: terms,
mustSearcher: mustSearcher,
terms: terms,
fuzzyTermMatches: fuzzyTermMatches,
}
rv.computeQueryNorm()
return &rv, nil
Expand Down Expand Up @@ -213,7 +239,7 @@ func (s *PhraseSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch,

// checkCurrMustMatch is solely concerned with determining if the DocumentMatch
// pointed to by s.currMust (which satisifies the pre-condition searcher)
// also satisfies the phase constraints. if so, it returns a DocumentMatch
// also satisfies the phrase constraints. if so, it returns a DocumentMatch
// for this document, otherwise nil
func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.DocumentMatch {
s.locations = s.currMust.Complete(s.locations)
Expand Down Expand Up @@ -244,7 +270,7 @@ func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.D

// checkCurrMustMatchField is solely concerned with determining if one
// particular field within the currMust DocumentMatch Locations
// satisfies the phase constraints (possibly more than once). if so,
// satisfies the phrase constraints (possibly more than once). if so,
// the matching field term locations are appended to the provided
// slice
func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext,
Expand All @@ -253,7 +279,21 @@ func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext,
if s.path == nil {
s.path = make(phrasePath, 0, len(s.terms))
}
s.paths = findPhrasePaths(0, nil, s.terms, tlm, s.path[:0], 0, s.paths[:0])
var tlmPtr *search.TermLocationMap = &tlm
if s.fuzzyTermMatches != nil {
// if fuzzy search, we need to expand the tlm to include all the fuzzy matches
// Example - term is "foo" and fuzzy matches are "foo", "fool", "food"
// the non expanded tlm will be:
// foo -> Locations[foo]
// fool -> Locations[fool]
// food -> Locations[food]
// the expanded tlm will be:
// foo -> [Locations[foo], Locations[fool], Locations[food]]
expandedTlm := make(search.TermLocationMap)
s.expandFuzzyMatches(tlm, expandedTlm)
tlmPtr = &expandedTlm
}
s.paths = findPhrasePaths(0, nil, s.terms, *tlmPtr, s.path[:0], 0, s.paths[:0])
for _, p := range s.paths {
for _, pp := range p {
ftls = append(ftls, search.FieldTermLocation{
Expand All @@ -271,6 +311,16 @@ func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext,
return ftls
}

func (s *PhraseSearcher) expandFuzzyMatches(tlm search.TermLocationMap, expandedTlm search.TermLocationMap) {
for term, fuzzyMatches := range s.fuzzyTermMatches {
locations := tlm[term]
for _, fuzzyMatch := range fuzzyMatches {
locations = append(locations, tlm[fuzzyMatch]...)
}
expandedTlm[term] = locations
}
}

type phrasePart struct {
term string
loc *search.Location
Expand Down Expand Up @@ -300,26 +350,31 @@ func (p phrasePath) String() string {
return rv
}

// findPhrasePaths is a function to identify phase matches from a set
// findPhrasePaths is a function to identify phrase matches from a set
// of known term locations. it recursive so care must be taken with
// arguments and return values.
//
// prevPos - the previous location, 0 on first invocation
//
// ap - array positions of the first candidate phrase part to
// which further recursive phrase parts must match,
// nil on initial invocation or when there are no array positions
// which further recursive phrase parts must match,
// nil on initial invocation or when there are no array positions
//
// phraseTerms - slice containing the phrase terms,
// may contain empty string as placeholder (don't care)
// may contain empty string as placeholder (don't care)
//
// tlm - the Term Location Map containing all relevant term locations
//
// p - the current path being explored (appended to in recursive calls)
// this is the primary state being built during the traversal
// this is the primary state being built during the traversal
//
// remainingSlop - amount of sloppiness that's allowed, which is the
// sum of the editDistances from each matching phrase part,
// where 0 means no sloppiness allowed (all editDistances must be 0),
// decremented during recursion
// sum of the editDistances from each matching phrase part, where 0 means no
// sloppiness allowed (all editDistances must be 0), decremented during recursion
//
// rv - the final result being appended to by all the recursive calls
//
// returns slice of paths, or nil if invocation did not find any successul paths
// returns slice of paths, or nil if invocation did not find any successful paths
func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string,
tlm search.TermLocationMap, p phrasePath, remainingSlop int, rv []phrasePath) []phrasePath {
// no more terms
Expand Down