Skip to content

Commit

Permalink
mango: correct text index selection for queries with $regex (#4458)
Browse files Browse the repository at this point in the history
* mango: Remove unused `op_insert`

The `op_insert` elements in the abstract representation of the
translated Lucene queries do not seem to be produced anywhere in
the code.  This might have been left over a while ago, and now
retire it.

* mango: Remove unused directory include

* mango: Equip text index selection with tests, specs, and docs

- Add specifications for the important functions that play some
  role in the text index selection.  This would help to understand
  the implicit contracts around them and the associated data flow.

- Introduce `test_utils:as_selector/1` to make it easier to build
  valid Mango selectors for testing.  On the top level, it uses
  Erlang maps to ensure the structural consistency of the input
  (selectors are JSON objects that can be considered maps).  Maps
  are then validated and normalized by `jiffy` and Mango's internal
  normalization rules for selectors for additional correctness,
  they eventually become embedded JSON objects.  This facilities
  writing better unit tests that are closer to the real-world use.
  At the same time, it comes with a dependency on these tools and
  their misbehavior can cause test failures.

- Add unit tests for the major functions that contribute to the
  index selection logic and boost the test coverage of the
  `mango_idx_text` and `mango_selector_text` modules.  That is
  important because running integration tests on a higher level
  requires a working Clouseau instance, which may not always be
  available.  With these unit tests in place, changes in the code
  can be tracked easily.  Also, the test cases can aid the reader
  to get a better understanding of the assumed behavior.

- Explain the purpose of `mango_idx_text:is_usable/3` as this is
  not trivial to catch at the first sight.  Thanks @mikerhodes for
  providing the input.

* mango: Refactor index selection tests

* mango: Correct text index selection for `$regex`

For the `$regex` operator, text indexes can be overly permissive
which can cause that they are selected even if they could not
serve the corresponding query.  Rework the interpreteration of
`$regex` to avoid such problems.
  • Loading branch information
pgj committed Mar 10, 2023
1 parent 8ec7b57 commit f40147b
Show file tree
Hide file tree
Showing 6 changed files with 666 additions and 22 deletions.
6 changes: 6 additions & 0 deletions src/couch/src/test_util.erl
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@

-export([shuffle/1]).

-export([as_selector/1]).

-include_lib("couch/include/couch_eunit.hrl").
-include_lib("couch/include/couch_db.hrl").
-include("couch_db_int.hrl").
Expand Down Expand Up @@ -488,3 +490,7 @@ shuffle(List) ->
Paired = [{couch_rand:uniform(), I} || I <- List],
Sorted = lists:sort(Paired),
[I || {_, I} <- Sorted].

%% Create a valid Mango selector from an Erlang map.
as_selector(Map) ->
mango_selector:normalize(jiffy:decode(jiffy:encode(Map))).
10 changes: 10 additions & 0 deletions src/mango/src/mango.hrl
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,13 @@
% the License.

-define(MANGO_ERROR(R), throw({mango_error, ?MODULE, R})).

-type abstract_text_selector() :: {'op_and', [abstract_text_selector()]}
| {'op_or', [abstract_text_selector()]}
| {'op_not', {abstract_text_selector(), abstract_text_selector()}}
| {'op_not', {_, 'false'}}
| {'op_field', {iolist() | binary(), _}}
| {'op_fieldname', {_, _}}
| {'op_null', {_, _}}
| {'op_default', _}
| {'op_regex', binary()}.
137 changes: 135 additions & 2 deletions src/mango/src/mango_idx_text.erl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
-include("mango.hrl").
-include("mango_idx.hrl").

-ifdef(TEST).
-import(test_util, [as_selector/1]).
-endif.

validate_new(#idx{} = Idx, Db) ->
{ok, Def} = do_validate(Idx#idx.def),
maybe_reject_index_all_req(Def, Db),
Expand Down Expand Up @@ -127,6 +131,14 @@ columns(Idx) ->
)
end.

% Mind that `is_usable/3` is not about "what fields can be answered by
% the index" but instead more along the lines of "this index will
% ensure that all the documents that should be returned for the query
% will be, because we checked that all the bits of the query that
% imply `$exists` for a field are used when we check that the indexing
% process will have included all the relevant documents in the index".
-spec is_usable(#idx{}, SelectorObject, _) -> boolean() when
SelectorObject :: any().
is_usable(_, Selector, _) when Selector =:= {[]} ->
false;
is_usable(Idx, Selector, _) ->
Expand Down Expand Up @@ -303,10 +315,15 @@ construct_analyzer({Props}) ->
]}
end.

-spec indexable_fields(SelectorObject) -> Fields when
SelectorObject :: any(),
Fields :: [binary()].
indexable_fields(Selector) ->
TupleTree = mango_selector_text:convert([], Selector),
indexable_fields([], TupleTree).

-spec indexable_fields(Fields, abstract_text_selector()) -> Fields when
Fields :: [binary()].
indexable_fields(Fields, {op_and, Args}) when is_list(Args) ->
lists:foldl(
fun(Arg, Fields0) -> indexable_fields(Fields0, Arg) end,
Expand Down Expand Up @@ -344,8 +361,6 @@ indexable_fields(Fields, {op_not, {ExistsQuery, Arg}}) when is_tuple(Arg) ->
% forces "$exists" : false to use _all_docs
indexable_fields(_, {op_not, {_, false}}) ->
[];
indexable_fields(Fields, {op_insert, Arg}) when is_binary(Arg) ->
Fields;
%% fieldname.[]:length is not a user defined field.
indexable_fields(Fields, {op_field, {[_, <<":length">>], _}}) ->
Fields;
Expand All @@ -360,6 +375,11 @@ indexable_fields(Fields, {op_fieldname, {_, _}}) ->
%% Similar idea to op_fieldname but with fieldname:null
indexable_fields(Fields, {op_null, {_, _}}) ->
Fields;
%% Regular expression matching should be an exception to the rule
%% above because the type of the associated field is exact, it must be
%% a string.
indexable_fields(Fields, {op_regex, Name}) ->
[iolist_to_binary([Name, ":string"]) | Fields];
indexable_fields(Fields, {op_default, _}) ->
[<<"$default">> | Fields].

Expand Down Expand Up @@ -456,4 +476,117 @@ warn_index_all({Idx, Db}) ->
?assertThrow({test_error, logged_warning}, validate_new(Idx, Db))
end).

indexable_fields_test() ->
?assertEqual(
[<<"$default">>, <<"field1:boolean">>, <<"field2:number">>, <<"field3:string">>],
indexable_fields(
as_selector(
#{
<<"$default">> => #{<<"$text">> => <<"text">>},
<<"field1">> => true,
<<"field2">> => 42,
<<"field3">> => #{<<"$regex">> => <<".*">>}
}
)
)
),
?assertEqual(
[<<"f1:string">>, <<"f2:string">>, <<"f3:string">>, <<"f4:string">>, <<"f5:string">>],
lists:sort(
indexable_fields(
as_selector(
#{
<<"$and">> =>
[
#{<<"f1">> => <<"v1">>},
#{<<"f2">> => <<"v2">>}
],
<<"$or">> =>
[
#{<<"f3">> => <<"v3">>},
#{<<"f4">> => <<"v4">>}
],
<<"$not">> => #{<<"f5">> => <<"v5">>}
}
)
)
)
),

?assertEqual(
[],
indexable_fields(
as_selector(
#{
<<"field1">> => null,
<<"field2">> => #{<<"$size">> => 3},
<<"field3">> => #{<<"$type">> => <<"type">>}
}
)
)
),
?assertEqual(
[],
indexable_fields(
as_selector(
#{
<<"$and">> =>
[
#{<<"f1">> => null},
#{<<"f2">> => null}
],
<<"$or">> =>
[
#{<<"f3">> => null},
#{<<"f4">> => null}
],
<<"$not">> => #{<<"f5">> => null}
}
)
)
).

is_usable_test() ->
?assertNot(is_usable(undefined, {[]}, undefined)),

AllFieldsIndex = #idx{def = {[{<<"fields">>, <<"all_fields">>}]}},
?assert(is_usable(AllFieldsIndex, undefined, undefined)),

Field1 = {[{<<"name">>, <<"field1">>}, {<<"type">>, <<"string">>}]},
Field2 = {[{<<"name">>, <<"field2">>}, {<<"type">>, <<"number">>}]},
Index = #idx{def = {[{<<"fields">>, [Field1, Field2]}]}},
?assert(is_usable(Index, as_selector(#{<<"field1">> => <<"value">>}), undefined)),
?assertNot(is_usable(Index, as_selector(#{<<"field1">> => 42}), undefined)),
?assertNot(is_usable(Index, as_selector(#{<<"field3">> => true}), undefined)),
?assert(
is_usable(Index, as_selector(#{<<"field1">> => #{<<"$type">> => <<"string">>}}), undefined)
),
?assert(
is_usable(Index, as_selector(#{<<"field1">> => #{<<"$type">> => <<"boolean">>}}), undefined)
),
?assert(
is_usable(Index, as_selector(#{<<"field3">> => #{<<"$type">> => <<"boolean">>}}), undefined)
),
?assert(is_usable(Index, as_selector(#{<<"field1">> => #{<<"$exists">> => true}}), undefined)),
?assert(is_usable(Index, as_selector(#{<<"field1">> => #{<<"$exists">> => false}}), undefined)),
?assert(is_usable(Index, as_selector(#{<<"field3">> => #{<<"$exists">> => true}}), undefined)),
?assert(is_usable(Index, as_selector(#{<<"field3">> => #{<<"$exists">> => false}}), undefined)),
?assert(
is_usable(Index, as_selector(#{<<"field1">> => #{<<"$regex">> => <<".*">>}}), undefined)
),
?assertNot(
is_usable(Index, as_selector(#{<<"field2">> => #{<<"$regex">> => <<".*">>}}), undefined)
),
?assertNot(
is_usable(Index, as_selector(#{<<"field3">> => #{<<"$regex">> => <<".*">>}}), undefined)
),
?assertNot(
is_usable(Index, as_selector(#{<<"field1">> => #{<<"$nin">> => [1, 2, 3]}}), undefined)
),
?assert(
is_usable(Index, as_selector(#{<<"field2">> => #{<<"$nin">> => [1, 2, 3]}}), undefined)
),
?assertNot(
is_usable(Index, as_selector(#{<<"field3">> => #{<<"$nin">> => [1, 2, 3]}}), undefined)
).
-endif.

0 comments on commit f40147b

Please sign in to comment.