Skip to content

Commit

Permalink
feat(#950): indexing keyword fields with textual info (#1197)
Browse files Browse the repository at this point in the history
* feat(search): indexing keyword fields with textual info

* fix: properties -> fields

(cherry picked from commit ab80cbc)

ci: include es 8.0 in build process (#1286)

* ci: include es 8.0 in build process

* chore: wip

* fix(es-mapping): avoid nested multi-fields in mapping

* fix(search): use id for default sorting

(cherry picked from commit 2eb5276)

fix(#1286): backward comp. sorting by id (#1304)

* fix(search): backward comp. sorting by id

* fix: error normalizing sort

* chore: dockerfile

(cherry picked from commit a3b0552)
  • Loading branch information
frascuchon committed Mar 28, 2022
1 parent 5ac148e commit 70f25a5
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 24 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Expand Up @@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
elasticsearch: [ 7.4, 7.10.2, 7.14, 7.16, 7.17 ]
elasticsearch: [ 7.4, 7.10.2, 7.14, 7.16, 7.17, 8.0 ]
defaults:
run:
shell: bash -l {0}
Expand Down
2 changes: 1 addition & 1 deletion release.Dockerfile
Expand Up @@ -9,7 +9,7 @@ RUN wget -O /wait-for-it.sh https://raw.githubusercontent.com/vishnubob/wait-for
&& find /packages/*.whl -exec pip install {}[server] \;

# See <https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker#module_name>
ENV MODULE_NAME="rubrix.server.server"
ENV MODULE_NAME="rubrix"
ENV VARIABLE_NAME="app"

CMD /wait-for-it.sh $ELASTICSEARCH -- /start.sh
11 changes: 6 additions & 5 deletions src/rubrix/server/commons/es_helpers.py
Expand Up @@ -29,6 +29,7 @@
TaskStatus,
)
from rubrix.server.tasks.commons.api import EsRecordDataFieldNames
from rubrix.server.tasks.commons.dao.es_config import mappings

SUPPORTED_LANGUAGES = ["es", "en", "fr", "de"]
DATASETS_RECORDS_INDEX_TEMPLATE = {
Expand Down Expand Up @@ -104,19 +105,19 @@


def nested_mappings_from_base_model(model_class: Type[BaseModel]) -> Dict[str, Any]:
def resolve_type(info):
def resolve_mapping(info) -> Dict[str, Any]:
the_type = info.get("type")
if the_type == "number":
return "float"
return {"type": "float"}
if the_type == "integer":
return "integer"
return "keyword"
return {"type": "integer"}
return mappings.keyword_field(enable_text_search=True)

return {
"type": "nested",
"include_in_root": True,
"properties": {
key: {"type": resolve_type(info)}
key: resolve_mapping(info)
for key, info in model_class.schema()["properties"].items()
},
}
Expand Down
1 change: 0 additions & 1 deletion src/rubrix/server/commons/es_wrapper.py
Expand Up @@ -382,7 +382,6 @@ def get_field_mapping(
return {
key: list(definition["mapping"].values())[0]["type"]
for key, definition in response[index]["mappings"].items()
if not key.endswith(".raw") # Drop raw version of fields
}
except NotFoundError:
# No mapping data
Expand Down
21 changes: 20 additions & 1 deletion src/rubrix/server/tasks/commons/dao/dao.py
Expand Up @@ -249,11 +249,13 @@ def search_records(
{**(search.aggregations or {})} if compute_aggregations else {}
)

sort_config = self.__normalize_sort_config__(records_index, sort=search.sort)

es_query = {
"_source": {"excludes": exclude_fields or []},
"from": record_from,
"query": search.query or {"match_all": {}},
"sort": search.sort or [{"_id": {"order": "asc"}}],
"sort": sort_config,
"aggs": aggregation_requests,
"highlight": self.__configure_query_highlight__(),
}
Expand Down Expand Up @@ -307,6 +309,23 @@ def search_records(

return result

def __normalize_sort_config__(
self, index: str, sort: Optional[List[Dict[str, Any]]] = None
) -> List[Dict[str, Any]]:
id_field = "id"
id_keyword_field = "id.keyword"
sort_config = []

for sort_field in sort or [{id_field: {"order": "asc"}}]:
for field in sort_field:
if field == id_field and self._es.get_field_mapping(
index=index, field_name=id_keyword_field
):
sort_config.append({id_keyword_field: sort_field[field]})
else:
sort_config.append(sort_field)
return sort_config

def scan_dataset(
self,
dataset: BaseDatasetDB,
Expand Down
33 changes: 25 additions & 8 deletions src/rubrix/server/tasks/commons/dao/es_config.py
Expand Up @@ -12,21 +12,30 @@

class mappings:
@staticmethod
def keyword_field():
def keyword_field(enable_text_search: bool = False):
"""Mappings config for keyword field"""
return {
mapping = {
"type": "keyword",
# TODO: Use environment var and align with fields validators
"ignore_above": MAX_KEYWORD_LENGTH,
}
if enable_text_search:
text_field = mappings.text_field()
text_field_fields = text_field.pop("fields", {})
mapping["fields"] = {"text": text_field, **text_field_fields}
return mapping

@staticmethod
def path_match_keyword_template(path: str):
def path_match_keyword_template(
path: str, enable_text_search_in_keywords: bool = False
):
"""Dynamic template mappings config for keyword field based on path match"""
return {
"path_match": path,
"match_mapping_type": "string",
"mapping": mappings.keyword_field(),
"mapping": mappings.keyword_field(
enable_text_search=enable_text_search_in_keywords
),
}

@staticmethod
Expand Down Expand Up @@ -130,11 +139,19 @@ def tasks_common_settings():


def dynamic_metrics_text():
return {"metrics.*": mappings.path_match_keyword_template(path="metrics.*")}
return {
"metrics.*": mappings.path_match_keyword_template(
path="metrics.*", enable_text_search_in_keywords=False
)
}


def dynamic_metadata_text():
return {"metadata.*": mappings.path_match_keyword_template(path="metadata.*")}
return {
"metadata.*": mappings.path_match_keyword_template(
path="metadata.*", enable_text_search_in_keywords=True
)
}


def tasks_common_mappings():
Expand All @@ -152,8 +169,8 @@ def tasks_common_mappings():
"status": mappings.keyword_field(),
"event_timestamp": {"type": "date"},
"last_updated": {"type": "date"},
"annotated_by": mappings.keyword_field(),
"predicted_by": mappings.keyword_field(),
"annotated_by": mappings.keyword_field(enable_text_search=True),
"predicted_by": mappings.keyword_field(enable_text_search=True),
"metrics": {"dynamic": True, "type": "object"},
"metadata": {"dynamic": True, "type": "object"},
},
Expand Down
4 changes: 2 additions & 2 deletions src/rubrix/server/tasks/text_classification/dao/es_config.py
Expand Up @@ -27,8 +27,8 @@ def text_classification_mappings():
},
"predicted": mappings.keyword_field(),
"multi_label": {"type": "boolean"},
"annotated_as": mappings.keyword_field(),
"predicted_as": mappings.keyword_field(),
"annotated_as": mappings.keyword_field(enable_text_search=True),
"predicted_as": mappings.keyword_field(enable_text_search=True),
"score": mappings.decimal_field(),
},
"dynamic_templates": [
Expand Down
10 changes: 5 additions & 5 deletions src/rubrix/server/tasks/token_classification/dao/es_config.py
Expand Up @@ -19,7 +19,7 @@ def mentions_mappings():

def token_classification_mappings():
metrics_mentions_mappings = nested_mappings_from_base_model(MentionMetrics)
_mentions_mappings = mentions_mappings()
_mentions_mappings = mentions_mappings() # TODO: remove
return {
"_source": mappings.source(
excludes=[
Expand All @@ -36,11 +36,11 @@ def token_classification_mappings():
),
"properties": {
"predicted": mappings.keyword_field(),
"annotated_as": mappings.keyword_field(),
"predicted_as": mappings.keyword_field(),
"annotated_as": mappings.keyword_field(enable_text_search=True),
"predicted_as": mappings.keyword_field(enable_text_search=True),
"score": {"type": "float"},
"predicted_mentions": _mentions_mappings,
"mentions": _mentions_mappings,
"predicted_mentions": _mentions_mappings, # TODO: remove
"mentions": _mentions_mappings, # TODO: remove
"tokens": mappings.keyword_field(),
# TODO: This must be unified with metrics.py module
"metrics.tokens": nested_mappings_from_base_model(TokenMetrics),
Expand Down

0 comments on commit 70f25a5

Please sign in to comment.