feat(#950): indexing keyword fields with textual info (#1197)

* feat(search): indexing keyword fields with textual info * fix: properties -> fields (cherry picked from commit ab80cbc) ci: include es 8.0 in build process (#1286) * ci: include es 8.0 in build process * chore: wip * fix(es-mapping): avoid nested multi-fields in mapping * fix(search): use id for default sorting (cherry picked from commit 2eb5276) fix(#1286): backward comp. sorting by id (#1304) * fix(search): backward comp. sorting by id * fix: error normalizing sort * chore: dockerfile (cherry picked from commit a3b0552)
argilla-io · Mar 28, 2022 · 70f25a5 · 70f25a5
1 parent 5ac148e
commit 70f25a5
Show file tree

Hide file tree

Showing 8 changed files with 60 additions and 24 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -20,7 +20,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        elasticsearch: [ 7.4, 7.10.2, 7.14, 7.16, 7.17 ]
+        elasticsearch: [ 7.4, 7.10.2, 7.14, 7.16, 7.17, 8.0 ]
     defaults:
       run:
         shell: bash -l {0}

diff --git a/release.Dockerfile b/release.Dockerfile
@@ -9,7 +9,7 @@ RUN wget -O /wait-for-it.sh https://raw.githubusercontent.com/vishnubob/wait-for
  && find /packages/*.whl -exec pip install {}[server] \;
 
 # See <https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker#module_name>
-ENV MODULE_NAME="rubrix.server.server"
+ENV MODULE_NAME="rubrix"
 ENV VARIABLE_NAME="app"
 
 CMD /wait-for-it.sh $ELASTICSEARCH -- /start.sh
diff --git a/src/rubrix/server/commons/es_helpers.py b/src/rubrix/server/commons/es_helpers.py
@@ -29,6 +29,7 @@
     TaskStatus,
 )
 from rubrix.server.tasks.commons.api import EsRecordDataFieldNames
+from rubrix.server.tasks.commons.dao.es_config import mappings
 
 SUPPORTED_LANGUAGES = ["es", "en", "fr", "de"]
 DATASETS_RECORDS_INDEX_TEMPLATE = {
@@ -104,19 +105,19 @@
 
 
 def nested_mappings_from_base_model(model_class: Type[BaseModel]) -> Dict[str, Any]:
-    def resolve_type(info):
+    def resolve_mapping(info) -> Dict[str, Any]:
         the_type = info.get("type")
         if the_type == "number":
-            return "float"
+            return {"type": "float"}
         if the_type == "integer":
-            return "integer"
-        return "keyword"
+            return {"type": "integer"}
+        return mappings.keyword_field(enable_text_search=True)
 
     return {
         "type": "nested",
         "include_in_root": True,
         "properties": {
-            key: {"type": resolve_type(info)}
+            key: resolve_mapping(info)
             for key, info in model_class.schema()["properties"].items()
         },
     }

diff --git a/src/rubrix/server/commons/es_wrapper.py b/src/rubrix/server/commons/es_wrapper.py
@@ -382,7 +382,6 @@ def get_field_mapping(
             return {
                 key: list(definition["mapping"].values())[0]["type"]
                 for key, definition in response[index]["mappings"].items()
-                if not key.endswith(".raw")  # Drop raw version of fields
             }
         except NotFoundError:
             # No mapping data

diff --git a/src/rubrix/server/tasks/commons/dao/dao.py b/src/rubrix/server/tasks/commons/dao/dao.py
@@ -249,11 +249,13 @@ def search_records(
             {**(search.aggregations or {})} if compute_aggregations else {}
         )
 
+        sort_config = self.__normalize_sort_config__(records_index, sort=search.sort)
+
         es_query = {
             "_source": {"excludes": exclude_fields or []},
             "from": record_from,
             "query": search.query or {"match_all": {}},
-            "sort": search.sort or [{"_id": {"order": "asc"}}],
+            "sort": sort_config,
             "aggs": aggregation_requests,
             "highlight": self.__configure_query_highlight__(),
         }
@@ -307,6 +309,23 @@ def search_records(
 
         return result
 
+    def __normalize_sort_config__(
+        self, index: str, sort: Optional[List[Dict[str, Any]]] = None
+    ) -> List[Dict[str, Any]]:
+        id_field = "id"
+        id_keyword_field = "id.keyword"
+        sort_config = []
+
+        for sort_field in sort or [{id_field: {"order": "asc"}}]:
+            for field in sort_field:
+                if field == id_field and self._es.get_field_mapping(
+                    index=index, field_name=id_keyword_field
+                ):
+                    sort_config.append({id_keyword_field: sort_field[field]})
+                else:
+                    sort_config.append(sort_field)
+        return sort_config
+
     def scan_dataset(
         self,
         dataset: BaseDatasetDB,

diff --git a/src/rubrix/server/tasks/commons/dao/es_config.py b/src/rubrix/server/tasks/commons/dao/es_config.py
@@ -12,21 +12,30 @@
 
 class mappings:
     @staticmethod
-    def keyword_field():
+    def keyword_field(enable_text_search: bool = False):
         """Mappings config for keyword field"""
-        return {
+        mapping = {
             "type": "keyword",
             # TODO: Use environment var and align with fields validators
             "ignore_above": MAX_KEYWORD_LENGTH,
         }
+        if enable_text_search:
+            text_field = mappings.text_field()
+            text_field_fields = text_field.pop("fields", {})
+            mapping["fields"] = {"text": text_field, **text_field_fields}
+        return mapping
 
     @staticmethod
-    def path_match_keyword_template(path: str):
+    def path_match_keyword_template(
+        path: str, enable_text_search_in_keywords: bool = False
+    ):
         """Dynamic template mappings config for keyword field based on path match"""
         return {
             "path_match": path,
             "match_mapping_type": "string",
-            "mapping": mappings.keyword_field(),
+            "mapping": mappings.keyword_field(
+                enable_text_search=enable_text_search_in_keywords
+            ),
         }
 
     @staticmethod
@@ -130,11 +139,19 @@ def tasks_common_settings():
 
 
 def dynamic_metrics_text():
-    return {"metrics.*": mappings.path_match_keyword_template(path="metrics.*")}
+    return {
+        "metrics.*": mappings.path_match_keyword_template(
+            path="metrics.*", enable_text_search_in_keywords=False
+        )
+    }
 
 
 def dynamic_metadata_text():
-    return {"metadata.*": mappings.path_match_keyword_template(path="metadata.*")}
+    return {
+        "metadata.*": mappings.path_match_keyword_template(
+            path="metadata.*", enable_text_search_in_keywords=True
+        )
+    }
 
 
 def tasks_common_mappings():
@@ -152,8 +169,8 @@ def tasks_common_mappings():
             "status": mappings.keyword_field(),
             "event_timestamp": {"type": "date"},
             "last_updated": {"type": "date"},
-            "annotated_by": mappings.keyword_field(),
-            "predicted_by": mappings.keyword_field(),
+            "annotated_by": mappings.keyword_field(enable_text_search=True),
+            "predicted_by": mappings.keyword_field(enable_text_search=True),
             "metrics": {"dynamic": True, "type": "object"},
             "metadata": {"dynamic": True, "type": "object"},
         },

diff --git a/src/rubrix/server/tasks/text_classification/dao/es_config.py b/src/rubrix/server/tasks/text_classification/dao/es_config.py
@@ -27,8 +27,8 @@ def text_classification_mappings():
             },
             "predicted": mappings.keyword_field(),
             "multi_label": {"type": "boolean"},
-            "annotated_as": mappings.keyword_field(),
-            "predicted_as": mappings.keyword_field(),
+            "annotated_as": mappings.keyword_field(enable_text_search=True),
+            "predicted_as": mappings.keyword_field(enable_text_search=True),
             "score": mappings.decimal_field(),
         },
         "dynamic_templates": [

diff --git a/src/rubrix/server/tasks/token_classification/dao/es_config.py b/src/rubrix/server/tasks/token_classification/dao/es_config.py
@@ -19,7 +19,7 @@ def mentions_mappings():
 
 def token_classification_mappings():
     metrics_mentions_mappings = nested_mappings_from_base_model(MentionMetrics)
-    _mentions_mappings = mentions_mappings()
+    _mentions_mappings = mentions_mappings()  # TODO: remove
     return {
         "_source": mappings.source(
             excludes=[
@@ -36,11 +36,11 @@ def token_classification_mappings():
         ),
         "properties": {
             "predicted": mappings.keyword_field(),
-            "annotated_as": mappings.keyword_field(),
-            "predicted_as": mappings.keyword_field(),
+            "annotated_as": mappings.keyword_field(enable_text_search=True),
+            "predicted_as": mappings.keyword_field(enable_text_search=True),
             "score": {"type": "float"},
-            "predicted_mentions": _mentions_mappings,
-            "mentions": _mentions_mappings,
+            "predicted_mentions": _mentions_mappings,  # TODO: remove
+            "mentions": _mentions_mappings,  # TODO: remove
             "tokens": mappings.keyword_field(),
             # TODO: This must be unified with metrics.py module
             "metrics.tokens": nested_mappings_from_base_model(TokenMetrics),