diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0d912f5bf8..5cf89e619c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - elasticsearch: [ 7.4, 7.10.2, 7.14, 7.16, 7.17 ] + elasticsearch: [ 7.4, 7.10.2, 7.14, 7.16, 7.17, 8.0 ] defaults: run: shell: bash -l {0} diff --git a/release.Dockerfile b/release.Dockerfile index 98d7bdf2c1..06a24d77c6 100644 --- a/release.Dockerfile +++ b/release.Dockerfile @@ -9,7 +9,7 @@ RUN wget -O /wait-for-it.sh https://raw.githubusercontent.com/vishnubob/wait-for && find /packages/*.whl -exec pip install {}[server] \; # See -ENV MODULE_NAME="rubrix.server.server" +ENV MODULE_NAME="rubrix" ENV VARIABLE_NAME="app" CMD /wait-for-it.sh $ELASTICSEARCH -- /start.sh diff --git a/src/rubrix/server/commons/es_helpers.py b/src/rubrix/server/commons/es_helpers.py index 1ec1b86442..49bdd8d5dd 100644 --- a/src/rubrix/server/commons/es_helpers.py +++ b/src/rubrix/server/commons/es_helpers.py @@ -29,6 +29,7 @@ TaskStatus, ) from rubrix.server.tasks.commons.api import EsRecordDataFieldNames +from rubrix.server.tasks.commons.dao.es_config import mappings SUPPORTED_LANGUAGES = ["es", "en", "fr", "de"] DATASETS_RECORDS_INDEX_TEMPLATE = { @@ -104,19 +105,19 @@ def nested_mappings_from_base_model(model_class: Type[BaseModel]) -> Dict[str, Any]: - def resolve_type(info): + def resolve_mapping(info) -> Dict[str, Any]: the_type = info.get("type") if the_type == "number": - return "float" + return {"type": "float"} if the_type == "integer": - return "integer" - return "keyword" + return {"type": "integer"} + return mappings.keyword_field(enable_text_search=True) return { "type": "nested", "include_in_root": True, "properties": { - key: {"type": resolve_type(info)} + key: resolve_mapping(info) for key, info in model_class.schema()["properties"].items() }, } diff --git a/src/rubrix/server/commons/es_wrapper.py b/src/rubrix/server/commons/es_wrapper.py index f79b267386..1a71c2fb2d 100644 --- a/src/rubrix/server/commons/es_wrapper.py +++ b/src/rubrix/server/commons/es_wrapper.py @@ -382,7 +382,6 @@ def get_field_mapping( return { key: list(definition["mapping"].values())[0]["type"] for key, definition in response[index]["mappings"].items() - if not key.endswith(".raw") # Drop raw version of fields } except NotFoundError: # No mapping data diff --git a/src/rubrix/server/tasks/commons/dao/dao.py b/src/rubrix/server/tasks/commons/dao/dao.py index d05af71eae..97759baeb3 100644 --- a/src/rubrix/server/tasks/commons/dao/dao.py +++ b/src/rubrix/server/tasks/commons/dao/dao.py @@ -249,11 +249,13 @@ def search_records( {**(search.aggregations or {})} if compute_aggregations else {} ) + sort_config = self.__normalize_sort_config__(records_index, sort=search.sort) + es_query = { "_source": {"excludes": exclude_fields or []}, "from": record_from, "query": search.query or {"match_all": {}}, - "sort": search.sort or [{"_id": {"order": "asc"}}], + "sort": sort_config, "aggs": aggregation_requests, "highlight": self.__configure_query_highlight__(), } @@ -307,6 +309,23 @@ def search_records( return result + def __normalize_sort_config__( + self, index: str, sort: Optional[List[Dict[str, Any]]] = None + ) -> List[Dict[str, Any]]: + id_field = "id" + id_keyword_field = "id.keyword" + sort_config = [] + + for sort_field in sort or [{id_field: {"order": "asc"}}]: + for field in sort_field: + if field == id_field and self._es.get_field_mapping( + index=index, field_name=id_keyword_field + ): + sort_config.append({id_keyword_field: sort_field[field]}) + else: + sort_config.append(sort_field) + return sort_config + def scan_dataset( self, dataset: BaseDatasetDB, diff --git a/src/rubrix/server/tasks/commons/dao/es_config.py b/src/rubrix/server/tasks/commons/dao/es_config.py index 656284856b..33086b2e4e 100644 --- a/src/rubrix/server/tasks/commons/dao/es_config.py +++ b/src/rubrix/server/tasks/commons/dao/es_config.py @@ -12,21 +12,30 @@ class mappings: @staticmethod - def keyword_field(): + def keyword_field(enable_text_search: bool = False): """Mappings config for keyword field""" - return { + mapping = { "type": "keyword", # TODO: Use environment var and align with fields validators "ignore_above": MAX_KEYWORD_LENGTH, } + if enable_text_search: + text_field = mappings.text_field() + text_field_fields = text_field.pop("fields", {}) + mapping["fields"] = {"text": text_field, **text_field_fields} + return mapping @staticmethod - def path_match_keyword_template(path: str): + def path_match_keyword_template( + path: str, enable_text_search_in_keywords: bool = False + ): """Dynamic template mappings config for keyword field based on path match""" return { "path_match": path, "match_mapping_type": "string", - "mapping": mappings.keyword_field(), + "mapping": mappings.keyword_field( + enable_text_search=enable_text_search_in_keywords + ), } @staticmethod @@ -130,11 +139,19 @@ def tasks_common_settings(): def dynamic_metrics_text(): - return {"metrics.*": mappings.path_match_keyword_template(path="metrics.*")} + return { + "metrics.*": mappings.path_match_keyword_template( + path="metrics.*", enable_text_search_in_keywords=False + ) + } def dynamic_metadata_text(): - return {"metadata.*": mappings.path_match_keyword_template(path="metadata.*")} + return { + "metadata.*": mappings.path_match_keyword_template( + path="metadata.*", enable_text_search_in_keywords=True + ) + } def tasks_common_mappings(): @@ -152,8 +169,8 @@ def tasks_common_mappings(): "status": mappings.keyword_field(), "event_timestamp": {"type": "date"}, "last_updated": {"type": "date"}, - "annotated_by": mappings.keyword_field(), - "predicted_by": mappings.keyword_field(), + "annotated_by": mappings.keyword_field(enable_text_search=True), + "predicted_by": mappings.keyword_field(enable_text_search=True), "metrics": {"dynamic": True, "type": "object"}, "metadata": {"dynamic": True, "type": "object"}, }, diff --git a/src/rubrix/server/tasks/text_classification/dao/es_config.py b/src/rubrix/server/tasks/text_classification/dao/es_config.py index 11785a724c..6500dc2c9e 100644 --- a/src/rubrix/server/tasks/text_classification/dao/es_config.py +++ b/src/rubrix/server/tasks/text_classification/dao/es_config.py @@ -27,8 +27,8 @@ def text_classification_mappings(): }, "predicted": mappings.keyword_field(), "multi_label": {"type": "boolean"}, - "annotated_as": mappings.keyword_field(), - "predicted_as": mappings.keyword_field(), + "annotated_as": mappings.keyword_field(enable_text_search=True), + "predicted_as": mappings.keyword_field(enable_text_search=True), "score": mappings.decimal_field(), }, "dynamic_templates": [ diff --git a/src/rubrix/server/tasks/token_classification/dao/es_config.py b/src/rubrix/server/tasks/token_classification/dao/es_config.py index fa0c08713f..871669c0ab 100644 --- a/src/rubrix/server/tasks/token_classification/dao/es_config.py +++ b/src/rubrix/server/tasks/token_classification/dao/es_config.py @@ -19,7 +19,7 @@ def mentions_mappings(): def token_classification_mappings(): metrics_mentions_mappings = nested_mappings_from_base_model(MentionMetrics) - _mentions_mappings = mentions_mappings() + _mentions_mappings = mentions_mappings() # TODO: remove return { "_source": mappings.source( excludes=[ @@ -36,11 +36,11 @@ def token_classification_mappings(): ), "properties": { "predicted": mappings.keyword_field(), - "annotated_as": mappings.keyword_field(), - "predicted_as": mappings.keyword_field(), + "annotated_as": mappings.keyword_field(enable_text_search=True), + "predicted_as": mappings.keyword_field(enable_text_search=True), "score": {"type": "float"}, - "predicted_mentions": _mentions_mappings, - "mentions": _mentions_mappings, + "predicted_mentions": _mentions_mappings, # TODO: remove + "mentions": _mentions_mappings, # TODO: remove "tokens": mappings.keyword_field(), # TODO: This must be unified with metrics.py module "metrics.tokens": nested_mappings_from_base_model(TokenMetrics),