feat(#945): change es analyzers using environment variables (#1022)

* feat: change text analyzers using environment variables * docs: include new env vars reference in doc * Apply suggestions from code review Co-authored-by: Daniel Vila Suero <daniel@recogn.ai> Co-authored-by: Daniel Vila Suero <daniel@recogn.ai>
argilla-io · Jan 25, 2022 · 1ab4a14 · 1ab4a14
1 parent 84b6ada
commit 1ab4a14
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 9 deletions.
diff --git a/docs/getting_started/advanced_setup_guides.rst b/docs/getting_started/advanced_setup_guides.rst
@@ -57,9 +57,9 @@ All you need to take into account is:
 
 * Rubrix creates an index template for these indices, so you may provide related template privileges to this ES role.
 
-Rubrix uses the ``ELASTICSEARCH`` environment variable to set the ES connection. 
+Rubrix uses the ``ELASTICSEARCH`` environment variable to set the ES connection.
 
-You can provide the credentials using the following scheme: 
+You can provide the credentials using the following scheme:
 
 .. code-block:: bash
 
@@ -70,6 +70,18 @@ Below you can see a screenshot for setting up a new *rubrix* Role and its permis
 :raw-html-m2r:`<img src="https://user-images.githubusercontent.com/2518789/142883104-f4f20cf0-34a0-47ff-8ee3-ab9f4644271c.png"/>`
 
 
+Change elasticsearch index analyzers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+By default, for indexing text fields, Rubrix uses the `standard` analyzer for general search and the `whitespace` analyzer for more exact queries (required by certain rules in the weak supervision module).
+ If those analyzers don't fit your use case, you can change them using the following environment variables:
+`RUBRIX_DEFAULT_ES_SEARCH_ANALYZER` and `RUBRIX_EXACT_ES_SEARCH_ANALYZER`.
+
+Note that provided analyzers names should be defined as built-in ones. If you want to use a
+customized analyzer, you should create it inside an index_template matching Rubrix index names (`.rubrix*.records-v0),
+and then provide the analyzer name using the specific environment variable.
+
+
 Deploy to aws instance using docker-machine
 -------------------------------------------
 

diff --git a/src/rubrix/server/commons/settings.py b/src/rubrix/server/commons/settings.py
@@ -55,18 +55,23 @@ class ApiSettings(BaseSettings):
     __DATASETS_INDEX_NAME__ = ".rubrix<NAMESPACE>.datasets-v0"
     __DATASETS_RECORDS_INDEX_NAME__ = ".rubrix<NAMESPACE>.dataset.{}.records-v0"
 
-    only_bulk_api: bool = False
     elasticsearch: str = "http://localhost:9200"
     cors_origins: List[str] = ["*"]
 
     docs_enabled: bool = True
 
+    namespace: str = Field(default=None, regex=r"^[a-z]+$")
+
+    # Analyzer configuration
+    default_es_search_analyzer: str = "standard"
+    exact_es_search_analyzer: str = "whitespace"
+    # This line will be enabled once words field won't be used anymore
+    # wordcloud_es_search_analyzer: str = "multilingual_stop_analyzer"
+
     es_records_index_shards: int = 1
     es_records_index_replicas: int = 0
     disable_es_index_template_creation: bool = False
 
-    namespace: str = Field(default=None, regex=r"^[a-z]+$")
-
     metadata_fields_limit: int = Field(
         default=50, gt=0, le=100, description="Max number of fields in metadata"
     )
@@ -92,6 +97,10 @@ class Config:
             "namespace": {
                 "env": "RUBRIX_NAMESPACE",
             },
+            "default_es_search_analyzer": {
+                "env": "RUBRIX_DEFAULT_ES_SEARCH_ANALYZER",
+            },
+            "exact_es_search_analyzer": {"env": "RUBRIX_EXACT_ES_SEARCH_ANALYZER"},
         }
 
 

diff --git a/src/rubrix/server/tasks/commons/dao/es_config.py b/src/rubrix/server/tasks/commons/dao/es_config.py
@@ -32,26 +32,39 @@ def path_match_keyword_template(path: str):
     @staticmethod
     def words_text_field():
         """Mappings config for old `word` field. Deprecated"""
+
+        default_analyzer = settings.default_es_search_analyzer
+        exact_analyzer = settings.exact_es_search_analyzer
+
+        if default_analyzer == "standard":
+            default_analyzer = MULTILINGUAL_STOP_ANALYZER_REF
+
+        if exact_analyzer == "whitespace":
+            exact_analyzer = EXTENDED_ANALYZER_REF
+
         return {
             "type": "text",
             "fielddata": True,
-            "analyzer": MULTILINGUAL_STOP_ANALYZER_REF,
+            "analyzer": default_analyzer,
             "fields": {
                 "extended": {
                     "type": "text",
-                    "analyzer": EXTENDED_ANALYZER_REF,
+                    "analyzer": exact_analyzer,
                 }
             },
         }
 
     @staticmethod
     def text_field():
         """Mappings config for textual field"""
+        default_analyzer = settings.default_es_search_analyzer
+        exact_analyzer = settings.exact_es_search_analyzer
+
         return {
             "type": "text",
-            "analyzer": "standard",
+            "analyzer": default_analyzer,
             "fields": {
-                "exact": {"type": "text", "analyzer": "whitespace"},
+                "exact": {"type": "text", "analyzer": exact_analyzer},
                 "wordcloud": {
                     "type": "text",
                     "analyzer": MULTILINGUAL_STOP_ANALYZER_REF,