Text calcers options fix

ref:127f4db9b9794ccf924bf070a25cf309934138b6
catboost · Nov 20, 2021 · 6bfbe66 · 6bfbe66
1 parent 6a60043
commit 6bfbe66
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 5 deletions.
diff --git a/catboost/private/libs/options/text_processing_options.cpp b/catboost/private/libs/options/text_processing_options.cpp
@@ -121,12 +121,25 @@ namespace NCatboostOptions {
             return;
         }
         TString calcerName;
-        if (options.Has("calcer_type")) {
+
+        if (options.IsString()) {
+            TStringBuf name, calcersOptions;
+            TStringBuf(options.GetString()).Split(':', name, calcersOptions);
+            calcerName = name;
+            CalcerOptions->InsertValue("calcer_type", calcerName);
+            for (TStringBuf stringParam : StringSplitter(calcersOptions).Split(':')) {
+                TStringBuf key, value;
+                stringParam.Split('=', key, value);
+                CalcerOptions->InsertValue(key, value);
+            }
+        } else {
+            CB_ENSURE(options.IsMap(),
+                      "We only support string and dictionaries as featurization options for value "
+                      << options.GetStringRobust() << " with type " << options.GetType());
             calcerName = options["calcer_type"].GetString();
             CalcerOptions.Set(options);
-        } else {
-            calcerName = options.GetString();
         }
+
         EFeatureCalcerType calcerType;
 
         CB_ENSURE(TryFromString<EFeatureCalcerType>(calcerName, calcerType),

diff --git a/catboost/python-package/ut/medium/test.py b/catboost/python-package/ut/medium/test.py
@@ -1220,7 +1220,7 @@ def fill_check_model(params, train_file, test_file, cd_file):
             {'dictionary_id': 'BiGram', 'token_level_type': 'Letter', 'occurrence_lower_bound': '1', 'gram_order': '2'},
             {'dictionary_id': 'Word', 'occurrence_lower_bound': '1'},
         ],
-        'feature_calcers': ['NaiveBayes', 'BoW'],
+        'feature_calcers': ['NaiveBayes', 'BoW:top_tokens_count=10'],
         'iterations': 10,
         'loss_function': 'MultiClass',
         'task_type': task_type,
@@ -1308,7 +1308,7 @@ def test_fit_with_texts(task_type):
             {'dictionary_id': 'BiGram', 'token_level_type': 'Letter', 'occurrence_lower_bound': '1', 'gram_order': '2'},
             {'dictionary_id': 'Word', 'occurrence_lower_bound': '1'},
         ],
-        'feature_calcers': ['NaiveBayes', 'BoW'],
+        'feature_calcers': ['NaiveBayes', 'BoW:top_tokens_count=10'],
         'iterations': 100,
         'loss_function': 'MultiClass',
         'task_type': task_type,