Skip to content

Commit

Permalink
Text calcers options fix
Browse files Browse the repository at this point in the history
ref:127f4db9b9794ccf924bf070a25cf309934138b6
  • Loading branch information
grag90 committed Nov 20, 2021
1 parent 6a60043 commit 6bfbe66
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 5 deletions.
19 changes: 16 additions & 3 deletions catboost/private/libs/options/text_processing_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,25 @@ namespace NCatboostOptions {
return;
}
TString calcerName;
if (options.Has("calcer_type")) {

if (options.IsString()) {
TStringBuf name, calcersOptions;
TStringBuf(options.GetString()).Split(':', name, calcersOptions);
calcerName = name;
CalcerOptions->InsertValue("calcer_type", calcerName);
for (TStringBuf stringParam : StringSplitter(calcersOptions).Split(':')) {
TStringBuf key, value;
stringParam.Split('=', key, value);
CalcerOptions->InsertValue(key, value);
}
} else {
CB_ENSURE(options.IsMap(),
"We only support string and dictionaries as featurization options for value "
<< options.GetStringRobust() << " with type " << options.GetType());
calcerName = options["calcer_type"].GetString();
CalcerOptions.Set(options);
} else {
calcerName = options.GetString();
}

EFeatureCalcerType calcerType;

CB_ENSURE(TryFromString<EFeatureCalcerType>(calcerName, calcerType),
Expand Down
4 changes: 2 additions & 2 deletions catboost/python-package/ut/medium/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1220,7 +1220,7 @@ def fill_check_model(params, train_file, test_file, cd_file):
{'dictionary_id': 'BiGram', 'token_level_type': 'Letter', 'occurrence_lower_bound': '1', 'gram_order': '2'},
{'dictionary_id': 'Word', 'occurrence_lower_bound': '1'},
],
'feature_calcers': ['NaiveBayes', 'BoW'],
'feature_calcers': ['NaiveBayes', 'BoW:top_tokens_count=10'],
'iterations': 10,
'loss_function': 'MultiClass',
'task_type': task_type,
Expand Down Expand Up @@ -1308,7 +1308,7 @@ def test_fit_with_texts(task_type):
{'dictionary_id': 'BiGram', 'token_level_type': 'Letter', 'occurrence_lower_bound': '1', 'gram_order': '2'},
{'dictionary_id': 'Word', 'occurrence_lower_bound': '1'},
],
'feature_calcers': ['NaiveBayes', 'BoW'],
'feature_calcers': ['NaiveBayes', 'BoW:top_tokens_count=10'],
'iterations': 100,
'loss_function': 'MultiClass',
'task_type': task_type,
Expand Down

0 comments on commit 6bfbe66

Please sign in to comment.