Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix problem with newline in vocab files #619

Merged
merged 1 commit into from
Jul 22, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,6 @@ src/artm/messages.pb.h
/python/dist/*

/src/artm/version.h

# configuration file for YouCompleteMe
.ycm_extra_conf.py
3 changes: 2 additions & 1 deletion src/artm/core/collection_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ CollectionParser::TokenMap CollectionParser::ParseVocabBagOfWordsUci() {
int token_id = 0;
while (!vocab.eof()) {
std::getline(vocab, str);
if (vocab.eof())
if (vocab.eof() && str.empty())
break;

boost::algorithm::trim(str);
Expand Down Expand Up @@ -445,3 +445,4 @@ void CollectionParser::Parse() {

} // namespace core
} // namespace artm
// vim: set ts=2 sw=2:
3 changes: 2 additions & 1 deletion src/artm/core/dictionary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ Dictionary::Gather(const GatherDictionaryArgs& args,
int token_id = 0;
while (!vocab.eof()) {
std::getline(vocab, str);
if (vocab.eof())
if (vocab.eof() && str.empty())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good!

break;

boost::algorithm::trim(str);
Expand Down Expand Up @@ -545,3 +545,4 @@ float Dictionary::CountTopicCoherence(const std::vector<core::Token>& tokens_to_

} // namespace core
} // namespace artm
// vim: set ts=2 sw=2:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// vim: set ts=2 sw=2:
Is this intentional?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, with additions in .gitignore.
Does it break something?

61 changes: 37 additions & 24 deletions src/artm_tests/collection_parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,36 +46,48 @@ TEST(CollectionParser, UciBagOfWords) {

artm::MasterModelConfig master_config;
::artm::MasterModel mc(master_config);
artm::GatherDictionaryArgs gather_config;
gather_config.set_data_path(target_folder);
gather_config.set_vocab_file_path(config.vocab_file_path());
gather_config.set_dictionary_target_name("mydictionary");
mc.GatherDictionary(gather_config);

::artm::GetDictionaryArgs get_dictionary_args;
get_dictionary_args.set_dictionary_name("mydictionary");
auto dictionary = mc.GetDictionary(get_dictionary_args);
ASSERT_EQ(dictionary.token_size(), 3);
auto dictionary_checker = [&mc, &target_folder] (
const std::string &path_to_vocab,
const std::string &dict_name) -> void {
// first of all, we gather dictionary into the core
artm::GatherDictionaryArgs gather_config;
gather_config.set_data_path(target_folder);
gather_config.set_vocab_file_path(path_to_vocab);
gather_config.set_dictionary_target_name(dict_name);
mc.GatherDictionary(gather_config);

// next, we retrieve it from the core
::artm::GetDictionaryArgs get_dictionary_args;
get_dictionary_args.set_dictionary_name(dict_name);
auto dict = mc.GetDictionary(get_dictionary_args);

// now we check its consistency
ASSERT_EQ(dict.token_size(), 3);

EXPECT_EQ(dict.token(0), "token1");
EXPECT_EQ(dict.token(1), "token2");
EXPECT_EQ(dict.token(2), "token3");

EXPECT_EQ(dictionary.token(0), "token1");
EXPECT_EQ(dictionary.token(1), "token2");
EXPECT_EQ(dictionary.token(2), "token3");
EXPECT_EQ(dict.class_id(0), "@default_class");
EXPECT_EQ(dict.class_id(1), "@default_class");
EXPECT_EQ(dict.class_id(2), "@default_class");

EXPECT_EQ(dictionary.class_id(0), "@default_class");
EXPECT_EQ(dictionary.class_id(1), "@default_class");
EXPECT_EQ(dictionary.class_id(2), "@default_class");
EXPECT_EQ(dict.token_df(0), 1);
EXPECT_EQ(dict.token_df(1), 2);
EXPECT_EQ(dict.token_df(2), 2);

EXPECT_EQ(dictionary.token_df(0), 1);
EXPECT_EQ(dictionary.token_df(1), 2);
EXPECT_EQ(dictionary.token_df(2), 2);
EXPECT_EQ(dict.token_tf(0), 5);
EXPECT_EQ(dict.token_tf(1), 4);
EXPECT_EQ(dict.token_tf(2), 9);

EXPECT_EQ(dictionary.token_tf(0), 5);
EXPECT_EQ(dictionary.token_tf(1), 4);
EXPECT_EQ(dictionary.token_tf(2), 9);
ASSERT_APPROX_EQ(dict.token_value(0), 5.0 / 18.0);
ASSERT_APPROX_EQ(dict.token_value(1), 2.0 / 9.0);
ASSERT_APPROX_EQ(dict.token_value(2), 0.5);
};

ASSERT_APPROX_EQ(dictionary.token_value(0), 5.0 / 18.0);
ASSERT_APPROX_EQ(dictionary.token_value(1), 2.0 / 9.0);
ASSERT_APPROX_EQ(dictionary.token_value(2), 0.5);
dictionary_checker(config.vocab_file_path(), "default_dictionary");
dictionary_checker("../../../test_data/vocab.parser_test_no_newline.txt", "no_newline_dictionary");
Copy link
Contributor

@ofrei ofrei Jul 22, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

../../../test_data/vocab.parser_test_no_newline.txt

../../.. in C++ tests is a bad practice, but I see it is used across multiple tests. So feel free to keep this code like this, but if you have time it would be nice to replace replace ../../../test_data with BIGARTM_UNITTEST_DATA env variable. I think that should work, because we anyway have to set this variable for python tests.


try { boost::filesystem::remove_all(target_folder); }
catch (...) {}
Expand Down Expand Up @@ -241,3 +253,4 @@ TEST(CollectionParser, VowpalWabbit) {
try { boost::filesystem::remove_all(target_folder); }
catch (...) {}
}
// vim: set ts=2 sw=2 sts=2:
3 changes: 3 additions & 0 deletions test_data/vocab.parser_test_no_newline.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
token1
token2
token3