Skip to content

Commit

Permalink
Merge pull request #157 from sashafrey/master
Browse files Browse the repository at this point in the history
Refactor disk usage in artm_tests to make them more robust
  • Loading branch information
bigartm committed Mar 14, 2015
2 parents ff4d3a7 + b29a6f5 commit 3351f86
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 33 deletions.
45 changes: 21 additions & 24 deletions src/artm_tests/collection_parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,30 +7,18 @@
#include "artm/messages.pb.h"
#include "artm/cpp_interface.h"

static void Cleanup() {
// Clean all .batches files
if (boost::filesystem::exists("collection_parser_test")) {
boost::filesystem::recursive_directory_iterator it("collection_parser_test");
boost::filesystem::recursive_directory_iterator endit;
while (it != endit) {
if (boost::filesystem::is_regular_file(*it)) {
if (it->path().extension() == ".batch" || it->path().extension() == ".dictionary")
boost::filesystem::remove(*it);
}

++it;
}
}
}
#include "artm_tests/test_mother.h"

namespace fs = boost::filesystem;

// To run this particular test:
// artm_tests.exe --gtest_filter=CollectionParser.*
TEST(CollectionParser, UciBagOfWords) {
Cleanup();
std::string target_folder = artm::test::Helpers::getUniqueString();

::artm::CollectionParserConfig config;
config.set_format(::artm::CollectionParserConfig_Format_BagOfWordsUci);
config.set_target_folder("collection_parser_test/");
config.set_target_folder(target_folder);
config.set_dictionary_file_name("test_parser.dictionary");
config.set_cooccurrence_file_name("test_parser.cooc.dictionary");
config.add_cooccurrence_token("token1");
Expand All @@ -44,7 +32,7 @@ TEST(CollectionParser, UciBagOfWords) {
ASSERT_EQ(dictionary_parsed->entry_size(), 3);

std::shared_ptr< ::artm::DictionaryConfig> dictionary_loaded = ::artm::LoadDictionary(
"collection_parser_test/test_parser.dictionary");
(fs::path(target_folder) / "test_parser.dictionary").string());
ASSERT_EQ(dictionary_parsed->entry_size(), dictionary_loaded->entry_size());

ASSERT_EQ(dictionary_loaded->entry_size(), 3);
Expand All @@ -65,7 +53,7 @@ TEST(CollectionParser, UciBagOfWords) {
ASSERT_EQ(dictionary_loaded->entry(2).token_count(), 9);

std::shared_ptr< ::artm::DictionaryConfig> cooc_dictionary_loaded = ::artm::LoadDictionary(
"collection_parser_test/test_parser.cooc.dictionary");
(fs::path(target_folder) / fs::path("test_parser.cooc.dictionary")).string());
ASSERT_EQ(cooc_dictionary_loaded->entry_size(), 3);
ASSERT_EQ(cooc_dictionary_loaded->entry(0).key_token(), "token1~token2");
ASSERT_EQ(cooc_dictionary_loaded->entry(0).items_count(), 1);
Expand All @@ -74,7 +62,7 @@ TEST(CollectionParser, UciBagOfWords) {
ASSERT_EQ(cooc_dictionary_loaded->entry(2).key_token(), "token2~token3");
ASSERT_EQ(cooc_dictionary_loaded->entry(2).items_count(), 2);

boost::filesystem::recursive_directory_iterator it("collection_parser_test");
boost::filesystem::recursive_directory_iterator it(target_folder);
boost::filesystem::recursive_directory_iterator endit;
int batches_count = 0;
while (it != endit) {
Expand All @@ -89,6 +77,9 @@ TEST(CollectionParser, UciBagOfWords) {
}

ASSERT_EQ(batches_count, 2);

try { boost::filesystem::remove_all(target_folder); }
catch (...) {}
}

TEST(CollectionParser, ErrorHandling) {
Expand All @@ -115,26 +106,29 @@ TEST(CollectionParser, ErrorHandling) {
}

TEST(CollectionParser, MatrixMarket) {
Cleanup();
std::string target_folder = artm::test::Helpers::getUniqueString();

::artm::CollectionParserConfig config;
config.set_format(::artm::CollectionParserConfig_Format_MatrixMarket);
config.set_target_folder("collection_parser_test/");
config.set_target_folder(target_folder);
config.set_num_items_per_batch(10000);
config.set_vocab_file_path("../../../test_data/deerwestere.txt");
config.set_docword_file_path("../../../test_data/deerwestere.mm");
config.set_dictionary_file_name("test_parser.dictionary");

std::shared_ptr< ::artm::DictionaryConfig> dictionary_parsed = ::artm::ParseCollection(config);
ASSERT_EQ(dictionary_parsed->entry_size(), 12);

try { boost::filesystem::remove_all(target_folder); }
catch (...) {}
}

TEST(CollectionParser, Multiclass) {
Cleanup();
std::string target_folder = artm::test::Helpers::getUniqueString();

::artm::CollectionParserConfig config;
config.set_format(::artm::CollectionParserConfig_Format_BagOfWordsUci);
config.set_target_folder("collection_parser_test/");
config.set_target_folder(target_folder);
config.set_dictionary_file_name("test_parser.dictionary");
config.set_vocab_file_path("../../../test_data/vocab.parser_test_multiclass.txt");
config.set_docword_file_path("../../../test_data/docword.parser_test.txt");
Expand All @@ -148,4 +142,7 @@ TEST(CollectionParser, Multiclass) {
ASSERT_EQ(dictionary_parsed->entry(1).class_id(), "@default_class");
ASSERT_EQ(dictionary_parsed->entry(2).key_token(), "token3");
ASSERT_EQ(dictionary_parsed->entry(2).class_id(), "class1");

try { boost::filesystem::remove_all(target_folder); }
catch (...) {}
}
15 changes: 13 additions & 2 deletions src/artm_tests/cpp_interface_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@

#include "artm/core/internals.pb.h"

#include "artm_tests/test_mother.h"

TEST(CppInterface, Canary) {
}

void BasicTest(bool is_network_mode, bool is_proxy_mode) {
std::string target_path = artm::test::Helpers::getUniqueString();
const int nTopics = 5;

// Endpoints:
Expand All @@ -33,7 +36,7 @@ void BasicTest(bool is_network_mode, bool is_proxy_mode) {
master_config.set_create_endpoint("tcp://*:5555");
master_config.set_connect_endpoint("tcp://localhost:5555");
master_config.add_node_connect_endpoint("tcp://localhost:5556");
master_config.set_disk_path(".");
master_config.set_disk_path(target_path);

// Clean all .batches files
boost::filesystem::recursive_directory_iterator it(".");
Expand Down Expand Up @@ -71,6 +74,7 @@ void BasicTest(bool is_network_mode, bool is_proxy_mode) {
::artm::MasterProxyConfig master_proxy_config;
master_proxy_config.mutable_config()->CopyFrom(master_config);
master_proxy_config.set_node_connect_endpoint("tcp://localhost:5557");
master_proxy_config.set_communication_timeout(1000);
master_component.reset(new ::artm::MasterComponent(master_proxy_config));
}

Expand Down Expand Up @@ -144,7 +148,7 @@ void BasicTest(bool is_network_mode, bool is_proxy_mode) {
}

// Index doc-token matrix
if (is_network_mode) artm::SaveBatch(batch, "00b6d631-46a6-4edf-8ef6-016c7b27d9f0.batch");
if (is_network_mode) artm::SaveBatch(batch, target_path);

std::shared_ptr<artm::TopicModel> topic_model;
double expected_normalizer = 0;
Expand Down Expand Up @@ -403,6 +407,13 @@ void BasicTest(bool is_network_mode, bool is_proxy_mode) {
EXPECT_EQ(new_topic_model4->topic_name(1), model_config.topic_name(2));
EXPECT_EQ(new_topic_model4->topic_name(2), model_config.topic_name(3));
EXPECT_EQ(new_topic_model4->topic_name(3), model_config.topic_name(4));

master_component.reset();
node_controller_master.reset();
node_controller.reset();

try { boost::filesystem::remove_all(target_path); }
catch (...) {}
}

// artm_tests.exe --gtest_filter=CppInterface.BasicTest_StandaloneMode
Expand Down
7 changes: 7 additions & 0 deletions src/artm_tests/test_mother.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@
namespace artm {
namespace test {

class Helpers {
public:
static std::string getUniqueString() {
return boost::lexical_cast<std::string>(boost::uuids::random_generator()());
}
};

class TestMother {
public:
TestMother() : nTopics(10), regularizer_name("regularizer1") {}
Expand Down
22 changes: 15 additions & 7 deletions src/cpp_client/srcmain.cc
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,15 @@ int execute(const artm_options& options) {
unique_tokens = ::artm::ParseCollection(collection_parser_config);
std::cout << "OK.\n";
} else {
std::cout << "Reuse " << batch_files_count << " batches in folder '" << options.batch_folder << "\n";
std::cout << "Loading dictionary file... ";
unique_tokens = ::artm::LoadDictionary((fs::path(options.batch_folder) / options.dictionary_file).string());
std::cout << "OK.\n";
std::cout << "Reuse " << batch_files_count << " batches in folder '" << options.batch_folder << "'\n";
std::string dictionary_full_filename = (fs::path(options.batch_folder) / options.dictionary_file).string();
if (fs::exists(dictionary_full_filename)) {
std::cout << "Loading dictionary file... ";
unique_tokens = ::artm::LoadDictionary(dictionary_full_filename);
std::cout << "OK.\n";
} else {
std::cout << "Dictionary file " << dictionary_full_filename << " does not exist; BigARTM will use all tokens from batches.\n";
}
}

// Step 3. Create master component.
Expand All @@ -308,7 +313,9 @@ int execute(const artm_options& options) {
master_component.reset(new MasterComponent(master_config));
}

Dictionary dictionary(*master_component, *unique_tokens);
std::shared_ptr<Dictionary> dictionary;
if (unique_tokens != nullptr)
dictionary.reset(new Dictionary(*master_component, *unique_tokens));

// Step 4. Configure regularizers.
std::vector<std::shared_ptr<artm::Regularizer>> regularizers;
Expand All @@ -324,7 +331,8 @@ int execute(const artm_options& options) {

// Step 5. Create and initialize model.
Model model(*master_component, model_config);
model.Initialize(dictionary);
if (dictionary != nullptr)
model.Initialize(*dictionary);

for (int iter = 0; iter < options.num_iters; ++iter) {
{
Expand Down Expand Up @@ -470,7 +478,7 @@ int main(int argc, char * argv[]) {
// options.docword = "D:\\datasets\\docword.kos.txt";
// options.vocab = "D:\\datasets\\vocab.kos.txt";

bool show_help = vm.count("help");
bool show_help = (vm.count("help") > 0);
if (options.docword.empty() || options.vocab.empty()) {
// Show help if user neither provided batch folder, nor docword/vocab files
if (!options.b_reuse_batch && (!vm.count("batch_folder") || vm["batch_folder"].defaulted())) show_help = true;
Expand Down

0 comments on commit 3351f86

Please sign in to comment.