Merge pull request #158 from sashafrey/master

#140 cpp_client should clean temporary folder with batches after run
bigartm · Mar 15, 2015 · 828c651 · 828c651
2 parents ed4d79e + 3e17983
commit 828c651
Show file tree

Hide file tree

Showing 8 changed files with 244 additions and 123 deletions.
diff --git a/docs/ref/cpp_client.txt b/docs/ref/cpp_client.txt
@@ -30,9 +30,6 @@ You may append the following options to customize the resulting topic model:
 
 You may also apply the following optimizations that should not change the resulting model
 
-  * ``--reuse_batches`` skips parsing of *docword* and *vocab* files, and tries to use batches located in ``--batch_folder``.
-    You may download pre-parsed batches by links provided in :doc:`/download` section.
-
   * ``-p`` allows you to specify number of concurrent processors.
     The recommended value is to use the number of logical cores on your machine.
 
@@ -49,52 +46,57 @@ You may also apply the following optimizations that should not change the result
 
 .. code-block:: bash
 
-   >cpp_client --help
-   BigARTM - library for advanced topic modeling (http://bigartm.org):
-
-   Basic options:
-     -h [ --help ]                         display this help message
-     -d [ --docword ] arg                  docword file in UCI format
-     -v [ --vocab ] arg                    vocab file in UCI format
-     -t [ --num_topic ] arg (=16)          number of topics
-     -p [ --num_processors ] arg (=2)      number of concurrent processors
-     -i [ --num_iters ] arg (=10)          number of outer iterations
-     --num_inner_iters arg (=10)           number of inner iterations
-     --reuse_theta                         reuse theta between iterations
-     --batch_folder arg (=batches)         temporary folder to store batches
-     --dictionary_file arg (=filename of dictionary file)
-     --reuse_batches                       reuse batches found in batch_folder
-                                           (default = false)
-     --items_per_batch arg (=500)          number of items per batch
-     --tau_phi arg (=0)                    regularization coefficient for PHI
+    >cpp_client --help
+    BigARTM - library for advanced topic modeling (http://bigartm.org):
+
+    Basic options:
+      -h [ --help ]                        display this help message
+      -d [ --docword ] arg                 docword file in UCI format
+      -v [ --vocab ] arg                   vocab file in UCI format
+      -b [ --batch_folder ] arg            If docword or vocab arguments are not
+                                           provided, cpp_client will try to read
+                                           pre-parsed batches from batch_folder
+                                           location. Otherwise, if both docword and
+                                           vocab arguments are provided, cpp_client
+                                           will parse data and store batches in
+                                           batch_folder location.
+      -t [ --num_topic ] arg (=16)         number of topics
+      -p [ --num_processors ] arg (=2)     number of concurrent processors
+      -i [ --num_iters ] arg (=10)         number of outer iterations
+      --num_inner_iters arg (=10)          number of inner iterations
+      --reuse_theta                        reuse theta between iterations
+      --dictionary_file arg (=dictionary)  filename of dictionary file
+      --items_per_batch arg (=500)         number of items per batch
+      --tau_phi arg (=0)                   regularization coefficient for PHI
                                            matrix
-     --tau_theta arg (=0)                  regularization coefficient for THETA
+      --tau_theta arg (=0)                 regularization coefficient for THETA
                                            matrix
-     --tau_decor arg (=0)                  regularization coefficient for topics
-                                           decorrelation (use with care, since
-                                           this value heavily depends on the size
-                                           of the dataset)
-     --paused                              wait for keystroke (allows to attach a
+      --tau_decor arg (=0)                 regularization coefficient for topics
+                                           decorrelation (use with care, since this
+                                           value heavily depends on the size of the
+                                           dataset)
+      --paused                             wait for keystroke (allows to attach a
                                            debugger)
-     --no_scores                           disable calculation of all scores
-     --update_every arg (=0)               [online algorithm] requests an update
-                                           of the model after update_every
-                                           document
-     --parsing_format arg (=0)             parsing format (0 - UCI, 1 - matrix
+      --no_scores                          disable calculation of all scores
+      --update_every arg (=0)              [online algorithm] requests an update of
+                                           the model after update_every document
+      --parsing_format arg (=0)            parsing format (0 - UCI, 1 - matrix
                                            market)
-     --disk_cache_folder arg               disk cache folder
-     --merger_queue_size arg               size of the merger queue
-
-   Networking options (experimental):
-     --nodes arg                  endpoints of the remote nodes (enables network
-                                  modus operandi)
-     --localhost arg (=localhost) DNS name or the IP address of the localhost
-     --port arg (=5550)           port to use for master node
-     --proxy arg                  proxy endpoint
-     --timeout arg (=1000)        network communication timeout in milliseconds
-
-   Examples:
-           cpp_client -d docword.kos.txt -v vocab.kos.txt
-           set GLOG_logtostderr=1 & cpp_client -d docword.kos.txt -v vocab.kos.txt
-
-For further details please refer to the `source code <https://raw.githubusercontent.com/bigartm/bigartm/master/src/cpp_client/srcmain.cc>`_ of cpp_client.
+      --disk_cache_folder arg              disk cache folder
+      --merger_queue_size arg              size of the merger queue
+
+    Networking options:
+      --nodes arg                          endpoints of the remote nodes (enables
+                                           network modus operandi)
+      --localhost arg (=localhost)         DNS name or the IP address of the
+                                           localhost
+      --port arg (=5550)                   port to use for master node
+      --proxy arg                          proxy endpoint
+      --timeout arg (=1000)                network communication timeout in
+                                           milliseconds
+
+    Examples:
+            cpp_client -d docword.kos.txt -v vocab.kos.txt
+            set GLOG_logtostderr=1 & cpp_client -d docword.kos.txt -v vocab.kos.txt
+
+For further details please refer to the `source code <https://raw.githubusercontent.com/bigartm/bigartm/master/src/cpp_client/srcmain.cc>`_ of cpp_client.
diff --git a/docs/ref/messages.txt b/docs/ref/messages.txt
@@ -1569,6 +1569,7 @@ Represents a configuration of a collection parser.
     optional int32 num_items_per_batch = 6 [default = 1000];
     optional string cooccurrence_file_name = 7;
     repeated string cooccurrence_token = 8;
+    optional bool use_unity_based_indices = 9 [default = true];
   }
 
 .. attribute:: CollectionParserConfig.format
@@ -1696,6 +1697,9 @@ Represents a configuration of a collection parser.
    A cooccurrence of the pair `<first>~<second>` will be collected only when both tokens are present in
    :attr:`CollectionParserConfig.cooccurrence_token`.
 
+.. attribute:: CollectionParserConfig.use_unity_based_indices
+   A flag indicating whether to interpret indices in docword file as unity-based or as zero-based.
+   By default `'use_unity_based_indices = True``, as required by UCI bag-of-words format.
 
 .. _SynchronizeModelArgs:
 

diff --git a/src/artm/core/collection_parser.cc b/src/artm/core/collection_parser.cc
@@ -164,7 +164,8 @@ std::shared_ptr<DictionaryConfig> CollectionParser::ParseDocwordBagOfWordsUci(To
 
   int item_id, token_id, token_count;
   for (std::string token; docword >> item_id >> token_id >> token_count;) {
-    token_id--;  // convert 1-based to zero-based index
+    if (config_.use_unity_based_indices())
+      token_id--;  // convert 1-based to zero-based index
 
     if (token_map->find(token_id) == token_map->end())  {
       std::stringstream ss;
@@ -173,7 +174,8 @@ std::shared_ptr<DictionaryConfig> CollectionParser::ParseDocwordBagOfWordsUci(To
       if (token_id == -1) {
         ss << ". wordID column appears to be zero-based in the docword file being parsed. "
            << "UCI format defines wordID column to be unity-based. "
-           << "Please, increase wordID by one in your input data.";
+           << "Please, set CollectionParserConfig.use_unity_based_indices=false "
+           << "or increase wordID by one in your input data";
       } else {
         ss << ". Token_id value is outside of the expected range.";
       }

diff --git a/src/artm/messages.pb.cc b/src/artm/messages.pb.cc