Merge pull request #114 from sashafrey/master

Enhanced CPU performance in Processor.cc
bigartm · Feb 15, 2015 · bc0affb · bc0affb
2 parents 76f2901 + a2bf516
commit bc0affb
Show file tree

Hide file tree

Showing 24 changed files with 1,968 additions and 486 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -54,7 +54,7 @@ else()
   message("-- Warning: BigARTM has not been tested with '${CMAKE_CXX_COMPILER_ID}' compiler.")
 endif()
 
-set(Boost_ADDITIONAL_VERSIONS "1.56" "1.56.0" "1.55" "1.55.0" "1.54" "1.54.0" "1.53" "1.53.0" "1.52" "1.52.0" "1.51" "1.51.0" "1.50" "1.50.0" "1.49" "1.49.0" "1.48" "1.48.0" "1.47" "1.47.0" "1.46" "1.46.0" "1.45" "1.45.0" "1.44" "1.44.0" "1.42" "1.42.0" "1.41.0" "1.41" "1.40.0" "1.40")
+set(Boost_ADDITIONAL_VERSIONS "1.57" "1.57.0" "1.56" "1.56.0" "1.55" "1.55.0" "1.54" "1.54.0" "1.53" "1.53.0" "1.52" "1.52.0" "1.51" "1.51.0" "1.50" "1.50.0" "1.49" "1.49.0" "1.48" "1.48.0" "1.47" "1.47.0" "1.46" "1.46.0" "1.45" "1.45.0" "1.44" "1.44.0" "1.42" "1.42.0" "1.41.0" "1.41" "1.40.0" "1.40")
 
 # find boost
 find_package(Boost REQUIRED)

diff --git a/docs/download.txt b/docs/download.txt
@@ -2,10 +2,12 @@ Download
 ========
 
 * Windows - latest release
-	* https://github.com/bigartm/bigartm/releases/download/v0.5.6/BigARTM_v0.5.6_win32.7z
-	* https://github.com/bigartm/bigartm/releases/download/v0.5.6/BigARTM_v0.5.6_x64.7z
+	* https://github.com/bigartm/bigartm/releases/download/v0.5.7/BigARTM_v0.5.7_win32.7z
+	* https://github.com/bigartm/bigartm/releases/download/v0.5.7/BigARTM_v0.5.7_x64.7z
 
 * Windows - previous releases
+	* https://github.com/bigartm/bigartm/releases/download/v0.5.6/BigARTM_v0.5.6_win32.7z
+	* https://github.com/bigartm/bigartm/releases/download/v0.5.6/BigARTM_v0.5.6_x64.7z
 	* https://github.com/bigartm/bigartm/releases/download/v0.5.5/BigARTM_v0.5.5_win32.7z
 	* https://github.com/bigartm/bigartm/releases/download/v0.5.5/BigARTM_v0.5.5_x64.7z
 	* https://github.com/bigartm/bigartm/releases/download/v0.5.4/BigARTM_v0.5.4_win32.7z

diff --git a/docs/ref/c_interface.txt b/docs/ref/c_interface.txt
@@ -340,35 +340,42 @@ ArtmDisposeDictionary
 ArtmAddBatch
 ------------
 
-.. c:function::  int ArtmAddBatch(int master_id, int length, const char* batch)
+.. c:function::  int ArtmAddBatch(int master_id, int length, const char* add_batch_args)
 
-   Adds an instance of :ref:`Batch` class to the master component.
+   Adds batch for processing.
 
    :param int master_id: The ID of a master component or a master proxy,
       returned by either :c:func:`ArtmCreateMasterComponent` or :c:func:`ArtmCreateMasterProxy` method.
 
-   :param const_char* batch:
-      Serialized :ref:`Batch` message.
+   :param const_char* add_batch_args:
+      Serialized :ref:`AddBatchArgs` message,
+      describing the arguments of this operation.
 
-   :param int length: The length in bytes of the *batch* message.
+   :param int length: The length in bytes of the *add_batch_args* message.
 
    :return: Returns :c:macro:`ARTM_SUCCESS` value if operation succeeded,
             otherwise returns one of the :ref:`error codes <error-codes>`.
 
    This operation is only allowed when
+   :attr:`MasterComponentConfig.online_batch_processing` is set to *True* and
    :attr:`MasterComponentConfig.modus_operandi` is set to *Local*.
 
 ArtmInvokeIteration
 -------------------
 
-.. c:function::  int ArtmInvokeIteration(int master_id, int iterations_count)
+.. c:function::  int ArtmInvokeIteration(int master_id, int length, const char* invoke_iteration_args)
 
    Invokes several iterations over the collection. 
 
    :param int master_id: The ID of a master component or a master proxy,
       returned by either :c:func:`ArtmCreateMasterComponent` or :c:func:`ArtmCreateMasterProxy` method.
 
-   :param int iterations_count: The number of iterations to perform.
+   :param const char* invoke_iteration_args:
+      Serialized :ref:`InvokeIterationArgs` message,
+      describing the arguments of this operation.
+
+   :param int length:
+      The length in bytes of the *invoke_iteration_args* message.
 
    :return: Returns :c:macro:`ARTM_SUCCESS` value if operation succeeded,
             otherwise returns one of the :ref:`error codes <error-codes>`.
@@ -421,14 +428,18 @@ ArtmInitializeModel
 ArtmWaitIdle
 ------------
 
-.. c:function::  int ArtmWaitIdle(int master_id, int timeout_milliseconds)
+.. c:function::  int ArtmWaitIdle(int master_id, int length, const char* wait_idle_args)
 
    Awaits for ongoing iterations. 
 
    :param int master_id: The ID of a master component or a master proxy,
       returned by either :c:func:`ArtmCreateMasterComponent` or :c:func:`ArtmCreateMasterProxy` method.
 
-   :param int timeout_milliseconds: Timeout in milliseconds. Pass *timeout = -1* to allow infinite timeout.
+   :param const_char* wait_idle_args:
+      Serialized :ref:`WaitIdleArgs` message,
+      describing the arguments of this operation.
+
+   :param int length: The length in bytes of the *wait_idle_args* message.
 
    :return: Returns :c:macro:`ARTM_SUCCESS` value if operation succeeded,
             otherwise returns one of the :ref:`error codes <error-codes>`.

diff --git a/docs/ref/cpp_client.txt b/docs/ref/cpp_client.txt
@@ -25,15 +25,8 @@ You may append the following options to customize the resulting topic model:
     Currently *cpp_client* does not allow you to customize regularizer weights for different topics
     and for different iterations. This limitation is only related to *cpp_client*,
     and you can simply achieve this by using BigARTM interface (either in Python or in C++).
-  * ``--online_decay`` and ``--online_period`` are the parameters of online algorithm.
-    *online_period* specifies time interval in milliseconds between model synchronization.
-    Once per this time interval the topic model will be re-calculated as follows:
-    *n_wt* counters in the old model are decreased according to *online_decay* coefficient,
-    and increased by *n_wt* counters calculated by all processors since the last model synchronization.
-    The suggested way to choose *online_period* is to measure single iteration time for *cpp_client* in offline mode
-    (e.g. without *online_period* option). Then set *online_period* to ``lambda = 0.25`` of the iteration time.
-    For larger collections it may be more efficient to set *online_period* to ``lambda = 0.1`` of the iteration time,
-    but then remember to update *online_decay* parameter accordingly (``1-lambda`` should be a reasonable value).
+  * ``--update_every`` is a parameter of the online algorithm.
+    When specified, the model will be updated every *update_every* documents.
 
 You may also apply the following optimizations that should not change the resulting model
 
@@ -51,6 +44,9 @@ You may also apply the following optimizations that should not change the result
     This parameter allows you to specify a writable disk location where BigARTM can cache Theta matrix
     between iterations to avoid storing it in main memory.
 
+  * ``--merger_queue_size`` limits the size of the merger queue. Decrease the size of the queue might
+    reduce memory usage, but decrease CPU utilization of the processors.
+
 .. code-block:: bash
 
    >cpp_client --help
@@ -81,13 +77,13 @@ You may also apply the following optimizations that should not change the result
      --paused                              wait for keystroke (allows to attach a
                                            debugger)
      --no_scores                           disable calculation of all scores
-     --online_period arg (=0)              period in milliseconds between model
-                                           synchronization on the online algorithm
-     --online_decay arg (=0.75)            decay coefficient [0..1] for online
-                                           algorithm
+     --update_every arg (=0)               [online algorithm] requests an update
+                                           of the model after update_every
+                                           document
      --parsing_format arg (=0)             parsing format (0 - UCI, 1 - matrix
                                            market)
      --disk_cache_folder arg               disk cache folder
+     --merger_queue_size arg               size of the merger queue
 
    Networking options (experimental):
      --nodes arg                  endpoints of the remote nodes (enables network

diff --git a/docs/ref/cpp_interface.txt b/docs/ref/cpp_interface.txt
@@ -37,7 +37,7 @@ MasterComponent
      Returns mutable configuration of the master component.
      Remember to call :cpp:func:`Reconfigure` to propagate your changes to master component.
 
-  .. cpp:function:: void InvokeIteration(int iterations_count)
+  .. cpp:function:: void InvokeIteration(int iterations_count = 1)
 
      Invokes certain number of iterations.
 

diff --git a/docs/ref/messages.txt b/docs/ref/messages.txt
@@ -90,6 +90,17 @@ of tokens in the Batch.
     optional string name = 1 [default = "@body"];
     repeated int32 token_id = 2;
     repeated int32 token_count = 3;
+    repeated int32 token_offset = 4;
+
+    optional string string_value = 5;
+    optional int64 int_value = 6;
+    optional double double_value = 7;
+    optional string date_value = 8;
+
+    repeated string string_array = 16;
+    repeated int64 int_array = 17;
+    repeated double double_array = 18;
+    repeated string date_array = 19;
   }
 
 
@@ -111,6 +122,7 @@ items in one batch are always processed sequentially.
     repeated string token = 1;
     repeated Item item = 2;
     repeated string class_id = 3;
+    optional string description = 4;
   }
 
 .. attribute:: Batch.token
@@ -127,6 +139,13 @@ items in one batch are always processed sequentially.
    This repeated field must have the same length as :attr:`token`.
    This value is optional, use an empty list indicate that all tokens belong to the default class.
 
+.. attribute:: Batch.description
+
+   An optional text description of the batch.
+   You may describe for example the source of the batch,
+   preprocessing technique and the structure of its fields.
+
+
 .. _Stream:
 
 Stream
@@ -899,6 +918,8 @@ Represents a configuration of a perplexity score.
     optional string stream_name = 2 [default = "@global"];
     optional Type model_type = 3 [default = UnigramDocumentModel];
     optional string dictionary_name = 4;
+    optional float theta_sparsity_eps = 5 [default = 1e-37];
+    repeated string theta_sparsity_topic_name = 6;
   }
 
 .. attribute:: PerplexityScoreConfig.field_name
@@ -926,6 +947,9 @@ Represents a result of calculation of a perplexity score.
     optional double raw = 2;
     optional double normalizer = 3;
     optional int32 zero_words = 4;
+    optional double theta_sparsity_value = 5;
+    optional int32 theta_sparsity_zero_topics = 6;
+    optional int32 theta_sparsity_total_topics = 7;
   }
 
 .. attribute:: PerplexityScore.value
@@ -947,6 +971,9 @@ Represents a result of calculation of a perplexity score.
    A number of tokens that have zero probability p(w|t,d) in a document.
    Such tokens are evaluated based on to unigram document model or unigram colection model.
 
+.. attribute:: PerplexityScore.theta_sparsity_value
+
+   A fraction of zero entries in the theta matrix.
 
 .. _SparsityThetaScoreConfig:
 
@@ -1834,7 +1861,7 @@ Represents an argument of get score operation.
     optional Batch batch = 3;
   }
 
-.. attribute:: GetScore ValueArgs.model_name
+.. attribute:: GetScoreValueArgs.model_name
 
   The name of the model to retrieved score for.
 
@@ -1847,3 +1874,62 @@ Represents an argument of get score operation.
   The :ref:`Batch` to calculate the score.
   This option is only applicable to cumulative scores.
   When not provided the score will be reported for all batches processed since last :c:func:`ArtmInvokeIteration`.
+
+
+.. _AddBatchArgs:
+
+AddBatchArgs
+============
+
+Represents an argument of :c:func:`ArtmAddBatch` operation.
+
+.. code-block:: bash
+
+  message AddBatchArgs {
+    optional Batch batch = 1;
+    optional int32 timeout_milliseconds = 2;
+  }
+
+.. attribute:: AddBatchArgs.batch
+
+  The :ref:`Batch` to add.
+
+.. attribute:: AddBatchArgs.timeout_milliseconds
+
+  Timeout in milliseconds for this operation.
+
+
+.. _InvokeIterationArgs:
+
+InvokeIterationArgs
+===================
+
+Represents an argument of :c:func:`ArtmInvokeIteration` operation.
+
+.. code-block:: bash
+
+  message InvokeIterationArgs {
+    optional int32 iterations_count = 1 [default = 1];
+  }
+
+.. attribute:: InvokeIterationArgs.iterations_count
+
+  An integer value describing how many iterations to invoke.
+
+
+.. _WaitIdleArgs:
+
+WaitIdleArgs
+============
+
+Represents an argument of :c:func:`ArtmWaitIdle` operation.
+
+.. code-block:: bash
+
+  message WaitIdleArgs {
+    optional int32 timeout_milliseconds = 1 [default = -1];
+  }
+
+.. attribute:: WaitIdleArgs.timeout_milliseconds
+
+  Timeout in milliseconds for this operation.
diff --git a/src/artm/core/common.h b/src/artm/core/common.h
@@ -7,6 +7,7 @@
 #include <functional>
 #include <memory>
 #include <string>
+#include <sstream>
 #include <unordered_map>
 
 #include "boost/uuid/uuid.hpp"
@@ -147,15 +148,30 @@ inline bool make_rpcz_call_no_throw(std::function<void()> f, const std::string&
 class CuckooWatch {
  public:
   explicit CuckooWatch(std::string message)
-      : message_(message), start_(std::chrono::system_clock::now()) {}
+      : message_(message), submessage_(), start_(std::chrono::system_clock::now()), parent_(nullptr) {}
+  CuckooWatch(std::string message, CuckooWatch* parent)
+      : message_(message), submessage_(), start_(std::chrono::system_clock::now()), parent_(parent) {}
+
   ~CuckooWatch() {
     auto delta = (std::chrono::system_clock::now() - start_);
     auto delta_ms = std::chrono::duration_cast<std::chrono::milliseconds>(delta);
-    LOG(INFO) << message_ << " " << delta_ms.count() << " milliseconds.";
+    if (parent_ == nullptr) {
+      std::stringstream ss;
+      ss << delta_ms.count() << "ms in " + message_;
+      if (!submessage_.empty())
+        ss << " [including " << submessage_ << "]";
+      LOG(INFO) << ss.str();
+    } else if (delta_ms.count() > 0) {
+      std::stringstream ss;
+      ss << delta_ms.count() << "ms in " << message_;
+      parent_->submessage_ += ss.str();
+    }
   }
 
  private:
   std::string message_;
+  std::string submessage_;
+  CuckooWatch* parent_;
   std::chrono::time_point<std::chrono::system_clock> start_;
 };
 

diff --git a/src/artm/core/merger.cc b/src/artm/core/merger.cc
@@ -493,17 +493,35 @@ void Merger::SynchronizeModel(const ModelName& model_name, float decay_weight,
     std::shared_ptr<ModelConfig> target_config = target_model_config_.get(name);
 
     // Accumulate counters in topic model with decay coefficient.
-    auto new_ttm = std::make_shared< ::artm::core::TopicModel>(
-      *old_ttm, decay_weight, target_config == nullptr ? current_config : *target_config);
+    std::shared_ptr< ::artm::core::TopicModel> new_ttm;
+    {
+      CuckooWatch cuckoo2("copy&decay, ", &cuckoo);
+      new_ttm = std::make_shared< ::artm::core::TopicModel>(
+        *old_ttm, decay_weight, target_config == nullptr ? current_config : *target_config);
+    }
     target_model_config_.set(name, nullptr);
-    // Apply increment
-    if (inc_ttm != topic_model_inc_.end())
+
+
+    if (inc_ttm != topic_model_inc_.end()) {
+      CuckooWatch cuckoo2("ApplyDiff, ", &cuckoo);
       new_ttm->ApplyDiff(*inc_ttm->second, apply_weight);
+    }
 
-    if (invoke_regularizers)
+    if (invoke_regularizers) {
+      CuckooWatch cuckoo2("InvokePhiRegularizers, ", &cuckoo);
       InvokePhiRegularizers(new_ttm.get());
+    }
+
+    {
+      CuckooWatch cuckoo2("CalcNormalizers, ", &cuckoo);
+      new_ttm->CalcNormalizers();
+    }
+
+    {
+      CuckooWatch cuckoo2("CalcPwt", &cuckoo);
+      new_ttm->CalcPwt();   // calculate pwt matrix
+    }
 
-    new_ttm->CalcNormalizers();
     topic_model_.set(name, new_ttm);
 
     topic_model_inc_.erase(name);
@@ -543,6 +561,7 @@ void Merger::InitializeModel(const InitializeModelArgs& args) {
   }
 
   new_ttm->CalcNormalizers();
+  new_ttm->CalcPwt();   // calculate pwt matrix
   topic_model_.set(args.model_name(), new_ttm);
 }