Permalink
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
842 lines (722 sloc) 24.9 KB
syntax = "proto2";
// File messages.proto defines all messages that can be transefer in or out from BigARTM library.
package artm;
// Represents an array of single-precision floating point values
message FloatArray {
repeated float value = 1 [packed = true];
}
// Represents an array of boolean values
message BoolArray {
repeated bool value = 1 [packed = true];
}
// Represents an array of integer values
message IntArray {
repeated int32 value = 1 [packed = true];
}
// Represents an array of string values
message StringArray {
repeated string value = 1;
}
// Represents a unit of textual information.
message Item {
optional int32 id = 1;
repeated Field field = 2; // obsolete in BigARTM v0.8.0
optional string title = 3;
repeated int32 token_id = 4;
repeated float token_weight = 5;
// Field contains start index of each transaction from token_id.
// Last element contains total number of tokens
// In case of regular text will be 0, 1, 2, 3 ... , num_tokens
repeated int32 transaction_start_index = 6;
// Field with length equal to transaction_start_index - 1
// Contains id of transaction typename, transaction belongs to
repeated int32 transaction_typename_id = 7;
}
// Represents a field withing an item.
// Obsolete in BigARTM v0.8.0
message Field {
optional string name = 1 [default = "@body"];
repeated int32 token_id = 2;
repeated int32 token_count = 3; // obsolete in BigARTM v0.7.2
repeated int32 token_offset = 4;
optional string string_value = 5;
optional int64 int_value = 6;
optional double double_value = 7;
optional string date_value = 8;
repeated string string_array = 16;
repeated int64 int_array = 17;
repeated double double_array = 18;
repeated string date_array = 19;
repeated float token_weight = 20;
}
// Represents a set of items.
message Batch {
repeated string token = 1;
repeated string class_id = 2;
repeated Item item = 3;
optional string description = 4;
optional string id = 5;
// string identifiers of each transaction_type
repeated string transaction_typename = 6;
}
message RegularizerSettings {
optional string name = 1;
optional float tau = 2;
optional float gamma = 3;
}
enum RegularizerType {
RegularizerType_SmoothSparseTheta = 0;
RegularizerType_SmoothSparsePhi = 1;
RegularizerType_DecorrelatorPhi = 2;
RegularizerType_MultiLanguagePhi = 3;
RegularizerType_LabelRegularizationPhi = 4;
RegularizerType_SpecifiedSparsePhi = 5;
RegularizerType_ImproveCoherencePhi = 6;
RegularizerType_SmoothPtdw = 7;
RegularizerType_TopicSelectionTheta = 8;
RegularizerType_BitermsPhi = 9;
RegularizerType_HierarchySparsingTheta = 10;
RegularizerType_TopicSegmentationPtdw = 11;
RegularizerType_SmoothTimeInTopicsPhi = 12;
RegularizerType_NetPlsaPhi = 13;
RegularizerType_Unknown = 9999;
}
// Represents a configuration of a general regularizer
message RegularizerConfig {
optional string name = 1;
optional RegularizerType type = 2 [default = RegularizerType_Unknown];
optional bytes config = 3;
optional float tau = 4;
optional float gamma = 5;
optional string config_json = 6;
}
// Represents a configuration of a SmoothSparse Theta regularizer
message SmoothSparseThetaConfig {
repeated string topic_name = 1;
repeated float alpha_iter = 2;
optional TransformConfig transform_config = 3;
repeated string item_title = 4;
repeated FloatArray item_topic_multiplier = 5;
}
// Represents a configuration of a SmoothSparse Phi regularizer
message SmoothSparsePhiConfig {
repeated string topic_name = 1;
repeated string class_id = 2;
optional string dictionary_name = 3;
optional TransformConfig transform_config = 4;
}
// Represents a configuration of a Decorrelator Phi regularizer
message DecorrelatorPhiConfig {
repeated string topic_name = 1;
repeated string class_id = 2;
repeated string first_topic_name = 3;
repeated string second_topic_name = 4;
repeated float value = 5;
}
// Represents a configuration of a MultiLanguage Phi regularizer
message MultiLanguagePhiConfig {
}
message LabelRegularizationPhiConfig {
repeated string topic_name = 1;
repeated string class_id = 2;
optional string dictionary_name = 3;
}
message SpecifiedSparsePhiConfig {
enum SparseMode {
SparseTopics = 0;
SparseTokens = 1;
}
repeated string topic_name = 1;
optional string class_id = 2 [default = "@default_class"];
optional int32 max_elements_count = 3 [default = 20];
optional float probability_threshold = 4 [default = 0.99];
optional SparseMode mode = 5 [default = SparseTopics];
}
// Represents a configuration of a ImproveCoherence Phi regularizer
message ImproveCoherencePhiConfig {
repeated string topic_name = 1;
repeated string class_id = 2;
optional string dictionary_name = 3;
}
// Represents a configuration of a Smooth Ptdw regularizer
message SmoothPtdwConfig {
enum SmoothType {
MovingAverage = 1;
MovingProduct = 2;
}
optional SmoothType type = 1 [default = MovingAverage];
optional int32 window = 3 [default = 10];
optional float threshold = 4 [default = 1.0];
}
// Represents a configuration of a TopicSelection Theta regularizer
message TopicSelectionThetaConfig {
repeated string topic_name = 1;
repeated float topic_value = 2; // user-counted value = n / (n_t * |T|)
repeated float alpha_iter = 3;
}
// Represents a configuration of a Biterms Phi regularizer
message BitermsPhiConfig {
repeated string topic_name = 1;
repeated string class_id = 2;
optional string dictionary_name = 3;
}
// Represents a configuration of a HierarchySparsing Theta regularizer
message HierarchySparsingThetaConfig {
repeated string topic_name = 1;
repeated float parent_topic_proportion = 2; // user counted value p(supertopic), optional
repeated float alpha_iter = 3;
}
// Represents a configuration of a Topic Segmentation regularizer
message TopicSegmentationPtdwConfig {
repeated string background_topic_names = 1;
optional int32 window = 3 [default = 10];
optional float threshold = 4 [default = 0.5];
}
// Represents a configuration of a SmoothTimeInTopics Phi regularizer
message SmoothTimeInTopicsPhiConfig {
repeated string topic_name = 1;
optional string class_id = 2 [default = "@default_class"];
}
message NetPlsaPhiConfig {
optional string class_id = 1;
repeated string topic_name = 2;
repeated string vertex_name = 3;
repeated float vertex_weight = 4;
repeated int32 first_vertex_index = 5;
repeated int32 second_vertex_index = 6;
repeated float edge_weight = 7;
optional bool symmetric_edge_weights = 8 [default = false];
}
// Represents the transform functions
message TransformConfig {
enum TransformType {
Logarithm = 0; // f(x) = ln(x)
Polynomial = 1; // f(x) = a * x^n
Constant = 2; // f(x) = 1
}
optional TransformType type = 1 [default = Constant];
optional float n = 2 [default = 1];
optional float a = 3 [default = 1];
}
enum ScoreType {
ScoreType_Perplexity = 0;
ScoreType_SparsityTheta = 1;
ScoreType_SparsityPhi = 2;
ScoreType_ItemsProcessed = 3;
ScoreType_TopTokens = 4;
ScoreType_ThetaSnippet = 5;
ScoreType_TopicKernel = 6;
ScoreType_TopicMassPhi = 7;
ScoreType_ClassPrecision = 8;
ScoreType_PeakMemory = 9;
ScoreType_BackgroundTokensRatio = 10;
ScoreType_Unknown = 9999;
}
// Represents a configuration of a general score
message ScoreConfig {
optional string name = 1;
optional ScoreType type = 2 [default = ScoreType_Unknown];
optional bytes config = 3;
optional string model_name = 4;
optional string config_json = 5;
}
// Represents a result of score calculation
message ScoreData {
optional string name = 1;
optional ScoreType type = 2 [default = ScoreType_Unknown];
optional bytes data = 3;
optional string data_json = 4;
}
message ScoreArray {
repeated ScoreData score = 1;
}
// Represents a configuration of a perplexity score
message PerplexityScoreConfig {
enum Type {
UnigramDocumentModel = 0;
UnigramCollectionModel = 1;
}
optional Type model_type = 1 [default = UnigramDocumentModel];
optional string dictionary_name = 2;
repeated string class_id = 3;
repeated string transaction_typename = 4;
}
// Represents a result of calculation of a perplexity score
message PerplexityScore {
message TransactionNameInfo {
optional string transaction_typename = 1;
// here and below we sum a lot of small values,
// so double type is required
optional double raw = 2;
optional double normalizer = 3;
optional int64 zero_words = 4;
}
// general perplexity value for all cases
optional float value = 1;
// fields for case of all transaction typenames
optional double raw = 2;
optional double normalizer = 3;
optional int64 zero_words = 4;
// field for case of custom transaction types
repeated TransactionNameInfo transaction_typename_info = 5;
}
// Represents a configuration of a theta sparsity score
message SparsityThetaScoreConfig {
optional float eps = 3 [default = 1e-37];
repeated string topic_name = 4;
}
// Represents a result of calculation of a theta sparsity score
message SparsityThetaScore {
optional float value = 1;
optional int64 zero_topics = 2;
optional int64 total_topics = 3;
}
// Represents a configuration of a sparsity phi score
message SparsityPhiScoreConfig {
optional float eps = 1 [default = 1e-37];
optional string class_id = 2 [default = "@default_class"];
repeated string topic_name = 3;
}
// Represents a result of calculation of a phi sparsity score
message SparsityPhiScore {
optional float value = 1;
optional int64 zero_tokens = 2;
optional int64 total_tokens = 3;
}
// Represents a configuration of an items processed score
message ItemsProcessedScoreConfig {
}
// Represents a result of calculation of an items processed score
message ItemsProcessedScore {
optional int32 value = 1 [default = 0];
optional int32 num_batches = 2 [default = 0];
optional float token_weight = 3 [default = 0];
optional float token_weight_in_effect = 4 [default = 0];
}
// Represents a configuration of a top tokens score
message TopTokensScoreConfig {
optional int32 num_tokens = 1 [default = 10];
optional string class_id = 2 [default = "@default_class"];
repeated string topic_name = 3;
optional string cooccurrence_dictionary_name = 4;
optional float eps = 5 [default = 1e-37];
}
// Represents a result of calculation of a top tokens score
message TopTokensScore {
optional int32 num_entries = 1;
repeated string topic_name = 2;
repeated int32 topic_index = 3;
repeated string token = 4;
repeated float weight = 5;
repeated float coherence = 6;
optional float average_coherence = 7;
}
// Represents a configuration of a theta snippet score
message ThetaSnippetScoreConfig {
optional int32 num_items = 4 [default = 10];
}
// Represents a result of calculation of a theta snippet score
message ThetaSnippetScore {
repeated int32 item_id = 1;
repeated FloatArray values = 2;
}
// Represents a configuration of a topic kernel score
message TopicKernelScoreConfig {
optional float eps = 1 [default = 1e-37];
optional string class_id = 2 [default = "@default_class"];
repeated string topic_name = 3;
optional float probability_mass_threshold = 4 [default = 0.1];
optional string cooccurrence_dictionary_name = 5;
}
// Represents a result of calculation of a topic kernel score
message TopicKernelScore {
repeated float kernel_size = 1;
repeated float kernel_purity = 2;
repeated float kernel_contrast = 3;
optional float average_kernel_size = 4;
optional float average_kernel_purity = 5;
optional float average_kernel_contrast = 6;
repeated float coherence = 7;
optional float average_coherence = 8;
repeated StringArray kernel_tokens = 9;
repeated string topic_name = 10;
}
// Represents a configuration of a topic part in nwt score
message TopicMassPhiScoreConfig {
optional float eps = 1 [default = 1e-37];
repeated string class_id = 2;
repeated string topic_name = 3;
}
// Represents a result of calculation of a topic part in nwt score
message TopicMassPhiScore {
optional float value = 1;
repeated string topic_name = 2;
repeated float topic_ratio = 3;
repeated float topic_mass = 4;
}
// Represents a configuration of a class precision score
message ClassPrecisionScoreConfig {
}
// Represents a result of calculation of a class precision score
message ClassPrecisionScore {
optional float value = 1;
optional float error = 2;
optional float total = 3;
}
// Represents a configuration of a class precision score
message PeakMemoryScoreConfig {
}
// Represents a result of calculation of a class precision score
message PeakMemoryScore {
optional int64 value = 1;
}
// Represents a configuration of a background tokens part score
message BackgroundTokensRatioScoreConfig {
optional float delta_threshold = 1 [default = 0.5];
optional bool save_tokens = 2 [default = true];
optional bool direct_kl = 3 [default = true];
optional string class_id = 4 [default = "@default_class"];
}
// Represents a result of calculation of a background tokens part score
message BackgroundTokensRatioScore {
optional float value = 1;
repeated string token = 2;
}
// Represents a topic model (or an operation on topic model)
message TopicModel {
optional string name = 1 [default = "@model"];
optional int32 num_topics = 2;
repeated string topic_name = 3;
repeated string token = 4;
repeated FloatArray token_weights = 5;
repeated string class_id = 6;
repeated IntArray topic_indices = 8;
optional int64 num_values = 9; // NNZ for sparse retrieval
}
// Represents a theta matrix.
message ThetaMatrix {
repeated int32 item_id = 2;
repeated FloatArray item_weights = 3;
repeated string topic_name = 4;
optional int32 num_topics = 5;
repeated string item_title = 6;
repeated IntArray topic_indices = 7;
optional int64 num_values = 8; // NNZ for sparse retrieval
}
// Represents a configuration of a collection parser.
message CollectionParserConfig {
enum CollectionFormat {
BagOfWordsUci = 0;
MatrixMarket = 1;
VowpalWabbit = 2;
}
enum BatchNameType {
Guid = 0;
Code = 1;
}
optional CollectionFormat format = 1 [default = BagOfWordsUci];
optional string docword_file_path = 2;
optional string vocab_file_path = 3;
optional string target_folder = 4;
optional int32 num_items_per_batch = 5 [default = 1000];
optional bool use_unity_based_indices = 6 [default = true];
optional BatchNameType name_type = 7 [default = Guid];
optional int32 num_threads = 8;
repeated string class_id = 9;
optional string cooc_tf_file_path = 10;
optional string cooc_df_file_path = 11;
optional string ppmi_tf_file_path = 12;
optional string ppmi_df_file_path = 13;
optional bool gather_cooc_tf = 14;
optional bool gather_cooc_df = 15;
optional bool gather_cooc = 16;
optional int32 cooc_window_width = 17 [default = 10];
optional int32 cooc_min_tf = 18 [default = 1];
optional int32 cooc_min_df = 19 [default = 1];
}
// Misc statistics produced by collection parser
message CollectionParserInfo {
optional int64 num_items = 1;
optional int64 num_batches = 2;
optional int64 dictionary_size = 3;
optional int64 num_tokens = 4;
optional float total_token_weight = 5;
}
// Represents a configuration of a cooccurrence collector.
message CooccurrenceCollectorConfig {
optional bool gather_cooc = 1;
optional bool gather_cooc_tf = 2;
optional bool gather_cooc_df = 3;
optional bool use_symetric_cooc = 4;
optional bool calculate_ppmi_tf = 5;
optional bool calculate_ppmi_df = 6;
optional string vw_file_path = 7;
optional string vocab_file_path = 8;
optional string target_folder = 9;
optional string cooc_tf_file_path = 10;
optional string cooc_df_file_path = 11;
optional string ppmi_tf_file_path = 12;
optional string ppmi_df_file_path = 13;
optional int32 cooc_window_width = 14 [default = 10];
optional int32 cooc_min_tf = 15 [default = 1];
optional int32 cooc_min_df = 16 [default = 1];
optional int32 max_num_of_open_files = 17;
optional int32 num_items_per_batch = 18 [default = 1000];
optional int32 num_of_cpu = 19;
optional int64 total_num_of_pairs = 20;
optional int32 total_num_of_documents = 21;
repeated string class_id = 22;
}
// Represents an argument of 'initialize model' operation
message InitializeModelArgs {
optional string model_name = 1;
optional string dictionary_name = 2;
repeated string topic_name = 4;
optional int32 seed = 5 [default = -1];
}
// Represents a static dictionary.
message DictionaryData {
optional string name = 1;
repeated string token = 2;
repeated string class_id = 3;
repeated float token_value = 4; // for scores and regularizers
repeated float token_tf = 5; // token frequency
repeated float token_df = 6; // number of document occurences
repeated int32 cooc_first_index = 7;
repeated int32 cooc_second_index = 8;
repeated float cooc_value = 9;
optional int64 num_items_in_collection = 10; // number of documents in the collection
repeated float cooc_tf = 11; // these two arrays should be either empty or with size
repeated float cooc_df = 12; // equal to cooc_value and consistent with cooc indices
}
message FilterDictionaryArgs {
optional string dictionary_name = 1;
optional string dictionary_target_name = 2;
optional string class_id = 3;
optional float min_df = 4;
optional float max_df = 5;
optional float min_df_rate = 6;
optional float max_df_rate = 7;
optional float min_tf = 8;
optional float max_tf = 9;
optional int64 max_dictionary_size = 10;
optional bool recalculate_value = 11 [default = false];
}
message GatherDictionaryArgs {
optional string dictionary_target_name = 1;
optional string data_path = 2;
optional string cooc_file_path = 3;
optional string vocab_file_path = 4;
optional bool symmetric_cooc_values = 5 [default = false];
repeated string batch_path = 6;
}
message GetDictionaryArgs {
optional string dictionary_name = 1;
}
enum MatrixLayout {
MatrixLayout_Dense = 0;
MatrixLayout_Sparse = 1;
}
message GetTopicModelArgs {
optional string model_name = 1;
repeated string topic_name = 2;
repeated string token = 3;
repeated string class_id = 4;
optional bool use_sparse_format = 5; // deprecated, use matrix_layout
optional float eps = 6 [default = 1e-37];
optional MatrixLayout matrix_layout = 8 [default = MatrixLayout_Dense];
}
message GetThetaMatrixArgs {
repeated string topic_name = 3;
optional bool use_sparse_format = 6; // deprecated, use matrix_layout
optional float eps = 7 [default = 1e-37];
optional MatrixLayout matrix_layout = 8 [default = MatrixLayout_Dense];
}
message GetScoreValueArgs {
optional string score_name = 2;
}
message GetScoreArrayArgs {
optional string score_name = 2;
}
message ExportModelArgs {
optional string file_name = 1;
optional string model_name = 2;
}
message ImportModelArgs {
optional string file_name = 1;
optional string model_name = 2;
}
message ExportScoreTrackerArgs {
optional string file_name = 1;
}
message ImportScoreTrackerArgs {
optional string file_name = 1;
}
message AttachModelArgs {
optional string model_name = 1;
}
enum ThetaMatrixType {
ThetaMatrixType_None = 0;
ThetaMatrixType_Dense = 1;
ThetaMatrixType_Sparse = 2;
ThetaMatrixType_Cache = 3;
ThetaMatrixType_DensePtdw = 4;
ThetaMatrixType_SparsePtdw = 5;
}
message ProcessBatchesArgs {
optional string nwt_target_name = 1;
repeated string batch_filename = 2;
optional string pwt_source_name = 3;
optional int32 num_document_passes = 4 [default = 10];
repeated string regularizer_name = 6;
repeated float regularizer_tau = 7;
repeated string class_id = 8;
repeated float class_weight = 9;
optional bool reuse_theta = 10 [default = false];
optional bool opt_for_avx = 11 [default = true];
optional ThetaMatrixType theta_matrix_type = 14 [default = ThetaMatrixType_Cache];
repeated float batch_weight = 15;
optional string predict_class_id = 17;
repeated Batch batch = 18;
optional bool use_random_theta = 19 [default = false];
repeated string topic_name = 20;
repeated string transaction_typename = 21;
repeated float transaction_weight = 22;
optional bool reset_nwt = 23 [default = true];
}
message ProcessBatchesResult {
repeated ScoreData score_data = 1;
optional ThetaMatrix theta_matrix = 2;
}
message MergeModelArgs {
optional string nwt_target_name = 1;
repeated string nwt_source_name = 2;
repeated float source_weight = 3;
repeated string topic_name = 4;
optional string dictionary_name = 5;
}
message RegularizeModelArgs {
optional string rwt_target_name = 1;
optional string pwt_source_name = 2;
optional string nwt_source_name = 3;
repeated RegularizerSettings regularizer_settings = 4;
}
message NormalizeModelArgs {
optional string pwt_target_name = 1;
optional string nwt_source_name = 2;
optional string rwt_source_name = 3;
}
message ImportDictionaryArgs {
optional string file_name = 1;
optional string dictionary_name = 2;
}
message ExportDictionaryArgs {
optional string file_name = 1;
optional string dictionary_name = 2;
}
message DuplicateMasterComponentArgs {
}
message GetMasterComponentInfoArgs {
}
message MasterComponentInfo {
message RegularizerInfo {
optional string name = 1;
optional string type = 2;
}
message ScoreInfo {
optional string name = 1;
optional string type = 2;
}
message DictionaryInfo {
optional string name = 1;
optional int64 num_entries = 2;
optional int64 byte_size = 3;
}
message BatchInfo {
optional string name = 1;
optional int32 num_items = 2;
optional int32 num_tokens = 3;
}
message ModelInfo {
optional string name = 1;
optional string type = 2;
optional int32 num_topics = 3;
optional int32 num_tokens = 4;
optional int64 byte_size = 5;
}
message CacheEntryInfo {
optional string key = 1;
optional int32 byte_size = 2;
}
optional MasterModelConfig config = 2;
repeated RegularizerInfo regularizer = 3;
repeated ScoreInfo score = 4;
repeated DictionaryInfo dictionary = 5;
repeated ModelInfo model = 6;
repeated CacheEntryInfo cache_entry = 7;
optional int32 processor_queue_size = 9;
repeated BatchInfo batch = 10;
optional int32 num_processors = 11;
}
message ImportBatchesArgs {
repeated Batch batch = 3;
}
message AwaitOperationArgs {
optional int32 timeout_milliseconds = 1 [default = -1];
}
message MasterModelConfig {
repeated string topic_name = 1;
repeated string class_id = 2;
repeated float class_weight = 3;
repeated ScoreConfig score_config = 4;
repeated RegularizerConfig regularizer_config = 5;
optional int32 num_processors = 6;
optional string pwt_name = 7 [default = "pwt"];
optional string nwt_name = 8 [default = "nwt"];
optional int32 num_document_passes = 9;
optional bool reuse_theta = 10 [default = false];
optional bool opt_for_avx = 11 [default = true];
optional string disk_cache_path = 13;
optional bool cache_theta = 15 [default = false];
optional string ptd_name = 16 [default = ""];
repeated string transaction_typename = 17;
repeated float transaction_weight = 18;
optional int32 parent_master_model_id = 19;
optional float parent_master_model_weight = 20 [default = 1.0];
}
message FitOfflineMasterModelArgs {
repeated string batch_filename = 1;
repeated float batch_weight = 2;
optional int32 num_collection_passes = 3 [default = 1];
optional string batch_folder = 4;
optional bool reset_nwt = 5 [default = true];
}
message FitOnlineMasterModelArgs {
repeated string batch_filename = 1;
repeated float batch_weight = 2;
repeated int32 update_after = 3;
repeated float apply_weight = 4;
repeated float decay_weight = 5;
optional bool asynchronous = 6 [default = false];
}
message TransformMasterModelArgs {
repeated Batch batch = 1;
repeated string batch_filename = 2;
optional ThetaMatrixType theta_matrix_type = 3 [default = ThetaMatrixType_Dense];
optional string predict_class_id = 4;
}
message ConfigureLoggingArgs {
optional string log_dir = 1; // If specified, logfiles are written into this directory instead of the default logging directory.
optional int32 minloglevel = 2; // Messages logged at a lower level than this don't actually get logged anywhere
optional int32 stderrthreshold = 3; // Log messages at a level >= this flag are automatically sent to stderr in addition to log files.
optional bool logtostderr = 4; // Log messages go to stderr instead of logfiles
optional bool colorlogtostderr = 5; // color messages logged to stderr (if supported by terminal)
optional bool alsologtostderr = 6; // log messages go to stderr in addition to logfiles
optional int32 logbufsecs = 7; // Buffer log messages for at most this many seconds
optional int32 logbuflevel = 8; // Buffer log messages logged at this level or lower (-1 means don't buffer; 0 means buffer INFO only; ...)
optional int32 max_log_size = 9; // approx. maximum log file size (in MB). A value of 0 will be silently overridden to 1.
optional bool stop_logging_if_full_disk = 10; // Stop attempting to log to disk if the disk is full.
}
message ClearThetaCacheArgs {}
message ClearScoreCacheArgs {}
message ClearScoreArrayCacheArgs {}