Skip to content

Commit

Permalink
optimize relative coefs and add test (#899)
Browse files Browse the repository at this point in the history
  • Loading branch information
MelLain committed Apr 1, 2018
1 parent 258df36 commit fa4e2c1
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 35 deletions.
59 changes: 24 additions & 35 deletions src/artm/core/phi_matrix_operations.cc
Original file line number Diff line number Diff line change
Expand Up @@ -277,11 +277,7 @@ void PhiMatrixOperations::InvokePhiRegularizers(
continue;
}

// count n and r_i for relative regularization, if necessary
// prepare next structure with parameters:
// pair of pairs, first pair --- n and n_t, second one --- r_i and r_it
std::unordered_map<core::ClassId, std::pair<std::pair<double, std::vector<float> >,
std::pair<double, std::vector<float> > > > parameters;
std::unordered_map<ClassId, std::vector<float>> relative_coefficients;
std::vector<bool> topics_to_regularize;

if (relative_reg) {
Expand Down Expand Up @@ -311,17 +307,17 @@ void PhiMatrixOperations::InvokePhiRegularizers(
topics_to_regularize.assign(topic_size, true);
}

std::vector<float> r_it = std::vector<float>(topic_size, 0.0f);
std::vector<float> coefficients = std::vector<float>(topic_size, 0.0f);
for (const auto& class_id : class_ids) {
auto iter = n_t_all.find(NormalizerKey(class_id, TransactionType(class_id)));
if (iter != n_t_all.end()) {
double n = 0.0;
double r_i = 0.0;
std::vector<float> r_it;
std::vector<float> n_t = iter->second;

for (int topic_id = 0; topic_id < topic_size; ++topic_id) {
if (!topics_to_regularize[topic_id]) {
r_it.push_back(-1.0f);
continue;
}
n += n_t[topic_id];
Expand All @@ -335,19 +331,21 @@ void PhiMatrixOperations::InvokePhiRegularizers(
r_it_current += fabs(local_r_wt.get(token_id, topic_id));
}

r_it.push_back(r_it_current);
r_it[topic_id] = r_it_current;
r_i += r_it_current;
}

auto pair_n = std::pair<double, std::vector<float> >(n, n_t);
auto pair_r = std::pair<double, std::vector<float> >(r_i, r_it);
auto pair_data = std::pair<std::pair<double, std::vector<float> >,
std::pair<double, std::vector<float> > >(pair_n, pair_r);
for (int topic_id = 0; topic_id < topic_size; ++topic_id) {
if (!topics_to_regularize[topic_id]) {
continue;
}

auto pair_last = std::pair<core::ClassId,
std::pair<std::pair<double, std::vector<float> >,
std::pair<double, std::vector<float> > > >(iter->first.class_id(), pair_data);
parameters.insert(pair_last);
float gamma = reg_iterator->gamma();
coefficients[topic_id] = gamma * (n_t[topic_id] / r_it[topic_id]) +
(1 - gamma) * static_cast<float>(n / r_i);
}

relative_coefficients.insert(std::make_pair(iter->first.class_id(), coefficients));
} else {
LOG(WARNING) << "No class_id " << class_id << " in model";
}
Expand All @@ -356,29 +354,20 @@ void PhiMatrixOperations::InvokePhiRegularizers(

for (int token_id = 0; token_id < token_size; ++token_id) {
const auto& class_id = n_wt.token(token_id).class_id;
auto iter = parameters.find(class_id);
if (relative_reg) {
if (iter == parameters.end()) {
LOG(WARNING) << "No relative coefficients parameters for class_id " << class_id;
continue;
}
auto iter = relative_coefficients.find(class_id);

if (relative_reg && iter == relative_coefficients.end()) {
LOG(WARNING) << "No relative coefficients were provided for class_id " << class_id;
continue;
}
// ToDo (MelLain): move this loop outside the outer one
for (int topic_id = 0; topic_id < topic_size; ++topic_id) {
float coefficient = 1.0f;
if (relative_reg) {
if (!topics_to_regularize[topic_id]) {
continue;
}

float gamma = reg_iterator->gamma();
float n_t = iter->second.first.second[topic_id];
double n = iter->second.first.first;
float r_it = iter->second.second.second[topic_id];
double r_i = iter->second.second.first;
coefficient = gamma * (n_t / r_it) + (1 - gamma) * static_cast<float>(n / r_i);
for (int topic_id = 0; topic_id < topic_size; ++topic_id) {
if (relative_reg && !topics_to_regularize[topic_id]) {
continue;
}

// update global r_wt using coefficient and tau
float coefficient = relative_reg ? iter->second[topic_id] : 1.0f;
float increment = coefficient * tau * local_r_wt.get(token_id, topic_id);
r_wt->increase(token_id, topic_id, increment);
}
Expand Down
71 changes: 71 additions & 0 deletions src/artm_tests/regularizers_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -302,3 +302,74 @@ TEST(Regularizers, NetPlsa) {
ASSERT_NEAR(real_values[i], expected_values[i], 1.0e-3);
}
}

// artm_tests.exe --gtest_filter=Regularizers.RelativeRegularization
TEST(Regularizers, RelativeRegularization) {
int nTopics = 500;
int nTokens = 500;
int nDocs = 100;

// generate batch
std::shared_ptr<::artm::Batch> batch(new ::artm::Batch());
batch->set_id(artm::test::Helpers::getUniqueString());

for (int i = 0; i < nTokens; i++) {
std::stringstream str;
str << "token" << i;
batch->add_token(str.str());
}

for (int i = 0; i < nDocs; ++i) {
artm::Item* item = batch->add_item();
std::stringstream str;
str << "item_" << i;
item->set_title(str.str());
for (int iToken = 0; iToken < nTokens; ++iToken) {
item->add_transaction_token_id(iToken);
item->add_transaction_start_index(item->transaction_start_index_size());
item->add_token_weight(1.0);
}
}

// part 1
// create master
::artm::MasterModelConfig master_config = ::artm::test::TestMother::GenerateMasterModelConfig(nTopics);
master_config.set_cache_theta(true);

// create regularizer
::artm::RegularizerConfig* regularizer_config = master_config.add_regularizer_config();

regularizer_config->set_name("SparsePhi");
regularizer_config->set_type(::artm::RegularizerType_SmoothSparsePhi);
regularizer_config->set_tau(-0.5);
regularizer_config->set_gamma(0.5);

regularizer_config->set_config(::artm::DecorrelatorPhiConfig().SerializeAsString());

// create sparsity score
::artm::ScoreConfig* score_config = master_config.add_score_config();

score_config->set_name("SparsityPhi");
score_config->set_type(::artm::ScoreType_SparsityPhi);
score_config->set_config(::artm::SparsityPhiScore().SerializeAsString());

artm::MasterModel master(master_config);
::artm::test::Api api(master);

std::vector<double> true_score = { 0.249724, 0.390548, 0.48292, 0.549428, 0.60086,
0.641332, 0.673568, 0.70006, 0.722924, 0.741688,
0.758396, 0.773184, 0.78594, 0.797584, 0.808048,
0.816872, 0.82518, 0.832504, 0.839472, 0.845976 };

auto offline_args = api.Initialize({ batch });
for (int i = 0; i < 20; ++i) {
master.FitOfflineModel(offline_args);

::artm::GetScoreArrayArgs args;
args.set_score_name("SparsityPhi");

auto sparsity_scores = master.GetScoreArrayAs< ::artm::SparsityPhiScore>(args);
ASSERT_EQ(sparsity_scores.size(), (i + 1));
ASSERT_APPROX_EQ(sparsity_scores.back().value(), true_score[i]);
}
}

0 comments on commit fa4e2c1

Please sign in to comment.