Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
4613 lines (4136 sloc) 166 KB
<
/**
* \file mcmc.h
*
* This file implements Markov chain Monte Carlo inference for Dirichlet
* processes (DP) and hierarchical Dirichlet processes (HDP). More
* specifically, this file implements a Gibbs sampler for the Chinese
* restaurant representation. See hdp.h for a description of DPs, HDPs, and the
* Chinese restaurant representation.
*
* Code example
* ------------
*
* The following is an example of an HDP mixture model where the base
* distribution is a symmetric Dirichlet distribution with parameter 1. The
* likelihood distribution is a categorical over the set of ASCII characters,
* which we encode as an unsigned int in `{1, 2, ..., 256}`.
* ```{.cpp}
* #include <hdp/mcmc.h>
* using namespace core;
*
* constexpr unsigned int depth = 2;
*
* template<typename BaseDistribution, typename Likelihood, typename K, typename V>
* void posterior_predictive_probability(
* const hdp_sampler<BaseDistribution, Likelihood, K, V>& sampler,
* const cache<BaseDistribution, Likelihood, K, V>& cache,
* const K& observation)
* {
* array<weighted_feature_set<V>> paths(32);
* auto root_probabilities = cache.compute_root_probabilities(sampler, observation);
* predict(sampler, observation, paths, root_probabilities);
* cleanup_root_probabilities(root_probabilities, sampler.posterior.length);
*
* sort(paths, feature_set_sorter(depth));
* for (unsigned int i = 0; i < paths.length; i++) {
* unsigned int*& path = paths[i].features.features;
* print("probability of drawing '", stdout);
* print((char) observation, stdout);
* if (path[0] == IMPLICIT_NODE) {
* print("' from any other child node: ", stdout);
* } else {
* print("' from child node ", stdout);
* print(path[0], stdout); print(": ", stdout);
* }
* print(exp(paths[i].log_probability), stdout);
* print('\n', stdout);
* free(paths[i]);
* }
* }
*
* template<typename BaseDistribution, typename Likelihood, typename K, typename V>
* void do_inference(hdp<BaseDistribution, Likelihood, K, V>& h) {
* hdp_sampler<BaseDistribution, Likelihood, K, V> sampler(h);
* cache<BaseDistribution, Likelihood, K, V> cache(sampler);
*
* prepare_sampler(sampler, cache);
* for (unsigned int i = 0; i < 200; i++)
* sample_hdp<true>(sampler, cache);
* for (unsigned int i = 0; i < 800; i++) {
* sample_hdp<true>(sampler, cache);
* if (i % 5 == 0) sampler.add_sample();
* }
*
* posterior_predictive_probability(sampler, cache, (unsigned int) '+');
* posterior_predictive_probability(sampler, cache, (unsigned int) '-');
* posterior_predictive_probability(sampler, cache, (unsigned int) 'a');
* }
*
* int main() {
* double alpha[] = {1.0e6, 1.0e-2};
* dirichlet<double> base_distribution(256, 1.0);
* hdp<dirichlet<double>, dense_categorical<double>, unsigned int, double> h(base_distribution, alpha, depth);
*
* unsigned int first_child[] = { 1 };
* unsigned int second_child[] = { 2 };
* for (unsigned int i = 0; i < 100; i++) {
* add(h, first_child, depth, (unsigned int) '+');
* add(h, second_child, depth, (unsigned int) '-');
* }
*
* do_inference(h);
* }
* ```
* In this example, we use the following model:
* \f[
* \begin{align*}
* H &= \text{Dirichlet}([1, \ldots, 1]), \\
* G^{\textbf{0}} &\sim DP(H, 10^6), \\
* G^{\textbf{1}}, G^{\textbf{2}}, G^{\textbf{3}} &\sim DP(G_{\textbf{0}}, 10^{-2}), \\
* x^{\textbf{1}}_1, \ldots, x^{\textbf{1}}_{100} &\sim G_{\textbf{1}}, \\
* x^{\textbf{2}}_1, \ldots, x^{\textbf{2}}_{100} &\sim G_{\textbf{2}}, \\
* y^{\textbf{1}}_i &\sim \text{Categorical}(x^{\textbf{1}}_i) \text{ for } i = 1, \ldots, 100, \\
* y^{\textbf{2}}_i &\sim \text{Categorical}(x^{\textbf{2}}_i) \text{ for } i = 1, \ldots, 100.
* \end{align*}
* \f]
* The only variables we observe are: \f$ y^{\textbf{1}}_i = \texttt{`+'} \f$
* and \f$ y^{\textbf{2}}_i = \texttt{`-'} \f$ for
* \f$ i = 1, \ldots, 100 \f$. We want to compute the probabilities of:
* \f[
* \begin{equation*}
* p(y^{\textbf{1}}_{new} | \boldsymbol{y}), p(y^{\textbf{2}}_{new} | \boldsymbol{y}), p(y^{\textbf{3}}_{new} | \boldsymbol{y}),
* \end{equation*}
* \f]
* where \f$ \boldsymbol{y} \triangleq \{y^{\textbf{1}}_1, \ldots, y^{\textbf{1}}_{100}, y^{\textbf{2}}_1, \ldots, y^{\textbf{2}}_{100}\} \f$
* is the set of all observations, and each \f$ y^{\textbf{n}}_{new} \f$ is a
* new (previously unobserved) sample drawn from \f$ x^{\textbf{n}}_{new} \f$
* which is in turn drawn from \f$ G^{\textbf{n}} \f$ (i.e. the posterior
* predictive distribution). The above code does exactly this, and the expected
* output is:
* ```
* probability of drawing '+' from child node 1: 0.283680
* probability of drawing '+' from child node 2: 0.002809
* probability of drawing '+' from any other child node: 0.003907
* probability of drawing '-' from child node 1: 0.002809
* probability of drawing '-' from child node 2: 0.283680
* probability of drawing '-' from any other child node: 0.003907
* probability of drawing 'a' from child node 1: 0.002809
* probability of drawing 'a' from child node 2: 0.002809
* probability of drawing 'a' from any other child node: 0.003906
* ```
*
* The function `do_inference` constructs the hdp_sampler and ::cache
* structures necessary to perform MCMC sampling. hdp_sampler (and
* node_sampler) stores the variables used by the sampling algorithm. ::cache
* is a structure that optimizes the sampling algorithm for particular choices
* of the base distribution and likelihood. Once constructed, the function then
* executes 200 "burn-in" iterations, to allow the MCMC to mix (converge to the
* true posterior). Then, it performs 800 more iterations, keeping every fifth
* sample (to minimize autocorrelation among samples, since we want them to be
* as independent as possible). Finally, the function
* `posterior_predictive_probability` is called to compute the above
* probabilities.
*
* A unit test is also available in mcmc.cpp, as another example.
*
* <!-- Created on: Jul 26, 2015
* Author: asaparov -->
*/
#ifndef MCMC_H_
#define MCMC_H_
#include <core/io.h>
#include <math/distributions.h>
#include <math/log.h>
#include <algorithm>
#include "hdp.h"
#include "cache.h"
namespace detail {
template<typename Stream, typename Printer> static auto test_level_printable(int) ->
decltype(bool(print(0u, std::declval<Stream&>(), 0u, std::declval<Printer&>())), std::true_type{});
template<typename Stream, typename Printer> static auto test_level_printable(long) -> std::false_type;
}
template<typename Stream, typename Printer> struct is_level_printable : decltype(::detail::test_level_printable<Stream, Printer>(0)){};
template<typename Stream, typename Printer,
typename std::enable_if<!is_level_printable<Stream, Printer>::value>::type* = nullptr>
inline bool print(unsigned int u, Stream& stream, unsigned int level, Printer& printer) {
return print(u, stream, printer);
}
/* forward declarations */
#if !defined(DOXYGEN_IGNORE)
template<typename DataDistribution, typename NodeType, typename BaseDistribution, typename Observations>
inline bool sample_initial_assignment(
NodeType& n,
const BaseDistribution& pi,
const Observations& observations,
unsigned int& assignment);
template<bool Collapsed, typename NodeType, typename Observations, typename Cache>
bool select_table(NodeType& n,
const Observations& observations,
unsigned int old_table,
unsigned int selected_table,
unsigned int& new_table,
Cache& cache);
#endif
static inline bool compare_histograms(
const unsigned int* first, const unsigned int* second,
unsigned int first_length, unsigned int second_length)
{
if (first_length < second_length) {
for (unsigned int i = 0; i < first_length; i++)
if (first[i] != second[i])
return false;
for (unsigned int i = first_length; i < second_length; i++)
if (second[i] != 0)
return false;
} else {
for (unsigned int i = 0; i < second_length; i++)
if (first[i] != second[i])
return false;
for (unsigned int i = second_length; i < first_length; i++)
if (first[i] != 0)
return false;
}
return true;
}
static inline void add_to_histogram(unsigned int* histogram,
unsigned int histogram_length, unsigned int item)
{
#if !defined(NDEBUG)
if (item >= histogram_length)
fprintf(stderr, "add_to_histogram WARNING: Index out of bounds.\n");
#endif
histogram[item]++;
}
template<typename V>
struct node_sample {
unsigned int* table_sizes;
unsigned int* root_assignments;
unsigned int customer_count;
unsigned int table_count;
unsigned int* table_assignments;
node_sample(unsigned int table_count, unsigned int observation_count) {
if (!initialize(table_count, observation_count)) {
fprintf(stderr, "node_sample ERROR: Error during initialization.\n");
exit(EXIT_FAILURE);
}
}
inline unsigned int root_assignment(unsigned int i) const {
return root_assignments[i];
}
template<typename Metric>
static inline long unsigned int size_of(const node_sample<V>& sample, const Metric& metric) {
return 2 * sample.table_count * sizeof(unsigned int)
+ sample.customer_count * sizeof(unsigned int)
+ core::size_of(sample.table_count) + core::size_of(sample.customer_count);
}
static inline void free(node_sample<V>& sample) {
sample.free();
}
private:
inline bool initialize(unsigned int num_tables, unsigned int observation_count) {
table_sizes = (unsigned int*) malloc(sizeof(unsigned int) * num_tables);
if (table_sizes == NULL) {
fprintf(stderr, "node_sample.initialize ERROR: Unable to initialize table_sizes.\n");
return false;
}
if (observation_count == 0)
observation_count = 1;
table_assignments = (unsigned int*) malloc(sizeof(unsigned int) * observation_count);
if (table_assignments == NULL) {
fprintf(stderr, "node_sample.initialize ERROR: Unable to initialize table_assignments.\n");
core::free(table_sizes);
return false;
}
table_count = num_tables;
root_assignments = (unsigned int*) malloc(sizeof(unsigned int) * num_tables);
if (root_assignments == NULL) {
fprintf(stderr, "node_sample.initialize ERROR: Unable to initialize root_assignments.\n");
core::free(table_sizes);
core::free(table_assignments);
return false;
}
return true;
}
inline void free() {
core::free(table_sizes);
core::free(table_assignments);
core::free(root_assignments);
}
template<typename A>
friend bool init(node_sample<A>&, unsigned int, unsigned int);
};
template<typename V>
bool init(node_sample<V>& sample, unsigned int table_count, unsigned int observation_count) {
return sample.initialize(table_count, observation_count);
}
template<typename AtomScribe>
struct node_sample_scribe {
AtomScribe& atom_scribe;
unsigned int observation_count;
unsigned int root_cluster_count;
node_sample_scribe(AtomScribe& atom_scribe, unsigned int observation_count, unsigned int root_cluster_count) :
atom_scribe(atom_scribe), observation_count(observation_count), root_cluster_count(root_cluster_count) { }
};
template<typename V, typename AtomReader>
bool read(node_sample<V>& sample, FILE* in, node_sample_scribe<AtomReader>& reader) {
if (!read(sample.table_count, in)) return false;
sample.table_sizes = (unsigned int*) malloc(sizeof(unsigned int) * sample.table_count);
if (sample.table_sizes == NULL)
return false;
sample.root_assignments = (unsigned int*) malloc(sizeof(unsigned int) * sample.table_count);
if (sample.root_assignments == NULL) {
free(sample.table_sizes);
return false;
}
sample.table_assignments = (unsigned int*) malloc(sizeof(unsigned int) * reader.observation_count);
if (sample.table_assignments == NULL) {
free(sample.root_assignments);
free(sample.table_sizes);
return false;
}
if (!read(sample.table_sizes, in, sample.table_count)) return false;
if (!read(sample.root_assignments, in, sample.table_count)) return false;
if (!read(sample.table_assignments, in, reader.observation_count)) return false;
sample.customer_count = 0;
for (unsigned int i = 0; i < sample.table_count; i++)
sample.customer_count += sample.table_sizes[i];
return true;
}
template<typename V, typename AtomWriter>
bool write(const node_sample<V>& sample, FILE* out, node_sample_scribe<AtomWriter>& writer) {
if (!write(sample.table_count, out)) return false;
if (!write(sample.table_sizes, out, sample.table_count)) return false;
if (!write(sample.root_assignments, out, sample.table_count)) return false;
if (!write(sample.table_assignments, out, writer.observation_count)) return false;
return true;
}
/**
* This structure stores Gibbs sampling variables for a single HDP non-root
* node (i.e. a ::node object). These "sampler" structures form a tree parallel
* to the HDP hierarchy consisting of ::hdp and ::node objects.
*
* Since the Gibbs sampler is derived using the Chinese restaurant
* representation, every node_sampler contains an infinite number of tables,
* but we only store the non-empty tables. The number of non-empty tables is
* given by node_sampler::table_count. However, during inference, the number of
* occupied tables may increase and decrease. As such, the node keeps a larger
* "capacity" of tables node_sampler::table_capacity to avoid reallocating
* memory at every sampling step. As long as `table_count <= table_capacity`,
* the algorithm doesn't need to reallocate memory. Every observation drawn
* from this node is assigned to a table (just as a customer picks a table at
* which to sit). All tables in this node are, in turn, sampled from the
* distribution of the parent node. As such, the tables in this node are
* assigned a table in the parent node.
*
* For an example of how to perform MCMC inference with DPs and HDPs, see the
* [code example above](#code-example).
*
* \tparam K the generic type of the observations drawn from this distribution.
* \tparam V the type of the probabilities.
*/
template<typename K, typename V>
struct node_sampler
{
/**
* The generic type of the observations drawn from this distribution.
*/
typedef K atom_type;
/**
* The type of the probabilities.
*/
typedef V value_type;
/**
* The type of a child node of this node in the sampler hierarchy.
*/
typedef node_sampler<K, V> child_type;
/**
* The "sampler" structures form a tree parallel to the HDP hierarchy which
* consists of ::hdp and ::node objects. This typedef returns the type of
* the object that this node_sampler represents in the HDP hierarchy.
*/
typedef node<K, V> node_type;
/**
* A native array containing the child nodes of this node in the sampler
* hierarchy. This array is parallel to the node::children and
* hdp::children array maps.
*/
node_sampler<K, V>* children;
/**
* A pointer to the parent node in the sampler hierarchy.
*/
node_sampler<K, V>* parent;
/**
* The "sampler" structures form a tree parallel to the HDP hierarchy which
* consists of ::hdp and ::node objects. This is a pointer to the node in
* the HDP hierarchy that is parallel to this node.
*/
node<K, V>* n;
/**
* <!-- state variables for Gibbs sampling -->
*/
/**
* An array parallel to node::observations that indicates which table in
* this node each observation is assigned to.
*/
unsigned int* observation_assignments;
/**
* An array with length node_sampler::table_count (and capacity
* node_sampler::table_capacity) that keeps track of the number of
* customers sitting at each table.
*/
unsigned int* table_sizes;
/**
* Each table in this node is assigned to a table in the parent node. This
* array with length node_sampler::table_count (and capacity
* node_sampler::table_capacity) keeps track the assignments of tables in
* this node to tables in the parent node.
*/
unsigned int* table_assignments;
/**
* Since every table in every node in the sampler hierarchy is assigned to
* a table in the parent node, we can follow the chain of table assignments
* to tables in the root node. This array with length
* node_sampler::table_count (and capacity node_sampler::table_capacity)
* keeps track of the assignemnts of tables in this node to tables in the
* root node.
*/
unsigned int* root_assignments;
/**
* This array of array_multiset objects keeps track, for each table in this
* node, the set of observations assigned to this table, either directly or
* indirectly via descendant nodes in the hierarchy. This array has length
* node_sampler::table_count and capacity node_sampler::table_capacity.
*/
array_multiset<K>* descendant_observations;
/**
* The number of occupied tables in the Chinese restaurant represented by
* this sampler node.
*/
unsigned int table_count;
/**
* The current capacity of tables in this sampler node.
*/
unsigned int table_capacity;
/**
* The total number of customers sitting at all tables at this sampler node.
*/
unsigned int customer_count;
array<node_sample<V>> posterior;
template<typename BaseDistribution, typename DataDistribution>
node_sampler(
const hdp_sampler<BaseDistribution, DataDistribution, K, V>& root,
const node<K, V>& n, const child_type* parent, unsigned int table_capacity) :
n(&n), parent(parent), posterior(4)
{
if (!initialize(root, table_capacity))
exit(EXIT_FAILURE);
}
~node_sampler() { free(); }
inline V alpha() const {
return n->get_alpha();
}
inline V log_alpha() const {
return n->get_log_alpha();
}
inline unsigned int child_count() const {
return (unsigned int) n->children.size;
}
inline const child_type& get_child(unsigned int key, bool& contains) const {
unsigned int index = n->children.index_of(key);
contains = (index < child_count());
return children[index];
}
inline unsigned int child_key(unsigned int index) const {
return n->children.keys[index];
}
inline unsigned int observation_count() const {
return (unsigned int) n->observations.length;
}
inline const K& get_observation(unsigned int index) const {
return n->observations[index];
}
inline unsigned int root_assignment(unsigned int i) const {
return root_assignments[i];
}
inline unsigned int table_assignment(unsigned int i) const {
return table_assignments[i];
}
template<typename Cache>
inline void relabel_tables(const unsigned int* table_map, const Cache& cache) {
/* apply the table map to the tables in the child nodes */
for (unsigned int i = 0; i < child_count(); i++)
children[i].relabel_parent_tables(table_map);
for (unsigned int i = 0; i < observation_count(); i++)
observation_assignments[i] = table_map[observation_assignments[i]];
}
void relabel_parent_tables(const unsigned int* parent_table_map) {
for (unsigned int i = 0; i < table_count; i++)
table_assignments[i] = parent_table_map[table_assignments[i]];
}
bool get_observations(array_multiset<K>& dst, unsigned int assignment) const {
for (unsigned int i = 0; i < table_count; i++) {
if (table_assignments[i] == assignment && !dst.add(descendant_observations[i])) {
fprintf(stderr, "node_sampler.get_observations ERROR: Unable "
"to add descendant observations to multiset.\n");
return false;
}
}
return true;
}
template<typename Cache>
inline void move_table(unsigned int src, unsigned int dst, const Cache& cache) {
table_assignments[dst] = table_assignments[src];
table_sizes[dst] = table_sizes[src];
root_assignments[dst] = root_assignments[src];
array_multiset<K>::move(descendant_observations[src], descendant_observations[dst]);
}
template<typename Observations, typename Cache>
inline bool move_to_table(const Observations& observations,
unsigned int src, unsigned int dst, Cache& cache)
{
if (!descendant_observations[dst].add(observations)) {
fprintf(stderr, "node_sampler.move_to_table ERROR: Unable to add observations to new table.\n");
return false;
}
descendant_observations[src].subtract(observations);
descendant_observations[src].remove_zeros();
cache.on_move_to_table(*this, src, dst, observations);
return true;
}
template<typename Observations, typename Cache>
inline bool move_to_new_table(const Observations& observations, unsigned int src, Cache& cache)
{
if (!descendant_observations[table_count - 1].add(observations)) {
fprintf(stderr, "node_sampler.move_to_new_table ERROR: Unable to add observations to new table.\n");
return false;
}
descendant_observations[src].subtract(observations);
descendant_observations[src].remove_zeros();
cache.on_move_to_new_table(*this, src, observations);
return true;
}
template<typename Observations, typename Cache>
inline bool add_to_table(
const Observations& observations,
unsigned int table, Cache& cache)
{
if (!descendant_observations[table].add(observations)) {
fprintf(stderr, "node_sampler.add_to_table ERROR: Unable to add observations to new table.\n");
return false;
}
cache.on_add_to_table(*this, table, observations);
return true;
}
template<typename Observations, typename Cache>
inline bool add_to_new_table(const Observations& observations, Cache& cache)
{
if (!descendant_observations[table_count - 1].add(observations)) {
fprintf(stderr, "node_sampler.add_to_new_table ERROR: Unable to add observations to new table.\n");
return false;
}
cache.on_add_to_new_table(*this, observations);
return true;
}
template<typename Observations, typename Cache>
inline bool remove_from_table(
const Observations& observations,
unsigned int table, Cache& cache)
{
descendant_observations[table].subtract(observations);
descendant_observations[table].remove_zeros();
cache.on_remove_from_table(*this, table, observations);
return true;
}
inline bool new_table() {
unsigned int new_capacity = table_capacity;
while (new_capacity < table_count + 2)
new_capacity *= 2;
if (table_capacity != new_capacity && !resize(new_capacity)) {
fprintf(stderr, "node_sampler.new_table ERROR: Unable to expand table array.\n");
return false;
}
table_capacity = new_capacity;
if (!init(descendant_observations[table_count], 4)) {
fprintf(stderr, "node_sampler.new_table ERROR: Unable to initialize new table.\n");
return false;
}
table_count++;
return true;
}
/* NOTE: this does not change the total number of tables */
inline void free_table(unsigned int i) {
core::free(descendant_observations[i]);
}
inline void set_table_count(unsigned int new_count) {
table_count = new_count;
}
inline bool add_sample() {
return add_sample(table_count);
}
bool add_sample(unsigned int root_cluster_count) {
for (unsigned int i = 0; i < child_count(); i++)
children[i].add_sample(root_cluster_count);
if (!posterior.ensure_capacity(posterior.length + 1)) {
fprintf(stderr, "node_sampler.add_sample ERROR: Unable to expand posterior sample array.\n");
return false;
} else if (!init(posterior[(unsigned int) posterior.length], table_count, observation_count())) {
fprintf(stderr, "node_sampler.add_sample ERROR: Unable to initialize new sample.\n");
return false;
}
node_sample<V>& sample = posterior[(unsigned int) posterior.length];
for (unsigned int i = 0; i < observation_count(); i++)
sample.table_assignments[i] = observation_assignments[i];
for (unsigned int i = 0; i < table_count; i++) {
sample.root_assignments[i] = root_assignments[i];
sample.table_sizes[i] = table_sizes[i];
}
sample.customer_count = customer_count;
posterior.length++;
return true;
}
void clear_samples() {
for (unsigned int i = 0; i < child_count(); i++)
children[i].clear_samples();
for (unsigned int i = 0; i < posterior.length; i++)
core::free(posterior[i]);
posterior.clear();
}
static inline void move(
const node_sampler<K, V>& src,
node_sampler<K, V>& dst)
{
dst.table_sizes = src.table_sizes;
core::move(src.observation_assignments, dst.observation_assignments);
core::move(src.posterior, dst.posterior);
dst.table_assignments = src.table_assignments;
dst.root_assignments = src.root_assignments;
dst.descendant_observations = src.descendant_observations;
dst.table_count = src.table_count;
dst.table_capacity = src.table_capacity;
dst.customer_count = src.customer_count;
dst.children = src.children;
dst.parent = src.parent;
dst.n = src.n;
}
template<typename Metric>
static inline long unsigned int size_of(
const node_sampler<K, V>& n, const Metric& metric)
{
long unsigned int sum = core::size_of(n.observation_assignments)
+ core::size_of(n.children, make_key_value_metric(default_metric(), metric))
+ core::size_of(n.customer_count) + core::size_of(n.parent)
+ core::size_of(n.observations, metric) + core::size_of(n.table_count)
+ core::size_of(n.table_capacity) + core::size_of(n.posterior);
for (unsigned int i = 0; i < n.table_count; i++)
sum += core::size_of(n.descendant_observations[i], metric);
sum += sizeof(array_multiset<K>) * (n.table_capacity - n.table_count);
sum += 3 * sizeof(unsigned int) * n.table_capacity;
return sum;
}
static inline void free(node_sampler<K, V>& n) {
n.free();
core::free(n.posterior);
}
private:
template<typename BaseDistribution, typename DataDistribution>
inline bool initialize(
const hdp_sampler<BaseDistribution, DataDistribution, K, V>& root,
unsigned int initial_table_capacity)
{
table_sizes = NULL;
table_count = 0;
table_capacity = initial_table_capacity;
table_assignments = NULL;
root_assignments = NULL;
descendant_observations = NULL;
observation_assignments = NULL;
children = NULL;
customer_count = 0;
children = (node_sampler<K, V>*) malloc(sizeof(node_sampler<K, V>) * n->children.capacity);
observation_assignments = (unsigned int*) calloc(n->observations.capacity, sizeof(unsigned int));
if (children == NULL || observation_assignments == NULL || !resize(initial_table_capacity)) {
fprintf(stderr, "node_sampler.initialize ERROR: Unable to initialize tables.\n");
free(0); return false;
}
/* add the observations from the hdp node */
for (unsigned int i = 0; i < n->children.size; i++) {
if (!init(children[i], root, n->children.values[i], this, 8)) {
fprintf(stderr, "node_sampler.initialize ERROR: Unable to initialize child node.\n");
free(i); return false;
}
for (unsigned int j = 0; j < children[i].table_count; j++) {
/* assign the tables in the child nodes to tables in this node */
if (!sample_initial_assignment<DataDistribution>(
*this, root.pi(), children[i].descendant_observations[j], children[i].table_assignments[j]))
{
fprintf(stderr, "node_sampler.initialize ERROR: Unable to add to descendant_observations.\n");
free(i + 1); return false;
}
}
}
for (unsigned int i = 0; i < observation_count(); i++) {
if (!sample_initial_assignment<DataDistribution>(
*this, root.pi(), n->observations[i], observation_assignments[i]))
{
fprintf(stderr, "node_sampler.initialize ERROR: Unable to add to descendant_observations.\n");
free(0); return false;
}
}
return true;
}
inline void free_children(unsigned int child_count, const unsigned int* indices = NULL) {
if (indices == NULL) {
for (unsigned int i = 0; i < child_count; i++)
core::free(children[i]);
} else {
for (unsigned int i = 0; i < child_count; i++)
core::free(children[indices[i]]);
}
core::free(children);
}
inline void free(unsigned int child_count, const unsigned int* indices = NULL) {
if (children != NULL)
free_children(child_count, indices);
for (unsigned int i = 0; i < posterior.length; i++)
core::free(posterior[i]);
for (unsigned int i = 0; i < table_count; i++)
core::free(descendant_observations[i]);
if (table_assignments != NULL) core::free(table_assignments);
if (root_assignments != NULL) core::free(root_assignments);
if (descendant_observations != NULL) core::free(descendant_observations);
if (observation_assignments != NULL) core::free(observation_assignments);
if (table_sizes != NULL) core::free(table_sizes);
}
inline void free() { free(child_count()); }
inline bool resize(unsigned int new_capacity)
{
unsigned int* new_sizes = (unsigned int*) realloc(
table_sizes, sizeof(unsigned int) * new_capacity);
if (new_sizes == NULL) {
fprintf(stderr, "node_sampler.resize ERROR: Unable to expand table_sizes.\n");
return false;
}
table_sizes = new_sizes;
unsigned int* new_table_assignments = (unsigned int*) realloc(
table_assignments, sizeof(unsigned int) * new_capacity);
if (new_table_assignments == NULL) {
fprintf(stderr, "node_sampler.resize ERROR: Unable to expand table_assignments.\n");
return false;
}
table_assignments = new_table_assignments;
unsigned int* new_root_assignments = (unsigned int*) realloc(
root_assignments, sizeof(unsigned int) * new_capacity);
if (new_root_assignments == NULL) {
fprintf(stderr, "node_sampler.resize ERROR: Unable to expand root_assignments.\n");
return false;
}
root_assignments = new_root_assignments;
array_multiset<K>* new_descendant_observations = (array_multiset<K>*) realloc(
static_cast<void*>(descendant_observations), sizeof(array_multiset<K>) * new_capacity);
if (new_descendant_observations == NULL) {
fprintf(stderr, "node_sampler.resize ERROR: Unable to expand descendant_observations.\n");
return false;
}
descendant_observations = new_descendant_observations;
return true;
}
template<typename A, typename B, typename C, typename D>
friend bool init(node_sampler<C, D>&, const hdp_sampler<A, B, C, D>&,
node<C, D>&, node_sampler<C, D>*, unsigned int);
template<typename A, typename B, typename C>
friend bool read_sampler_node(A&, B&, typename A::node_type&, C&);
template<typename A, typename B, typename C, typename D, typename E>
friend bool read(node_sampler<A, B>&, C&, node<A, B>&, D&, E&);
};
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
inline bool init(node_sampler<K, V>& sampler,
const hdp_sampler<BaseDistribution, DataDistribution, K, V>& root,
node<K, V>& n, node_sampler<K, V>* parent, unsigned int table_capacity)
{
sampler.n = &n;
sampler.parent = parent;
if (!array_init(sampler.posterior, 32)) {
fprintf(stderr, "init ERROR: Unable to initialize posterior sample array in node.\n");
return false;
} else if (!sampler.initialize(root, table_capacity)) {
fprintf(stderr, "init ERROR: Error during initialization of node.\n");
free(sampler.posterior);
return false;
}
return true;
}
template<typename K, typename V>
bool copy(const node_sampler<K, V>& src, node_sampler<K, V>& dst,
const hash_map<const node<K, V>*, node<K, V>*>& node_map,
node_sampler<K, V>* parent = NULL)
{
dst.n = node_map.get(src.n);
dst.parent = parent;
dst.table_count = src.table_count;
dst.table_capacity = src.table_count;
dst.customer_count = src.customer_count;
if (!array_init(dst.posterior, 1))
return false;
dst.children = (node_sampler<K, V>*) malloc(sizeof(node_sampler<K, V>) * src.child_count());
if (dst.children == NULL) {
free(dst.posterior);
return false;
}
dst.observation_assignments = (unsigned int*) malloc(sizeof(unsigned int) * src.observation_count());
if (dst.observation_assignments == NULL) {
free(dst.posterior);
free(dst.children);
return false;
}
memcpy(dst.observation_assignments, src.observation_assignments, sizeof(unsigned int) * src.observation_count());
dst.descendant_observations = (array_multiset<K>*) malloc(sizeof(array_multiset<K>) * src.table_count);
if (dst.descendant_observations == NULL) {
free(dst.posterior);
free(dst.observation_assignments);
free(dst.children);
return false;
}
dst.table_sizes = (unsigned int*) malloc(sizeof(unsigned int) * src.table_count);
if (dst.table_sizes == NULL) {
free(dst.posterior);
free(dst.observation_assignments);
free(dst.children);
free(dst.descendant_observations);
return false;
}
memcpy(dst.table_sizes, src.table_sizes, sizeof(unsigned int) * src.table_count);
dst.root_assignments = (unsigned int*) malloc(sizeof(unsigned int) * src.table_count);
if (dst.root_assignments == NULL) {
free(dst.posterior);
free(dst.observation_assignments);
free(dst.children);
free(dst.descendant_observations);
free(dst.table_sizes);
return false;
}
memcpy(dst.root_assignments, src.root_assignments, sizeof(unsigned int) * src.table_count);
dst.table_assignments = (unsigned int*) malloc(sizeof(unsigned int) * src.table_count);
if (dst.table_assignments == NULL) {
free(dst.posterior);
free(dst.observation_assignments);
free(dst.children);
free(dst.descendant_observations);
free(dst.table_sizes);
free(dst.root_assignments);
return false;
}
memcpy(dst.table_assignments, src.table_assignments, sizeof(unsigned int) * src.table_count);
for (unsigned int i = 0; i < src.table_count; i++)
if (!init(dst.descendant_observations[i], src.descendant_observations[i]))
return false; /* TODO: free memory (also add appropriate error messages) */
for (unsigned int i = 0; i < src.child_count(); i++)
if (!copy(src.children[i], dst.children[i], node_map, &dst))
return false; /* TODO: free memory (also add appropriate error messages) */
return true;
}
template<typename K, typename V>
struct collapsed_root_sample {
array_multiset<K>* descendant_observations;
unsigned int* table_sizes;
unsigned int table_count;
unsigned int customer_count;
unsigned int* table_assignments;
collapsed_root_sample(unsigned int table_count, unsigned int observation_count) : table_count(table_count) {
if (!initialize(table_count, observation_count)) {
fprintf(stderr, "collapsed_root_sample ERROR: Error during initialization.\n");
exit(EXIT_FAILURE);
}
}
~collapsed_root_sample() { free(); }
inline unsigned int root_assignment(unsigned int i) const {
return i;
}
template<typename Metric>
static inline long unsigned int size_of(const collapsed_root_sample<K, V>& sample, const Metric& metric) {
long unsigned int sum = core::size_of(sample.table_count) + core::size_of(sample.customer_count);
for (unsigned int i = 0; i < sample.table_count; i++)
sum += core::size_of(sample.descendant_observations[i], metric);
return sum + (sample.table_count + sample.customer_count) * sizeof(unsigned int);
}
static void free(collapsed_root_sample<K, V>& sample) {
sample.free();
}
private:
inline bool initialize(unsigned int table_count, unsigned int observation_count) {
descendant_observations = (array_multiset<K>*) malloc(sizeof(array_multiset<K>) * table_count);
if (descendant_observations == NULL) {
fprintf(stderr, "collapsed_root_sample.initialize ERROR: Unable to"
" initialize descendant_observations.\n");
return false;
}
table_sizes = (unsigned int*) malloc(sizeof(unsigned int) * table_count);
if (table_sizes == NULL) {
fprintf(stderr, "collapsed_root_sample.initialize ERROR:"
" Unable to initialize table_sizes.\n");
core::free(descendant_observations);
return false;
}
if (observation_count == 0)
observation_count = 1;
table_assignments = (unsigned int*) malloc(sizeof(unsigned int) * observation_count);
if (table_assignments == NULL) {
fprintf(stderr, "collapsed_root_sample.initialize ERROR:"
" Unable to initialize table_assignments.\n");
core::free(descendant_observations);
core::free(table_sizes);
return false;
}
return true;
}
inline void free() {
for (unsigned int i = 0; i < table_count; i++)
core::free(descendant_observations[i]);
core::free(descendant_observations);
core::free(table_sizes);
core::free(table_assignments);
}
template<typename A, typename B>
friend bool init(collapsed_root_sample<A, B>&, unsigned int, unsigned int);
};
template<typename K, typename V>
bool init(collapsed_root_sample<K, V>& sample, unsigned int table_count, unsigned int observation_count) {
sample.table_count = table_count;
return sample.initialize(table_count, observation_count);
}
template<typename K, typename V, typename AtomReader>
bool read(collapsed_root_sample<K, V>& sample, FILE* in, node_sample_scribe<AtomReader>& reader)
{
if (!read(sample.table_count, in)) return false;
sample.descendant_observations = (array_multiset<K>*)
malloc(sizeof(array_multiset<K>) * sample.table_count);
if (sample.descendant_observations == NULL)
return false;
sample.table_sizes = (unsigned int*) malloc(sizeof(unsigned int) * sample.table_count);
if (sample.table_sizes == NULL) {
free(sample.descendant_observations);
return false;
}
sample.table_assignments = (unsigned int*) malloc(sizeof(unsigned int) * reader.observation_count);
if (sample.table_assignments == NULL) {
free(sample.descendant_observations);
free(sample.table_sizes);
return false;
}
for (unsigned int i = 0; i < sample.table_count; i++) {
if (!read(sample.descendant_observations[i], in, reader.atom_scribe))
return false;
}
if (!read(sample.table_sizes, in, sample.table_count)) return false;
if (!read(sample.table_assignments, in, reader.observation_count)) return false;
sample.customer_count = 0;
for (unsigned int i = 0; i < sample.table_count; i++)
sample.customer_count += sample.table_sizes[i];
return true;
}
template<typename K, typename V, typename AtomWriter>
bool write(const collapsed_root_sample<K, V>& sample, FILE* out, node_sample_scribe<AtomWriter>& writer)
{
if (!write(sample.table_count, out)) return false;
for (unsigned int i = 0; i < sample.table_count; i++) {
if (!write(sample.descendant_observations[i], out, writer.atom_scribe))
return false;
}
if (!write(sample.table_sizes, out, sample.table_count)) return false;
if (!write(sample.table_assignments, out, writer.observation_count)) return false;
return true;
}
/**
* This structure stores Gibbs sampling variables for an HDP root node (i.e. an
* ::hdp object). These "sampler" structures form a tree parallel to the HDP
* hierarchy consisting of ::hdp and ::node objects.
*
* Since the Gibbs sampler is derived using the Chinese restaurant
* representation, every hdp_sampler contains an infinite number of tables, but
* we only store the non-empty tables. The number of non-empty tables is given
* by hdp_sampler::table_count. However, during inference, the number of
* occupied tables may increase and decrease. As such, the node keeps a larger
* "capacity" of tables hdp_sampler::table_capacity to avoid reallocating
* memory at every sampling step. As long as `table_count <= table_capacity`,
* the algorithm doesn't need to reallocate memory. Every observation drawn
* from this node is assigned to a table (just as a customer picks a table at
* which to sit).
*
* For an example of how to perform MCMC inference with DPs and HDPs, see the
* [code example above](#code-example).
*
* \tparam BaseDistribution the type of the base distribution (see ::hdp).
* \tparam DataDistribution the type of the likelihood (see ::hdp).
* \tparam K the generic type of the observations drawn from this distribution.
* \tparam V the type of the probabilities.
*/
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
struct hdp_sampler
{
/**
* The generic type of the observations drawn from this distribution.
*/
typedef K atom_type;
/**
* The type of the probabilities.
*/
typedef V value_type;
/**
* The type of the base distribution (see ::hdp).
*/
typedef BaseDistribution base_distribution_type;
/**
* The type of the likelihood (see ::hdp).
*/
typedef DataDistribution data_distribution_type;
/**
* The type of a child node of this node in the sampler hierarchy.
*/
typedef node_sampler<K, V> child_type;
/**
* The "sampler" structures form a tree parallel to the HDP hierarchy which
* consists of ::hdp and ::node objects. This typedef returns the type of
* the object that this hdp_sampler represents in the HDP hierarchy.
*/
typedef hdp<BaseDistribution, DataDistribution, K, V> node_type;
/**
* A native array containing the child nodes of this node in the sampler
* hierarchy. This array is parallel to the node::children and
* hdp::children array maps.
*/
node_sampler<K, V>* children;
/**
* The "sampler" structures form a tree parallel to the HDP hierarchy which
* consists of ::hdp and ::node objects. This is a pointer to the node in
* the HDP hierarchy that is parallel to this node.
*/
node_type* n;
/**
* <!-- state variables for Gibbs sampling -->
*/
/**
* An array parallel to hdp::observations that indicates which table in
* this node each observation is assigned to.
*/
unsigned int* observation_assignments;
/**
* An array with length node_sampler::table_count (and capacity
* node_sampler::table_capacity) that keeps track of the number of
* customers sitting at each table.
*/
unsigned int* table_sizes;
/**
* This array of array_multiset objects keeps track, for each table in
* this node, the set of observations assigned to this table, either
* directly or indirectly via descendant nodes in the hierarchy. This array
* has length node_sampler::table_count and capacity
* node_sampler::table_capacity.
*/
array_multiset<K>* descendant_observations;
/**
* The number of occupied tables in the Chinese restaurant represented by
* this sampler root node.
*/
unsigned int table_count;
/**
* The current capacity of tables in this sampler root node.
*/
unsigned int table_capacity;
/**
* The total number of customers sitting at all tables at this sampler
* root node.
*/
unsigned int customer_count;
array<collapsed_root_sample<K, V>> posterior;
/**
* Constructs this root sampler node with the given `root` node from the
* HDP hierarchy. All existing observations are assigned a table in the
* hierarchy one-by-one, each time sampling the table assignment from the
* conditional distribution: given that all other table assignments are
* fixed, the assignment for each observation is sampled. This constructor
* also initializes all descendant sampler nodes, recursively, parallel to
* the HDP hierarchy.
*/
hdp_sampler(hdp<BaseDistribution, DataDistribution, K, V>& root) :
n(&root), table_count(0), table_capacity(8), posterior(32)
{
if (!initialize())
exit(EXIT_FAILURE);
}
~hdp_sampler() { free(); }
inline V alpha() const {
return n->get_alpha();
}
inline V log_alpha() const {
return n->get_log_alpha();
}
inline const BaseDistribution& pi() const {
return n->pi;
}
inline unsigned int child_count() const {
return (unsigned int) n->children.size;
}
inline const node_sampler<K, V>& get_child(unsigned int key, bool& contains) const {
unsigned int index = n->children.index_of(key);
contains = (index < child_count());
return children[index];
}
inline unsigned int child_key(unsigned int index) const {
return n->children.keys[index];
}
inline unsigned int observation_count() const {
return (unsigned int) n->observations.length;
}
inline const K& get_observation(unsigned int index) const {
return n->observations[index];
}
inline unsigned int root_assignment(unsigned int i) const {
return i;
}
inline unsigned int table_assignment(unsigned int i) const {
return i;
}
inline void move_table(unsigned int src, unsigned int dst,
cache<BaseDistribution, DataDistribution, K, V>& cache)
{
array_multiset<K>::move(descendant_observations[src], descendant_observations[dst]);
table_sizes[dst] = table_sizes[src];
cache.on_move_table(src, dst);
}
template<typename Observations>
inline bool move_to_table(const Observations& observations,
unsigned int src, unsigned int dst,
cache<BaseDistribution, DataDistribution, K, V>& cache)
{
if (!descendant_observations[dst].add(observations)) {
fprintf(stderr, "hdp_sampler.move_to_table ERROR: Unable to initialize new multiset.\n");
return false;
}
descendant_observations[src].subtract(observations);
descendant_observations[src].remove_zeros();
return cache.on_move_to_table(*this, src, dst, observations);
}
template<typename Observations>
inline bool move_to_new_table(
const Observations& observations, unsigned int src,
cache<BaseDistribution, DataDistribution, K, V>& cache)
{
if (!descendant_observations[table_count - 1].add(observations)) {
fprintf(stderr, "hdp_sampler.move_to_new_table ERROR: Unable to initialize new multiset.\n");
return false;
}
descendant_observations[src].subtract(observations);
descendant_observations[src].remove_zeros();
return cache.on_move_to_new_table(*this, src, observations);
}
template<typename Observations>
inline bool add_to_table(
const Observations& observations, unsigned int table,
cache<BaseDistribution, DataDistribution, K, V>& cache)
{
if (!descendant_observations[table].add(observations)) {
fprintf(stderr, "hdp_sampler.add_to_table ERROR: Unable to initialize new multiset.\n");
return false;
}
return cache.on_add_to_table(*this, table, observations);
}
template<typename Observations>
inline bool add_to_new_table(const Observations& observations,
cache<BaseDistribution, DataDistribution, K, V>& cache)
{
if (!descendant_observations[table_count - 1].add(observations)) {
fprintf(stderr, "hdp_sampler.add_to_new_table ERROR: Unable to initialize new multiset.\n");
return false;
}
return cache.on_add_to_new_table(*this, observations);
}
template<typename Observations>
inline bool remove_from_table(
const Observations& observations, unsigned int table,
cache<BaseDistribution, DataDistribution, K, V>& cache)
{
descendant_observations[table].subtract(observations);
descendant_observations[table].remove_zeros();
return cache.on_remove_from_table(*this, table, observations);
}
inline bool new_table() {
unsigned int new_capacity = table_capacity;
while (new_capacity < table_count + 2)
new_capacity *= 2;
if (table_capacity != new_capacity && !resize(new_capacity)) {
fprintf(stderr, "hdp_sampler.new_table ERROR: Unable to expand table array.\n");
return false;
}
table_capacity = new_capacity;
if (!init(descendant_observations[table_count], 8)) {
fprintf(stderr, "hdp_sampler.new_table ERROR: Unable to initialize new multiset.\n");
return false;
}
table_count++;
return true;
}
inline bool new_initial_table() {
if (!init(descendant_observations[table_count], 8)) {
fprintf(stderr, "hdp_sampler.new_table ERROR: Unable to initialize new multiset.\n");
return false;
}
table_sizes[table_count] = 0;
table_count++;
return true;
}
/* NOTE: this does not change the total number of tables */
inline void free_table(unsigned int i) {
core::free(descendant_observations[i]);
}
inline void set_table_count(unsigned int new_count) {
table_count = new_count;
}
bool add_sample() {
for (unsigned int i = 0; i < child_count(); i++)
children[i].add_sample(table_count);
if (!posterior.ensure_capacity(posterior.length + 1)) {
fprintf(stderr, "hdp_sampler.add_sample ERROR: Unable to expand posterior sample array.\n");
return false;
} else if (!init(posterior[(unsigned int) posterior.length], table_count, observation_count())) {
fprintf(stderr, "hdp_sampler.add_sample ERROR: Unable to initialize new sample.\n");
return false;
}
collapsed_root_sample<K, V>& sample = posterior[(unsigned int) posterior.length];
for (unsigned int i = 0; i < observation_count(); i++)
sample.table_assignments[i] = observation_assignments[i];
sample.table_count = 0;
for (unsigned int i = 0; i < table_count; i++) {
if (!array_multiset<K>::copy(descendant_observations[i], sample.descendant_observations[i])) {
fprintf(stderr, "hdp_sampler.add_sample ERROR: Unable to copy descendant observations.\n");
core::free(sample);
return false;
}
sample.table_sizes[i] = table_sizes[i];
sample.table_count++;
}
sample.customer_count = customer_count;
posterior.length++;
return true;
}
inline void clear_samples() {
for (unsigned int i = 0; i < child_count(); i++)
children[i].clear_samples();
for (unsigned int i = 0; i < posterior.length; i++)
core::free(posterior[i]);
posterior.clear();
}
inline void relabel_tables(const unsigned int* table_map,
cache<BaseDistribution, DataDistribution, K, V>& cache)
{
/* apply the table map to the tables in the child nodes */
for (unsigned int i = 0; i < child_count(); i++)
children[i].relabel_parent_tables(table_map);
for (unsigned int i = 0; i < observation_count(); i++)
observation_assignments[i] = table_map[observation_assignments[i]];
cache.relabel_tables(table_map);
}
template<typename Metric>
static inline long unsigned int size_of(
const hdp<BaseDistribution, DataDistribution, K, V>& h, const Metric& metric) {
long unsigned int sum = core::size_of(h.children) + core::size_of(h.observations)
+ core::size_of(h.table_count) + core::size_of(h.table_capacity)
+ core::size_of(h.customer_count) + core::size_of(h.posterior);
sum += sizeof(unsigned int) * h.depth; /* for alpha */
for (unsigned int i = 0; i < h.table_count; i++)
sum += core::size_of(h.descendant_observations[i]);
sum += (sizeof(DataDistribution) + sizeof(array_multiset<K>)) * (h.table_capacity - h.table_count);
sum += sizeof(unsigned int) * h.table_capacity; /* for table_sizes */
return sum;
}
static inline void free(hdp_sampler<BaseDistribution, DataDistribution, K, V>& h) {
h.free();
core::free(h.posterior);
}
private:
bool initialize() {
table_count = 0;
children = NULL;
observation_assignments = NULL;
descendant_observations = NULL;
table_sizes = NULL;
customer_count = 0;
children = (node_sampler<K, V>*) malloc(sizeof(node_sampler<K, V>) * n->children.capacity);
observation_assignments = (unsigned int*) calloc(sizeof(unsigned int), n->observations.capacity);
if (children == NULL || observation_assignments == NULL || !resize(table_capacity)) {
fprintf(stderr, "hdp_sampler.initialize ERROR: Unable to initialize tables.\n");
free(0); return false;
}
/* add the observations from the hdp */
for (unsigned int i = 0; i < n->children.size; i++) {
if (!init(children[i], *this, n->children.values[i], (node_sampler<K, V>*) NULL, 8)) {
fprintf(stderr, "hdp_sampler.initialize ERROR: Unable to initialize child node.\n");
free(i); return false;
}
for (unsigned int j = 0; j < children[i].table_count; j++) {
/* assign the tables in the child nodes to tables in this node */
if (!sample_initial_assignment<DataDistribution>(
*this, pi(), children[i].descendant_observations[j], children[i].table_assignments[j]))
{
fprintf(stderr, "hdp_sampler.initialize ERROR: Unable to add to descendant_observations.\n");
free(i + 1); return false;
}
}
}
for (unsigned int i = 0; i < observation_count(); i++) {
if (!sample_initial_assignment<DataDistribution>(
*this, pi(), n->observations[i], observation_assignments[i]))
{
fprintf(stderr, "hdp_sampler.initialize ERROR: Unable to add to descendant_observations.\n");
free(0); return false;
}
}
recompute_root_assignments(*this);
return true;
}
inline void free_children(unsigned int child_count, const unsigned int* indices = NULL) {
if (indices == NULL) {
for (unsigned int i = 0; i < child_count; i++)
core::free(children[i]);
} else {
for (unsigned int i = 0; i < child_count; i++)
core::free(children[indices[i]]);
}
core::free(children);
}
inline void free(unsigned int child_count, const unsigned int* indices = NULL) {
if (children != NULL)
free_children(child_count, indices);
if (descendant_observations != NULL) {
for (unsigned int i = 0; i < table_count; i++)
core::free(descendant_observations[i]);
}
for (unsigned int i = 0; i < posterior.length; i++)
core::free(posterior[i]);
if (observation_assignments != NULL) core::free(observation_assignments);
if (descendant_observations != NULL) core::free(descendant_observations);
if (table_sizes != NULL) core::free(table_sizes);
}
inline void free() { free(child_count()); }
inline bool resize(unsigned int new_capacity) {
array_multiset<K>* new_descendant_observations = (array_multiset<K>*)
realloc(static_cast<void*>(descendant_observations), sizeof(array_multiset<K>) * new_capacity);
if (new_descendant_observations == NULL) {
fprintf(stderr, "hdp_sampler.resize ERROR: Unable to expand descendant_observations.\n");
return false;
}
descendant_observations = new_descendant_observations;
unsigned int* new_table_sizes = (unsigned int*)
realloc(table_sizes, sizeof(unsigned int) * new_capacity);
if (new_table_sizes == NULL) {
fprintf(stderr, "hdp_sampler.resize ERROR: Unable to expand table_sizes.\n");
return false;
}
table_sizes = new_table_sizes;
return true;
}
template<typename A, typename B, typename C, typename D>
friend bool init(hdp_sampler<A, B, C, D>&, hdp<A, B, C, D>&, unsigned int);
template<typename A, typename B, typename C>
friend bool read_sampler_node(A&, B&, typename A::node_type&, C&);
};
/**
* Initializes the given root sampler node `h` with the given `root` node from
* the HDP hierarchy. All existing observations are assigned a table in the
* hierarchy one-by-one, each time sampling the table assignment from the
* conditional distribution: given that all other table assignments are fixed,
* the assignment for each observation is sampled. This constructor also
* initializes all descendant sampler nodes, recursively, parallel to the HDP
* hierarchy.
*/
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
bool init(hdp_sampler<BaseDistribution, DataDistribution, K, V>& h,
hdp<BaseDistribution, DataDistribution, K, V>& root, unsigned int initial_table_count = 1)
{
h.n = &root;
h.table_count = 0;
h.table_capacity = 1 << (core::log2(initial_table_count) + 2);
if (!array_init(h.posterior, 32)) {
fprintf(stderr, "init ERROR: Unable to initialize posterior sample array in hdp.\n");
return false;
}
return h.initialize();
}
/**
* Copies the HDP sampler hierarchy rooted at `src` into a new sampler
* hierarchy rooted at `dst` (which should be uninitialized). The new sampler
* hierarchy may be associated with a new HDP hierarchy, rooted at `new_root`,
* in which case, `node_map` should contain a map of pointers to nodes from the
* old hierarchy to pointers to nodes in the new hierarchy. The `copy` function
* for ::hdp is one way to produce such a map.
*/
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
bool copy(
const hdp_sampler<BaseDistribution, DataDistribution, K, V>& src,
hdp_sampler<BaseDistribution, DataDistribution, K, V>& dst,
hdp<BaseDistribution, DataDistribution, K, V>& new_root,
const hash_map<const node<K, V>*, node<K, V>*>& node_map)
{
dst.n = &new_root;
dst.table_count = src.table_count;
dst.table_capacity = src.table_count;
dst.customer_count = src.customer_count;
if (!array_init(dst.posterior, 1))
return false;
dst.children = (node_sampler<K, V>*) malloc(sizeof(node_sampler<K, V>) * src.child_count());
if (dst.children == NULL) {
free(dst.posterior);
return false;
}
dst.observation_assignments = (unsigned int*) malloc(sizeof(unsigned int) * src.observation_count());
if (dst.observation_assignments == NULL) {
free(dst.posterior);
free(dst.children);
return false;
}
memcpy(dst.observation_assignments, src.observation_assignments, sizeof(unsigned int) * src.observation_count());
dst.descendant_observations = (array_multiset<K>*) malloc(sizeof(array_multiset<K>) * src.table_count);
if (dst.descendant_observations == NULL) {
free(dst.posterior);
free(dst.observation_assignments);
free(dst.children);
return false;
}
dst.table_sizes = (unsigned int*) malloc(sizeof(unsigned int) * src.table_count);
if (dst.table_sizes == NULL) {
free(dst.posterior);
free(dst.observation_assignments);
free(dst.children);
free(dst.descendant_observations);
return false;
}
memcpy(dst.table_sizes, src.table_sizes, sizeof(unsigned int) * src.table_count);
for (unsigned int i = 0; i < src.table_count; i++)
if (!init(dst.descendant_observations[i], src.descendant_observations[i]))
return false; /* TODO: free memory (also add appropriate error messages) */
for (unsigned int i = 0; i < src.child_count(); i++)
if (!copy(src.children[i], dst.children[i], node_map))
return false; /* TODO: free memory (also add appropriate error messages) */
return true;
}
/**
* Prints the given sampler `node` to `out`.
* \tparam NodeType either node_sampler or hdp_sampler.
* \tparam KeyPrinter a scribe type for which the function
* `bool print(unsigned int, Stream&, KeyPrinter&)` or
* `bool print(unsigned int, Stream&, KeyPrinter&, unsigned int)` is defined
* (if the latter is defined, it will be called, and the level of the
* printed node is passed as the fourth argument). This scribe is used to
* print the `unsigned int` keys of child nodes in the hierarchy.
* \tparam AtomPrinter a scribe type for which the function
* `bool print(const K&, Stream&, AtomPrinter&)` is defined. This scribe
* is used to print the observations in the hierarchy.
*/
template<typename NodeType, typename Stream, typename KeyPrinter,
typename AtomPrinter, typename K = typename NodeType::atom_type>
bool print(const NodeType& node, Stream& out,
KeyPrinter& key_printer, AtomPrinter& atom_printer, unsigned int level = 0)
{
bool success = true;
if (node.observation_count() > 0) {
array_multiset<K> multiset = array_multiset<K>(node.observation_count());
for (const K& observation : node.n->observations)
multiset.add_unsorted(observation);
insertion_sort(multiset.counts.keys, multiset.counts.values, (unsigned int) multiset.counts.size, default_sorter());
for (unsigned int i = 0; i < multiset.counts.size; i++) {
if (i > 0) success &= print(", ", out);
success &= print(multiset.counts.keys[i], out, atom_printer)
&& print(':', out) && print(multiset.counts.values[i], out);
}
}
bool first = true;
for (unsigned int i = 0; i < node.child_count(); i++) {
if (node.children[i].customer_count == 0) continue;
if (first) first = false;
else success &= print(' ', out);
success &= print('{', out) && print(node.child_key(i), out, level, key_printer) && print(": ", out)
&& print(node.children[i], out, key_printer, atom_printer, level + 1) && print('}', out);
}
return success;
}
template<typename K, typename V>
inline void fix_parent_pointers(node_sampler<K, V>& node) {
for (unsigned int i = 0; i < node.child_count(); i++)
node.children[i].parent = &node;
}
template<typename K, typename V>
inline node_sampler<K, V>* get_parent(node_sampler<K, V>& parent) {
return &parent;
}
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
constexpr node_sampler<K, V>* get_parent(hdp_sampler<BaseDistribution, DataDistribution, K, V>& parent) {
return NULL;
}
template<typename K, typename V, typename Stream, typename ParentType, typename KeyReader>
bool read(node_sampler<K, V>& n, Stream& stream, node<K, V>& hdp_node, ParentType& parent, KeyReader& key_reader)
{
n.parent = get_parent(parent);
n.table_assignments = NULL;
n.root_assignments = NULL;
if (!read_sampler_node(n, stream, hdp_node, key_reader))
return false;
if (!read(n.table_assignments, stream, n.table_count)) {
fprintf(stderr, "read ERROR: Insufficient memory for table assignment array.\n");
n.free(); free(n.posterior); return false;
}
return true;
}
/**
* Reads the sampler hierarchy rooted at `n` from `stream`. This function
* recursively reads all descendant nodes in the sampler hierarchy.
* \tparam KeyReader a scribe type for which the function
* `bool read(K&, Stream&, KeyReader&)` is defined. This scribe is used to
* read the observations in the hierarchy.
*/
template<typename BaseDistribution, typename DataDistribution,
typename K, typename V, typename Stream, typename KeyReader>
bool read(hdp_sampler<BaseDistribution, DataDistribution, K, V>& n,
Stream& stream, hdp<BaseDistribution, DataDistribution, K, V>& h,
KeyReader& key_reader)
{
if (!read_sampler_node(n, stream, h, key_reader)) return false;
recompute_root_assignments(n);
return true;
}
template<typename NodeType, typename Stream, typename KeyWriter>
bool write_sampler_node(const NodeType& n, Stream& stream, KeyWriter& key_writer)
{
if (!write(n.table_count, stream)
|| !write(n.n->children.keys, stream, n.child_count(), key_writer)
|| !write(n.observation_assignments, stream, n.observation_count()))
return false;
for (unsigned int i = 0; i < n.child_count(); i++)
if (!write(n.children[i], stream, key_writer)) return false;
return true;
}
template<typename K, typename V, typename Stream, typename KeyWriter>
bool write(const node_sampler<K, V>& n, Stream& stream, KeyWriter& key_writer)
{
return write_sampler_node(n, stream, key_writer)
&& write(n.table_assignments, stream, n.table_count);
}
/**
* Writes the sampler hierarchy rooted at `n` to `stream`. This function
* recursively writes all descendant nodes in the sampler hierarchy.
* \tparam KeyWriter a scribe type for which the function
* `bool write(const K&, Stream&, KeyReader&)` is defined. This scribe is
* used to write the observations in the hierarchy.
*/
template<typename BaseDistribution, typename DataDistribution,
typename K, typename V, typename Stream, typename KeyWriter>
bool write(const hdp_sampler<BaseDistribution, DataDistribution, K, V>& n, Stream& stream, KeyWriter& key_writer)
{
return write_sampler_node(n, stream, key_writer);
}
template<typename NodeType, typename Stream, typename KeyReader>
bool read_sampler_node(NodeType& n, Stream& stream,
typename NodeType::node_type& hdp_node,
KeyReader& key_reader)
{
unsigned int table_count;
if (!array_init(n.posterior, 32)) {
return false;
} else if (!read(table_count, stream)) {
free(n.posterior);
return false;
}
n.table_count = 0;
n.customer_count = 0;
n.table_capacity = max(1u, table_count);
n.n = &hdp_node;
n.descendant_observations = NULL; n.table_sizes = NULL;
n.children = (typename NodeType::child_type*) malloc(sizeof(typename NodeType::child_type) * hdp_node.children.capacity);
n.observation_assignments = (unsigned int*) calloc(sizeof(unsigned int), hdp_node.observations.capacity);
unsigned int* keys = (unsigned int*) malloc(sizeof(unsigned int) * hdp_node.children.size);
unsigned int* indices = (unsigned int*) malloc(sizeof(unsigned int) * hdp_node.children.size);
unsigned int* inverse_indices = (unsigned int*) malloc(sizeof(unsigned int) * hdp_node.children.size);
if (n.children == NULL || n.observation_assignments == NULL || keys == NULL || indices == NULL || inverse_indices == NULL
|| !n.resize(n.table_capacity) || !read(keys, stream, hdp_node.children.size, key_reader)
|| !read(n.observation_assignments, stream, (unsigned int) hdp_node.observations.length))
{
fprintf(stderr, "read_sampler_node ERROR: Unable to initialize tables.\n");
n.free(0); free(n.posterior); return false;
}
memset(n.table_sizes, 0, sizeof(unsigned int) * table_count);
for (unsigned int i = 0; i < table_count; i++) {
if (!init(n.descendant_observations[i], 8)) {
fprintf(stderr, "read_sampler_node ERROR: Unable to initialize descendant observation multisets.\n");
n.free(0); free(n.posterior); free(indices); free(inverse_indices); return false;
}
n.table_count++;
}
for (unsigned int i = 0; i < hdp_node.children.size; i++) {
indices[i] = i;
inverse_indices[i] = i;
}
if (hdp_node.children.size > 1) {
sort(keys, indices, hdp_node.children.size, default_sorter());
sort(indices, inverse_indices, hdp_node.children.size);
}
for (unsigned int i = 0; i < hdp_node.children.size; i++)
free(keys[i]);
free(indices); free(keys);
for (unsigned int i = 0; i < hdp_node.children.size; i++) {
if (!read(n.children[inverse_indices[i]], stream, hdp_node.children.values[inverse_indices[i]], n, key_reader)) {
fprintf(stderr, "read_sampler_node ERROR: Unable to read child node.\n");
n.free(i, indices); free(n.posterior); free(inverse_indices); return false;
}
}
free(inverse_indices);
/* compute descendant observations, table sizes, and customer count */
for (unsigned int i = 0; i < n.observation_count(); i++) {
unsigned int table = n.observation_assignments[i];
if (!n.descendant_observations[table].add(n.get_observation(i))) {
fprintf(stderr, "read_sampler_node ERROR: Unable to add observation.\n");
n.free(); free(n.posterior); return false;
}
n.table_sizes[table]++;
n.customer_count++;
}
for (unsigned int i = 0; i < n.child_count(); i++) {
for (unsigned int j = 0; j < n.children[i].table_count; j++) {
unsigned int table = n.children[i].table_assignments[j];
if (!n.descendant_observations[table].add(n.children[i].descendant_observations[j])) {
fprintf(stderr, "read_sampler_node ERROR: Unable to add child node's observations.\n");
n.free(); free(n.posterior); return false;
}
n.table_sizes[table]++;
n.customer_count++;
}
}
return true;
}
template<typename DataDistribution, typename NodeType, typename BaseDistribution, typename Observations>
inline bool sample_initial_assignment(
NodeType& n,
const BaseDistribution& pi,
const Observations& observations,
unsigned int& assignment)
{
typedef typename NodeType::value_type V;
/* compute likelihood of the observations for each root cluster */
array<V> distribution = array<V>(n.table_count + 1);
for (unsigned int i = 0; i < n.table_count; i++)
distribution[i] = DataDistribution::log_conditional(pi, observations, n.descendant_observations[i]);
distribution[n.table_count] = DataDistribution::log_probability(pi, observations);
distribution.length = n.table_count + 1;
normalize_exp(distribution.data, (unsigned int) distribution.length);
assignment = sample_categorical(distribution.data, (unsigned int) distribution.length);
if (assignment == n.table_count) {
n.table_sizes[n.table_count] = 1;
if (!n.new_table()) return false;
} else {
n.table_sizes[assignment]++;
}
n.customer_count++;
return n.descendant_observations[assignment].add(observations);
}
template<typename RootType>
struct root_position {
typedef RootType root_type;
typedef typename RootType::atom_type atom_type;
typedef typename RootType::value_type value_type;
RootType& node;
root_position(RootType& root) : node(root) { }
inline unsigned int depth() const {
return node.depth;
}
inline const RootType& get_root() const {
return node;
}
inline RootType& get_root() {
return node;
}
};
template<typename RootType>
inline root_position<RootType> make_root_node(RootType& root) {
return root_position<RootType>(root);
}
template<typename RootType>
struct child_position {
typedef RootType root_type;
typedef typename RootType::child_type node_type;
typedef typename RootType::atom_type atom_type;
typedef typename RootType::value_type value_type;
RootType& root;
node_type& node;
child_position(RootType& root, node_type& node) : root(root), node(node) { }
inline unsigned int depth() const {
return root.depth;
}
inline RootType& get_root() {
return root;
}
inline const RootType& get_root() const {
return root;
}
};
template<typename RootType>
inline child_position<RootType> make_child_node(
const root_position<RootType>& root, typename RootType::child_type& node) {
return child_position<RootType>(root.node, node);
}
template<typename RootType>
inline child_position<RootType> make_child_node(
const child_position<RootType>& root, typename RootType::child_type& node) {
return child_position<RootType>(root.root, node);
}
template<bool UpdateCustomerCount, typename K, typename V, typename Cache>
inline void decrement_table(node_sampler<K, V>& n, unsigned int table, Cache& cache)
{
#if !defined(NDEBUG)
if (n.table_sizes[table] == 0) {
fprintf(stderr, "decrement_table ERROR: table_sizes is invalid.\n");
return;
}
#endif
n.table_sizes[table]--;
if (UpdateCustomerCount)
n.customer_count--;
}
template<bool UpdateCustomerCount, typename BaseDistribution,
typename DataDistribution, typename K, typename V>
inline void decrement_table(
hdp_sampler<BaseDistribution, DataDistribution, K, V>& h,
unsigned int table, cache<BaseDistribution, DataDistribution, K, V>& cache)
{
#if !defined(NDEBUG)
if (h.table_sizes[table] == 0) {
fprintf(stderr, "decrement_table ERROR: table_sizes is invalid.\n");
return;
}
#endif
cache.on_change_table_size(table, h.table_sizes[table] - 1);
h.table_sizes[table]--;
if (UpdateCustomerCount)
h.customer_count--;
}
template<bool UpdateCustomerCount, typename K, typename V, typename Cache>
inline void increment_table(node_sampler<K, V>& n, unsigned int table, Cache& cache)
{
n.table_sizes[table]++;
if (UpdateCustomerCount)
n.customer_count++;
}
template<bool UpdateCustomerCount, typename BaseDistribution,
typename DataDistribution, typename K, typename V>
inline void increment_table(
hdp_sampler<BaseDistribution, DataDistribution, K, V>& h,
unsigned int table, cache<BaseDistribution, DataDistribution, K, V>& cache)
{
cache.on_change_table_size(table, h.table_sizes[table] + 1);
h.table_sizes[table]++;
if (UpdateCustomerCount)
h.customer_count++;
}
template<typename NodeType>
void compute_table_sizes(const NodeType& n, unsigned int* dst, unsigned int dst_length) {
for (unsigned int i = 0; i < n.observation_count(); i++)
add_to_histogram(dst, dst_length, n.observation_assignments[i]);
for (unsigned int i = 0; i < n.child_count(); i++) {
typename NodeType::child_type& child = n.children[i];
for (unsigned int j = 0; j < child.table_count; j++)
if (child.descendant_observations[j].counts.size != 0)
add_to_histogram(dst, dst_length, child.table_assignments[j]);
}
}
template<typename NodeType>
bool check_table(const NodeType& n, unsigned int table,
const array_multiset<typename NodeType::atom_type>& expected_observations)
{
bool success = true;
auto computed_observations =
array_multiset<typename NodeType::atom_type>((unsigned int) expected_observations.counts.size);
for (unsigned int i = 0; i < n.child_count(); i++) {
if (!n.children[i].get_observations(computed_observations, table)) {
fprintf(stderr, "check_table ERROR: Unable to add observations from child node.\n");
return false;
}
}
for (unsigned int i = 0; i < n.observation_count(); i++) {
if (n.observation_assignments[i] == table)
computed_observations.add(n.n->observations[i]);
}
if (computed_observations == expected_observations)
return success;
fprintf(stderr, "check_table ERROR: descendant_observations of table %u is invalid.\n", table);
return false;
}
template<typename K, typename V>
bool check_tables(const node_sampler<K, V>& n)
{
unsigned int* expected_sizes = (unsigned int*) calloc(n.table_count, sizeof(unsigned int));
compute_table_sizes(n, expected_sizes, n.table_count);
if (!compare_histograms(expected_sizes, n.table_sizes, n.table_count, n.table_count)) {
fprintf(stderr, "check_tables WARNING: table_sizes is inconsistent.\n");
free(expected_sizes);
return false;
}
free(expected_sizes);
for (unsigned int i = 0; i < n.child_count(); i++) {
if (n.children[i].parent != &n) {
fprintf(stderr, "check_tables WARNING: Parent pointer is inconsistent.\n");
return false;
}
if (!check_tables(n.children[i]))
return false;
}
for (unsigned int i = 0; i < n.table_count; i++)
if (!check_table(n, i, n.descendant_observations[i]))
return false;
return true;
}
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
bool check_tables(const hdp_sampler<BaseDistribution, DataDistribution, K, V>& h)
{
unsigned int* expected_table_sizes = (unsigned int*) calloc(h.table_count, sizeof(unsigned int));
compute_table_sizes(h, expected_table_sizes, h.table_count);
if (!compare_histograms(expected_table_sizes, h.table_sizes, h.table_count, h.table_count))
{
fprintf(stderr, "check_tables WARNING: table_sizes is inconsistent.\n");
free(expected_table_sizes);
return false;
}
free(expected_table_sizes);
for (unsigned int i = 0; i < h.child_count(); i++) {
if (h.children[i].parent != NULL) {
fprintf(stderr, "check_tables WARNING: Parent pointer is inconsistent.\n");
return false;
}
if (!check_tables(h.children[i]))
return false;
}
for (unsigned int i = 0; i < h.table_count; i++)
if (!check_table(h, i, h.descendant_observations[i]))
return false;
return true;
}
template<typename NodeType>
bool tables_sorted(const NodeType& h)
{
for (unsigned int i = 0; i < h.child_count(); i++)
if (!tables_sorted(h.children[i]))
return false;
for (unsigned int i = 0; i < h.table_count; i++)
if (!h.descendant_observations[i].is_sorted())
return false;
return true;
}
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
bool is_valid(const hdp_sampler<BaseDistribution, DataDistribution, K, V>& h,
const cache<BaseDistribution, DataDistribution, K, V>& cache)
{
if (!tables_sorted(h)) {
fprintf(stderr, "is_valid WARNING: Table array_multisets are not sorted.\n");
return false;
} else if (!check_tables(h)) {
fprintf(stderr, "sample_assignment WARNING: The HDP structure is invalid.\n");
return false;
} else if (!cache.is_valid(h)) {
fprintf(stderr, "sample_assignment WARNING: Table distribution cache is invalid.\n");
return false;
}
return true;
}
template<typename K, typename V>
void prepare_sampler(node_sampler<K, V>& n, unsigned int root_table_count)
{
#if !defined(NDEBUG)
/* for debugging, check the consistency of the table_sizes data structure */
unsigned int* expected_sizes = (unsigned int*) calloc(n.table_count, sizeof(unsigned int));
compute_table_sizes(n, expected_sizes, n.table_count);
if (!compare_histograms(expected_sizes, n.table_sizes, n.table_count, n.table_count))
fprintf(stderr, "prepare_sampler WARNING: table_sizes is inconsistent.\n");
free(expected_sizes);
#endif
/* sort any multisets */
for (unsigned int j = 0; j < n.table_count; j++) {
if (n.descendant_observations[j].counts.size <= 1)
continue;
quick_sort(n.descendant_observations[j].counts.keys,
n.descendant_observations[j].counts.values,
(unsigned int) n.descendant_observations[j].counts.size, default_sorter());
}
/* recurse to all descendants */
for (unsigned int i = 0; i < n.child_count(); i++)
prepare_sampler(n.children[i], root_table_count);
}
/**
* Performs the necessary operations to prepare the variables in the sampler
* hierarchy given by `h` and the `cache` for calls to sample_hdp. This
* function also recursively prepares all descendant sampler nodes.
*/
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
inline void prepare_sampler(
hdp_sampler<BaseDistribution, DataDistribution, K, V>& h,
cache<BaseDistribution, DataDistribution, K, V>& cache)
{
#if !defined(NDEBUG)
/* for debugging, check the consistency of the table_sizes data structure */
unsigned int* expected_sizes = (unsigned int*) calloc(h.table_count, sizeof(unsigned int));
compute_table_sizes(h, expected_sizes, h.table_count);
if (!compare_histograms(expected_sizes, h.table_sizes, h.table_count, h.table_count))
fprintf(stderr, "prepare_sampler WARNING: table_sizes is inconsistent.\n");
free(expected_sizes);
#endif
h.clear_samples();
cache.prepare_sampler(h);
/* recurse to all descendants */
for (unsigned int i = 0; i < h.child_count(); i++)
prepare_sampler(h.children[i], h.table_count);
}
template<typename K, typename V, typename NodeType>
void recompute_root_assignments(node_sampler<K, V>& n, const NodeType& parent) {
for (unsigned int i = 0; i < n.table_count; i++) {
unsigned int assignment = n.table_assignments[i];
n.root_assignments[i] = parent.root_assignment(assignment);
}
for (unsigned int i = 0; i < n.child_count(); i++)
recompute_root_assignments(n.children[i], n);
}
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
void recompute_root_assignments(hdp_sampler<BaseDistribution, DataDistribution, K, V>& h) {
for (unsigned int i = 0; i < h.child_count(); i++)
recompute_root_assignments(h.children[i], h);
}
template<typename NodeType, typename Cache>
void remove_empty_tables(NodeType& n, Cache& cache)
{
/* recursively remove empty tables in the child nodes */
for (unsigned int i = 0; i < n.child_count(); i++)
remove_empty_tables(n.children[i], cache);
/* compute the table map for the children of this node */
unsigned int new_length = 0;
unsigned int* table_map = (unsigned int*) alloca(
sizeof(unsigned int) * n.table_count);
for (unsigned int i = 0; i < n.table_count; i++) {
if (n.table_sizes[i] == 0) {
n.free_table(i);
} else {
table_map[i] = new_length;
if (new_length != i)
n.move_table(i, new_length, cache);
new_length++;
}
}
n.set_table_count(new_length);
n.relabel_tables(table_map, cache);
}
template<typename V>
struct sample_result {
unsigned int selected_table;
V new_table_probability;
};
template<bool Collapsed, typename RootType, typename Distribution, typename V>
inline sample_result<V> sample_table(
const root_position<RootType>& n,
const Distribution& root_distribution,
const V& new_table_probability)
{
/* sample either a new or old table at the root */
V max = root_distribution.max();
V old_table = root_distribution.sum() / (n.node.alpha() + n.node.customer_count);
V new_table = exp(n.node.log_alpha() + new_table_probability - max) / (n.node.alpha() + n.node.customer_count);
V uniform = sample_uniform<double>() * (old_table + new_table);
if (old_table == 0.0 || uniform < new_table)
return {n.node.table_count, old_table + new_table};
else {
unsigned int selected = sample_categorical(
root_distribution.probabilities(),
root_distribution.sum(), n.node.table_count);
return {selected, old_table + new_table};
}
}
template<bool Collapsed, typename RootType, typename Distribution, typename V>
inline sample_result<V> sample_table(
const child_position<RootType>& n,
const Distribution& root_distribution,
const V& new_table_probability)
{
V* probabilities = (V*) malloc(sizeof(V) * (n.node.table_count + 1));
for (unsigned int j = 0; j < n.node.table_count; j++)
probabilities[j] = n.node.table_sizes[j] / (n.node.alpha() + n.node.customer_count)
* root_distribution.likelihood(n.node.root_assignment(j));
sample_result<V> parent_result;
if (n.node.parent == NULL) {
parent_result = sample_table<Collapsed>(
make_root_node(n.root), root_distribution, new_table_probability);
} else {
parent_result = sample_table<Collapsed>(make_child_node(n, *n.node.parent),
root_distribution, new_table_probability);
}
probabilities[n.node.table_count] = parent_result.new_table_probability
* n.node.alpha() / (n.node.alpha() + n.node.customer_count);
unsigned int selected = sample_categorical(probabilities, n.node.table_count + 1);
if (selected == n.node.table_count)
selected += parent_result.selected_table;
V sum = probabilities[n.node.table_count];
free(probabilities);
return {selected, sum};
}
/**
* <!-- Below are functions for adding new customers to existing or new tables. -->
*/
/* initializes a new table at the existing position 'index' */
template<bool Collapsed, typename RootType, typename Observations, typename Cache>
void replace_with_new_table(
child_position<RootType>& n,
unsigned int index,
const Observations& observations,
unsigned int selected_parent_table,
Cache& cache)
{
/* need to draw a new assignment for the parent table */
unsigned int& new_table_assignment = n.node.table_assignments[index];
if (n.node.parent != NULL) {
auto parent = make_child_node(n, *n.node.parent);
select_table<Collapsed>(parent, observations,
selected_parent_table, new_table_assignment, cache);
n.node.root_assignments[index] = parent.node.root_assignment(new_table_assignment);
} else {
auto root = make_root_node(n.root);
select_table<Collapsed>(root, observations,
selected_parent_table, new_table_assignment, cache);
n.node.root_assignments[index] = new_table_assignment;
}
}
template<bool Collapsed, typename RootType, typename Observations, typename Cache>
inline void assign_new_table(
root_position<RootType>& n,
const Observations& observations,
unsigned int selected_parent_table,
const Cache& cache)
{
if (!Collapsed)
fprintf(stderr, "assign_new_table ERROR: Unsupported operation.\n");
//n.node.pi.sample(n.node.phi[n.node.table_count - 1]);
}
template<bool Collapsed, typename RootType, typename Observations, typename Cache>
inline void assign_new_table(
child_position<RootType>& n,
const Observations& observations,
unsigned int selected_parent_table,
Cache& cache)
{
replace_with_new_table<Collapsed>(n,
n.node.table_count - 1, observations, selected_parent_table, cache);
}
template<bool Collapsed, typename NodeType, typename Observations, typename Cache>
bool move_to_new_table(NodeType& n,
const Observations& observations,
unsigned int selected_parent_table,
Cache& cache)
{
/* create a new table, and add the observations to it */
if (!n.node.add_to_new_table(observations, cache))
return false;
assign_new_table<Collapsed>(n, observations, selected_parent_table, cache);
return true;
}
template<typename RootType, typename Observations, typename Cache>
bool add_to_table(root_position<RootType>& n,
unsigned int dst, const Observations& observations, Cache& cache)
{
return n.node.add_to_table(observations, dst, cache);
}
template<typename RootType, typename Observations, typename Cache>
bool add_to_table(child_position<RootType>& n,
unsigned int dst, const Observations& observations, Cache& cache)
{
if (!n.node.add_to_table(observations, dst, cache))
return false;
/* if the table becomes empty, "delete" it (remove_empty_tables will properly remove it) */
if (n.node.parent == NULL) {
auto root = make_root_node(n.root);
return add_to_table(root, n.node.table_assignments[dst], observations, cache);
} else {
auto parent = make_child_node(n, *n.node.parent);
return add_to_table(parent, n.node.table_assignments[dst], observations, cache);
}
}
template<bool Collapsed, typename NodeType, typename Observations, typename Cache>
bool add_to_new_table(NodeType& n,
const Observations& observations,
unsigned int selected_parent_table,
Cache& cache)
{
/* create a new table, and add the observations to it */
if (!n.node.add_to_new_table(observations, cache))
return false;
assign_new_table<Collapsed>(n, observations, selected_parent_table, cache);
return true;
}
template<bool Collapsed, typename NodeType, typename Observations, typename Cache>
bool select_table(NodeType& n,
const Observations& observations,
unsigned int selected_table,
unsigned int& new_table,
Cache& cache)
{
if (selected_table < n.node.table_count) {
/* selected an existing table in this level */
new_table = selected_table;
n.node.table_sizes[selected_table]++;
n.node.customer_count++;
add_to_table(n, selected_table, observations, cache);
} else {
if (!n.node.new_table()) {
fprintf(stderr, "select_table ERROR: Unable to create table.\n");
return false;
}
new_table = n.node.table_count - 1;
n.node.table_sizes[new_table] = 1;
n.node.customer_count++;
move_to_new_table<Collapsed>(n, observations,
selected_table - (n.node.table_count - 1), cache);
}
return true;
}
template<bool Collapsed, typename NodeType, typename BaseDistribution, typename Observations, typename Cache>
inline bool sample_new_assignment(
NodeType& n,
const BaseDistribution& pi,
const Observations& observations,
unsigned int& assignment,
Cache& cache)
{
typedef typename NodeType::value_type V;
/* compute likelihood of the observations for each root cluster */
auto& root = n.get_root();
auto root_distribution = cache.compute_root_distribution(root, observations);
V new_table_probability = NodeType::root_type::data_distribution_type::log_probability(root.pi(), observations);
sample_result<V> result = sample_table<Collapsed>(n, root_distribution, new_table_probability);
cache.on_sample_table(root, root_distribution, root.table_count, observations);
root_distribution.free();
/* select the new table assignment */
if (result.selected_table < n.node.table_count) {
/* we chose a table at this level */
increment_table<true>(n.node, result.selected_table, cache);
assignment = result.selected_table;
add_to_table(n, result.selected_table, observations, cache);
} else {
/* we chose a table at a higher level */
unsigned int new_table = n.node.table_count;
if (!n.node.new_table()) {
fprintf(stderr, "sample_assignment ERROR: Unable to create table.\n");
return false;
}
n.node.table_sizes[new_table] = 1;
n.node.customer_count++;
assignment = new_table;
move_to_new_table<Collapsed>(n, observations,
result.selected_table - (n.node.table_count - 1), cache);
}
return true;
}
template<typename NodeType, typename Cache>
unsigned int sample_table(const NodeType& n,
const unsigned int* path,
unsigned int length, Cache& cache)
{
typedef typename NodeType::value_type V;
/* first, compute the table that *would* be sampled if we were sampling from this node */
unsigned int table;
V random = (n.customer_count + n.alpha()) * sample_uniform<V>();
if (random < n.customer_count) {
table = n.root_assignment(select_categorical(n.table_sizes, random, n.table_count));
} else {
table = UINT_MAX;
}
if (length == 0)
return table;
unsigned int index = strict_linear_search(
n.n->children.keys, *path, 0, (unsigned int) n.n->children.size);
if (index > 0 && n.n->children.keys[index - 1] == *path) {
/* a child node with this key already exists, so recurse into it */
const auto& child = n.children[index - 1];
unsigned int child_table = sample_table(child, path + 1, length - 1, cache);
if (child_table == UINT_MAX) {
return table;
} else {
return child_table;
}
} else {
return table;
}
}
template<typename BaseDistribution, typename DataDistribution,
typename K, typename V, typename Cache, typename... Args>
bool sample(const hdp_sampler<BaseDistribution, DataDistribution, K, V>& h,
K& output, const unsigned int* path, unsigned int length, Cache& cache, Args&&... args)
{
unsigned int root_table = sample_table(h, path, length, cache);
if (root_table == UINT_MAX) {
/* we chose to sample a new table at the root */
return sample(h.pi(), output, std::forward<Args>(args)...);
} else {
return DataDistribution::sample(h.pi(), h.descendant_observations[root_table], output);
}
}
template<typename RootType>
inline node_sampler<typename RootType::atom_type, typename RootType::value_type>*
get_node_pointer(child_position<RootType>& child) {
return &child.node;
}
template<typename RootType>
inline node_sampler<typename RootType::atom_type, typename RootType::value_type>*
get_node_pointer(root_position<RootType>& child) {
return NULL;
}
template<bool Collapsed = true, typename NodeType, typename BaseDistribution, typename Cache>
bool add(NodeType n, const BaseDistribution& pi,
const typename NodeType::value_type* alpha,
const unsigned int* path, unsigned int length,
const typename NodeType::atom_type& observation, Cache& cache)
{
typedef typename NodeType::atom_type K;
typedef typename NodeType::value_type V;
if (length == 0) {
size_t old_capacity = n.node.n->observations.capacity;
if (!n.node.n->observations.add(observation)) {
fprintf(stderr, "add ERROR: Unable to add new observation.\n");
return false;
} else if (old_capacity < n.node.n->observations.capacity) {
if (!resize(n.node.observation_assignments, n.node.n->observations.capacity)) {
free(n.node.n->observations.last());
n.node.n->observations.length--;
return false;
}
}
return sample_new_assignment<Collapsed>(n, pi, observation,
n.node.observation_assignments[n.node.n->observations.length - 1], cache);
}
size_t old_capacity = n.node.n->children.capacity;
if (!n.node.n->children.ensure_capacity((unsigned int) n.node.n->children.size + 1)) {
fprintf(stderr, "add ERROR: Unable to expand children map.\n");
return false;
} else if (old_capacity < n.node.n->children.capacity) {
if (!resize(n.node.children, n.node.n->children.capacity))
return false;
/* pointers to parent and hdp nodes could be invalidated here, so fix them */
for (unsigned int i = 0; i < n.node.n->children.size; i++) {
n.node.children[i].n = &n.node.n->children.values[i];
fix_parent_pointers(n.node.children[i]);
}
}
#if !defined(NDEBUG)
if (*path == IMPLICIT_NODE)
fprintf(stderr, "add WARNING: The specified HDP path is ambiguous.\n");
#endif
unsigned int index = strict_linear_search(n.node.n->children.keys, *path, 0, (unsigned int) n.node.n->children.size);
if (index > 0 && n.node.n->children.keys[index - 1] == *path) {
/* a child node with this key already exists, so recurse into it */
return add(make_child_node(n, n.node.children[index - 1]), pi, alpha + 1, path + 1, length - 1, observation, cache);
} else {
/* there is no child node with the given key, so create it */
shift_right(n.node.n->children.keys, (unsigned int) n.node.n->children.size, index);
shift_right(n.node.n->children.values, (unsigned int) n.node.n->children.size, index);
node<K, V>& child = n.node.n->children.values[index];
if (!init(child, *alpha)) {
fprintf(stderr, "add ERROR: Error creating new child.\n");
return false;
}
shift_right(n.node.children, n.node.child_count(), index);
n.node.n->children.keys[index] = *path;
n.node.n->children.size++;
/* TODO: this insertion algorithm can be made more efficient */
if (!init(n.node.children[index], n.get_root(), child, get_node_pointer(n), 4)) {
free(child);
return false;
}
/* correct the node pointers */
for (unsigned int i = 0; i < n.node.child_count(); i++) {
n.node.children[i].n = &n.node.n->children.values[i];
fix_parent_pointers(n.node.children[i]);
}
return add(make_child_node(n, n.node.children[index]), pi, alpha + 1, path + 1, length - 1, observation, cache);
}
}
/**
* Adds the given `observation` to the HDP hierarchy and sampler hierarchy
* represented by `h`. The node to which to add the observation is given by the
* `path`, which provides the indices to follow from each node to its child. If
* a node does not exist, it is created, with its concentration parameter
* copied from the appropriate index in hdp::alpha. Thus, `depth` cannot be
* larger than `hdp::depth` of `h`. If `depth == 1`, the observation is added
* to the root node.
*
* This function also assigns the observation to the table according to the
* conditional distribution: Conditioned on the fact that all other random
* variables are fixed (including the other seating assignments), we sample
* the seating assignment for this new observation.
*/
template<typename BaseDistribution, typename DataDistribution, typename K, typename V, typename Cache>
inline bool add(hdp_sampler<BaseDistribution, DataDistribution, K, V>& h,
const unsigned int* path, unsigned int depth, const K& observation, Cache& cache)
{
return add(make_root_node(h), h.n->pi, h.n->alpha + 1, path, depth, observation, cache);
}
template<typename RootType, typename Observations, typename Cache>
bool remove_from_table(RootType& n,
RootType& root, unsigned int table,
const Observations& observations, Cache& cache)
{
return n.remove_from_table(observations, table, cache);
}
template<typename RootType, typename Observations, typename Cache>
bool remove_from_table(
typename RootType::child_type& n,
RootType& root, unsigned int table,
const Observations& observations, Cache& cache)
{
if (!n.remove_from_table(observations, table, cache))
return false;
/* if the table becomes empty, "delete" it (remove_empty_tables will properly remove it) */
if (n.parent == NULL) {
if (n.descendant_observations[table].counts.size == 0)
decrement_table<true>(root, n.table_assignments[table], cache);
return remove_from_table(root, root, n.table_assignments[table], observations, cache);
} else {
if (n.descendant_observations[table].counts.size == 0)
decrement_table<true>(*n.parent, n.table_assignments[table], cache);
return remove_from_table(*n.parent, root, n.table_assignments[table], observations, cache);
}
}
template<typename NodeType, typename RootType, typename Cache>
bool remove(NodeType& n, RootType& root,
const unsigned int* path, unsigned int length,
const typename RootType::atom_type& observation, Cache& cache)
{
if (length == 0) {
unsigned int index = n.n->observations.index_of(observation);
unsigned int last = (unsigned int) (n.observation_count() - 1);
if (index == n.observation_count()) {
fprintf(stderr, "remove WARNING: Observation does not exist at this node.\n");
return true;
}
unsigned int table = n.observation_assignments[index];
decrement_table<true>(n, table, cache);
cache.on_remove(observation);
remove_from_table(n, root, table, observation, cache);
if (index + 1 < n.observation_count()) {
core::swap(n.n->observations[index], n.n->observations[last]);
core::swap(n.observation_assignments[index], n.observation_assignments[last]);
}
free(n.n->observations.last());
n.n->observations.length--;
return true;
}
unsigned int index = n.n->children.index_of(*path);
if (index == n.n->children.size) {
fprintf(stderr, "remove WARNING: Given path does not exist in HDP tree.\n");
return true;
}
return remove(n.children[index], root, path + 1, length - 1, observation, cache);
}
/**
* Removes the given `observation` from the HDP hierarchy and sampler hierarchy
* represented by `h`. The node from which to remove the observation is given
* by the `path`, which provides the indices to follow from each node to its
* child. If a node does not exist, a warning is printed. `depth` cannot be
* larger than `hdp::depth` of `h`. If `depth == 1`, the observation is removed
* from the root node.
*/
template<typename BaseDistribution, typename DataDistribution, typename K, typename V, typename Cache>
inline bool remove(hdp_sampler<BaseDistribution, DataDistribution, K, V>& h,
const unsigned int* path, unsigned int depth, const K& observation, Cache& cache)
{
return remove(h, h, path, depth, observation, cache);
}
/**
* <!-- Below are functions for moving existing customers to existing or new tables. -->
*/
template<typename RootType, typename Observations, typename Cache>
bool move_to_table(root_position<RootType>& n,
unsigned int src, unsigned int dst,
const Observations& observations, Cache& cache)
{
if (src == dst) return true;
return n.node.move_to_table(observations, src, dst, cache);
}
template<typename RootType, typename Observations, typename Cache>
bool move_to_table(child_position<RootType>& n,
unsigned int src, unsigned int dst,
const Observations& observations, Cache& cache)
{
if (src == dst) return true;
if (!n.node.move_to_table(observations, src, dst, cache))
return false;
/* if the table becomes empty, "delete" it (remove_empty_tables will properly remove it) */
bool success;
if (n.node.parent == NULL) {
auto root = make_root_node(n.root);
if (n.node.descendant_observations[src].counts.size == 0)
decrement_table<true>(n.root, n.node.table_assignments[src], cache);
success = move_to_table(root, n.node.table_assignments[src],
n.node.table_assignments[dst], observations, cache);
} else {
auto parent = make_child_node(n, *n.node.parent);
if (n.node.descendant_observations[src].counts.size == 0)
decrement_table<true>(parent.node, n.node.table_assignments[src], cache);
success = move_to_table(parent, n.node.table_assignments[src],
n.node.table_assignments[dst], observations, cache);
}
#if !defined(NDEBUG)
/* for debugging, check the consistency of the table_sizes data structure */
unsigned int* expected_sizes = (unsigned int*) calloc(n.node.table_count, sizeof(unsigned int));
compute_table_sizes(n.node, expected_sizes, n.node.table_count);
if (!compare_histograms(expected_sizes, n.node.table_sizes, n.node.table_count, n.node.table_count))
fprintf(stderr, "move_to_table WARNING: table_sizes is inconsistent.\n");
free(expected_sizes);
#endif
return success;
}
/* initializes a new table at the existing position 'index' */
template<bool RemoveOldTable, bool Collapsed,
typename RootType, typename Observations, typename Cache>
void replace_with_new_table(
root_position<RootType>& n,
unsigned int old_table, unsigned int index,
const Observations& observations,
unsigned int selected_parent_table,
Cache& cache)
{
if (!Collapsed)
fprintf(stderr, "assign_new_table ERROR: Unsupported operation.\n");
//n.node.pi.sample(n.node.phi[index]);
}
/* initializes a new table at the existing position 'index' */
template<bool RemoveOldTable, bool Collapsed,
typename RootType, typename Observations, typename Cache>
void replace_with_new_table(
child_position<RootType>& n,
unsigned int old_table, unsigned int index,
const Observations& observations,
unsigned int selected_parent_table,
Cache& cache)
{
/* need to draw a new assignment for the parent table */
unsigned int& new_table_assignment = n.node.table_assignments[index];
unsigned int old_parent_table = n.node.table_assignments[old_table];
if (n.node.parent != NULL) {
auto parent = make_child_node(n, *n.node.parent);
if (RemoveOldTable)
decrement_table<true>(parent.node, old_parent_table, cache);
select_table<Collapsed>(parent, observations, old_parent_table,
selected_parent_table, new_table_assignment, cache);
n.node.root_assignments[index] = parent.node.root_assignment(new_table_assignment);
} else {
auto root = make_root_node(n.root);
if (RemoveOldTable)
decrement_table<true>(n.root, old_parent_table, cache);
select_table<Collapsed>(root, observations, old_parent_table,
selected_parent_table, new_table_assignment, cache);
n.node.root_assignments[index] = new_table_assignment;
}
}
template<bool Collapsed, typename RootType, typename Observations, typename Cache>
inline void assign_new_table(
root_position<RootType>& n,
unsigned int src,
const Observations& observations,
unsigned int selected_parent_table,
const Cache& cache)
{
if (!Collapsed)
fprintf(stderr, "assign_new_table ERROR: Unsupported operation.\n");
//n.node.pi.sample(n.node.phi[n.node.table_count - 1]);
}
template<bool Collapsed, typename RootType, typename Observations, typename Cache>
inline void assign_new_table(
child_position<RootType>& n,
unsigned int src,
const Observations& observations,
unsigned int selected_parent_table,
Cache& cache)
{
if (n.node.descendant_observations[src].counts.size != 0)
replace_with_new_table<false, Collapsed>(n, src,
n.node.table_count - 1, observations, selected_parent_table, cache);
else replace_with_new_table<true, Collapsed>(n, src,
n.node.table_count - 1, observations, selected_parent_table, cache);
}
template<bool Collapsed, typename NodeType, typename Observations, typename Cache>
bool move_to_new_table(
NodeType& n, unsigned int src,
const Observations& observations,
unsigned int selected_parent_table,
Cache& cache)
{
/* create a new table, and add the observations to it */
if (!n.node.move_to_new_table(observations, src, cache))
return false;
assign_new_table<Collapsed>(n, src, observations, selected_parent_table, cache);
return true;
}
template<bool Collapsed, typename NodeType, typename Observations, typename Cache>
bool select_table(NodeType& n,
const Observations& observations,
unsigned int old_table,
unsigned int selected_table,
unsigned int& new_table,
Cache& cache)
{
if (selected_table < n.node.table_count) {
/* selected an existing table in this level */
new_table = selected_table;
n.node.table_sizes[selected_table]++;
n.node.customer_count++;
move_to_table(n, old_table, selected_table, observations, cache);
} else {
if (!n.node.new_table()) {
fprintf(stderr, "select_table ERROR: Unable to create table.\n");
return false;
}
new_table = n.node.table_count - 1;
n.node.table_sizes[new_table] = 1;
n.node.customer_count++;
move_to_new_table<Collapsed>(n, old_table, observations,
selected_table - (n.node.table_count - 1), cache);
}
#if !defined(NDEBUG)
/* for debugging, check the consistency of the table_sizes data structure */
unsigned int* expected_sizes = (unsigned int*) calloc(n.node.table_count, sizeof(unsigned int));
compute_table_sizes(n.node, expected_sizes, n.node.table_count);
if (!compare_histograms(expected_sizes, n.node.table_sizes, n.node.table_count, n.node.table_count))
fprintf(stderr, "select_table WARNING: table_sizes is inconsistent.\n");
free(expected_sizes);
#endif
return true;
}
template<bool Collapsed, typename NodeType, typename Observations, typename Cache>
void sample_assignment(NodeType& n,
const Observations& observations,
unsigned int& assignment,
unsigned int old_root_assignment,
Cache& cache)
{
#if !defined(NDEBUG)
if (assignment >= n.node.table_count || n.node.table_sizes[assignment] == 0) {
fprintf(stderr, "sample_assignment ERROR: Invalid table assignment multiset.\n");
return;
}
#endif
typedef typename NodeType::root_type::value_type V;
decrement_table<false>(n.node, assignment, cache);
/* compute likelihood of the observations for each root cluster */
auto& root = n.get_root();
auto root_distribution = cache.compute_root_distribution(root, observations, old_root_assignment);
V new_table_probability = NodeType::root_type::data_distribution_type::log_probability(root.pi(), observations);
sample_result<V> result = sample_table<Collapsed>(n, root_distribution, new_table_probability);
cache.on_sample_table(root, root_distribution, old_root_assignment, observations);
/* select the new table assignment */
if (result.selected_table < n.node.table_count) {
/* we chose a table at this level */
increment_table<false>(n.node, result.selected_table, cache);
unsigned int old_table = assignment;
assignment = result.selected_table;
move_to_table(n, old_table, result.selected_table, observations, cache);
} else {
/* we chose a table at a higher level */
if (n.node.table_sizes[assignment] == 0) {
increment_table<false>(n.node, assignment, cache);
replace_with_new_table<true, Collapsed>(n, assignment, assignment,
observations, result.selected_table - n.node.table_count, cache);
} else {
unsigned int new_table = n.node.table_count;
if (!n.node.new_table()) {
fprintf(stderr, "sample_assignment ERROR: Unable to create table.\n");
return;
}
n.node.table_sizes[new_table] = 1;
unsigned int old_table = assignment;
assignment = new_table;
move_to_new_table<Collapsed>(n, old_table, observations,
result.selected_table - (n.node.table_count - 1), cache);
}
}
cache.on_finish_sampling(root, root_distribution, old_root_assignment);
root_distribution.free();
#if !defined(NDEBUG)
/* for debugging, check the consistency of this data structure */
if (!tables_sorted(n.get_root()))
fprintf(stderr, "sample_assignment WARNING: Table array_multisets are not sorted.\n");
/*if (!check_tables(n.get_root()))
fprintf(stderr, "sample_assignment WARNING: The HDP structure is invalid.\n");
if (!cache.is_valid(root))
fprintf(stderr, "sample_assignment WARNING: Table distribution cache is invalid.\n");*/
unsigned int computed_sum = 0;
for (unsigned int i = 0; i < n.node.table_count; i++)
computed_sum += n.node.table_sizes[i];
if (computed_sum != n.node.customer_count) {
fprintf(stderr, "sample_assignment WARNING: customer_count is incorrect.\n");
n.node.customer_count = computed_sum;
}
#endif
}
template<bool Collapsed, typename NodeType, typename Cache>
void sample_nodes(NodeType& n, unsigned int depth, Cache& cache)
{
/* visit each child node in random order */
unsigned int* child_order = (unsigned int*) alloca(sizeof(unsigned int) * n.node.child_count());
for (unsigned int i = 0; i < n.node.child_count(); i++)
child_order[i] = i;
if (n.node.child_count() > 1) shuffle(child_order, (unsigned int) n.node.child_count());
for (unsigned int i = 0; i < n.node.child_count(); i++) {
/* visit nodes in prefix order, so recurse first */
auto& child = n.node.children[child_order[i]];
auto child_node = make_child_node(n, child);
sample_nodes<Collapsed>(child_node, depth + 1, cache);
/* visit each table in the child node in random order */
unsigned int* order = (unsigned int*) alloca(sizeof(unsigned int) * child.table_count);
for (unsigned int j = 0; j < child.table_count; j++)
order[j] = j;
if (child.table_count > 1) shuffle(order, child.table_count);
/* sample the assignment for every table at this node */
for (unsigned int j = 0; j < child.table_count; j++) {
unsigned int index = order[j];
if (child.descendant_observations[index].counts.size == 0)
continue;
auto& observations = child.descendant_observations[index];
if (observations.counts.size == 1 && observations.counts.values[0] == 1) {
sample_assignment<Collapsed>(n, observations.counts.keys[0],
child.table_assignments[index], child.root_assignments[index], cache);
} else {
sample_assignment<Collapsed>(n, observations,
child.table_assignments[index], child.root_assignments[index], cache);
}
}
}
/* visit each observation in this node in random order */
unsigned int* order = (unsigned int*) malloc(sizeof(unsigned int) * n.node.observation_count());
for (unsigned int i = 0; i < n.node.observation_count(); i++)
order[i] = i;
if (n.node.observation_count() > 1)
shuffle(order, (unsigned int) n.node.observation_count());
/* sample the table assignment for each observation at this node */
for (unsigned int i = 0; i < n.node.observation_count(); i++) {
unsigned int& table_assignment = n.node.observation_assignments[order[i]];
sample_assignment<Collapsed>(
n, n.node.get_observation(order[i]), table_assignment,
n.node.root_assignment(table_assignment), cache);
}
free(order);
}
/**
* Runs a single Gibbs iteration on the sampler node `h` and all descendant
* sampler nodes. This function assumes prepare_sampler was called beforehand.
* prepare_sampler does not need to be called again before the next iteration.
* \tparam Collapsed if `true`, the collapsed Gibbs sampler is used (where the
* root samples from the base distribution, phi, are integrated out). If
* `false` the uncollapsed Gibbs sampler is used. However, at this time,
* only collapsed Gibbs sampling is supported.
*/
template<bool Collapsed, typename BaseDistribution,
typename DataDistribution, typename K, typename V>
inline void sample_hdp(
hdp_sampler<BaseDistribution, DataDistribution, K, V>& h,
cache<BaseDistribution, DataDistribution, K, V>& cache)
{
auto root = make_root_node(h);
sample_nodes<Collapsed>(root, 1, cache);
cache.on_finished_sampling_hdp();
remove_empty_tables(h, cache);
recompute_root_assignments(h);
}
/**
* <!-- Auxiliary variable sampler for HDP hyperparameters.
* (Teh et al. 2006 appendix A, Escobar and West 1995 section 6) -->
*/
template<typename K, typename V>
inline void set_alpha(node_sampler<K, V>& n, const V* alpha) {
n.n->alpha = *alpha;
n.n->log_alpha = log(*alpha);
for (unsigned int i = 0; i < n.child_count(); i++)
set_alpha(n.children[i], alpha + 1);
}
template<typename NodeType>
inline void compute_auxiliary(const NodeType& n,
unsigned int& s_sum, typename NodeType::value_type& log_w_sum)
{
if (n.customer_count > 0) {
s_sum += n.table_count - sample_bernoulli(n.customer_count / (n.customer_count + n.alpha()));
log_w_sum += log(sample_beta(n.alpha() + 1, (typename NodeType::value_type) n.customer_count));
}
}
template<typename K, typename V>
inline void sample_alpha_each_node(
node_sampler<K, V>& n,
const V* a, const V* b)
{
V s_sum = 0.0, log_w_sum = 0.0;
compute_auxiliary(n, s_sum, log_w_sum);
n.n->alpha = sample_gamma(*a + s_sum, *b - log_w_sum);
n.n->log_alpha = log(n.alpha());
for (unsigned int i = 0; i < n.child_count(); i++)
sample_alpha(n.children[i], a + 1, b + 1);
}
/**
* This function implements the auxiliary variable sampler for inferring the
* HDP concentration parameters. This function assumes that hdp::alpha is
* distributed according to a Gamma distribution with parameters `a[level]` and
* `b[level]`.
*
* This function samples a distinct alpha at every node in the hierarchy. The
* function sample_alpha_each_level, on the other hand, samples a distinct
* alpha at every *level* in the hierarchy (it assumes that hdp::alpha is the
* same across all nodes in the same level).
*
* \param a an array of shape parameters for the Gamma prior on hdp::alpha,
* with length hdp::depth, one element for every level in the hierarchy.
* \param b an array of rate parameters for the Gamma prior on hdp::alpha, with
* length hdp::depth, one element for every level in the hierarchy.
* \see [Teh et al. 2006 appendix A](https://people.eecs.berkeley.edu/~jordan/papers/hdp.pdf),
* [Escobar and West 1995 section 6](http://users.wpi.edu/~balnan/Escobar-West-1995.pdf).
*/
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
inline void sample_alpha_each_node(
hdp_sampler<BaseDistribution, DataDistribution, K, V>& n,
const V* a, const V* b)
{
V s_sum = 0.0, log_w_sum = 0.0;
compute_auxiliary(n, s_sum, log_w_sum);
n.n->alpha[0] = sample_gamma(*a + s_sum, *b - log_w_sum);
n.n->log_alpha = log(n.alpha());
for (unsigned int i = 0; i < n.child_count(); i++)
sample_alpha_each_node(n.children[i], a + 1, b + 1);
}
template<typename NodeType>
inline void sample_alpha(NodeType& n,
unsigned int* s_sum,
typename NodeType::value_type* log_w_sum)
{
compute_auxiliary(n, *s_sum, *log_w_sum);
for (unsigned int i = 0; i < n.child_count(); i++)
sample_alpha(n.children[i], s_sum + 1, log_w_sum + 1);
}
/**
* This function implements the auxiliary variable sampler for inferring the
* HDP concentration parameters. This function assumes that hdp::alpha is
* distributed according to a Gamma distribution with parameters `a[level]` and
* `b[level]`.
*
* This function samples a distinct alpha at every *level* in the hierarchy (it
* assumes that hdp::alpha is the same across all nodes in the same level). The
* function sample_alpha_each_node, on the other hand, does not make this
* assumption and samples a distinct hdp::alpha at every node in the hierarchy.
*
* \param a an array of shape parameters for the Gamma prior on hdp::alpha,
* with length hdp::depth, one element for every level in the hierarchy.
* \param b an array of rate parameters for the Gamma prior on hdp::alpha, with
* length hdp::depth, one element for every level in the hierarchy.
* \see [Teh et al. 2006 appendix A](https://people.eecs.berkeley.edu/~jordan/papers/hdp.pdf),
* [Escobar and West 1995 section 6](http://users.wpi.edu/~balnan/Escobar-West-1995.pdf).
*/
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
inline bool sample_alpha_each_level(
hdp_sampler<BaseDistribution, DataDistribution, K, V>& n,
const V* a, const V* b)
{
unsigned int* s_sum = (unsigned int*) alloca(sizeof(unsigned int) * n.n->depth);
V* log_w_sum = (V*) alloca(sizeof(V) * n.n->depth);
if (s_sum == NULL || log_w_sum == NULL) {
fprintf(stderr, "sample_alpha ERROR: Out of memory.\n");
return false;
}
for (unsigned int i = 0; i < n.n->depth; i++) {
s_sum[i] = 0;
log_w_sum[i] = 0.0;
}
sample_alpha(n, s_sum, log_w_sum);
for (unsigned int i = 0; i < n.n->depth; i++) {
n.n->alpha[i] = sample_gamma(a[i] + s_sum[i], b[i] - log_w_sum[i]);
if (n.n->alpha[i] == 0.0)
n.n->alpha[i] = 1.0e-16; /* bad things happen if this is zero */
}
n.n->log_alpha = log(n.alpha());
for (unsigned int i = 0; i < n.child_count(); i++)
set_alpha(n.children[i], n.n->alpha + 1);
return true;
}
/**
* Returns the joint probability of the concentration parameters hdp::alpha and
* node::alpha under the assumption that alpha is distributed according to a
* Gamma distribution with parameters `a[level]` and `b[level]`. This function
* also assumes that alpha is the same across all nodes in the same level.
*/
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
inline V log_probability_each_level(
const hdp_sampler<BaseDistribution, DataDistribution, K, V>& n,
const V* a, const V* b)
{
V score = 0.0;
for (unsigned int i = 0; i < n.n->depth; i++)
score += log_probability_gamma(n.n->alpha[i], a[i], b[i]);
return score;
}
/**
* <!-- Logic for computing the probability of a path given an observation. -->
*/
template<typename K, typename V>
struct hdp_search_state {
unsigned int* path;
V* probabilities;
V maximum;
const node_sampler<K, V>* n;
unsigned int level;
static inline void free(hdp_search_state<K, V>& state) {
core::free(state.path);
core::free(state.probabilities);
}
};
template<typename K, typename V>
inline bool operator < (const hdp_search_state<K, V>& first, const hdp_search_state<K, V>& second)
{
return first.maximum < second.maximum;
}
template<typename V, typename NodeSample>
inline V compute_probability(const NodeSample& sample,
const V* root_probabilities, const V& prior, const V& alpha)
{
V probability = 0.0;
for (unsigned int i = 0; i < sample.table_count; i++)
probability += sample.table_sizes[i] * root_probabilities[sample.root_assignments[i]];
probability += alpha * prior;
return probability / (sample.customer_count + alpha);
}
template<typename V, typename RootSample>
inline V compute_root_probability(const RootSample& sample,
const V* root_probabilities, const V& prior, const V& alpha)
{
V probability = 0.0;
for (unsigned int i = 0; i < sample.table_count; i++)
probability += sample.table_sizes[i] * root_probabilities[i];
probability += alpha * prior;
return probability / (sample.customer_count + alpha);
}
template<typename V, typename NodeSample>
inline V compute_probability(const NodeSample& sample,
const array<unsigned int>& root_tables,
const V& prior, const V& alpha)
{
V probability = 0.0;
for (unsigned int i = 0; i < sample.table_count; i++) {
if (root_tables.contains(sample.root_assignments[i]))
probability += sample.table_sizes[i];
}
probability += alpha * prior;
return probability / (sample.customer_count + alpha);
}
template<typename V, typename RootSample>
inline V compute_root_probability(const RootSample& sample,
const array<unsigned int>& root_tables,
const V& prior, const V& alpha)
{
V probability = 0.0;
for (unsigned int table : root_tables)
probability += sample.table_sizes[table];
probability += alpha * prior;
return probability / (sample.customer_count + alpha);
}
template<typename V>
inline V sum(const V* values, unsigned int length) {
V sum = 0.0;
for (unsigned int i = 0; i < length; i++)
sum += values[i];
return sum;
}
template<typename NodeType, typename V, typename FeatureSet>
inline void complete_path(
const NodeType& node, const V* probabilities, unsigned int label,
unsigned int sample_count, const unsigned int* path,
const unsigned int* const* excluded, const unsigned int* excluded_counts,
unsigned int level, unsigned int depth, array<FeatureSet>& x)
{
unsigned int max_excluded_count = 0;
if (label == IMPLICIT_NODE)
max_excluded_count = node.child_count() + excluded_counts[level];
V probability = sum(probabilities, sample_count) / sample_count;
if (probability == 0.0) {
return;
} else if (!x.ensure_capacity(x.length + 1)) {
fprintf(stderr, "complete_path ERROR: Unable to expand x array.\n");
return;
} else if (!init(x[(unsigned int) x.length], depth - 1)) {
fprintf(stderr, "complete_path ERROR: Unable to initialize feature sequence.\n");
return;
}
FeatureSet& completed = x[(unsigned int) x.length];
x.length++;
for (unsigned int i = 0; i < depth - 1; i++) {
if (i == level) {
completed.set_feature(i, label);
} else {
completed.set_feature(i, path[i]);
if (path[i] == IMPLICIT_NODE && excluded != NULL
&& !completed.set_excluded(i, excluded[i], excluded_counts[i])) {
free(completed);
return;
}
}
}
completed.set_probability(probability);
/* compute the union of the set of child node ids and the excluded set */
if (max_excluded_count == 0) return;
completed.ensure_excluded_capacity(level, max_excluded_count);
auto do_union = [&](unsigned int child_id, unsigned int i, unsigned int j) {
completed.exclude_unsorted(level, child_id);
};
set_union(do_union, do_union, do_union,
node.n->children.keys, node.child_count(),
excluded[level], excluded_counts[level]);
completed.sort_excluded(level);
}
template<typename K, typename V, typename RootDistributionType, typename FeatureSet>
inline void complete_path(const node_sampler<K, V>& leaf,
const RootDistributionType& root_probabilities, const unsigned int* path,
const V* probabilities, unsigned int new_key,
unsigned int level, unsigned int depth, array<FeatureSet>& x)
{
#if !defined(NDEBUG)
if (level + 1 != depth - 1)
fprintf(stderr, "complete_path WARNING: This function should only be called at the leaves.\n");
#endif
V probability = 0.0;
for (unsigned int i = 0; i < leaf.posterior.length; i++) {
probability += compute_probability(leaf.posterior[i],
root_probabilities[i], probabilities[i], leaf.alpha());
}
probability /= leaf.posterior.length;
if (probability == 0.0) {
return;
} else if (!x.ensure_capacity(x.length + 1)) {
fprintf(stderr, "complete_path ERROR: Unable to expand x array.\n");
return;
} else if (!init(x[(unsigned int) x.length], depth - 1)) {
fprintf(stderr, "complete_path ERROR: Unable to initialize feature sequence.\n");
return;
}
FeatureSet& completed = x[(unsigned int) x.length];
x.length++;
for (unsigned int i = 0; i < level; i++)
completed.set_feature(i, path[i]);
completed.set_feature(level, new_key);
completed.set_probability(probability);
}
template<typename NodeType, typename V>
inline V compute_maximum(const NodeType& n,
const V* const* root_probabilities, const V* probabilities)
{
V maximum = 0.0;
for (unsigned int i = 0; i < n.posterior.length; i++) {
V maximum_i = probabilities[i];
const V* root = root_probabilities[i];
auto& sample = n.posterior[i];
for (unsigned int j = 0; j < sample.table_count; j++)
if (sample.table_sizes[j] > 0 && root[sample.root_assignment(j)] > maximum_i)
maximum_i = root[sample.root_assignment(j)];
maximum += maximum_i;
}
return maximum;
}
template<typename NodeType, typename V>
inline V compute_maximum(const NodeType& n,
const array<unsigned int>* root_tables,
const V* probabilities)
{
V maximum = 0.0;
for (unsigned int i = 0; i < n.posterior.length; i++) {
V maximum_i = probabilities[i];
const array<unsigned int>& root = root_tables[i];
auto& sample = n.posterior[i];
for (unsigned int j = 0; j < sample.table_count; j++) {
if (sample.table_sizes[j] > 0 && root.contains(sample.root_assignment(j))) {
maximum_i = 1.0;
break;
}
}
maximum += maximum_i;
}
return maximum;
}
template<typename NodeType, typename RootDistributionType, typename V>
inline V compute_maximum(const NodeType& n,
const RootDistributionType& root_probabilities, const V* probabilities,
const unsigned int* excluded, unsigned int excluded_count)
{
if (excluded_count == 0)
return compute_maximum(n, root_probabilities, probabilities);
V maximum = 0.0;
unsigned int i = 0, j = 0;
while (i < excluded_count && j < n.children.size) {
if (excluded[i] == n.children.keys[j]) {
/* this child is excluded */
i++; j++;
} else if (excluded[i] < n.children.keys[j]) {
i++;
} else {
V new_maximum = compute_maximum(n.children.values[j], root_probabilities, probabilities);
if (new_maximum > maximum) maximum = new_maximum;
j++;
}
}
while (j < n.children.size) {
V new_maximum = compute_maximum(n.children.values[j], root_probabilities, probabilities);
if (new_maximum > maximum) maximum = new_maximum;
j++;
}
return maximum;
}
template<typename K, typename V, typename RootDistributionType, typename FeatureSet>
void push_node_state(const node_sampler<K, V>& child,
const RootDistributionType& root_probabilities,
const unsigned int* path, const V* probabilities,
unsigned int new_key, unsigned int level, unsigned int depth,
array<hdp_search_state<K, V>>& queue, array<FeatureSet>& x)
{
if (level + 1 == depth - 1) {
/* this is the last level */
complete_path(child, root_probabilities, path,
probabilities, new_key, level, depth, x);
return;
}
#if !defined(NDEBUG)
/* we expect the caller to ensure there is enough capacity */
if (queue.length == queue.capacity) {
fprintf(stderr, "push_node_state ERROR: No capacity in the queue.\n");
return;
}
#endif
hdp_search_state<K, V>& state = queue[(unsigned int) queue.length];
state.probabilities = (V*) malloc(sizeof(V) * child.posterior.length);
if (state.probabilities == NULL) {
fprintf(stderr, "push_node_state ERROR: Out of memory.\n");
return;
}
state.path = (unsigned int*) malloc((depth - 1) * sizeof(unsigned int));
if (state.path == NULL) {
fprintf(stderr, "push_node_state ERROR: Out of memory.\n");
free(state.probabilities);
return;
}
for (unsigned int i = 0; i < level; i++)
state.path[i] = path[i];
state.path[level] = new_key;
for (unsigned int i = level + 1; i + 1 < depth; i++)
state.path[i] = path[i];
state.level = level + 1;
state.n = &child;
for (unsigned int i = 0; i < child.posterior.length; i++) {
state.probabilities[i] = compute_probability(child.posterior[i],
root_probabilities[i], probabilities[i], child.alpha());
}
state.maximum = compute_maximum(child, root_probabilities, state.probabilities);
queue.length++;
std::push_heap(queue.data, queue.data + queue.length);
}
template<typename NodeType, typename RootDistributionType, typename K, typename V, typename FeatureSet>
void process_search_state(const NodeType& n,
const RootDistributionType& root_probabilities, const unsigned int* path,
const unsigned int* const* excluded, const unsigned int* excluded_count,
const V* probabilities, unsigned int level, unsigned int depth,
array<hdp_search_state<K, V>>& queue, array<FeatureSet>& x)
{
bool contains;
if (path[level] == IMPLICIT_NODE) {
/* compute the score for a descendant that is not explicitly stored in the tree */
complete_path(n, probabilities, IMPLICIT_NODE,
(unsigned int) n.posterior.length, path, excluded, excluded_count, level, depth, x);
if (!queue.ensure_capacity(queue.length + n.child_count())) {
fprintf(stderr, "process_search_state ERROR: Unable to expand search queue.\n");
return;
}
/* consider all child nodes not in the 'excluded' set */
unsigned int i = 0, j = 0;
while (i < n.child_count() && j < excluded_count[level]) {
unsigned int child_id = n.child_key(i);
if (child_id == excluded[level][j]) {
i++; j++; continue;
} else if (child_id < excluded[level][j]) {
push_node_state(n.children[i], root_probabilities,
path, probabilities, n.child_key(i), level, depth, queue, x);
i++;
} else {
j++;
}
}
while (i < n.child_count()) {
push_node_state(n.children[i], root_probabilities, path,
probabilities, n.child_key(i), level, depth, queue, x);
i++;
}
} else if (path[level] == UNION_NODE) {
fprintf(stderr, "process_search_state ERROR: Support for UNION_NODE is unimplemented.");
exit(EXIT_FAILURE);
/* TODO: implement this; it should look something like below */
/*for (unsigned int i = 0; i < excluded_count; i++) {
auto& child = n.get_child(excluded[i], contains);
if (contains) {
push_node_state(child, root_probabilities, path,
probabilities, excluded[i], level, depth, queue, x);
} else {
complete_path(n, probabilities, excluded[i],
(unsigned int) n.posterior.length, path, NULL, 0, level, depth, x);
}
}*/
} else {
auto& child = n.get_child(path[level], contains);
if (contains) {
push_node_state(child, root_probabilities, path,
probabilities, path[level], level, depth, queue, x);
} else {
complete_path(n, probabilities, path[level],
(unsigned int) n.posterior.length, path, NULL, 0, level, depth, x);
}
}
}
/**
* Returns a copy of the root probabilities from `src`, which this function
* assumes were computed from the sampler hierarchy rooted at `h` using the
* function `V** compute_root_probabilities(const hdp_sampler<BaseDistribution, DataDistribution, K, V>&, const K&)`
* in ::cache.
*/
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
V** copy_root_probabilities(const hdp_sampler<BaseDistribution, DataDistribution, K, V>& h, const V* const* src)
{
V** root_probabilities = (V**) malloc(sizeof(V*) * h.posterior.length);
if (root_probabilities == NULL) {
fprintf(stderr, "copy_root_probabilities ERROR: Out of memory.\n");
return NULL;
}
for (unsigned int i = 0; i < h.posterior.length; i++) {
root_probabilities[i] = (V*) malloc(sizeof(V) * h.posterior[i].table_count);
if (root_probabilities[i] == NULL) {
fprintf(stderr, "predict ERROR: Insufficient memory for root_probabilities[%u].\n", i);
cleanup_root_probabilities(root_probabilities, i);
return NULL;
}
memcpy(root_probabilities[i], src[i], sizeof(V) * h.posterior[i].table_count);
}
return root_probabilities;
}
/**
* Returns a copy of the root probabilities from `src`, which this function
* assumes were computed from the sampler hierarchy rooted at `h` using the
* function `V** compute_root_probabilities(const hdp_sampler<BaseDistribution, DataDistribution, K, V>&, const K*, unsigned int)`
* in ::cache.
*/
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
V** copy_root_probabilities(
const hdp_sampler<BaseDistribution, DataDistribution, K, V>& h,
const V* const* src, unsigned int observation_count)
{
V** root_probabilities = (V**) malloc(sizeof(V*) * observation_count);
if (root_probabilities == NULL) {
fprintf(stderr, "copy_root_probabilities ERROR: Out of memory.\n");
return NULL;
}
for (unsigned int i = 0; i < observation_count; i++) {
root_probabilities[i] = (V*) malloc(sizeof(V) * h.table_count);
if (root_probabilities[i] == NULL) {
fprintf(stderr, "predict ERROR: Insufficient memory for root_probabilities[%u].\n", i);
cleanup_root_probabilities(root_probabilities, i);
return NULL;
}
memcpy(root_probabilities[i], src[i], sizeof(V) * h.table_count);
}
return root_probabilities;
}
/**
* Returns a copy of the root probabilities from `src`, which this function
* assumes were computed from the sampler hierarchy rooted at `h` using the
* function `array<unsigned int>* compute_root_probabilities(const hdp_sampler<BaseDistribution, DataDistribution, K, V>&, const K*, unsigned int)`
* in the specialization of ::cache.
*/
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
array<unsigned int>* copy_root_probabilities(
const hdp_sampler<BaseDistribution, DataDistribution, K, V>& h,
const array<unsigned int>* src, unsigned int observation_count)
{
array<unsigned int>* root_probabilities = (array<unsigned int>*)
malloc(sizeof(array<unsigned int>) * observation_count);
if (root_probabilities == NULL) {
fprintf(stderr, "copy_root_probabilities ERROR: Out of memory.\n");
return NULL;
} for (unsigned int i = 0; i < observation_count; i++) {
if (!array_init(root_probabilities[i], max((unsigned int) src[i].length, 1u))) {
fprintf(stderr, "copy_root_probabilities ERROR: Out of memory.\n");
cleanup_root_probabilities(root_probabilities, i); return NULL;
}
memcpy(root_probabilities[i].data, src[i].data, sizeof(unsigned int) * src[i].length);
root_probabilities[i].length = src[i].length;
}
return root_probabilities;
}
template<typename BaseDistribution, typename DataDistribution, typename K, typename V>
unsigned int* compute_root_counts(const hdp_sampler<BaseDistribution, DataDistribution, K, V>& h)
{
/* initialize and store the list of root table counts */
unsigned int* root_table_counts = (unsigned int*) malloc(sizeof(unsigned int) * h.posterior.length);
if (root_table_counts == NULL) {
fprintf(stderr, "predict ERROR: Insufficient memory for root_table_counts.\n");
return NULL;
}
for (unsigned int i = 0; i < h.posterior.length; i++)
root_table_counts[i] = h.posterior[i].table_count;
return root_table_counts;
}
template<typename BaseDistribution, typename DataDistribution, typename K, typename V,
typename std::enable_if<is_dirichlet<BaseDistribution>::value>::type* = nullptr>
V max_root_probability(
const hdp<BaseDistribution, dense_categorical<V>, K, V>& h,
const K& observation, cache<BaseDistribution, dense_categorical<V>, K, V>* caches)
{
V pi = h.pi.get_for_atom(observation);
if (h.posterior.length == 0)
return pi / h.pi.sum();
V maximum = 0.0;
for (unsigned int i = 0; i < h.posterior.length; i++) {
V maximum_i = pi / h.pi.sum();
/* TODO: test numerical stability */
bool contains;
array_multiset<unsigned int>& table_counts = caches[i].get_table_counts(observation, contains);
if (contains) {
for (unsigned int j = 0; j < table_counts.counts.size; j++) {
unsigned int table = table_counts.counts.keys[j];
V value = (pi + table_counts.counts.values[j])
/ (h.pi.sum() + h.posterior[i].descendant_observations[table].total());
if (value > maximum_i)
maximum_i = value;
}
}
maximum += maximum_i;
}
return maximum / h.posterior.length;
}
/**
* Computes the posterior predictive probability of `observation` being drawn
* from the nodes characterized by the path set. The path set is specified by
* the variables `path`, `excluded`, and `excluded_counts`. Whenever an element
* of `path` is ::IMPLICIT_NODE, this indicates that any child node at that
* level lies within the path set. Exclusions can be specified by the set of
* keys `excluded[i]` which is a native array containing a *sorted* list of
* keys to be excluded from the path set, at that level. The length of
* `excluded[i]` is given by `excluded_counts[i]`.
*
* This function uses the posterior samples stored in hdp_sampler::posterior
* and node_sampler::posterior to perform its computations.
*
* The results are stored in the array `x` of `FeatureSet` objects.
*
* \tparam FeatureSet a "weighted feature set" type that can be initialized
* with the function `bool init(FeatureSet&, unsigned int)` where the
* unsigned int indicates the length of the feature set, and the following
* public member functions are defined:
* `set_feature(unsigned int, unsigned int)`, `set_probability(double)`,
* `ensure_excluded_capacity(unsigned int, unsigned int)`,
* `set_excluded(unsigned int, const unsigned int*, unsigned int)`,
* `exclude_unsorted(unsigned int, unsigned int)`, and
* `sort_excluded(unsigned int)`. The weighted_feature_set is an example
* of a type that satisfies this template.
* \tparam RootDistributionType the return type of the function
* `compute_root_probabilities(const hdp_sampler<BaseDistribution, DataDistribution, K, V>&, const K&)`
* in ::cache. Note that depending on BaseDistribution and
* DataDistribution, the return type may vary, since a different
* specialization of ::cache may be used.
*/
template<typename BaseDistribution, typename DataDistribution,
typename K, typename V, typename FeatureSet, typename RootDistributionType>
void predict(
const hdp_sampler<BaseDistribution, DataDistribution, K, V>& h,
const K& observation, const unsigned int* path,
const unsigned int* const* excluded, const unsigned int* excluded_counts,
array<FeatureSet>& x, const RootDistributionType& root_probabilities)
{
#if !defined(NDEBUG)
if (x.length > 0) {
fprintf(stderr, "predict WARNING: Provided 'x' is non-empty.\n");
for (unsigned int i = 0; i < x.length; i++)
free(x[i]);
x.clear();
}
#endif
V pi = DataDistribution::probability(h.pi(), observation);
if (h.posterior.length == 0) {
if (!x.ensure_capacity(1) || !init(x[0], h.n->depth - 1))
return;
x.length = 1;
x[0].set_probability(pi);
for (unsigned int i = 0; i + 1 < h.n->depth; i++) {
x[0].set_feature(i, path[i]);
if (path[i] == IMPLICIT_NODE)
x[0].set_excluded(i, excluded[i], excluded_counts[i]);
}
return;
}
V* probabilities = (V*) malloc(sizeof(V) * h.posterior.length);
if (probabilities == NULL) {
fprintf(stderr, "predict ERROR: Insufficient memory for probabilities vector.\n");
return;
}
for (unsigned int i = 0; i < h.posterior.length; i++)
probabilities[i] = compute_root_probability(h.posterior[i], root_probabilities[i], pi, h.alpha());
/* TODO: initial array size can be set more intelligently (depending on termination condition) */
array<hdp_search_state<K, V>> queue = array<hdp_search_state<K, V>>(1024);
process_search_state(h, root_probabilities, path,
excluded, excluded_counts, probabilities, 0, h.n->depth, queue, x);
free(probabilities);
while (queue.length > 0) {
std::pop_heap(queue.data, queue.data + queue.length);
hdp_search_state<K, V> state = queue.last();
queue.length--;
process_search_state(*state.n, root_probabilities, state.path,
excluded, excluded_counts, state.probabilities, state.level, h.n->depth, queue, x);
free(state);
}
for (unsigned int i = 0; i < queue.length; i++)
free(queue[i]);
}
/**
* Computes the posterior predictive probability of `observation` being drawn
* from the nodes characterized by the path set. The path set is specified by
* the variables `path`, `excluded`, and `excluded_counts`. Whenever an element
* of `path` is ::IMPLICIT_NODE, this indicates that any child node at that
* level lies within the path set.
*
* This function uses the posterior samples stored in hdp_sampler::posterior
* and node_sampler::posterior to perform its computations.
*
* The results are stored in the array `x` of `FeatureSet` objects.
*
* \tparam FeatureSet a "weighted feature set" type that can be initialized
* with the function `bool init(FeatureSet&, unsigned int)` where the
* unsigned int indicates the length of the feature set, and the following
* public member functions are defined:
* `set_feature(unsigned int, unsigned int)`, `set_probability(double)`,
* `ensure_excluded_capacity(unsigned int, unsigned int)`,
* `set_excluded(unsigned int, const unsigned int*, unsigned int)`,
* `exclude_unsorted(unsigned int, unsigned int)`, and
* `sort_excluded(unsigned int)`. The weighted_feature_set is an example
* of a type that satisfies this template.
* \tparam RootDistributionType the return type of the function
* `compute_root_probabilities(const hdp_sampler<BaseDistribution, DataDistribution, K, V>&, const K&)`
* in ::cache. Note that depending on BaseDistribution and
* DataDistribution, the return type may vary, since a different
* specialization of ::cache may be used.
*/