Skip to content

Commit

Permalink
Update more docmentation
Browse files Browse the repository at this point in the history
  • Loading branch information
Ben committed Jun 10, 2015
1 parent b85d0f1 commit b805d2a
Show file tree
Hide file tree
Showing 11 changed files with 123 additions and 119 deletions.
5 changes: 1 addition & 4 deletions apop.m4.h
Expand Up @@ -482,9 +482,6 @@ apop_model * apop_model_fix_params_get_base(apop_model *model_in);






//////vtables
/** \cond doxy_ignore */

Expand Down Expand Up @@ -582,7 +579,7 @@ apop_data * apop_histograms_test_goodness_of_fit(apop_model *h0, apop_model *h1)
apop_data * apop_test_kolmogorov(apop_model *m1, apop_model *m2);
apop_data *apop_data_pmf_compress(apop_data *in);
Apop_var_declare( apop_data * apop_data_to_bins(apop_data const *indata, apop_data const *binspec, int bin_count, char close_top_bin) )
Apop_var_declare( apop_model * apop_model_to_pmf(apop_model *model, apop_data *binspec, long int draws, int bin_count, gsl_rng *rng) )
Apop_var_declare( apop_model * apop_model_to_pmf(apop_model *model, apop_data *binspec, long int draws, int bin_count) )

//text conveniences
Apop_var_declare( char* apop_text_paste(apop_data const*strings, char *between, char *before, char *after, char *between_cols, int (*prune)(apop_data* ! int ! int ! void*), void* prune_parameter) )
Expand Down
6 changes: 3 additions & 3 deletions apop_asst.m4.c
Expand Up @@ -389,15 +389,15 @@ gsl_rng *apop_rng_get_thread_base(int thread){
\exception out->error=='d' Trouble drawing from the distribution for at least one row. That row is set to all \c NAN.
\li Prints a warning if you send in a non-<tt>NULL apop_data</tt> set, but its \c matrix element is \c NULL, when <tt>apop_opts.verbose>=1</tt>.
\li See also \ref apop_draw, which makes a single draw.
\li Random numbers are generated using RNGs from \ref apop_rng_get_thread, qv.
Here is a two-line program to draw a different set of ten Standard Normals on every run (provided runs are more than a second apart):
\include draw_some_normals.c
*/
\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD apop_data *apop_model_draws(apop_model *model, int count, apop_data *draws){
apop_model * apop_varad_var(model, NULL);
Apop_stopif(!model, apop_return_data_error(n), 0, "Input model is NULL.");
Expand Down
13 changes: 5 additions & 8 deletions apop_hist.m4.c
Expand Up @@ -13,32 +13,29 @@ Copyright (c) 2006--2007, 2010, 2013 by Ben Klemens. Licensed under the GPLv2;
/** Make random draws from an \ref apop_model, and bin them using a binspec in the style
of \ref apop_data_to_bins. If you have a data set that used the same binspec, you now have synced histograms, which you can plot or sensibly test hypotheses about.
The output is normalized to integrate to one.
\param binspec A description of the bins in which to place the draws; see \ref apop_data_to_bins. (default: as in \ref apop_data_to_bins.)
\param model The model to be drawn from. Because this function works via random draws, the model needs to have a
\c draw method. (No default)
\param draws The number of random draws to make. (arbitrary default = 10,000)
\param bin_count If no bin spec, the number of bins to use (default: as per \ref apop_data_to_bins, \f$\sqrt(N)\f$)
\param rng The \c gsl_rng used to make random draws. (default: an RNG from \ref apop_rng_get_thread)
\return An \ref apop_pmf model.
\return An \ref apop_pmf model, with a new binned data set attached (which you may
have to <tt>apop_data_free(output_model->data)</tt> to prevent memory leaks). The
weights on the data set are normalized to sum to one.
\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD apop_model *apop_model_to_pmf(apop_model *model, apop_data *binspec, long int draws, int bin_count, gsl_rng *rng){
APOP_VAR_HEAD apop_model *apop_model_to_pmf(apop_model *model, apop_data *binspec, long int draws, int bin_count){
apop_model* apop_varad_var(model, NULL);
Apop_assert(model && model->draw, "The second argument needs to be an apop_model with a 'draw' function "
"that I can use to make random draws.");
apop_data* apop_varad_var(binspec, NULL);
int apop_varad_var(bin_count, 0);
long int apop_varad_var(draws, 1e4);
gsl_rng *apop_varad_var(rng, apop_rng_get_thread())
APOP_VAR_ENDHEAD
Get_vmsizes(binspec);
apop_data *outd = apop_data_alloc(draws, model->dsize);
for (long int i=0; i< draws; i++)
apop_draw(Apop_rv(outd, i)->data, rng, model);
apop_data *outd = apop_model_draws(model, draws);
apop_data *outbinned = apop_data_to_bins(outd, binspec, .bin_count=bin_count);
apop_data_free(outd);
apop_vector_normalize(outbinned->weights);
Expand Down
67 changes: 35 additions & 32 deletions apop_mcmc.m4.c
Expand Up @@ -158,9 +158,11 @@ can be reported to you. That run is done using \c model->data as input.
\return On return, \c out is filled with the next step in the Markov chain. The <tt>->data</tt> element of the PMF model is extended to include the additional steps in the chain.
If a proposal failed the model constraints, then return 1; else return 0. See the notes in the documentation for \ref apop_model_metropolis.
\li After pulling the attached settings group, the parent model is ignored. One expects that \c base_model in the mcmc settings group == the parent model.
\li If your settings break the model parameters into several chunks, this function returns after stepping through all chunks.
\li After pulling the attached settings group, the parent model is ignored. One expects
that \c base_model in the mcmc settings group == the parent model.
\li If your settings break the model parameters into several chunks, this function
returns after stepping through all chunks.
\ingroup all_public
*/
int apop_model_metropolis_draw(double *out, gsl_rng* rng, apop_model *model){
apop_mcmc_settings *s = apop_settings_get_group(model, apop_mcmc);
Expand Down Expand Up @@ -211,14 +213,25 @@ Markov chain Monte Carlo</a> to make draws from the given model.
The basic storyline is that draws are made from a proposal distribution, and the
likelihood of your model given your data and the drawn parameters evaluated. At each
step, a new set of proposal parameters are drawn, and if either they are more likely
step, a new set of proposal parameters are drawn, and if they are more likely
than the previous set the new proposal is accepted as the next step, else with probability (prob of new params)/(prob of old params),
they are accepted as the next step anyway. Otherwise the last accepted proposal is repeated.
The output is an \ref apop_pmf model with a data set listing the draws that were
accepted, including those repetitions. The output model is modified so that subsequent
draws are one more step from the Markov chain, via \ref apop_model_metropolis_draw.
\param d The \ref apop_data set used for evaluating the likelihood of a proposed parameter set.
\param rng A \c gsl_rng, probably allocated via \ref apop_rng_alloc. (Default: an RNG from \ref apop_rng_get_thread)
\param m The \ref apop_model from which parameters are being drawn. (No default; must not be \c NULL)
\return A modified \ref apop_pmf model representing the results of the search. It has
a specialized \c draw method that returns another step from the Markov chain with each draw.
\exception out->error='c' Proposal was outside of a constraint; see below.
\li If a proposal fails to meet the \c constraint element of the model you input, then
the proposal is thrown out and a new one selected. By the default proposal
distribution, this is not mathematically correct (it breaks detailed balance),
Expand All @@ -231,46 +244,36 @@ Attach an \ref apop_mcmc_settings group to your model to specify the proposal
distribution, burnin, and other details of the search. See the \ref apop_mcmc_settings
documentation for details.
\li The default proposal includes an adaptive step: you specify a target accept rate
\li The default proposal includes an adaptive step: you specify a target accept rate
(default: .35), and if the accept rate is currently higher the variance of the proposals
is widened to explore more of the space; if the accept rate is currently lower the
variance is narrowed to stay closer to the last accepted proposal. Technically, this
breaks ergodicity of the Markov chain, but the consensus seems to be that this is
not a serious problem. If it does concern you, you can set the \c base_adapt_fn in the \ref apop_mcmc_settings group to a do-nothing function, or one that damps its adaptation as \f$n\to\infty\f$.
\li If you have a univariate model, \ref apop_arms_draw may be a suitable simpler alternative.
\li Note the \c gibbs_chunks element of the \ref apop_mcmc_settings group. If you set \c
\li If you have a univariate model, \ref apop_arms_draw may be a suitable simpler alternative.
\li Note the \c gibbs_chunks element of the \ref apop_mcmc_settings group. If you set \c
gibbs_chunks='a', all parameters are drawn as a set, and accepted/rejected as a set. The
variances are adapted at an identical rate. If you set \c gibbs_chunks='i',
then each scalar parameter is assigned its own proposal distribution, which is adapted
at its own pace. With \c gibbs_chunks='b' (the default), then each of the vector, matrix,
and weights of your model's parameters are drawn/accepted/adapted as a block (and so
on to additional chunks if your model has <tt>->more</tt> pages). This works well for
complex models which naturally break down into subsets of parameters.
Each chunk counts as a step in the Markov chain. Therefore, if there are several chunks,
you can expect chunks to repeat from step to step. If you want a draw after cycling through all chunks, try using \ref apop_model_metropolis_draw, which has that behavior.
\param d The \ref apop_data set used for evaluating the likelihood of a proposed parameter set.
\param rng A \c gsl_rng, probably allocated via \ref apop_rng_alloc. (Default: an RNG from \ref apop_rng_get_thread)
\param m The \ref apop_model from which parameters are being drawn. (No default; must not be \c NULL)
\li If the likelihood model has \c NULL parameters, I will allocate them. That means you can use
one of the stock models that ship with Apophenia. If I need to run the model's prep
routine to get the size of the parameters, then I will make a copy of the likelihood
model, run prep, and then allocate parameters for that copy of a model.
\li On exit, the \c parameters element of your likelihood model has the last accepted parameter proposal.
\li If you set <tt>apop_opts.verbose=2</tt> or greater, I will report the accept rate of the M-H sampler. It is a common rule of thumb to select a proposal so that this is between 20% and 50%. Set <tt>apop_opts.verbose=3</tt> to see the stream of proposal points, their likelihoods, and the acceptance odds. You may want to set <tt>apop_opts.log_file=fopen("yourlog", "w")</tt> first.
\return A modified \ref apop_pmf model representing the results of the search. It has
a specialized \c draw method that returns another step from the Markov chain with each draw.
\exception out->error='c' Proposal was outside of a constraint; see above.
\li Each chunk counts as a step in the Markov chain. Therefore, if there are
several chunks, you can expect chunks to repeat from step to step. If you want a
draw after cycling through all chunks, try using \ref apop_model_metropolis_draw,
which has that behavior.
\li If the likelihood model has \c NULL parameters, I will allocate them. That
means you can use one of the stock models that ship with Apophenia. If I need
to run the model's prep routine to get the size of the parameters, then I will
make a copy of the likelihood model, run prep, and then allocate parameters
for that copy of a model.
\li On exit, the \c parameters element of your likelihood model has the last accepted parameter proposal.
\li If you set <tt>apop_opts.verbose=2</tt> or greater, I will report the accept
rate of the M-H sampler. It is a common rule of thumb to select a proposal so that
this is between 20% and 50%. Set <tt>apop_opts.verbose=3</tt> to see the stream
of proposal points, their likelihoods, and the acceptance odds. You may want to
set <tt>apop_opts.log_file=fopen("yourlog", "w")</tt> first.
\li This function uses the \ref designated syntax for inputs.
*/
Expand Down
2 changes: 1 addition & 1 deletion apop_missing_data.m4.c
Expand Up @@ -127,7 +127,7 @@ necessary data-parameter switching to make that happen.
\param mvn A parametrized \ref apop_model from which you expect the data was derived.
if \c NULL, then I'll use the Multivariate Normal that best fits the data after listwise deletion.
\return An estimated <tt>apop_ml_impute_model</tt>. Also, the data input will be filled in and ready to use.
\return An estimated \ref apop_model. Also, the data input will be filled in and ready to use.
*/
apop_model * apop_ml_impute(apop_data *d, apop_model* mvn){
if (!mvn){
Expand Down
51 changes: 25 additions & 26 deletions apop_mle.m4.c
@@ -1,11 +1,4 @@
/** \file apop_mle.c The MLE functions. Call them with an \ref apop_model.
This file includes a number of distributions and models whose parameters one would estimate using maximum likelihood techniques.
It has (more-or-less) a single public function: \ref apop_maximum_likelihood, and you don't even need to use that one, because the \c apop_estimate function defaults to using it if there is no model-specific estimation routine provided.
At the bottom are the maximum likelihood procedures themselves. There are four: Newton-type derivative methods, the no-derivative version, the with-derivative version, and the simulated annealing routine.*/

/** \file apop_mle.c */
/*Copyright (c) 2006--2010 by Ben Klemens. Licensed under the GPLv2; see COPYING. */
#include "apop_internal.h"
#include <setjmp.h>
Expand Down Expand Up @@ -116,15 +109,17 @@ static void apop_internal_numerical_gradient(apop_fn_with_params ll,

/**The GSL provides one-dimensional numerical differentiation; here's the multidimensional extension.
\param data The data set to use for all evaluations. It remains constant throughout.
\param model The model, expressing the function whose derivative is sought. The gradient is taken via small changes along the model parameters.
\param delta The size of the differential. If you explicitly give me a \c delta, I'll use it. If \c delta is not specified,
but \c model has \c method_settings of type \c apop_ml_params, then the \c delta element is used for the differential. Else, I use 1e-3.
\param data The \ref apop_data set to use for all evaluations.
\param model The \ref apop_model, expressing the function whose derivative is sought. The gradient is taken via small changes along the model parameters.
\param delta The size of the differential. (default: 1e-3, but see below)
\code
gsl_vector *gradient = apop_numerical_gradient(data, your_parametrized_model);
\endcode
\li If you do not set \ref delta as an input, I first look for an \ref apop_mle_settings
group attached to the input model, and check that for a \c delta element. If that is
also missing, use the default of 1e-3.
\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD gsl_vector * apop_numerical_gradient(apop_data *data, apop_model *model, double delta){
Expand Down Expand Up @@ -175,15 +170,16 @@ static long double apop_fn_for_infomatrix(apop_data *d, apop_model *m){
apop_model *apop_model_for_infomatrix = &(apop_model){"Ad hoc model for working out the information matrix.",
.log_likelihood = apop_fn_for_infomatrix};

/** Numerically estimate the matrix of second derivatives of the
parameter values. The math is
simply a series of re-evaluations at small differential steps. [Therefore, it may be expensive to do this for a very computationally-intensive model.]
/** Numerically estimate the matrix of second derivatives of the parameter values, via
a series of re-evaluations at small differential steps. [Therefore, it may be expensive
to do this for a very computationally-intensive model.]
\param data The data at which the model was estimated
\param model The model, with parameters already estimated
\param delta the step size for the differentials. The current default is around 1e-3.
\param data The \ref apop_data at which the model was estimated (default: \c NULL)
\param model The \ref apop_model, with parameters already estimated (no default, must not be \c NULL)
\param delta the step size for the differentials. (default: 1e-3, but see below)
\return The matrix of estimated second derivatives at the given data and parameter values.
\li If you do not set \ref delta as an input, I first look for an \ref apop_mle_settings group attached to the input model, and check that for a \c delta element. If that is also missing, use the default of 1e-3.
\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD apop_data * apop_model_hessian(apop_data * data, apop_model *model, double delta){
Expand Down Expand Up @@ -232,8 +228,8 @@ I follow Efron and Hinkley in using the estimated information matrix---the value
\param model A model whose parameters have been estimated.
\param delta The differential by which to step for sampling changes. (default currently = 1e-3)
\return A covariance matrix for the data. Also, if the data does not have a
<tt>"Covariance"</tt> page, I'll set it to the result as well [i.e., I won't overwrite an
existing covar].
<tt>"<Covariance>"</tt> page, I'll set it to the result as well [i.e., I won't overwrite an
existing covariance page].
This function uses the \ref designated syntax for inputs.
*/
Expand Down Expand Up @@ -598,34 +594,37 @@ Onecheck(Newton hybrid no scale)
return 1;
}

/** The maximum likelihood calculations.
/** Find the likelihood-maximizing parameters of a model given data.
\li I assume that \c apop_prep has been called on your model. The easiest way to guarantee this is to use \ref apop_estimate, which calls this function if the input model has no \c estimate method.
\li I assume that \ref apop_prep has been called on your model. The easiest way to guarantee this is to use \ref apop_estimate, which calls this function if the input model has no \c estimate method.
\li All of the settings are specified by adding a
\ref apop_mle_settings struct to your model, so see the many notes there. Notably,
the default method is the Fletcher-Reeves conjugate gradient method, and if your model
does not have a dlog likelihood function, then a numeric gradient will be calculated
via \ref apop_numerical_gradient. Add a \ref apop_mle_settings group to your model
for other methods, including the Nelder-Mead simplex and simulated annealing.
via \ref apop_numerical_gradient. Add an \ref apop_mle_settings group to your model
to set tuning parameters or select other methods, including the Nelder-Mead simplex,
simulated annealing, and root-finding.
\param data An \ref apop_data set.
\param dist The \ref apop_model object: \ref apop_gamma, \ref apop_probit, \ref apop_zipf, &amp;c. You can add
an \c apop_mle_settings struct to it (<tt>Apop_model_add_group(your_model, apop_mle,
.verbose=1, .method="PR cg", and_so_on)</tt>). So, see the \c apop_mle_settings
documentation for the many options, such as choice of method and tuning parameters.
.verbose=1, .method="PR cg", and_so_on)</tt>).
\return None, but the input model is modified to include the parameter estimates, &c.
\li There is auxiliary info in the <tt>->info</tt> element of the post-estimation struct. Get elements via, e.g.:
\code
apop_model *est = apop_estimate(your_data, apop_probit);
int status = apop_data_get(est->info, .rowname="status");
if (status)
//trouble
else
//optimum found
apop_data_print(est->parameters); //Here are the estimated parameters
\endcode
\li During the search for an optimum, ctrl-C (SIGINT) will halt the search, and the function will return whatever parameters the search was on at the time.
Expand Down

0 comments on commit b805d2a

Please sign in to comment.