# accord-net/framework

Switch branches/tags
Nothing to show
Fetching contributors…
Cannot retrieve contributors at this time
834 lines (739 sloc) 33 KB
 // Accord Statistics Library // The Accord.NET Framework // http://accord-framework.net // // Copyright © César Souza, 2009-2017 // cesarsouza at gmail.com // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA // namespace Accord.Statistics.Distributions.Multivariate { using System; using Accord.Math; using Accord.Math.Decompositions; using Accord.Statistics.Distributions; using Accord.Statistics.Distributions.Fitting; using Accord.Statistics.Distributions.Univariate; using System.Runtime.CompilerServices; using Accord.Math.Random; using Accord.Compat; /// /// Multivariate Normal (Gaussian) distribution. /// /// /// /// /// The Gaussian is the most widely used distribution for continuous /// variables. In the case of many variables, it is governed by two /// parameters, the mean vector and the variance-covariance matrix. /// /// When a covariance matrix given to the class constructor is not positive /// definite, the distribution is degenerate and this may be an indication /// indication that it may be entirely contained in a r-dimensional subspace. /// Applying a rotation to an orthogonal basis to recover a non-degenerate /// r-dimensional distribution may help in this case. /// /// /// References: /// /// /// Ai Access. Glossary of Data Modeling. Positive definite matrix. Available on: /// http://www.aiaccess.net/English/Glossaries/GlosMod/e_gm_positive_definite_matrix.htm /// /// /// /// /// /// The following example shows how to create a Multivariate Gaussian /// distribution with known parameters mean vector and covariance matrix /// /// /// // Create a multivariate Gaussian distribution /// var dist = new MultivariateNormalDistribution( /// /// // mean vector mu /// mean: new double[] /// { /// 4, 2 /// }, /// /// // covariance matrix sigma /// covariance: new double[,] /// { /// { 0.3, 0.1 }, /// { 0.1, 0.7 } /// } /// ); /// /// // Common measures /// double[] mean = dist.Mean; // { 4, 2 } /// double[] median = dist.Median; // { 4, 2 } /// double[] var = dist.Variance; // { 0.3, 0.7 } (diagonal from cov) /// double[,] cov = dist.Covariance; // { { 0.3, 0.1 }, { 0.1, 0.7 } } /// /// // Probability mass functions /// double pdf1 = dist.ProbabilityDensityFunction(new double[] { 2, 5 }); // 0.000000018917884164743237 /// double pdf2 = dist.ProbabilityDensityFunction(new double[] { 4, 2 }); // 0.35588127170858852 /// double pdf3 = dist.ProbabilityDensityFunction(new double[] { 3, 7 }); // 0.000000000036520107734505265 /// double lpdf = dist.LogProbabilityDensityFunction(new double[] { 3, 7 }); // -24.033158110192296 /// /// // Cumulative distribution function (for up to two dimensions) /// double cdf = dist.DistributionFunction(new double[] { 3, 5 }); // 0.033944035782101548 /// /// // Generate samples from this distribution: /// double[][] sample = dist.Generate(1000000); /// /// /// /// The following example demonstrates how to fit a multivariate Gaussian to /// a set of observations. Since those observations would lead to numerical /// difficulties, the example also demonstrates how to specify a regularization /// constant to avoid getting a . /// /// /// /// double[][] observations = /// { /// new double[] { 1, 2 }, /// new double[] { 1, 2 }, /// new double[] { 1, 2 }, /// new double[] { 1, 2 } /// }; /// /// // Create a multivariate Gaussian for 2 dimensions /// var normal = new MultivariateNormalDistribution(2); /// /// // Specify a regularization constant in the fitting options /// NormalOptions options = new NormalOptions() { Regularization = double.Epsilon }; /// /// // Fit the distribution to the data /// normal.Fit(observations, options); /// /// // Check distribution parameters /// double[] mean = normal.Mean; // { 1, 2 } /// double[] var = normal.Variance; // { 4.9E-324, 4.9E-324 } (almost machine zero) /// /// /// /// The next example shows how to estimate a Gaussian distribution from data /// available inside a Microsoft Excel spreadsheet using the ExcelReader class. /// /// /// // Create a new Excel reader to read data from a spreadsheet /// ExcelReader reader = new ExcelReader(@"test.xls", hasHeaders: false); /// /// // Extract the "Data" worksheet from the xls /// DataTable table = reader.GetWorksheet("Data"); /// /// // Convert the data table to a jagged matrix /// double[][] observations = table.ToArray(); /// /// /// // Estimate a new Multivariate Normal Distribution from the observations /// var dist = MultivariateNormalDistribution.Estimate(observations, new NormalOptions() /// { /// Regularization = 1e-10 // this value will be added to the diagonal until it becomes positive-definite /// }); /// /// /// /// /// [Serializable] public class MultivariateNormalDistribution : MultivariateContinuousDistribution, IFittableDistribution, ISampleableDistribution { // Distribution parameters private double[] mean; private double[,] covariance; private CholeskyDecomposition chol; private SingularValueDecomposition svd; private double lnconstant; // Derived measures private double[] variance; /// /// Constructs a multivariate Gaussian distribution /// with zero mean vector and identity covariance matrix. /// /// /// The number of dimensions in the distribution. /// public MultivariateNormalDistribution(int dimension) : this(dimension, true) { } private MultivariateNormalDistribution(int dimension, bool init) : base(dimension) { if (init) { // init is set to false during cloning double[] mean = new double[dimension]; double[,] cov = Matrix.Identity(dimension); var chol = new CholeskyDecomposition(cov); initialize(mean, cov, chol, svd: null); } } /// /// Constructs a multivariate Gaussian distribution /// with given mean vector and covariance matrix. /// /// /// The mean vector μ (mu) for the distribution. /// The covariance matrix Σ (sigma) for the distribution. /// public MultivariateNormalDistribution(double[] mean, double[][] covariance) : this(mean, covariance.ToMatrix()) { } /// /// Constructs a multivariate Gaussian distribution /// with given mean vector and covariance matrix. /// /// /// The mean vector μ (mu) for the distribution. /// public MultivariateNormalDistribution(double[] mean) : this(mean, Matrix.Identity(mean.Length)) { } /// /// Constructs a multivariate Gaussian distribution /// with given mean vector and covariance matrix. /// /// /// The mean vector μ (mu) for the distribution. /// The covariance matrix Σ (sigma) for the distribution. /// public MultivariateNormalDistribution(double[] mean, double[,] covariance) : base(mean.Length) { int rows = covariance.GetLength(0); int cols = covariance.GetLength(1); if (rows != cols) throw new DimensionMismatchException("covariance", "Covariance matrix should be square."); if (mean.Length != rows) throw new DimensionMismatchException("covariance", "Covariance matrix should have the same dimensions as mean vector's length."); var chol = new CholeskyDecomposition(covariance); if (!chol.IsPositiveDefinite) throw new NonPositiveDefiniteMatrixException("Covariance matrix is not positive definite." + " If are trying to estimate a distribution from data, please try using the Estimate method."); initialize(mean, covariance, chol, svd: null); } private void initialize(double[] m, double[,] cov, CholeskyDecomposition cd, SingularValueDecomposition svd) { int k = m.Length; this.mean = m; this.covariance = cov; this.chol = cd; this.svd = svd; if (chol != null || svd != null) { // Transforming to log operations, we have: double lndet = (cd != null) ? cd.LogDeterminant : svd.LogPseudoDeterminant; // So the log(constant) could be computed as: lnconstant = -(Constants.Log2PI * k + lndet) * 0.5; } } /// /// Gets the Mean vector μ (mu) for /// the Gaussian distribution. /// /// public override double[] Mean { get { return mean; } } /// /// Gets the Variance vector diag(Σ), the diagonal of /// the sigma matrix, for the Gaussian distribution. /// /// public override double[] Variance { get { if (variance == null) variance = Matrix.Diagonal(covariance); return variance; } } /// /// Gets the variance-covariance matrix /// Σ (sigma) for the Gaussian distribution. /// /// public override double[,] Covariance { get { return covariance; } } /// /// Computes the cumulative distribution function for distributions /// up to two dimensions. For more than two dimensions, this method /// is not supported. /// /// protected internal override double InnerDistributionFunction(params double[] x) { if (Dimension == 1) { double stdDev = Math.Sqrt(Covariance[0, 0]); if (stdDev == 0) return (x[0] == mean[0]) ? 1 : 0; double z = (x[0] - mean[0]) / stdDev; return Normal.Function(z); } if (Dimension == 2) { double sigma1 = Math.Sqrt(Covariance[0, 0]); double sigma2 = Math.Sqrt(Covariance[1, 1]); double rho = Covariance[0, 1] / (sigma1 * sigma2); if (Double.IsNaN(rho)) return (x.IsEqual(mean)) ? 1 : 0; double z = (x[0] - mean[0]) / sigma1; double w = (x[1] - mean[1]) / sigma2; return Normal.Bivariate(z, w, rho); } throw new NotSupportedException("The cumulative distribution " + "function is only available for up to two dimensions."); } /// /// Gets the complementary cumulative distribution function /// (ccdf) for this distribution evaluated at point x. /// This function is also known as the Survival function. /// /// /// /// The Complementary Cumulative Distribution Function (CCDF) is /// the complement of the Cumulative Distribution Function, or 1 /// minus the CDF. /// /// protected internal override double InnerComplementaryDistributionFunction(params double[] x) { if (Dimension == 1) { double stdDev = Math.Sqrt(Covariance[0, 0]); double z = (x[0] - mean[0]) / stdDev; if (stdDev == 0) return (x[0] == mean[0]) ? 0 : 1; return Normal.Complemented(z); } if (Dimension == 2) { double sigma1 = Math.Sqrt(Covariance[0, 0]); double sigma2 = Math.Sqrt(Covariance[1, 1]); double rho = Covariance[0, 1] / (sigma1 * sigma2); if (Double.IsNaN(rho)) return (x.IsEqual(mean)) ? 0 : 1; double z = (x[0] - mean[0]) / sigma1; double w = (x[1] - mean[1]) / sigma2; return Normal.BivariateComplemented(z, w, rho); } throw new NotSupportedException("The cumulative distribution " + "function is only available for up to two dimensions."); } /// /// Gets the probability density function (pdf) for /// this distribution evaluated at point x. /// /// /// A single point in the distribution range. For a /// univariate distribution, this should be a single /// double value. For a multivariate distribution, /// this should be a double array. /// /// /// The probability of x occurring /// in the current distribution. /// /// /// /// The Probability Density Function (PDF) describes the /// probability that a given value x will occur. /// /// protected internal override double InnerProbabilityDensityFunction(params double[] x) { return Math.Exp(-0.5 * Mahalanobis(x) + lnconstant); } /// /// Gets the log-probability density function (pdf) /// for this distribution evaluated at point x. /// /// /// A single point in the distribution range. For a /// univariate distribution, this should be a single /// double value. For a multivariate distribution, /// this should be a double array. /// /// /// The logarithm of the probability of x /// occurring in the current distribution. /// /// protected internal override double InnerLogProbabilityDensityFunction(params double[] x) { return -0.5 * Mahalanobis(x) + lnconstant; } /// /// Gets the Mahalanobis distance between a sample and this distribution. /// /// /// A point in the distribution space. /// /// The Mahalanobis distance between the point and this distribution. /// #if NET45 || NET46 || NET462 || NETSTANDARD2_0 [MethodImpl(MethodImplOptions.AggressiveInlining)] #endif public double Mahalanobis(double[] x) { if (x.Length != Dimension) throw new DimensionMismatchException("x", "The vector should have the same dimension as the distribution."); var z = new double[mean.Length]; for (int i = 0; i < x.Length; i++) z[i] = x[i] - mean[i]; double[] a = (chol == null) ? svd.Solve(z) : chol.Solve(z); double b = 0; for (int i = 0; i < z.Length; i++) b += a[i] * z[i]; return b; } /// /// Fits the underlying distribution to a given set of observations. /// /// /// The array of observations to fit the model against. The array /// elements can be either of type double (for univariate data) or /// type double[] (for multivariate data). /// The weight vector containing the weight for each of the samples. /// Optional arguments which may be used during fitting, such /// as regularization constants and additional parameters. /// /// /// Please see . /// /// public override void Fit(double[][] observations, double[] weights, IFittingOptions options) { NormalOptions normalOptions = options as NormalOptions; if (options != null && normalOptions == null) throw new ArgumentException("The specified options' type is invalid.", "options"); Fit(observations, weights, normalOptions); } /// /// Fits the underlying distribution to a given set of observations. /// /// /// The array of observations to fit the model against. The array /// elements can be either of type double (for univariate data) or /// type double[] (for multivariate data). /// The weight vector containing the weight for each of the samples. /// Optional arguments which may be used during fitting, such /// as regularization constants and additional parameters. /// /// /// Please see . /// /// public void Fit(double[][] observations, double[] weights, NormalOptions options) { double[] mean; double[,] cov; if (weights != null) { #if DEBUG for (int i = 0; i < weights.Length; i++) { if (Double.IsNaN(weights[i]) || Double.IsInfinity(weights[i])) throw new ArgumentException("Invalid numbers in the weight vector.", "weights"); } #endif // Compute weighted mean vector mean = Measures.WeightedMean(observations, weights); // Compute weighted covariance matrix if (options != null && options.Diagonal) cov = Matrix.Diagonal(Measures.WeightedVariance(observations, weights, mean)); else cov = Measures.WeightedCovariance(observations, weights, mean); } else { // Compute mean vector mean = Measures.Mean(observations, dimension: 0); // Compute covariance matrix if (options != null && options.Diagonal) cov = Matrix.Diagonal(Measures.Variance(observations, mean)); else cov = Measures.Covariance(observations, mean).ToMatrix(); } if (options != null && options.Shared) { options.Postprocessing = (components, pi) => { var gaussians = components.To(); // Compute pooled variance double[,] pooledCov = Measures.PooledCovariance(gaussians.Apply(x => x.covariance), pi); // Decompose only once decompose(options, pooledCov, out chol, out svd); foreach (var gaussian in gaussians) gaussian.initialize(gaussian.mean, pooledCov, chol, svd); }; initialize(mean, cov, null, null); } else { decompose(options, cov, out chol, out svd); initialize(mean, cov, chol, svd); } } private static void decompose(NormalOptions options, double[,] cov, out CholeskyDecomposition chol, out SingularValueDecomposition svd) { svd = null; chol = null; if (options == null) { // We don't have options. Just attempt a standard fitting. If // the matrix is not positive semi-definite, throw an exception. chol = new CholeskyDecomposition(cov); if (!chol.IsPositiveDefinite) { throw new NonPositiveDefiniteMatrixException("Covariance matrix is not positive " + "definite. Try specifying a regularization constant in the fitting options " + "(there is an example in the Multivariate Normal Distribution documentation)."); } return; } else { // We have options. In this case, we will either be using the SVD // or we can add a regularization constant until the covariance // matrix becomes positive semi-definite. if (options.Robust) { // No need to apply a regularization constant in this case svd = new SingularValueDecomposition(cov, true, true, true); } else { // Apply a regularization constant until covariance becomes positive chol = new CholeskyDecomposition(cov); // Check if need to add a regularization constant double regularization = options.Regularization; if (regularization > 0) { int dimension = cov.Rows(); while (!chol.IsPositiveDefinite) { for (int i = 0; i < dimension; i++) { for (int j = 0; j < dimension; j++) if (Double.IsNaN(cov[i, j]) || Double.IsInfinity(cov[i, j])) cov[i, j] = 0.0; cov[i, i] += regularization; } chol = new CholeskyDecomposition(cov, false, true); } } if (!chol.IsPositiveDefinite) { throw new NonPositiveDefiniteMatrixException("Covariance matrix is not positive " + "definite. Try specifying a regularization constant in the fitting options " + "(there is an example in the Multivariate Normal Distribution documentation)."); } } } } /// /// Estimates a new Normal distribution from a given set of observations. /// /// public static MultivariateNormalDistribution Estimate(double[][] observations) { return Estimate(observations, null, null); } /// /// Estimates a new Normal distribution from a given set of observations. /// /// /// /// Please see . /// /// public static MultivariateNormalDistribution Estimate(double[][] observations, NormalOptions options) { return Estimate(observations, null, options); } /// /// Estimates a new Normal distribution from a given set of observations. /// /// /// /// Please see . /// /// public static MultivariateNormalDistribution Estimate(double[][] observations, double[] weights) { MultivariateNormalDistribution n = new MultivariateNormalDistribution(observations[0].Length); n.Fit(observations, weights, null); return n; } /// /// Estimates a new Normal distribution from a given set of observations. /// /// /// /// Please see . /// /// public static MultivariateNormalDistribution Estimate(double[][] observations, double[] weights, NormalOptions options) { MultivariateNormalDistribution n = new MultivariateNormalDistribution(observations[0].Length); n.Fit(observations, weights, options); return n; } /// /// Creates a new object that is a copy of the current instance. /// /// /// /// A new object that is a copy of this instance. /// /// public override object Clone() { var clone = new MultivariateNormalDistribution(this.Dimension, false); clone.lnconstant = lnconstant; clone.covariance = (double[,])covariance.Clone(); clone.mean = (double[])mean.Clone(); if (chol != null) clone.chol = (CholeskyDecomposition)chol.Clone(); if (svd != null) clone.svd = (SingularValueDecomposition)svd.Clone(); return clone; } /// /// Converts this multivariate /// normal distribution into a joint distribution /// of independent normal distributions. /// /// /// /// A independent joint distribution of /// normal distributions. /// /// public Independent ToIndependentNormalDistribution() { NormalDistribution[] components = new NormalDistribution[this.Dimension]; for (int i = 0; i < components.Length; i++) components[i] = new NormalDistribution(this.Mean[i], Math.Sqrt(this.Variance[i])); return new Independent(components); } /// /// Generates a random vector of observations from the current distribution. /// /// /// The number of samples to generate. /// The location where to store the samples. /// The random number generator to use as a source of randomness. /// Default is to use . /// /// A random vector of observations drawn from this distribution. /// public override double[][] Generate(int samples, double[][] result, Random source) { if (chol == null) throw new NonPositiveDefiniteMatrixException("Covariance matrix is not positive definite."); double[,] A = chol.LeftTriangularFactor; double[] z = new double[Dimension]; double[] u = Mean; for (int i = 0; i < samples; i++) { NormalDistribution.Random(Dimension, result: z, source: source); Matrix.Dot(A, z, result: result[i]); Elementwise.Add(result[i], u, result: result[i]); } return result; } /// /// Creates a new univariate Normal distribution. /// /// /// The mean value for the distribution. /// The standard deviation for the distribution. /// /// A object that /// actually represents a . /// public static MultivariateNormalDistribution Univariate(double mean, double stdDev) { return new MultivariateNormalDistribution(new[] { mean }, new[,] { { stdDev * stdDev } }); } /// /// Creates a new bivariate Normal distribution. /// /// /// The mean value for the first variate in the distribution. /// The mean value for the second variate in the distribution. /// The standard deviation for the first variate. /// The standard deviation for the second variate. /// The correlation coefficient between the two distributions. /// /// A bi-dimensional . /// public static MultivariateNormalDistribution Bivariate(double mean1, double mean2, double stdDev1, double stdDev2, double rho) { double[] mean = { mean1, mean2 }; double[,] covariance = { { stdDev1 * stdDev1, stdDev1 * stdDev2 * rho }, { stdDev1 * stdDev2 * rho, stdDev2 * stdDev2 }, }; return new MultivariateNormalDistribution(mean, covariance); } /// /// Returns a that represents this instance. /// /// /// The format. /// The format provider. /// /// /// A that represents this instance. /// /// public override string ToString(string format, IFormatProvider formatProvider) { return String.Format(formatProvider, "Normal(X; μ, Σ)"); } /// /// Generates a random vector of observations from a distribution with the given parameters. /// /// /// The number of samples to generate. /// The mean vector μ (mu) for the distribution. /// The covariance matrix Σ (sigma) for the distribution. /// /// A random vector of observations drawn from this distribution. /// public static double[][] Generate(int samples, double[] mean, double[,] covariance) { var normal = new MultivariateNormalDistribution(mean, covariance); return normal.Generate(samples); } } }