Skip to content
This repository has been archived by the owner on Nov 19, 2020. It is now read-only.

Commit

Permalink
Mitigating the impact of a numerical precision issue when normalizing…
Browse files Browse the repository at this point in the history
… distances to probabilities in the K-Means++ initialization.

Updates GH-259: K-means clustering exception
  • Loading branch information
cesarsouza committed Jun 29, 2017
1 parent deb93fb commit 0b8aa30
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 23 deletions.
13 changes: 11 additions & 2 deletions Sources/Accord.MachineLearning/Clustering/CentroidCluster`3.cs
Expand Up @@ -136,8 +136,17 @@ public virtual void Randomize(TData[] points, Seeding strategy = Seeding.KMeansP
for (int i = 0; i < D.Length; i++)
D[i] /= sum;

// Sample randomly using the probabilities
idx = GeneralDiscreteDistribution.Random(D);
try
{
// Sample randomly using the probabilities
idx = GeneralDiscreteDistribution.Random(D);
}
catch (InvalidOperationException)
{
// Degenerate case: numerical inaccuracy when normalizing
// the point-centroid distances to become probabilities
idx = r.Next(0, points.Length);
}
}

// 3. Choose one new data point at random as a new center, using a weighted
Expand Down
47 changes: 31 additions & 16 deletions Sources/Accord.MachineLearning/Clustering/KMeans/KMeans.cs
Expand Up @@ -407,24 +407,20 @@ protected void ComputeInformation(double[][] data, int[] labels)
if (ComputeCovariances)
{
// Compute cluster information (optional)
Parallel.For(0, clusters.Count, ParallelOptions, i =>
// Note: If you get OutOfMemoryExceptions here, just disable the
// computation of variances by setting ComputeCovariances = false
if (ParallelOptions.MaxDegreeOfParallelism == 1)
{
double[][] centroids = clusters.Centroids;
// Extract the data for the current cluster
double[][] sub = data.Get(labels.Find(x => x == i));
if (sub.Length > 0)
{
// Compute the current cluster variance
clusters.Covariances[i] = sub.Covariance(centroids[i]);
}
else
for (int i = 0; i < clusters.Count; i++)
innerComputeCovariance(data, labels, i);
}
else
{
Parallel.For(0, clusters.Count, ParallelOptions, i =>
{
// The cluster doesn't have any samples
clusters.Covariances[i] = Jagged.Zeros(Dimension, Dimension);
}
});
innerComputeCovariance(data, labels, i);
});
}
}

if (ComputeError)
Expand All @@ -433,6 +429,25 @@ protected void ComputeInformation(double[][] data, int[] labels)
}
}

private void innerComputeCovariance(double[][] data, int[] labels, int i)
{
double[][] centroids = clusters.Centroids;

// Extract the data for the current cluster
double[][] sub = data.Get(labels.Find(x => x == i));

if (sub.Length > 0)
{
// Compute the current cluster variance
clusters.Covariances[i] = sub.Covariance(centroids[i]);
}
else
{
// The cluster doesn't have any samples
clusters.Covariances[i] = Jagged.Zeros(Dimension, Dimension);
}
}

/// <summary>
/// Divides the input data into K clusters.
/// </summary>
Expand Down
Expand Up @@ -861,10 +861,13 @@ public static int Random(double[] probabilities, Random source, bool log = false
}
}

if (cumulativeSum == 0)
if (cumulativeSum < 1e-100)
throw new ArgumentException("probabilities", "All probabilities are zero.");

throw new InvalidOperationException("Generated value is not between 0 and 1.");
throw new InvalidOperationException("The given probabilities do not sum up to one. Please normalize them by " +
"dividing the probabilities by their sum. If the probabilities have already been normalized, this can be due " +
"a numerical inaccuracy. If this is the case, try transforming the probabilities to logarithms and including " +
"'log = true' in the arguments of the GeneralDiscreteDistribution.Random(double[] probabilities, bool log) function.");
}


Expand Down
Expand Up @@ -26,6 +26,7 @@ namespace Accord.Tests.Statistics
using NUnit.Framework;
using Accord.Math;
using System.Globalization;
using System;

[TestFixture]
public class DiscreteDistributionTest
Expand Down Expand Up @@ -225,7 +226,7 @@ public void FitTest_vector_inputs()

// --

double[][] values2 =
double[][] values2 =
{
new[] { 1.00, 0.00, 0.00, 0.00 },
new[] { 0.00, 0.00, 0.00, 0.00 },
Expand All @@ -241,7 +242,7 @@ public void FitTest_vector_inputs()
Assert.IsTrue(Matrix.IsEqual(expected, actual2));


double[][] values3 =
double[][] values3 =
{
new[] { 1.00, 0.00, 0.00, 0.00 },
new[] { 0.00, 1.00, 0.00, 0.00 },
Expand All @@ -257,7 +258,7 @@ public void FitTest_vector_inputs()
Assert.IsTrue(Matrix.IsEqual(expected, actual3));


double[][] values4 =
double[][] values4 =
{
new[] { 0.50, 0.00, 0.00, 0.00 },
new[] { 0.00, 0.00, 0.00, 0.00 },
Expand Down Expand Up @@ -460,5 +461,13 @@ public void GenerateTest()
Assert.AreEqual(0.3, target.Frequencies[2], 0.01);
}
}


[Test]
public void RandomTest()
{
Assert.Throws<InvalidOperationException>(() => GeneralDiscreteDistribution.Random(new[] { 1e-14, 1e-15 }));
Assert.Throws<ArgumentException>(() => GeneralDiscreteDistribution.Random(new[] { 0.0, 0.0 }));
}
}
}

0 comments on commit 0b8aa30

Please sign in to comment.