Skip to content

Commit

Permalink
Added Gaussian Multivariate Distribution algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
Alonso Vidales committed Mar 13, 2014
1 parent 3f42194 commit 7dd9180
Show file tree
Hide file tree
Showing 5 changed files with 466 additions and 0 deletions.
27 changes: 27 additions & 0 deletions README.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ package ml
- Logistic Regression
- Neural Networks
- Collaborative Filtering
- Gaussian Multivariate Distribution for anomaly detection systems

Is implemented too the fmincg function in order to calculate the optimal
theta configuration to reduce the cost value for all the implemented
Expand Down Expand Up @@ -152,6 +153,32 @@ type DataSet interface {



type MultGaussianDist struct {
X [][]float64 // Data to train the gussian distribution
Sigma2 []float64
Mu []float64 // Medians for each feature
}
Anomaly detection implementation using Multivariate Gaussian
Distribution:
http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Density_function


func MultVarGaussianDistLoadFromFile(filePath string) (gd *MultGaussianDist)
Creates a MultGaussianDist object from the content of a CSV file space
sepparate where each line is a sample and each column a feature


func (gd *MultGaussianDist) CalculateMuSigma()
To be called before GetProbability method in order to calculate the
medians and sigmas params for all the training set

func (gd *MultGaussianDist) GetProbability(data [][]float64) (p []float64)
Returns the probability of anomaly for each data, each row of data is a
sample to study and each colum a featurea, determinate an epsilon and
when p(x) < epsilon, you may have an anomaly, you can use
SelectThreshold in in order to calculate the best epsilon


type NeuralNet struct {
// Training set of values for each feature, the first dimension are the test cases
X [][]float64
Expand Down
1 change: 1 addition & 0 deletions ml.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
// - Logistic Regression
// - Neural Networks
// - Collaborative Filtering
// - Gaussian Multivariate Distribution for anomaly detection systems
//
// Is implemented too the fmincg function in order to calculate the optimal
// theta configuration to reduce the cost value for all the implemented solutions.
Expand Down
99 changes: 99 additions & 0 deletions multivariate_gaussian_normal_distribution.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package ml

import (
"io/ioutil"
"math"
"strconv"
"strings"
)

// Anomaly detection implementation using Multivariate Gaussian Distribution:
// http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Density_function
type MultGaussianDist struct {
X [][]float64 // Data to train the gussian distribution
Sigma2 []float64
Mu []float64 // Medians for each feature
}

// To be called before GetProbability method in order to calculate the medians
// and sigmas params for all the training set
func (gd *MultGaussianDist) CalculateMuSigma() {
gd.Mu = make([]float64, len(gd.X[0]))

for _, x := range gd.X {
for i, val := range x {
gd.Mu[i] += val
}
}
for i := 0; i < len(gd.X[0]); i++ {
gd.Mu[i] /= float64(len(gd.X))
}

gd.Sigma2 = make([]float64, len(gd.X[0]))
for _, x := range gd.X {
for i, val := range x {
gd.Sigma2[i] += math.Pow(val-gd.Mu[i], 2)
}
}
for i := 0; i < len(gd.X[0]); i++ {
gd.Sigma2[i] /= float64(len(gd.X))
}
}

// Returns the probability of anomaly for each data, each row of data is a
// sample to study and each colum a featurea, determinate an epsilon and when
// p(x) < epsilon, you may have an anomaly, you can use SelectThreshold in
// in order to calculate the best epsilon
func (gd *MultGaussianDist) GetProbability(data [][]float64) (p []float64) {
for i := 0; i < len(data); i++ {
for c := 0; c < len(data[0]); c++ {
data[i][c] -= gd.Mu[c]
}
}

detSig := 1.0
for i := 0; i < len(gd.Sigma2); i++ {
detSig *= gd.Sigma2[i]
}

base := math.Pow(2*math.Pi, -(float64(len(data[0])))/2) * math.Pow(detSig, -0.5)
p = make([]float64, len(data))
for i := 0; i < len(data); i++ {
for c := 0; c < len(data[0]); c++ {
p[i] += -0.5 * data[i][c] * data[i][c] * (1 / gd.Sigma2[c])
}
p[i] = base * math.Pow(math.E, p[i])
}
return
}

// Creates a MultGaussianDist object from the content of a CSV file space
// sepparate where each line is a sample and each column a feature
func MultVarGaussianDistLoadFromFile(filePath string) (gd *MultGaussianDist) {
strInfo, err := ioutil.ReadFile(filePath)
if err != nil {
panic(err)
}

trainingData := strings.Split(string(strInfo), "\n")
gd = &MultGaussianDist{
X: make([][]float64, len(trainingData)-1),
}
for i, line := range trainingData {
if line == "" {
break
}

parts := strings.Split(line, " ")
gd.X[i] = make([]float64, len(parts))
for c, value := range strings.Split(line, " ") {
floatVal, err := strconv.ParseFloat(value, 64)
if err != nil {
panic(err)
}
gd.X[i][c] = floatVal
}
}

return
}
32 changes: 32 additions & 0 deletions multivariate_gaussian_normal_distribution_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package ml

import (
"fmt"
"testing"
)

func TestMultivariateGaussianDistreibution(t *testing.T) {
fmt.Println("Testing Multivariate Gaussian Distribution implementation...")
gd := MultVarGaussianDistLoadFromFile(
"test_data/multivarite_gaussian_normal_distribution.dat")
gd.CalculateMuSigma()
probs := gd.GetProbability([][]float64{
[]float64{13.0468, 14.7412},
[]float64{13.4085, 13.7633},
[]float64{14.1959, 15.8532},
[]float64{14.9147, 16.1743},
})

expectedRes := []float64{
0.06470823722117165,
0.050304834321416796,
0.07244977886910747,
0.0503144083695188,
}

for i, res := range expectedRes {
if res != probs[i] {
t.Error("The expected value is:", res, "but the obtained was:", probs[i])
}
}
}
Loading

0 comments on commit 7dd9180

Please sign in to comment.