forked from adrg/strutil
-
Notifications
You must be signed in to change notification settings - Fork 0
/
jaccard.go
65 lines (54 loc) · 1.55 KB
/
jaccard.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
package metrics
import (
"strings"
"github.com/andvikt/strutil/internal/ngram"
)
// Jaccard represents the Jaccard index for measuring the similarity
// between sequences.
// For more information see https://en.wikipedia.org/wiki/Jaccard_index.
type Jaccard struct {
// CaseSensitive specifies if the string comparison is case sensitive.
CaseSensitive bool
// NgramSize represents the size (in characters) of the tokens generated
// when comparing the input sequences.
NgramSize int
}
// NewJaccard returns a new Jaccard string metric.
//
// Default options:
// CaseSensitive: true
// NGramSize: 2
func NewJaccard() *Jaccard {
return &Jaccard{
CaseSensitive: true,
NgramSize: 2,
}
}
// Compare returns the Jaccard similarity coefficient of a and b. The
// returned similarity is a number between 0 and 1. Larger similarity numbers
// indicate closer matches.
// An n-gram size of 2 is used if the provided size is less than or equal to 0.
func (m *Jaccard) Compare(a, b string) float64 {
// Lower terms if case insensitive comparison is specified.
if !m.CaseSensitive {
a = strings.ToLower(a)
b = strings.ToLower(b)
}
// Check if both terms are empty.
runesA, runesB := []rune(a), []rune(b)
if len(runesA) == 0 && len(runesB) == 0 {
return 1
}
size := m.NgramSize
if size <= 0 {
size = 2
}
// Calculate n-gram intersection and union.
_, common, totalA, totalB := ngram.Intersection(runesA, runesB, size)
total := totalA + totalB
if total == 0 {
return 0
}
// Return similarity.
return float64(common) / float64(total-common)
}