forked from adrg/strutil
-
Notifications
You must be signed in to change notification settings - Fork 0
/
levenshtein.go
106 lines (88 loc) · 2.56 KB
/
levenshtein.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
package metrics
import (
"strings"
"unicode/utf8"
"github.com/andvikt/strutil/internal/mathutil"
)
// Levenshtein represents the Levenshtein metric for measuring the similarity
// between sequences.
// For more information see https://en.wikipedia.org/wiki/Levenshtein_distance.
type Levenshtein struct {
// CaseSensitive specifies if the string comparison is case sensitive.
CaseSensitive bool
// InsertCost represents the Levenshtein cost of a character insertion.
InsertCost int
// InsertCost represents the Levenshtein cost of a character deletion.
DeleteCost int
// InsertCost represents the Levenshtein cost of a character substitution.
ReplaceCost int
}
// NewLevenshtein returns a new Levenshtein string metric.
//
// Default options:
// CaseSensitive: true
// InsertCost: 1
// DeleteCost: 1
// ReplaceCost: 1
func NewLevenshtein() *Levenshtein {
return &Levenshtein{
CaseSensitive: true,
InsertCost: 1,
DeleteCost: 1,
ReplaceCost: 1,
}
}
// Compare returns the Levenshtein similarity of a and b. The returned
// similarity is a number between 0 and 1. Larger similarity numbers indicate
// closer matches.
func (m *Levenshtein) Compare(a, b string) float64 {
distance, maxLen := m.distance(a, b)
return 1 - float64(distance)/float64(maxLen)
}
// Distance returns the Levenshtein distance between a and b. Lower distances
// indicate closer matches. A distance of 0 means the strings are identical.
func (m *Levenshtein) Distance(a, b string) int {
distance, _ := m.distance(a, b)
return distance
}
func (m *Levenshtein) distance(a, b string) (int, int) {
// Check if both terms are empty.
lenA, lenB := utf8.RuneCountInString(a), utf8.RuneCountInString(b)
if lenA == 0 && lenB == 0 {
return 0, 0
}
// Check if one of the terms is empty.
maxLen := mathutil.Max(lenA, lenB)
if lenA == 0 {
return m.InsertCost * lenB, maxLen
}
if lenB == 0 {
return m.DeleteCost * lenA, maxLen
}
// Lower terms if case insensitive comparison is specified.
if !m.CaseSensitive {
a = strings.ToLower(a)
b = strings.ToLower(b)
}
// Initialize cost slice.
prevCol := make([]int, lenB+1)
for i := 0; i <= lenB; i++ {
prevCol[i] = i
}
// Calculate distance.
col := make([]int, lenB+1)
for i := 0; i < lenA; i++ {
col[0] = i + 1
for j := 0; j < lenB; j++ {
delCost := prevCol[j+1] + m.DeleteCost
insCost := col[j] + m.InsertCost
subCost := prevCol[j]
if a[i] != b[j] {
subCost += m.ReplaceCost
}
col[j+1] = mathutil.Min(delCost, insCost, subCost)
}
col, prevCol = prevCol, col
}
return prevCol[lenB], maxLen
}