-
-
Notifications
You must be signed in to change notification settings - Fork 83
/
util.go
107 lines (92 loc) · 2.89 KB
/
util.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
package regex
import (
"github.com/hedhyw/rex/pkg/dialect/base"
"github.com/hedhyw/rex/pkg/rex"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
"regexp"
"strings"
"unicode"
)
func AnyWordChar() base.ClassToken {
return rex.Common.Class(rex.Chars.Unicode(unicode.L), rex.Chars.Digits())
}
func AnyNonWordChar() base.ClassToken {
return rex.Common.NotClass(rex.Chars.Unicode(unicode.L), rex.Chars.Digits())
}
func OpeningPunctuationToken() base.ClassToken {
return rex.Chars.Runes("('\"")
}
func ClosingPunctuationToken() base.ClassToken {
return rex.Chars.Runes(",;:?!-)'\"")
}
func MidWordPunctuationToken() base.ClassToken {
return rex.Chars.Runes("'-")
}
func TrimmedWordToken() base.GroupToken {
return rex.Group.Composite(
rex.Group.NonCaptured(rex.Chars.Upper(), rex.Chars.Single('.')).Repeat().EqualOrMoreThan(2),
rex.Group.NonCaptured(
AnyWordChar().Repeat().OneOrMore(),
rex.Group.NonCaptured(
MidWordPunctuationToken().Repeat().OneOrMore(), AnyWordChar().Repeat().OneOrMore(),
).Repeat().ZeroOrMore(),
),
).NonCaptured()
}
func WordToken() base.GroupToken {
return rex.Group.Define(
OpeningPunctuationToken().Repeat().ZeroOrMore(),
TrimmedWordToken(),
ClosingPunctuationToken().Repeat().ZeroOrMore(),
).NonCaptured()
}
var wordTokenRegex = rex.New(WordToken()).MustCompile()
func WordTokenRegex() *regexp.Regexp {
return wordTokenRegex
}
func NormalizeString(input string) string {
input = strings.ToLower(input)
input, _, _ = transform.String(transform.Chain(norm.NFD, norm.NFC), input)
var tokens []string
for _, match := range WordTokenRegex().FindAllStringSubmatch(input, -1) {
if len(match) >= 1 && len(match[0]) >= 1 {
tokens = append(tokens, match[0])
}
}
return strings.Join(tokens, " ")
}
func QuotedStringToken(quoteCharToken base.ClassToken) base.GroupToken {
return rex.Group.Define(
quoteCharToken,
rex.Group.Composite(
rex.Group.Define(rex.Chars.Runes("\\"), quoteCharToken).NonCaptured(),
rex.Common.NotClass(quoteCharToken),
).Repeat().ZeroOrMore(),
quoteCharToken,
).NonCaptured()
}
var searchTokenRegex = rex.New(
rex.Group.Composite(
rex.Group.Define(QuotedStringToken(rex.Chars.Runes("'"))),
rex.Group.Define(QuotedStringToken(rex.Chars.Runes("\""))),
rex.Group.Define(rex.Chars.Runes(`-`).Repeat().ZeroOrMore(), WordToken()),
).NonCaptured().Repeat().ZeroOrMore(),
).MustCompile()
func SearchStringToNormalizedTokens(input string) []string {
var tokens []string
matches := searchTokenRegex.FindAllStringSubmatch(input, -1)
for _, match := range matches {
if len(match[1]) >= 1 {
tokens = append(tokens, match[1])
} else if len(match[3]) >= 1 {
tokens = append(tokens, match[3])
} else if len(match[5]) >= 1 {
tokens = append(tokens, strings.ToLower(match[5]))
}
}
return tokens
}
func NormalizeSearchString(input string) string {
return strings.Join(SearchStringToNormalizedTokens(input), " ")
}