-
Notifications
You must be signed in to change notification settings - Fork 88
/
tokenize.go
52 lines (46 loc) · 1.66 KB
/
tokenize.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
package keyword
import (
"log/slog"
"regexp"
"strings"
"unicode"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)
var (
puncChars = regexp.MustCompile(`[[:punct:]]+`)
nonTokenChars = regexp.MustCompile(`[^\pL\pN\s]+`)
)
// Splits free-form text in to tokens, including lower-case, unicode normalization, and some unicode folding.
//
// The intent is for this to work similarly to an NLP tokenizer, as might be used in a fulltext search engine, and enable fast matching to a list of known tokens. It might eventually even do stemming, removing pluralization (trailing "s" for English), etc.
func TokenizeText(text string) []string {
// this function needs to be re-defined in every function call to prevent a race condition
normFunc := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
split := strings.ToLower(nonTokenChars.ReplaceAllString(text, " "))
bare := strings.ToLower(nonTokenChars.ReplaceAllString(split, ""))
norm, _, err := transform.String(normFunc, bare)
if err != nil {
slog.Warn("unicode normalization error", "err", err)
norm = bare
}
return strings.Fields(norm)
}
func splitIdentRune(c rune) bool {
return !unicode.IsLetter(c) && !unicode.IsNumber(c)
}
// Splits an identifier in to tokens. Removes any single-character tokens.
//
// For example, the-handle.bsky.social would be split in to ["the", "handle", "bsky", "social"]
func TokenizeIdentifier(orig string) []string {
fields := strings.FieldsFunc(orig, splitIdentRune)
out := make([]string, 0, len(fields))
for _, v := range fields {
tok := Slugify(v)
if len(tok) > 1 {
out = append(out, tok)
}
}
return out
}