-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.go
65 lines (58 loc) · 1.4 KB
/
utils.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
package segment
import (
"regexp"
"strings"
)
var (
reHanExists = regexp.MustCompile(`\p{Han}+`)
reNoWords = regexp.MustCompile(`^([\d]+[\w]*|[\w]+[\d]+)$`)
// DictFile 分词词典文件
DictFile string
lastLoadedDictFile string
)
func init() {
AddFilter(func(v string) bool {
switch {
case reHanExists.MatchString(v):
if len(v) < 6 { // 忽略掉单字
return false
}
case reNoWords.MatchString(v):
return false
default:
return true
}
return true
})
}
// SplitWords 分词
func SplitWords(b []byte, args ...string) []string {
if len(DictFile) > 0 && lastLoadedDictFile != DictFile {
lastLoadedDictFile = DictFile
ReloadDict()
}
return Default().Segment(string(b), args...)
}
// SplitWordsBy 按模式分词
func SplitWordsBy(b []byte, mode string, args ...string) []string {
if len(DictFile) > 0 && lastLoadedDictFile != DictFile {
lastLoadedDictFile = DictFile
ReloadDict()
}
return Default().SegmentBy(string(b), mode, args...)
}
// ReloadDict 重新加载词典
func ReloadDict(dictFiles ...string) error {
dictFile := DictFile
if len(dictFiles) > 0 {
dictFile = dictFiles[0]
}
return Default().LoadDict(dictFile)
}
// SplitWordsAsString 将分词结果作为字串返回
func SplitWordsAsString(b []byte, args ...string) string {
words := SplitWords(b, args...)
content := strings.Join(words, ` `)
content = strings.TrimSpace(content)
return content
}