/
mspm.go
82 lines (67 loc) · 1.85 KB
/
mspm.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
// Package mspm provides model that will have collection of trieNodes that represent the patterns to be searched in a document.
package mspm
import (
"bufio"
"io"
"io/ioutil"
"github.com/BlackRabbitt/mspm/ds/trie"
)
// M represents mspm-model.
type M struct {
Name string // Name representing mspm model
trieNode *trie.HashNode
}
// Output defines the output of mspm.
// string - Term found in document
// int32 - Count of term in that document
type Output map[string]int32
// NewModel will return a fresh new model
func NewModel(name string) *M {
return &M{Name: name, trieNode: trie.NewHashNode()}
}
// Build trie datastructure that accepts multiline list of words.
func (model *M) Build(words io.Reader) {
scanner := bufio.NewScanner(words)
for scanner.Scan() {
model.trieNode.Insert(scanner.Bytes())
}
}
// MultiTermMatch returns all the trie-terms found in document.
func (model *M) MultiTermMatch(document io.Reader) (output Output, err error) {
output = make(map[string]int32)
content, err := ioutil.ReadAll(document)
if err != nil {
return
}
var index byte
tNode := model.trieNode
// start and end pointer select the current valid term. It is adjusted itself over time.
startPointer := 0
endPointer := startPointer
for level := startPointer; level < len(content); level++ {
index = content[level]
if tNode.Children[index] == nil {
if endPointer > startPointer {
term := string(content[startPointer : endPointer+1])
output[term]++
}
startPointer = level + 1
endPointer = startPointer
tNode = model.trieNode
continue
}
tNode = tNode.Children[index]
if tNode.FinalState {
endPointer = level
if len(tNode.Children) > 0 {
continue
}
term := string(content[startPointer : endPointer+1]) // exclusive
output[term]++
startPointer = level + 1
tNode = model.trieNode
continue
}
}
return
}