-
-
Notifications
You must be signed in to change notification settings - Fork 5
/
vocabulary.go
47 lines (42 loc) · 1.15 KB
/
vocabulary.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
package model
import (
"github.com/adalkiran/llama-nuts-and-bolts/src/sentencepiece"
)
const (
unknownToken = "<unk>"
beginOfSentenceToken = "<s>"
endOfSentenceToken = "</s>"
whitespaceEscapeToken = "\xe2\x96\x81"
unknownOutputToken = "\xe2\x96\x85"
)
type Vocabulary struct {
TokenToId map[string]TokenId
IdToToken []sentencepiece.SentencePiece
BeginOfSentenceId TokenId
EndOfSentenceId TokenId
UnknownId TokenId
PadId TokenId
}
func NewVocabulary(vocabModelProto *sentencepiece.ModelProto) *Vocabulary {
result := &Vocabulary{
TokenToId: make(map[string]TokenId, len(*vocabModelProto.Pieces)),
IdToToken: *vocabModelProto.Pieces,
UnknownId: -1,
BeginOfSentenceId: -1,
EndOfSentenceId: -1,
PadId: -1,
}
for i, token := range result.IdToToken {
result.TokenToId[token.Piece] = TokenId(i)
}
if id, ok := result.TokenToId[unknownToken]; ok {
result.UnknownId = id
}
if id, ok := result.TokenToId[beginOfSentenceToken]; ok {
result.BeginOfSentenceId = id
}
if id, ok := result.TokenToId[endOfSentenceToken]; ok {
result.EndOfSentenceId = id
}
return result
}