/
document.go
126 lines (108 loc) · 3.14 KB
/
document.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
package prose
// A DocOpt represents a setting that changes the document creation process.
//
// For example, it might disable named-entity extraction:
//
// doc := prose.NewDocument("...", prose.WithExtraction(false))
type DocOpt func(doc *Document, opts *DocOpts)
// DocOpts controls the Document creation process:
type DocOpts struct {
Extract bool // If true, include named-entity extraction
Segment bool // If true, include segmentation
Tag bool // If true, include POS tagging
Tokenize bool // If true, include tokenization
}
// WithTokenization can enable (the default) or disable tokenization.
func WithTokenization(include bool) DocOpt {
return func(doc *Document, opts *DocOpts) {
// Tagging and entity extraction both require tokenization.
opts.Tokenize = include
}
}
// WithTagging can enable (the default) or disable POS tagging.
func WithTagging(include bool) DocOpt {
return func(doc *Document, opts *DocOpts) {
opts.Tag = include
}
}
// WithSegmentation can enable (the default) or disable sentence segmentation.
func WithSegmentation(include bool) DocOpt {
return func(doc *Document, opts *DocOpts) {
opts.Segment = include
}
}
// WithExtraction can enable (the default) or disable named-entity extraction.
func WithExtraction(include bool) DocOpt {
return func(doc *Document, opts *DocOpts) {
opts.Extract = include
}
}
// UsingModel can enable (the default) or disable named-entity extraction.
func UsingModel(model *Model) DocOpt {
return func(doc *Document, opts *DocOpts) {
doc.Model = model
}
}
// A Document represents a parsed body of text.
type Document struct {
Model *Model
Text string
// TODO: Store offsets (begin, end) instead of `text` field.
entities []Entity
sentences []Sentence
tokens []*Token
}
// Tokens returns `doc`'s tokens.
func (doc *Document) Tokens() []Token {
tokens := make([]Token, 0, len(doc.tokens))
for _, tok := range doc.tokens {
tokens = append(tokens, *tok)
}
return tokens
}
// Sentences returns `doc`'s sentences.
func (doc *Document) Sentences() []Sentence {
return doc.sentences
}
// Entities returns `doc`'s entities.
func (doc *Document) Entities() []Entity {
return doc.entities
}
var defaultOpts = DocOpts{
Tokenize: true,
Segment: true,
Tag: true,
Extract: true,
}
// NewDocument creates a Document according to the user-specified options.
//
// For example,
//
// doc := prose.NewDocument("...")
func NewDocument(text string, opts ...DocOpt) (*Document, error) {
var pipeError error
doc := Document{Text: text}
base := defaultOpts
for _, applyOpt := range opts {
applyOpt(&doc, &base)
}
if doc.Model == nil {
doc.Model = DefaultModel(base.Tag, base.Extract)
}
if base.Segment {
segmenter := NewPunktSentenceTokenizer()
doc.sentences = segmenter.Segment(text)
}
if base.Tokenize || base.Tag || base.Extract {
tokenizer := NewIterTokenizer()
doc.tokens = append(doc.tokens, tokenizer.Tokenize(text)...)
}
if base.Tag || base.Extract {
doc.tokens = doc.Model.Tagger.Tag(doc.tokens)
}
if base.Extract {
doc.tokens = doc.Model.extracter.classify(doc.tokens)
doc.entities = doc.Model.extracter.chunk(doc.tokens)
}
return &doc, pipeError
}