/
basically.go
62 lines (53 loc) · 2.21 KB
/
basically.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
package basically
// A Document represents a given text, and is responsible for
// handling the summarization and keyword extraction process.
type Document interface {
Summarize(length int, threshold float64, focus string) ([]*Sentence, error)
Highlight(length int, merge bool) ([]*Keyword, error)
Characters() (int, int)
}
// A Parser is responsible for parsing and tokenizing a document
// into strings and words. A Parser also performs additional tasks
// such as POS-tagging and sentiment analysis.
type Parser interface {
ParseDocument(doc string, quote bool) ([]*Sentence, []*Token, error)
}
// A Summarizer is responsible for extracting key sentences from a
// document.
type Summarizer interface {
Initialize(sents []*Sentence, similar Similarity, filter TokenFilter,
focusString *Sentence, threshold float64)
Rank(iters int)
}
// A Highlighter is responsible for extracting key words from a document.
type Highlighter interface {
Initialize(tokens []*Token, filter TokenFilter, window int)
Rank(iters int)
Highlight(length int, merge bool) ([]*Keyword, error)
}
// A TokenFilter represents a (black/white) filter applied to tokens before similarity calculations.
type TokenFilter func(*Token) bool
// A Similarity computes the similarity of two sentences after applying the token filter.
type Similarity func(n1, n2 []*Token, filter TokenFilter) float64
// A Token represents an individual token of text such as a word or punctuation
// symbol.
type Token struct {
Tag string // The token's part-of-speech tag.
Text string // The token's actual content.
Order int // The token's order in the text.
}
// A Keyword is the keyword belonging to a highlighted document.
// A Keyword contains the raw word, and its associated weight.
type Keyword struct {
Word string // Raw keyword.
Weight float64 // Weight of the keyword.
}
// A Sentence represents an individual sentence within the text.
type Sentence struct {
Raw string // Raw sentence string.
Tokens []*Token // Tokenized sentence.
Sentiment float64 // Sentiment score.
Score float64 // Score (weight) of the sentence.
Bias float64 // Bias assigned to the sentence for ranking.
Order int // The sentence's order in the text.
}