/
hyphen.go
173 lines (153 loc) · 4.36 KB
/
hyphen.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
// Package hyphen hyphenates text using existing Hunspell hyphenation dictionaries.
//
// This is a port of https://github.com/Kozea/Pyphen
package hyphen
import (
"strings"
"sync"
"unicode"
"github.com/benoitkugler/textlayout/language"
)
var (
dictionariesCache = map[string]hyphDicReference{}
dictionariesCacheLock sync.Mutex
)
type Hyphener struct {
hd hyphDic
left, right int
}
func NewHyphener(lang language.Language, left, right int) Hyphener {
filename := languages[LanguageFallback(lang)]
var out Hyphener
out.left, out.right = left, right
dictionariesCacheLock.Lock()
defer dictionariesCacheLock.Unlock()
if dic, ok := dictionariesCache[filename]; ok {
out.hd.data = dic
} else {
dic, _ := parseHyphDic(dictionaries, filename) // Test assert thaht it wont fail
dictionariesCache[filename] = dic
out.hd.data = dic
}
out.hd.cache = make(map[string][]dataOrInt)
return out
}
// Get a list of positions where the word can be hyphenated.
// See also `HyphDict.positions`. The points that are too far to the
// left or right are removed.
func (h Hyphener) positions(word []rune) []dataOrInt {
right := len(word) - h.right
var out []dataOrInt
for _, index := range h.hd.positions(word) {
if h.left <= index.V && index.V <= right {
out = append(out, index)
}
}
return out
}
// Iterates over all hyphenation possibilities, the longest first,
// for `word`.
// The returned slice contains the starts of each possibility.
func (h Hyphener) Iterate(word string) []string {
word_ := []rune(word)
pos := h.positions(word_)
L := len(pos)
out := make([]string, L)
wordIsUpper := strings.IndexFunc(word, func(r rune) bool { return !unicode.IsUpper(r) }) == -1
for i := L - 1; i >= 0; i-- { // reverse
index := pos[i]
var subs string
if index.Data != nil { // get the nonstandard hyphenation data
data := *index.Data
data.Index += index.V
c1, _ := data.Changes[0], data.Changes[1]
if wordIsUpper {
c1 = strings.ToUpper(c1)
}
subs = string(word_[:data.Index]) + c1
} else {
subs = string(word_[:index.V])
}
out[L-1-i] = subs
}
return out
}
type hyphDicReference struct {
Patterns map[string]pattern
MaxLength int // in runes
}
type hyphDic struct {
cache map[string][]dataOrInt
data hyphDicReference
}
// Get a list of positions where the word can be hyphenated.
//
// E.g. for the dutch word 'lettergrepen' this method returns `[3, 6, 9]`.
//
// Each position is a [dataOrInt] : if the data attribute is not nil,
// it contains information about nonstandard hyphenation at that point
func (dic hyphDic) positions(word_ []rune) []dataOrInt {
word := strings.ToLower(string(word_))
if points, ok := dic.cache[word]; ok {
return points
}
pointedWord := []rune("." + word + ".")
references := make([]dataOrInt, len(pointedWord)+1)
for i := 0; i < len(pointedWord)-1; i++ {
for j := i + 1; j <= i+dic.data.MaxLength && j <= len(pointedWord); j++ {
pat, ok := dic.data.Patterns[string(pointedWord[i:j])]
if ok {
offset, values := pat.Start, pat.Values
slice := references[i+offset : i+offset+len(values)]
for k := range slice {
max := slice[k]
if values[k].V > slice[k].V {
max = values[k]
}
slice[k] = max
}
}
}
}
var points []dataOrInt
for i, reference := range references {
if reference.V%2 != 0 {
points = append(points, dataOrInt{V: i - 1, Data: reference.Data})
}
}
dic.cache[word] = points
return points
}
type pattern struct {
Values []dataOrInt
Start int
}
type dataOrInt struct {
Data *complexHyphenation // optional
V int
}
// complexHyphenation stores information about nonstandard hyphenation at a point.
type complexHyphenation struct {
// a string like `'ff=f'`, that describes how hyphenation should
// take place.
Changes [2]string
// where to substitute the change, counting from the current point
Index int
// how many characters to remove while substituting the nonstandard
// hyphenation
Cut int
}
// Get a fallback language available in our dictionaries.
//
// http://www.unicode.org/reports/tr35/#Locale_Inheritance
//
// We use the normal truncation inheritance. This function needs aliases
// including scripts for languages with multiple regions available.
func LanguageFallback(lang language.Language) language.Language {
for _, lg := range lang.SimpleInheritance() {
if _, ok := languages[lg]; ok {
return lg
}
}
return ""
}