-
Notifications
You must be signed in to change notification settings - Fork 0
/
slug.go
196 lines (180 loc) · 6.03 KB
/
slug.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
package slug
import (
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
"html"
"regexp"
"unicode"
)
// Slugify a string. We attempt to do this so as to produce slugs
// that are as identical as possible to the (Ruby) Mongoid slug
// package. That package actually uses another package, Stringex
// (https://github.com/rsl/stringex) to generate the slug strings.
// Stringex, in turn, provides a special adapater for Mongoid. We
// use Stringex as the reference implementation without the
// exclusion, limit or downcase options.
//
// def to_url(options = {})
// return self if options[:exclude] && options[:exclude].include?(self)
// options = stringex_default_options.merge(options)
// whitespace_replacement_token = options[:replace_whitespace_with]
// dummy = remove_formatting(options).
// replace_whitespace(whitespace_replacement_token).
// collapse(whitespace_replacement_token).
// limit(options[:limit], options[:truncate_words], whitespace_replacement_token)
// dummy.downcase! unless options[:force_downcase] == false
// dummy
// end
//
// This method makes use of the following routine to normalize all
// kinds of things. We don't perform all of these normalizations.
//
// def remove_formatting(options = {})
// strip_html_tags.
// convert_smart_punctuation.
// convert_accented_html_entities.
// convert_vulgar_fractions.
// convert_unreadable_control_characters.
// convert_miscellaneous_html_entities.
// convert_miscellaneous_characters(options).
// to_ascii.
// # NOTE: String#to_ascii may convert some Unicode characters to ascii we'd already transliterated
// # so we need to do it again just to be safe
// convert_miscellaneous_characters(options).
// collapse
// end
//
func Slugify(s string) string {
return slugify(s, "-")
}
// Slugify
func slugify(s, w string) string {
var g string
s = StripHTMLTags(s)
s = html.UnescapeString(s)
s = stripControlCharacters(s)
s = normalizeDiacritics(s)
s = convertCurrenciesToWords(s)
s = convertSymbolsToWords(s)
sp, ws := 0, 0
for _, e := range s {
if unicode.IsSpace(e) {
sp++
continue
}
if string(e) == w { // allow a single literal word separator
ws++
continue
}
if unicode.IsLetter(e) || unicode.IsNumber(e) {
if sp > 0 {
// add the space if we're not at the beginning or the end
if len(g) > 0 {
g += w
}
sp = 0 // clear space
} else if ws > 0 {
// add the word separator if we're not at the beginning or the end
if len(g) > 0 {
g += w
}
ws = 0 // clear word separator
}
g += string(unicode.ToLower(e))
}
}
return g
}
// Non-space marks
func isMn(r rune) bool {
return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks
}
// Convert characters with diacritical marks to their unaccented/base
// counterparts. See also:
// http://stackoverflow.com/questions/26722450/remove-diacritics-using-go
func normalizeDiacritics(s string) string {
t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
result, _, _ := transform.String(t, s)
return result
}
// Map smart punctuation to it's dumb counterpart
var smartPunct = map[rune]string{
'«': "\"", '»': "\"",
'“': "\"", '”': "\"",
'„': "\"", '‟': "\"",
'❝': "\"", '❞': "\"",
'〝': "\"", '〞': "\"",
'〟': "\"", '"': "\"",
'‘': "'", '’': "'",
'‚': "'", '‛': "'",
'‹': "'", '›': "'",
'❛': "'", '❜': "'",
'…': "...",
}
// Convert special quotation marks to ASCII " runes.
func normalizeSmartPunctuation(s string) string {
var c string
for _, e := range s {
if r, ok := smartPunct[e]; ok {
c += r
} else {
c += string(e)
}
}
return c
}
// Strip out control characters
func stripControlCharacters(s string) string {
var c string
for _, e := range s {
if !unicode.IsControl(e) {
c += string(e)
}
}
return c
}
// Expression / replacement
type regexpReplace struct {
Expr *regexp.Regexp
Replace string
}
// Symbols for term conversion (this is a pretty expensive way to do this...)
var termSymbols = []regexpReplace{
regexpReplace{regexp.MustCompile(`\s*&\s*`), " and "},
regexpReplace{regexp.MustCompile(`\s*@\s*`), " at "},
regexpReplace{regexp.MustCompile(`\s*º\s*`), " degrees "},
regexpReplace{regexp.MustCompile(`\s*°\s*`), " degrees "},
regexpReplace{regexp.MustCompile(`\s*÷\s*`), " divided by "},
regexpReplace{regexp.MustCompile(`\s*\.{3,}\s*`), " ellipsis "},
regexpReplace{regexp.MustCompile(`(\S|^)\.(\S)`), "$1 dot $2"},
regexpReplace{regexp.MustCompile(`\s*=\s*`), " equals "},
regexpReplace{regexp.MustCompile(`\s*%\s*`), " percent "},
regexpReplace{regexp.MustCompile(`\s*(\\|\/|/)\s*`), " slash "},
regexpReplace{regexp.MustCompile(`\s*\*\s*`), " star "},
}
// Convert certain special symbols to their word counterpard
func convertSymbolsToWords(s string) string {
for _, e := range termSymbols {
s = e.Expr.ReplaceAllString(s, e.Replace)
}
return s
}
// Symbols for currency conversion (this is a pretty expensive way to do this...)
var currencySymbols = []regexpReplace{
regexpReplace{regexp.MustCompile(`(?:\s|^)€(\d+)(?:\s|$)`), " $1 euros "},
regexpReplace{regexp.MustCompile(`(?:\s|^)€(\d+)\.(\d+)(?:\s|$)`), " $1 euros $2 cents "},
regexpReplace{regexp.MustCompile(`(?:\s|^)\$(\d+)(?:\s|$)`), " $1 dollars "},
regexpReplace{regexp.MustCompile(`(?:\s|^)\$(\d+)\.(\d+)(?:\s|$)`), " $1 dollars $2 cents "},
regexpReplace{regexp.MustCompile(`(?:\s|^)£(\d+)(?:\s|$)`), " $1 pounds "},
regexpReplace{regexp.MustCompile(`(?:\s|^)£(\d+)\.(\d+)(?:\s|$)`), " $1 pounds $2 pence "},
regexpReplace{regexp.MustCompile(`(?:\s|^)¥(\d+)(?:\s|$)`), " $1 yen "},
regexpReplace{regexp.MustCompile(`(?:\s|^)R\$(\d+)(?:\s|$)`), " $1 reais "},
regexpReplace{regexp.MustCompile(`(?:\s|^)R\$(\d+)\.(\d+)(?:\s|$)`), " $1 reais $2 cents "},
}
// Convert certain special symbols to their word counterpard
func convertCurrenciesToWords(s string) string {
for _, e := range currencySymbols {
s = e.Expr.ReplaceAllString(s, e.Replace)
}
return s
}