forked from go-enry/go-license-detector
/
normalize.go
195 lines (170 loc) · 6.42 KB
/
normalize.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
package normalize
import (
"bytes"
"regexp"
"strings"
"unicode"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)
var (
lineEndingsRe = regexp.MustCompile(`\r\n?`)
// 3.1.1 All whitespace should be treated as a single blank space.
whitespaceRe = regexp.MustCompile(`[ \t\f\r ]+`)
trailingWhitespaceRe = regexp.MustCompile(`(?m)[ \t\f\r ]$`)
licenseHeaderRe = regexp.MustCompile(`(licen[cs]e)\.?\n\n`)
leadingWhitespaceRe = regexp.MustCompile(`(?m)^(( \n?)|\n)`)
// 5.1.2 Hyphens, Dashes Any hyphen, dash, en dash, em dash, or other variation should be
// considered equivalent.
punctuationRe = regexp.MustCompile(`[-‒–—―⁓⸺⸻~˗‐‑⁃⁻₋−∼⎯⏤─➖𐆑֊﹘﹣-]+`)
// 5.1.3 Quotes Any variation of quotations (single, double, curly, etc.) should be considered
// equivalent.
quotesRe = regexp.MustCompile(`["'“”‘’„‚«»‹›❛❜❝❞\x60]+`)
// 7.1.1 Where a line starts with a bullet, number, letter, or some form of a list item
// (determined where list item is followed by a space, then the text of the sentence), ignore
// the list item for matching purposes.
bulletRe = regexp.MustCompile(`(?m)^(([-*✱﹡•●⚫⏺🞄∙⋅])|([(\[{]?\d+[.)\]}] ?)|([(\[{]?[a-z][.)\]}] ?)|([(\[{]?i+[.)\]} ] ?))`)
// 8.1.1 The words in the following columns are considered equivalent and interchangeable.
wordReplacer = strings.NewReplacer(
"acknowledgment", "acknowledgement",
"analogue", "analog",
"analyse", "analyze",
"artefact", "artifact",
"authorisation", "authorization",
"authorised", "authorized",
"calibre", "caliber",
"cancelled", "canceled",
"capitalisations", "capitalizations",
"catalogue", "catalog",
"categorise", "categorize",
"centre", "center",
"emphasised", "emphasized",
"favour", "favor",
"favourite", "favorite",
"fulfil", "fulfill",
"fulfilment", "fulfillment",
"initialise", "initialize",
"judgment", "judgement",
"labelling", "labeling",
"labour", "labor",
"licence", "license",
"maximise", "maximize",
"modelled", "modeled",
"modelling", "modeling",
"offence", "offense",
"optimise", "optimize",
"organisation", "organization",
"organise", "organize",
"practise", "practice",
"programme", "program",
"realise", "realize",
"recognise", "recognize",
"signalling", "signaling",
"sub-license", "sublicense",
"sub license", "sub-license",
"utilisation", "utilization",
"whilst", "while",
"wilful", "wilfull",
"non-commercial", "noncommercial",
"per cent", "percent",
"copyright owner", "copyright",
)
// 9.1.1 "©", "(c)", or "Copyright" should be considered equivalent and interchangeable.
copyrightRe = regexp.MustCompile(`copyright|\(c\)`)
trademarkRe = regexp.MustCompile(`trademark(s?)|\(tm\)`)
// extra cleanup
brokenLinkRe = regexp.MustCompile(`http s ://`)
urlCleanupRe = regexp.MustCompile(`[<(](http(s?)://[^\s]+)[)>]`)
copyrightLineRe = regexp.MustCompile(`(?m)^((©.*)|(all rights reserved(\.)?)|(li[cs]en[cs]e))\n`)
nonAlphaNumRe = regexp.MustCompile(`[^- \na-z0-9]`)
// used in Split()
splitRe = regexp.MustCompile(`\n\s*[^a-zA-Z0-9_,()]{3,}\s*\n`)
)
// Strictness represents the aggressiveness of the performed normalization. The bigger the number,
// the more aggressive. See `Enforced`, `Moderate` and `Relaxed`.
type Strictness int
const (
// Enforced is the strictest mode - only the official SPDX guidelines are applied.
Enforced Strictness = 0
// Moderate is equivalent to Enforced with some additional normalization: dots are removed, copyright lines too.
Moderate Strictness = 1
// Relaxed is the most powerful normalization, Moderate + Unicode normalization and all non-alphanumeric chars removed.
Relaxed Strictness = 2
)
// LicenseText makes a license text ready for analysis.
// It follows SPDX guidelines at
// https://spdx.org/spdx-license-list/matching-guidelines
func LicenseText(text string, strictness Strictness) string {
// Line endings
text = lineEndingsRe.ReplaceAllString(text, "\n")
// 4. Capitalization
text = strings.ToLower(text)
// 3. Whitespace
text = whitespaceRe.ReplaceAllString(text, " ")
text = trailingWhitespaceRe.ReplaceAllString(text, "")
text = licenseHeaderRe.ReplaceAllString(text, "$1\nthisislikelyalicenseheaderplaceholder\n")
text = leadingWhitespaceRe.ReplaceAllString(text, "")
// 5. Punctuation
text = punctuationRe.ReplaceAllString(text, "-")
text = quotesRe.ReplaceAllString(text, "\"")
// 7. Bullets and Numbering
text = bulletRe.ReplaceAllString(text, "")
// 8. Varietal Word Spelling
text = wordReplacer.Replace(text)
// 9. Copyright Symbol
text = copyrightRe.ReplaceAllString(text, "©")
text = trademarkRe.ReplaceAllString(text, "™")
// fix broken URLs in SPDX source texts
text = brokenLinkRe.ReplaceAllString(text, "https://")
// fix URLs in <> - erase the decoration
text = urlCleanupRe.ReplaceAllString(text, "$1")
// collapse several non-alphanumeric characters
{
buffer := &bytes.Buffer{}
back := '\x00'
for _, char := range text {
if !unicode.IsLetter(char) && !unicode.IsDigit(char) && back == char {
continue
}
back = char
buffer.WriteRune(char)
}
text = buffer.String()
}
if strictness > Enforced {
// there are common mismatches because of trailing dots
text = strings.Replace(text, ".", "", -1)
// usually copyright lines are custom and occur multiple times
text = copyrightLineRe.ReplaceAllString(text, "")
}
if strictness > Moderate {
return Relax(text)
}
text = leadingWhitespaceRe.ReplaceAllString(text, "")
text = strings.Replace(text, "thisislikelyalicenseheaderplaceholder", "", -1)
return text
}
// Relax applies very aggressive normalization rules to text.
func Relax(text string) string {
buffer := &bytes.Buffer{}
writer := transform.NewWriter(
buffer, transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC))
_, _ = writer.Write([]byte(text))
_ = writer.Close()
text = buffer.String()
text = nonAlphaNumRe.ReplaceAllString(text, "")
text = leadingWhitespaceRe.ReplaceAllString(text, "")
text = strings.Replace(text, " ", " ", -1)
return text
}
// Split applies heuristics to split the text into several parts
func Split(text string) []string {
result := []string{text}
// Always add the full text
splitted := splitRe.Split(text, -1)
if len(splitted) > 1 {
result = append(result, splitted...)
}
return result
}