-
Notifications
You must be signed in to change notification settings - Fork 1
/
prototyping.go
300 lines (245 loc) · 7.8 KB
/
prototyping.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
package safelinks
import (
"log"
"net/url"
"regexp"
"strings"
"unicode"
)
// SafeLinksURLRequiredPrefix is the required prefix for all Safe Links URLs.
const SafeLinksURLRequiredPrefix = "https://"
// FoundURLPattern is an unvalidated URL pattern match found in given input.
type FoundURLPattern struct {
// Input *string
startPosition int
endPosition int
URLPattern string
}
// SafeLinkURL contains the encoded and decoded URLs for a matched Safe Link.
type SafeLinkURL struct {
EncodedURL string
DecodedURL string
}
// ValidURL attempts to validate whether a given input string is a valid URL.
func ValidURL(input string) bool {
if _, err := url.Parse(input); err != nil {
return false
}
return true
}
// ValidSafeLinkURL validates whether a given url.URL is a valid Safe Links
// URL.
func ValidSafeLinkURL(input *url.URL) bool {
if err := assertValidURLParameter(input); err != nil {
return false
}
return true
}
// GetURLPatternsUsingRegex parses the given input and returns a collection of
// FoundURLPattern values.
//
// Since all Safe Links URLs observed in the wild begin with a HTTPS scheme we
// require that all matched URL patterns begin with that protocol scheme. nil
// is returned if no patterns using that scheme are found.
//
// NOTE: Validation is not performed to ensure that matched patterns are valid
// URLs.
//
// Internal logic uses a regular expression to match URL patterns beginning
// with 'https://' and ending with a whitespace character.
func GetURLPatternsUsingRegex(input string) ([]FoundURLPattern, error) {
// urls := make([]url.URL, 0, 5)
urlPatterns := make([]FoundURLPattern, 0, 5)
if !strings.Contains(input, SafeLinksURLRequiredPrefix) {
return nil, ErrNoURLsFound
}
// This works but would match regular http:// prefixes:
//
// https://www.honeybadger.io/blog/a-definitive-guide-to-regular-expressions-in-go/
// urlRegex := `https?://\S+|www\.\S+`
urlRegex := SafeLinksURLRequiredPrefix + `\S+|www\.\S+`
r := regexp.MustCompile(urlRegex)
matches := r.FindAllString(input, -1)
log.Println("Matches:", matches)
for _, match := range matches {
urlPatterns = append(
urlPatterns,
FoundURLPattern{
URLPattern: match,
},
)
}
return urlPatterns, nil
}
// GetURLPatternsUsingIndex parses the given input and returns a collection of
// FoundURLPattern values.
//
// Since all Safe Links URLs observed in the wild begin with a HTTPS scheme we
// require that all matched URL patterns begin with that protocol scheme. nil
// is returned if no patterns using that scheme are found.
//
// NOTE: Validation has not been performed to ensure that matched patterns are
// valid URLs.
//
// Internal logic uses slice indexing/iteration to match URL patterns
// beginning with 'https://' and ending with a whitespace character.
func GetURLPatternsUsingIndex(input string) ([]FoundURLPattern, error) {
// urls := make([]url.URL, 0, 5)
urlPatterns := make([]FoundURLPattern, 0, 5)
if !strings.Contains(input, SafeLinksURLRequiredPrefix) {
return nil, ErrNoURLsFound
}
remaining := input
for {
urlStart := strings.Index(remaining, SafeLinksURLRequiredPrefix)
if urlStart == -1 {
break
}
next := urlStart + len(SafeLinksURLRequiredPrefix) + 1
// Sanity check to keep from indexing past remaining string length.
if next >= len(remaining) {
break
}
// Assume we found ending point until proven otherwise.
// urlEnd := next
// for _, char := range remaining[next:] {
// if unicode.IsSpace(char) {
// break // we found end of URL pattern
// }
// urlEnd++
// }
urlEnd := getURLIndexEndPosition(remaining[next:], next)
urlPatterns = append(
urlPatterns,
FoundURLPattern{
// recording for later potential debugging
startPosition: urlStart,
endPosition: urlEnd,
URLPattern: remaining[urlStart:urlEnd],
},
)
// Abort further processing if we're at the end of our original input
// string.
if urlEnd+1 >= len(input) {
break
}
// Otherwise, record the next position as the starting point for
// further URL match evaluation.
remaining = remaining[urlEnd+1:]
}
return urlPatterns, nil
}
// GetURLPatternsUsingPrefixMatchingOnFields parses the given input and
// returns a collection of FoundURLPattern values.
//
// Since all Safe Links URLs observed in the wild begin with a HTTPS scheme we
// require that all matched URL patterns begin with that protocol scheme. nil
// is returned if no patterns using that scheme are found.
//
// NOTE: Validation has not been performed to ensure that matched patterns are
// valid URLs.
//
// Internal logic uses string splitting on whitespace and prefix matching to
// match URL patterns beginning with 'https://' and ending with a whitespace
// character.
func GetURLPatternsUsingPrefixMatchingOnFields(input string) ([]FoundURLPattern, error) {
urlPatterns := make([]FoundURLPattern, 0, 5)
if !strings.Contains(input, SafeLinksURLRequiredPrefix) {
return nil, ErrNoURLsFound
}
fields := strings.Fields(input)
for _, field := range fields {
if strings.HasPrefix(field, SafeLinksURLRequiredPrefix) {
urlPatterns = append(
urlPatterns,
FoundURLPattern{
URLPattern: field,
},
)
}
}
if len(urlPatterns) == 0 {
return nil, ErrNoURLsFound
}
return urlPatterns, nil
}
// URLs parses the given input and returns a collection of *url.URL values.
//
// Since all Safe Links URLs observed in the wild begin with a HTTPS scheme we
// require that all matched URLs begin with that protocol scheme. nil is
// returned if no valid URLs using that scheme are found.
func URLs(input string) ([]*url.URL, error) {
urls := make([]*url.URL, 0, 5)
// NOTE: Confirmed working:
//
// urlPatterns, err := GetURLPatternsUsingIndex(input)
// urlPatterns, err := GetURLPatternsUsingPrefixMatchingOnFields(input)
urlPatterns, err := GetURLPatternsUsingRegex(input)
if err != nil {
return nil, err
}
for _, pattern := range urlPatterns {
u, err := url.Parse(pattern.URLPattern)
if err != nil {
continue
}
urls = append(urls, u)
}
return urls, nil
}
// SafeLinkURLsFromURLs evaluates a given collection of URLs and returns any
// that are found to be encoded as Safe Links. Deduplication is *not*
// performed. An error is returned if no valid matches are found.
func SafeLinkURLsFromURLs(urls []*url.URL) ([]SafeLinkURL, error) {
safeLinkURLs := make([]SafeLinkURL, 0, len(urls))
for _, u := range urls {
if !ValidSafeLinkURL(u) {
continue
}
originalURL := u.Query().Get("url")
safeLinkURLs = append(
safeLinkURLs,
SafeLinkURL{
EncodedURL: u.String(),
// DecodedURL: originalURL,
DecodedURL: cleanURL(originalURL),
},
)
}
if len(safeLinkURLs) == 0 {
return nil, ErrNoSafeLinkURLsFound
}
return safeLinkURLs, nil
}
// SafeLinkURLs parses the given input and returns a collection of parsed and
// decoded URLs. Deduplication is *not* performed.
//
// An error is returned if no valid matches are found.
func SafeLinkURLs(input string) ([]SafeLinkURL, error) {
urls, err := URLs(input)
if err != nil {
return nil, err
}
return SafeLinkURLsFromURLs(urls)
}
// FromURLs evaluates a given collection of URLs and returns a collection of
// SafeLinkURL values for any that are found to be encoded as Safe Links.
// Deduplication is *not* performed.
//
// An error is returned if no valid matches are found.
func FromURLs(urls []*url.URL) ([]SafeLinkURL, error) {
return SafeLinkURLsFromURLs(urls)
}
// getURLIndexEndPosition accepts an input string and a starting position and
// iterates until it finds the first space character. This is assumed to be
// the separator used to indicate the end of a URL pattern.
func getURLIndexEndPosition(input string, startPos int) int {
endPos := startPos
for _, char := range input[startPos:] {
if unicode.IsSpace(char) {
break // we found end of URL pattern
}
endPos++
}
return endPos
}