-
Notifications
You must be signed in to change notification settings - Fork 6
/
regexpcreator.js
405 lines (369 loc) · 14.2 KB
/
regexpcreator.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
/**
* Creates regular expressions based on specified settings
* @example
* new RegExpCreator({caseSensitive: true, diacritics: false}).create('lorem');
* // => /()(lorem)/gm
*/
class RegExpCreator {
/**
* @typedef RegExpCreator~accuracyObj
* @type {object.<string>}
* @property {string} value - An accuracy string value
* @property {string[]} limiters - A custom array of limiters. For example
* <code>["-", ","]</code>
*/
/**
* @typedef RegExpCreator~accuracy
* @type {string}
* @property {"partially"|"complementary"|"exactly"|RegExpCreator~accuracyObj}
* [accuracy="partially"] - Either one of the following string values:
* <ul>
* <li><i>partially</i>: When searching for "lor" only "lor" inside
* "lorem" will be marked</li>
* <li><i>complementary</i>: When searching for "lor" the whole word
* "lorem" will be marked</li>
* <li><i>exactly</i>: When searching for "lor" only those exact words
* will be marked. In this example nothing inside "lorem".
* </ul>
* Or an object containing two properties:
* <ul>
* <li><i>value</i>: The value must be "exactly" or "complementary"</li>
* <li><i>limiters</i>: A custom array of string limiters</li>
* </ul>
*/
/**
* @typedef RegExpCreator~wildcards
* @type {string}
* @property {"disabled"|"enabled"|"withSpaces"}
* [wildcards="disabled"] - Set to any of the following string values:
* <ul>
* <li><i>disabled</i>: Disable wildcard usage</li>
* <li><i>enabled</i>: When searching for "lor?m", the "?" will match zero
* or one non-space character (e.g. "lorm", "loram", "lor3m", etc). When
* searching for "lor*m", the "*" will match zero or more non-space
* characters (e.g. "lorm", "loram", "lor123m", etc).</li>
* <li><i>withSpaces</i>: When searching for "lor?m", the "?" will
* match zero or one space or non-space character (e.g. "lor m", "loram",
* etc). When searching for "lor*m", the "*" will match zero or more space
* or non-space characters (e.g. "lorm", "lore et dolor ipsum", "lor: m",
* etc).</li>
* </ul>
*/
/**
* @typedef RegExpCreator~ignorePunctuation
* @type {string[]}
* @property {string} The strings in this setting will contain punctuation
* marks that will be ignored:
* <ul>
* <li>These punctuation marks can be between any characters, e.g. setting
* this option to <code>["'"]</code> would match "Worlds", "World's" and
* "Wo'rlds"</li>
* <li>One or more apostrophes between the letters would still produce a
* match (e.g. "W'o''r'l'd's").</li>
* <li>A typical setting for this option could be as follows:
* <pre>ignorePunctuation: ":;.,-–—‒_(){}[]!'\"+=".split(""),</pre> This
* setting includes common punctuation as well as a minus, en-dash,
* em-dash and figure-dash
* ({@link https://en.wikipedia.org/wiki/Dash#Figure_dash ref}), as well
* as an underscore.</li>
* </ul>
*/
/**
* @typedef RegExpCreator~options
* @type {object.<string>}
* @property {boolean} [diacritics=true] - If diacritic characters should be
* matched. ({@link https://en.wikipedia.org/wiki/Diacritic Diacritics})
* @property {object.<string|string[]>} [synonyms] - An object with synonyms.
* The key will be a synonym for the value and the value for the key
* @property {RegExpCreator~accuracy} [accuracy]
* @property {boolean} [caseSensitive=false] - Whether to search case sensitive
* @property {boolean} [ignoreJoiners=false] - Whether to ignore word
* joiners inside of key words. These include soft-hyphens, zero-width
* space, zero-width non-joiners and zero-width joiners.
* @property {RegExpCreator~ignorePunctuation} [ignorePunctuation]
* @property {RegExpCreator~wildcards} [wildcards]
*/
/**
* @typedef RegExpCreator~patternObj
* @type {object}
* @property {string} lookbehind - A lookbehind capturing group
* @property {string} pattern - A string pattern
* @property {string} lookahead - A positive lookahead assertion
*/
/**
* @param {RegExpCreator~options} [options] - Optional options object
*/
constructor(options) {
this.opt = Object.assign({}, {
'diacritics': true,
'synonyms': {},
'accuracy': 'partially',
'caseSensitive': false,
'ignoreJoiners': false,
'ignorePunctuation': [],
'wildcards': 'disabled'
}, options);
}
/**
* Creates a regular expression to match the specified search term considering
* the available option settings
* @param {string} str - The search term to be used
* @param {boolean} patterns - Whether to return an object with pattern parts or RegExp object
* @return {RegExpCreator~patternObj|RegExp}
*/
create(str, patterns) {
str = this.checkWildcardsEscape(str);
if (Object.keys(this.opt.synonyms).length) {
str = this.createSynonyms(str);
}
const joiners = this.getJoinersPunctuation();
if (joiners) {
str = this.setupIgnoreJoiners(str);
}
if (this.opt.diacritics) {
str = this.createDiacritics(str);
}
str = str.replace(/\s+/g, '[\\s]+');
if (joiners) {
str = this.createJoiners(str, joiners);
}
if (this.opt.wildcards !== 'disabled') {
str = this.createWildcards(str);
}
const obj = this.createAccuracy(str);
return (patterns
? obj
: new RegExp(`${obj.lookbehind}(${obj.pattern})${obj.lookahead}`, `g${this.opt.caseSensitive ? '' : 'i'}`));
}
/**
* Creates a single combine pattern from an array of string considering the available option settings
* @param {Array} array - The array of string
* @param {boolean} capture - Whether to wrap an individual pattern in a capturing or non-capturing group
* @return {RegExpCreator~patternObj|null}
*/
createCombinePattern(array, capture) {
if ( !Array.isArray(array) || !array.length) {
return null;
}
const group = capture ? '(' : '(?:',
obj = this.create(array[0], true),
lookbehind = obj.lookbehind,
lookahead = obj.lookahead,
pattern = this.distinct(array.map(str => `${group}${this.create(str, true).pattern})`)).join('|');
return { lookbehind, pattern, lookahead };
}
/**
* Sort array from longest entry to shortest
* @param {array} arry - The array to sort
* @return {array}
*/
sortByLength(arry) {
return arry.sort((a, b) => a.length === b.length ?
// sort a-z for same length elements
(a > b ? 1 : -1) :
b.length - a.length
);
}
/**
* Escapes RegExp special characters
* @param {string} str - The string to escape
* @return {string}
*/
escape(str) {
return str.replace(/[[\]/{}()*+?.\\^$|]/g, '\\$&');
}
/**
* In a character set only '-^]\\' characters must be escaped;
* the characters '^' at the beginning, '-' in the middle affects character set
* @param {string} str - The string to escape
* @return {string}
*/
escapeCharSet(str) {
return str.replace(/[-^\]\\]/g, '\\$&');
}
/**
* Splits string if par is string, removes duplicates
* @param {array|string} par - The parameter to process
* @return {array}
*/
toArrayIfString(par) {
return par && par.length ? this.distinct(typeof par === 'string' ? par.split('') : par) : [];
}
/**
* Removes duplicate or empty entries
* @param {array} array - The array to process
* @return {array}
*/
distinct(array) {
const result = [];
array.forEach(item => {
if (item.trim() && result.indexOf(item) === -1) {
result.push(item);
}
});
return result;
}
/**
* Creates a regular expression string to match the defined synonyms
* @param {string} str - The search term to be used
* @return {string}
*/
createSynonyms(str) {
const syn = this.opt.synonyms,
flags = 'g' + (this.opt.caseSensitive ? '' : 'i');
for (const key in syn) {
if (syn.hasOwnProperty(key)) {
let array = Array.isArray(syn[key]) ? syn[key] : [syn[key]];
array.unshift(key);
array = this.sortByLength(this.distinct(array)).map(term => this.checkWildcardsEscape(term));
if (array.length > 1) {
const pattern = array.map(k => this.escape(k)).join('|');
str = str.replace(new RegExp(pattern, flags), `(?:${array.join('|')})`);
}
}
}
return str;
}
/**
* Check wildcards option creates placeholders in the regular expression string to allow later
* insertion of wildcard patterns and escapes RegExp special characters
* @param {string} str - The search term
* @return {string}
*/
checkWildcardsEscape(str) {
if (this.opt.wildcards !== 'disabled') {
// replace single character wildcard with unicode 0001
str = str.replace(/(\\)*\?/g, (m, gr1) => gr1 ? '?' : '\u0001')
// replace multiple character wildcard with unicode 0002
.replace(/(\\)*\*/g, (m, gr1) => gr1 ? '*' : '\u0002');
}
return this.escape(str);
}
/**
* Replaces the wildcard placeholders in a regular expression string
* @param {string} str - The search term to be used
* @return {string}
*/
createWildcards(str) {
// default to "enable" (i.e. to not include spaces)
// "withSpaces" uses `[\\S\\s]` instead of `.` because the latter
// does not match new line characters
const spaces = this.opt.wildcards === 'withSpaces',
boundary = this.opt.blockElementsBoundary,
anyChar = spaces && boundary ? '[^' + (boundary.char ? boundary.char : '\x01') + ']*?' : '[\\S\\s]*?';
return str
// replace unicode 0001 with a RegExp class to match any single
// character, or any single non-whitespace character depending
// on the setting
.replace(/\u0001/g, spaces ? '[\\S\\s]?' : '\\S?')
// replace unicode 0002 with a RegExp class to match zero or
// more characters, or zero or more non-whitespace characters
// depending on the setting
.replace(/\u0002/g, spaces ? anyChar : '\\S*');
}
/**
* Creates placeholders in the regular expression string to allow later insertion of
* designated characters (soft hyphens, zero width characters, and punctuation)
* @param {string} str - The search term to be used
* @return {string}
*/
setupIgnoreJoiners(str) {
// It's not added '\0' after `(?:` grouping construct, around `|`, before `)` chars, and at the end of a string,
// not breaks the grouping construct `(?:`
return str.replace(/(\(\?:|\|)|\\?.(?=([|)]|$)|.)/g, (m, gr1, gr2) => {
return gr1 || typeof gr2 !== 'undefined' ? m : m + '\u0000';
});
}
/**
* Replaces '\u0000' placeholders in a regular expression string by designated
* characters (soft hyphens, zero width characters, and punctuation) based on the
* specified option values of <code>ignorePunctuation</code> and
* <code>ignoreJoiners</code>
* @param {string} str - The search term to be used
* @return {string}
*/
createJoiners(str, joiners) {
return str.split(/\u0000+/).join(`[${joiners}]*`);
}
/**
* Creates a punctuation and/or joiners pattern
* @return {string}
*/
getJoinersPunctuation() {
let punct = this.toArrayIfString(this.opt.ignorePunctuation),
str = '';
if (punct.length) {
str = this.escapeCharSet(punct.join(''));
}
if (this.opt.ignoreJoiners) {
// u+00ad = soft hyphen
// u+200b = zero-width space
// u+200c = zero-width non-joiner
// u+200d = zero-width joiner
str += '\\u00ad\\u200b\\u200c\\u200d';
}
return str;
}
/**
* Creates a regular expression string to match diacritics
* @param {string} str - The search term to be used
* @return {string}
*/
createDiacritics(str) {
const caseSensitive = this.opt.caseSensitive,
array = [
'aàáảãạăằắẳẵặâầấẩẫậäåāą', 'AÀÁẢÃẠĂẰẮẲẴẶÂẦẤẨẪẬÄÅĀĄ',
'cçćč', 'CÇĆČ', 'dđď', 'DĐĎ', 'eèéẻẽẹêềếểễệëěēę', 'EÈÉẺẼẸÊỀẾỂỄỆËĚĒĘ',
'iìíỉĩịîïī', 'IÌÍỈĨỊÎÏĪ', 'lł', 'LŁ', 'nñňń', 'NÑŇŃ',
'oòóỏõọôồốổỗộơởỡớờợöøō', 'OÒÓỎÕỌÔỒỐỔỖỘƠỞỠỚỜỢÖØŌ', 'rř', 'RŘ',
'sšśșş', 'SŠŚȘŞ', 'tťțţ', 'TŤȚŢ', 'uùúủũụưừứửữựûüůū', 'UÙÚỦŨỤƯỪỨỬỮỰÛÜŮŪ',
'yýỳỷỹỵÿ', 'YÝỲỶỸỴŸ', 'zžżź', 'ZŽŻŹ'
];
return str.split('').map(ch => {
for (let i = 0; i < array.length; i += 2) {
if (caseSensitive) {
if (array[i].indexOf(ch) !== -1) {
return '[' + array[i] + ']';
} else if (array[i+1].indexOf(ch) !== -1) {
return '[' + array[i+1] + ']';
}
} else if (array[i].indexOf(ch) !== -1 || array[i+1].indexOf(ch) !== -1) {
return '[' + array[i] + array[i+1] + ']';
}
}
return ch;
}).join('');
}
/**
* Creates a regular expression string to match the specified string with the
* defined accuracy. As in the regular expression of "exactly" can be a group
* containing a blank at the beginning, all regular expressions will be
* created with two groups. The first group can be ignored (may contain
* the said blank), the second contains the actual match
* @param {string} str - The searm term to be used
* @return {RegExpCreator~patternObj}
*/
createAccuracy(str) {
const chars = '!"#$%&\'()*+,\\-./:;<=>?@[\\]\\\\^_`{|}~¡¿';
let accuracy = this.opt.accuracy,
lookbehind = '()',
pattern = str,
lookahead = '',
limiters;
if (typeof accuracy !== 'string') {
limiters = this.toArrayIfString(accuracy.limiters);
limiters = limiters.length ? limiters : null;
accuracy = accuracy.value;
}
if (accuracy === 'complementary') {
let joins ='\\s' + (limiters ? this.escapeCharSet(limiters.join('')) : chars);
pattern = `[^${joins}]*${str}[^${joins}]*`;
} else if (accuracy === 'exactly') {
let joins = limiters ? '|' + limiters.map(ch => this.escape(ch)).join('|') : '';
lookbehind = `(^|\\s${joins})`;
lookahead = `(?=$|\\s${joins})`;
}
return { lookbehind, pattern, lookahead };
}
}
export default RegExpCreator;