forked from wp-plugins/word-stats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
basic-string-tools.js
177 lines (154 loc) · 6.05 KB
/
basic-string-tools.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
/*
Javascript version of basic_string_tools
by: Fran Ontanaya <email@franontanaya.com>
Version: 0.1
License: GPLv2
*/
var bstAllCombiningMarks =
"[\u00AD\u2010" + // Breaking Hyphens
"\u031C-\u0361" + // Combining Diacritical Marks
"\u20D0-\u20F0" + // Combining Diacritical Marks for Symbols
"\u1DC0-\u1DFF" + // Combining Diacritical Marks Supplement
"\uFE20-\uFE26" + // Combining Half Marks
"\u0483-\u0489" + // Cyrillic
"\uA66F-\uA67D" + // Cyrillic Extended-B
"\u0951-\u0954" + // Devaganari
"\u037A\u0384-\u0385" + // Greek and Coptic
"\u3099-\u309C" + // Hiragana
"\u30FB-\u30FE]"; // Katakana
// * This list is incomplete.
// ToDo: Add option to enable only selected blocks for better performance?
var bstAllWordChars =
"A-Za-z0-9" + // Basic Latin
"\uFB00-\uFB4F" + // Alphabetic Presentation Forms (ToDo: Split ligated forms)
"\u0621-\u064A\u0660-\u0669\u066E-\u06D3\u06D5\u06EE-\u06FF"+ // Arabic
"\uFB50-\uFBB1" + // Arabic Presentation Forms A
"\uFE80-\uFEFC" + // Arabic Presentation Forms B
"\u0750-\u077F" + // Arabic Supplement
"\u20A0-\u20CF" + // Currency symbols.
"\u0400-\u0482\u0498-\u04FF" + // Cyrillic
"\u2DE0-\u2DFF" + // Cyrillic Extended-A
"\uA640-\uA66E\uA680-\uA697" + // Cyrillic Extended-B
"\u0500-\u0525" + // Cyrillic Supplement
"\u0904-\u0939\u093E-\u0950-\u0955-\u096F\u0972-\u097F" + // Devanagari
"\uA8E0-\uA8F0" + // Devanagari Extended
"\u1F200-\u1F2FF" + // Enclosed Ideographic Supplement
"\u10A0-\u10FA" + // Georgian
"\u0386\u0388-\u03FF" + // Greek and Coptic
"\u1F00-\u1FBC\u1FC2-\u1FCC\u1FD0-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FFC" + // Greek Extended
"\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFF9D" + // Halfwidth and Fullwidth Forms
"\u05D0-\u05EA" + // Hebrew
"\u3040-\u3096" + // Hiragana
"\u30A1-\u30FA" + // Katakana
"\u00C0-\u00D6\u00D8-\u00F6\u00F9-\u00FF" + // Latin-1 Supplement
"\u0100-\u017F" + // Latin Extended-A
"\u1E00-\u1EFF" + // Latin Extended Additional
"\u0180-\u024F" + // Latin Extended-B
"\u2C60-\u2C7F" + // Latin Extended-C
"\uA726-\uA787" + // Latin Extended-D
"\u0D05-\u0D39\u0D3E-\u0D44" + // Malayam
"\u1D400-\u1D7FF}" + // Mathematical Alphanumeric Symbols
"\u0710-\u072F\u074D-\u074F" + // Syriac
"\u1700-\u1714"; // Tagalog
// * This list is incomplete.
var bstInWordMarks = "\'(?![A-Za-z])";
var bstAllShortPauses =
"[\.]{3}|[;:\u2026\u2015" +
"\u00B7\u0387]"; // Greek
// * This list is incomplete
function bstMatchRegArray( regexArray, text ) {
var regex;
var r;
for( r in regexArray ) {
regex = new RegExp( regexArray[ r ], "i" );
if ( text.match( regex ) !== null ) { return true; }
}
return false;
}
function bstHtmlStripper( text ) {
/* Use the browser's parser to strip tags */
var div = document.createElement( "div" );
div.innerHTML = text;
text = div.textContent || div.innerText || "";
div = null;
/* text.replace( new RegExp( "<[^\s][^>]*[^\s]>", "g" ), " " ); */
return text;
}
/* Note that parsing for non-Latin scripts may be incomplete. */
function bstSimpleBoundaries( text ) {
/* Add reverse() function, so we can fake regexp lookbehind. */
String.prototype.reverse = function () {
return this.split('').reverse().join('');
};
/* Replace some special characters */
text = text.replace( new RegExp( "=|-", "g" ), " " );
text = text.replace( new RegExp( "[\[]|\]", "g" ), " " );
text = text.replace( new RegExp( "\\\\", "g" ), " " );
text = text.replace( new RegExp( "\u00A0|\ |\"", "g" ), " " );
/* Remove combining marks et al, as they are meaningless for this purpose and can split words */
text = text.replace( new RegExp( bstAllCombiningMarks, "g" ), "" );
/* Replace ellipsis with commas when not followed by capitalized word */
text = text.replace( new RegExp( "\u2026(?=\s[a-z])|[\.]{3}(?=\s[a-z])", "g" ), "," );
/* Typical end of sentence, including unicodes | All remaining ellipsis */
text = text.replace( new RegExp( "[\!\?\.;\u06D4\u203C\u2047-\u2049\u2026]+|[\.]{3}", "g" ), "." );
/* Replace all remaining short pauses with colons */
text = text.replace( new RegExp( bstAllShortPauses, "g" ), "," );
/* Remove single quotes around words, but not inside words. Because JavaScript doesn't support lookbehind, we reverse the string instead. */
text = text.replace( new RegExp( bstInWordMarks, "g" ), " ");
text = text.reverse();
text = text.replace( new RegExp( bstInWordMarks, "g" ), " ");
text = text.reverse();
/* Replace non-word characters, save short pauses and end of sentence, with spaces */
text = text.replace( new RegExp( "[^" + bstAllWordChars + ",\'\.\n]", "g" ), " ");
var result = new Array();
result[ "text" ] = text;
text = text.replace( new RegExp( "[^" + bstAllWordChars + "]" , "g" ), "" );
result[ "alphanumeric" ] = text;
return result;
}
function bstTrimArray( array ) {
/* Remove the first and last items if they are empty */
if ( array[ 0 ] == "" || array[ 0 ] == "\n" ) { array = array.slice( 1, array.length ); }
if ( array[ array.length - 1 ] == "" || array[ array.length -1 ] == "\n" ) { array = array.slice( 0, array.length - 1 ); }
return array;
}
function trim( s ) {
var l = 0; var r = s.length - 1;
while( l < s.length && s[ l ] == ' ' ) {
l++;
}
while( r > l && s[ r ] == ' ' ) {
r-=1;
}
return s.substring( l, r + 1 );
}
function bstTrimText( text ) {
// Trim spaces
text = text.replace( new RegExp( "[ ]+(?=[\.\n])", "g"), '' );
return trim( text );
}
function bstSplitSentences( text ) {
text = bstTrimArray( text.split( /[\.\n]+/ ) );
return text;
}
function bstSplitWords( text ) {
text = bstTrimArray( text.split( /[ ,\.\n]+/ ) );
return text;
}
function bstSplitText( text ) {
var stats = new Array();
if ( text == "" ) {
stats[ "text" ] = "";
stats[ "sentences" ] = "";
stats[ "words" ] = "";
stats[ "alphanumeric" ] = "";
} else {
var simplified = bstSimpleBoundaries( text );
stats[ "text" ] = simplified[ "text" ];
simplified[ "text" ] = bstTrimText( simplified[ "text" ] );
stats[ "sentences" ] = bstSplitSentences( simplified[ "text" ] );
stats[ "words" ] = bstSplitWords( simplified[ "text" ] );
stats[ "alphanumeric" ] = simplified[ "alphanumeric" ];
}
return stats;
}