Skip to content
This repository has been archived by the owner on Sep 21, 2021. It is now read-only.

Commit

Permalink
Merge pull request #13 from antislice/upstream-pr
Browse files Browse the repository at this point in the history
Fix bugs in cleanText() and wordCount(), add some tests
  • Loading branch information
cgiffard committed Sep 20, 2016
2 parents 43e0e3a + 31aa398 commit 2d1f0d6
Show file tree
Hide file tree
Showing 7 changed files with 256 additions and 15 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
node_modules
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ JavaScript port of [TextStatistics.php](https://github.com/DaveChild/Text-Statis
I've done what I think is a reasonably faithful port. Documentation incoming!
I removed a lot of the original comments during the port, but seeing as the API remained largely the same, I'll add them in shortly.

Same goes for a test suite - I'll get something working in node in a bit. :)
The beginning of a test suite in [Mocha](https://mochajs.org/) is here, covering cleaning the text and some cases of word and sentence counting.

## Installation

Expand Down
28 changes: 15 additions & 13 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,23 @@

fullStopTags.forEach(function(tag) {
text = text.replace("</" + tag + ">",".");
})
});

text = text
.replace(/<[^>]+>/g, "") // Strip tags
.replace(/[,:;()\-]/, " ") // Replace commans, hyphens etc (count them as spaces)
.replace(/[\.!?]/, ".") // Unify terminators
.replace(/^\s+/,"") // Strip leading whitespace
.replace(/[ ]*(\n|\r\n|\r)[ ]*/," ") // Replace new lines with spaces
.replace(/([\.])[\. ]+/,".") // Check for duplicated terminators
.replace(/[ ]*([\.])/,". ") // Pad sentence terminators
.replace(/\s+/," ") // Remove multiple spaces
.replace(/\s+$/,""); // Strip trailing whitespace
.replace(/[,:;()\/&+]|\-\-/g, " ") // Replace commas, hyphens etc (count them as spaces)
.replace(/[\.!?]/g, ".") // Unify terminators
.replace(/^\s+/, "") // Strip leading whitespace
.replace(/[\.]?(\w+)[\.]?(\w+)@(\w+)[\.](\w+)[\.]?/g, "$1$2@$3$4") // strip periods in email addresses (so they remain counted as one word)
.replace(/[ ]*(\n|\r\n|\r)[ ]*/g, ".") // Replace new lines with periods
.replace(/([\.])[\.]+/g, ".") // Check for duplicated terminators
.replace(/[ ]*([\.])/g, ". ") // Pad sentence terminators
.replace(/\s+/g, " ") // Remove multiple spaces
.replace(/\s+$/, ""); // Strip trailing whitespace

text += "."; // Add final terminator, just in case it's missing.

if(text.slice(-1) != '.') {
text += "."; // Add final terminator, just in case it's missing.
}
return text;
}

Expand Down Expand Up @@ -84,7 +86,7 @@

TextStatistics.prototype.wordCount = function(text) {
text = text ? cleanText(text) : this.text;
return text.split(/[^a-z0-9]+/i).length || 1;
return text.split(/[^a-z0-9\'@\.\-]+/i).length || 1;
};

TextStatistics.prototype.averageWordsPerSentence = function(text) {
Expand Down Expand Up @@ -210,7 +212,7 @@
wordPartCount = word
.split(/[^aeiouy]+/ig)
.filter(function(wordPart) {
return !!wordPart.replace(/\s+/ig,"").length
return !!wordPart.replace(/\s+/ig,"").length;
})
.length;

Expand Down
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
},
"main": "index.js",
"dependencies": {},
"devDependencies": {},
"devDependencies": {
"mocha": "^3.0.2"
},
"optionalDependencies": {}
}
1 change: 1 addition & 0 deletions test/mocha.opts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
--reporter nyan
181 changes: 181 additions & 0 deletions test/testCleanText.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
var assert = require('assert');
var TextStatistics = require('../index.js');

describe('TextStatistics', function() {
// this is called when you "make" a TextStatistics
describe('#cleanText()', function() {
it('should add a final terminator if it\'s missing', function() {
var ts = TextStatistics('Hello friend');
assert.equal(ts.text, 'Hello friend.');
});

it('should not add a final terminator if there is a \'.\'', function() {
var ts = TextStatistics('Hello friend.');
assert.equal(ts.text, 'Hello friend.');
});

context('trailing whitespace', function() {
it('should strip spaces', function() {
var ts = TextStatistics('Hello friend. ');
assert.equal(ts.text, 'Hello friend.');
});

it('should strip newlines', function() {
var ts = TextStatistics('Hello friend.\n\n');
assert.equal(ts.text, 'Hello friend.');
});

it('should strip \\r\\n thing', function() {
var ts = TextStatistics('Hello friend.\r\n');
assert.equal(ts.text, 'Hello friend.');
});

it('should strip tabs', function() {
var ts = TextStatistics('Hello friend.\t');
assert.equal(ts.text, 'Hello friend.');
});
});

context('leading whitespace', function() {
it('should strip spaces', function() {
var ts = TextStatistics(' Hello friend.');
assert.equal(ts.text, 'Hello friend.');
});

it('should strip newlines', function() {
var ts = TextStatistics('\n\nHello friend.');
assert.equal(ts.text, 'Hello friend.');
});

it('should strip \\r\\n thing', function() {
var ts = TextStatistics('\r\nHello friend.');
assert.equal(ts.text, 'Hello friend.');
});

it('should strip tabs', function() {
var ts = TextStatistics('\tHello friend.');
assert.equal(ts.text, 'Hello friend.');
});
});

it('should remove multiple spaces between words', function() {
var ts = TextStatistics('Hello good friend.');
assert.equal(ts.text, 'Hello good friend.');
});

it('should un-duplicate terminators', function() {
var ts = TextStatistics('Hello... Friend..');
assert.equal(ts.text, 'Hello. Friend.');
});

it('should pad terminators with a space', function() {
var ts = TextStatistics('Hello.Good.Friend.');
assert.equal(ts.text, 'Hello. Good. Friend.');
});

context('unify terminators', function() {
it('should replace all !! with ..', function() {
var ts = TextStatistics('Hello! Friend!');
assert.equal(ts.text, 'Hello. Friend.');
});

it('should replace all ?? with ..', function() {
var ts = TextStatistics('Hello? Friend?');
assert.equal(ts.text, 'Hello. Friend.');
});
});

context('replacing newlines with terminators', function() {
it('should replace \\n', function() {
var ts = TextStatistics('bulleted list here we go\nnice dog\ngood dog');
assert.equal(ts.text, 'bulleted list here we go. nice dog. good dog.');
});

it('should replace \\r\\n', function() {
var ts = TextStatistics('bulleted list here we go\r\nnice dog\r\ngood dog');
assert.equal(ts.text, 'bulleted list here we go. nice dog. good dog.');
});

it('should replace \\r', function() {
var ts = TextStatistics('bulleted list here we go\rnice dog\rgood dog');
assert.equal(ts.text, 'bulleted list here we go. nice dog. good dog.');
});
});

context('stripping periods from email addresses', function() {
it('should replace a single period', function() {
var ts = TextStatistics('textstatistics@example.com');
assert.equal(ts.text, 'textstatistics@examplecom.');
});

it('should replace a single period in the first part', function() {
var ts = TextStatistics('text.statistics@example.com');
assert.equal(ts.text, 'textstatistics@examplecom.');
});

it('should replace two periods in the first part', function() {
var ts = TextStatistics('text.stat.istics@example.com');
assert.equal(ts.text, 'textstatistics@examplecom.');
});

it('should replace periods with a subdomain', function() {
var ts = TextStatistics('textstatistics@test.example.com');
assert.equal(ts.text, 'textstatistics@testexamplecom.');
});

it('should replace periods with a subdomain and before the @', function() {
var ts = TextStatistics('text.stat.istics@test.example.com');
assert.equal(ts.text, 'textstatistics@testexamplecom.');
});
});

context('replacing non-terminator punctuation', function() {
it('should replace commas with spaces', function() {
var ts = TextStatistics('Hello, hi, friend.');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should replace colons with spaces', function() {
var ts = TextStatistics('Hello: hi: friend.');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should replace semicolons with spaces', function() {
var ts = TextStatistics('Hello; hi; friend.');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should replace parentheses with spaces', function() {
var ts = TextStatistics('(Hello (hi) friend).');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should replace slashes with spaces', function() {
var ts = TextStatistics('Hello/hi/friend.');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should replace double hyphens with spaces', function() {
var ts = TextStatistics('Hello--hi--friend.');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should not replace a single dash with spaces', function() {
var ts = TextStatistics('Hi-di-ho friend-person!');
assert.equal(ts.text, 'Hi-di-ho friend-person.');
});

it('should replace pluses with spaces', function() {
var ts = TextStatistics('Hello + hi+friend.');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should replace ampersands with spaces', function() {
var ts = TextStatistics('Hello&hi & friend.');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should replace em-dash with spaces'); // can I do that?
});
});
});
54 changes: 54 additions & 0 deletions test/testCountMethods.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
var assert = require('assert');
var TextStatistics = require('../index.js');

describe('TextStatistics', function() {

describe('#sentenceCount()', function() {
it('should count a single sentence', function() {
var ts = TextStatistics('see spot run.');
assert.equal(1, ts.sentenceCount());
});

it('should count a single sentence with a comma', function() {
var ts = TextStatistics('see, spot runs.');
assert.equal(1, ts.sentenceCount());
});

it('should count a few simple sentences', function() {
var ts = TextStatistics('see spot run. good job spot. have a treat.');
assert.equal(3, ts.sentenceCount());
});
});

describe('#wordCount()', function() {
it('a string w/o words should have word count of one, because dividing by zero', function() {
var ts = TextStatistics('.');
assert.equal(1, ts.wordCount());
});

it('should count the number of words in a text', function() {
var ts = TextStatistics('see spot run');
assert.equal(3, ts.wordCount());
});

it('should not count words with an apostrophe as two words', function() {
var ts = TextStatistics('they\'re');
assert.equal(1, ts.wordCount());
});

it('should not count the empty string after a period as a word', function() {
var ts = TextStatistics('dog.');
assert.equal(1, ts.wordCount());
});

it('should count an email address as a single word', function() {
var ts = TextStatistics('textstatistics@example.com');
assert.equal(1, ts.wordCount());
});

it('should count words with a dash as a single word', function() {
var ts = TextStatistics('long-term');
assert.equal(1, ts.wordCount());
});
});
});

0 comments on commit 2d1f0d6

Please sign in to comment.