Skip to content

Commit

Permalink
Merge pull request #16 from cgiffard/cleanup_branch
Browse files Browse the repository at this point in the history
bug fixes and cleanup.
  • Loading branch information
adam-zethraeus committed May 13, 2014
2 parents c6ca973 + 129dd10 commit e65fe59
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 77 deletions.
174 changes: 103 additions & 71 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,94 +10,110 @@ var XRegexp = require('xregexp').XRegExp;
"keygen", "link", "meta", "param", "source", "track", "wbr"
];

var downsize = function (text, options, offset) {
var downsize = function (text, inputOptions, offset) {
var stack = [],
pointer = 0,
tagName = "",
parseState = 0,
countState = {},
trackedState = {unitCount: 0, countState: false},
tagBuffer = "",
truncatedText = "";

var options = options && typeof options === "object" ? options : {},
wordChars = options.wordChars instanceof RegExp ?
options.wordChars : XRegexp("[\\p{L}0-9\\-\\']", "i");

var keepContext = !!options.contextualTags,
contextualTags = (
keepContext && Array.isArray(options.contextualTags) ?
options.contextualTags : []
);

function count(chr, track) {
var limit = options.words || (options.characters + 1) || Infinity,
contextualTagPresent = false,
stackIndex = 0;

if (!("unitCount" in track))
track.unitCount = 0;

// Tick-tock state storage for counting words
// If it doesn't exist, initialise it with value of current char
if (!("countState" in track))
track.countState = !!wordChars.test(chr + "");

if (options.words) {
if (!!wordChars.test(chr + "") !== track.countState) {

track.countState = !!wordChars.test(chr + "");
var COUNT_CHARACTERS = -1,
COUNT_WORDS = -2;

// Only count the words on the "tock", or we'd be counting
// them twice.
if (!track.countState)
track.unitCount++;
}

// We pass in empty values to count word boundries
// defined by tags.
// This isn't relevant to character truncation.
} else if (chr !== "") {

track.unitCount++;
}
var options = inputOptions && typeof inputOptions === "object" ? inputOptions : {},
wordChars = options.wordChars instanceof RegExp ?
options.wordChars : new XRegexp("[\\p{L}0-9\\-\\']", "i");
options.countingType = !isNaN(Number(options.words)) ? COUNT_WORDS : COUNT_CHARACTERS;
options.keepContext = !!options.contextualTags;
options.contextualTags = options.keepContext &&
Array.isArray(options.contextualTags) ?
options.contextualTags : [];
options.limit = (options.countingType === COUNT_WORDS) ? Number(options.words) :
Number(options.characters);
options.limit = isNaN(options.limit) ? Infinity : options.limit;

function isAtLimit() {
var stackIndex = 0;

// Return true when we've hit our limit
if (track.unitCount < limit)
if (trackedState.unitCount < options.limit) {
return false;
}

// If we've got no special context to retain, do an early return.
if (!keepContext)
if (!options.keepContext) {
return true;

for (; stackIndex < stack.length; stackIndex++)
if (~contextualTags.indexOf(getTagName(stack[stackIndex])))
}

for (; stackIndex < stack.length; stackIndex++) {
if (~options.contextualTags.indexOf(getTagName(stack[stackIndex]))) {
return false;
}
}

// There are no contextual tags left, we can stop.
return true;
}

function count(chr) {
// TODO: 'Tock' for word counting happens when next whitespace is added.
// i.e. it then needs stripping.
// Should a pointer be passed to count instead of the chr?
// This would allow forward lookup and allow 'Tock' on final char.
switch (options.countingType) {
case COUNT_WORDS:
if (!!wordChars.test(chr + "") !== trackedState.countState) {

trackedState.countState = !!wordChars.test(chr + "");

// Only count the words on the "tock", or we'd be counting
// them twice.
if (!trackedState.countState) {
trackedState.unitCount++;
}
}
break;

case COUNT_CHARACTERS:
// We pass in empty values to count word boundries
// defined by tags.
// This isn't relevant to character truncation.
if (chr !== "") {
trackedState.unitCount++;
}
break;
}
}

// Define our parse states
var PARSER_UNINITIALISED = 0,
PARSER_TAG_COMMENCED = 1,
PARSER_TAG_STRING = -1,
PARSER_TAG_STRING_SINGLE = -2,
PARSER_COMMENT = -3;

for (; pointer < text.length; pointer++) {
var exit = false;
for (; pointer < text.length && !exit; pointer++) {

if (parseState !== PARSER_UNINITIALISED)
if (parseState !== PARSER_UNINITIALISED) {
tagBuffer += text[pointer];
}

switch (text[pointer]) {

case "<":
// Ooh look — we're starting a new tag.
// (Provided we're in uninitialised state and the next
// character is a word character, explamation mark or slash)

if (parseState === PARSER_UNINITIALISED &&
text[pointer + 1].match(/[a-z0-9\-\_\/\!]/)) {

if (isAtLimit()) {
exit = true;
break;
}
parseState = PARSER_TAG_COMMENCED;
tagBuffer += text[pointer];
}
Expand Down Expand Up @@ -168,55 +184,68 @@ var XRegexp = require('xregexp').XRegExp;
stack.pop();
}

// Nope, it's an opening tag.
} else {
// Nope, it's an opening tag.

// Don't push self closing or void elements on to
// the stack, since they have no effect on nesting.

if (voidElements.indexOf(tagName) < 0 &&
!tagBuffer.match(/\/\s*>$/)) {

stack.push(tagBuffer);
}
}

tagBuffer = "";

// Closed tags are word boundries. Count!
// Because we've reset our parser state we need
// to manually short circuit the logic that comes next.
if (!count("", countState)) continue;
}
if (!isAtLimit()) {
count("");
continue;
}

if (parseState === PARSER_COMMENT &&
text.substring(pointer - 2, pointer) === "--") {
} else if (parseState === PARSER_COMMENT) {
if (text.substring(pointer - 2, pointer) === "--") {
parseState = PARSER_UNINITIALISED;
truncatedText += tagBuffer;
tagBuffer = "";

parseState = PARSER_UNINITIALISED;
truncatedText += tagBuffer;
tagBuffer = "";

// Closed tags are word boundries. Count!
if (!count("", countState)) continue;
// Closed tags are word boundries. Count!
if (!isAtLimit()) {
count("");
continue;
}
}
}

break;
break; // break switch
}

// We're not inside a tag, comment, attribute, or string.
// This is just text.
if (!parseState) {
if (parseState === PARSER_UNINITIALISED) {

// Have we had enough of a good thing?
if (count(text[pointer], countState)) break;
if (isAtLimit()) {
// console.log("limit at: '" + text[pointer] +"'");
// console.log(trackedState.unitCount);
break;
}
count(text[pointer]);

// Nope, we still thirst for more.
truncatedText += text[pointer];
}
}

} // end of main parsing for loop

// 'Tock' for word counting happens when next whitespace is added.
// Strip this and any other trailing whitespace.
// TODO: what should the whitespace behavior be?
truncatedText = truncatedText.trim();

if (options.append && (stack.length || tagBuffer.length)) {
truncatedText = truncatedText.trim() + options.append;
truncatedText += options.append;
}

// Append anything still left in the buffer
Expand All @@ -236,7 +265,9 @@ var XRegexp = require('xregexp').XRegExp;

// We didn't get a tag name, so return nothing. Better than
// a bad prediction, or a junk tag.
if (!tagName) return "";
if (!tagName) {
return "";
}

return "</" + tagName + ">";
}
Expand All @@ -247,8 +278,9 @@ var XRegexp = require('xregexp').XRegExp;
}

// Export to node
if (typeof module !== 'undefined' && module.exports)
if (typeof module !== 'undefined' && module.exports) {
return module.exports = downsize;
}

// Nope, export to the browser instead.
exportTo.downsize = downsize;
Expand Down
48 changes: 42 additions & 6 deletions test.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ chai.should();

describe("Word-wise truncation", function () {

it("should be able to handle tagless input", function () {
downsize("this is a test of tagless input", {words: 5})
.should.equal("this is a test of");
});

it("should be able to truncate across nested tags", function () {
downsize("<p>this is a <strong>test of word downsizing</strong></p>", {words: 5})
.should.equal("<p>this is a <strong>test of</strong></p>");
Expand Down Expand Up @@ -83,11 +88,6 @@ describe("Word-wise truncation", function () {
.should.equal("<p>Рэпудёандаэ конжыквуюнтюр эю</p>");
});

it("should properly properly character-truncate across tag boundries", function () {
downsize("<p>abcdefghij</p><p>klmnop</p><p>qrs</p>", {characters: 15})
.should.equal("<p>abcdefghij</p><p>klmno</p>");
});

it("should not have trailing empty tags", function () {
downsize("<p>there are five words here</p><i>what</i>", {words: 5})
.should.equal("<p>there are five words here</p>");
Expand All @@ -103,6 +103,34 @@ describe("Word-wise truncation", function () {
.should.equal("<ul><li>item one</li><li>item two</li><li>item three</li></ul>");
});

it("should handle truncation to zero words", function () {
downsize("<p>this is a <strong>test of word downsizing</strong></p>", {words: 0})
.should.equal("");
});

it("should handle truncation to zero words with a string number input for backwards compatibility", function () {
downsize("<p>this is a <strong>test of word downsizing</strong></p>", {words: "0"})
.should.equal("");
});

});

describe("Character based truncation", function () {

it("should be able to handle tagless input", function () {
downsize("this is a test of tagless input", {characters: 6})
.should.equal("this i");
});

it("should properly character-truncate across tag boundries", function () {
downsize("<p>abcdefghij</p><p>klmnop</p><p>qrs</p>", {characters: 15})
.should.equal("<p>abcdefghij</p><p>klmno</p>");

downsize("<p>a</p><p>b</p><p>cdefghij</p><p>klmnop</p><p>qrs</p>", {characters: 15})
.should.equal("<p>a</p><p>b</p><p>cdefghij</p><p>klmno</p>");

});

it("should await the end of the containing paragraph", function () {
downsize("<p>there are many more than seven characters in this paragraph</p><p>this is unrelated</p>", {characters: 7, contextualTags: ["p", "ul", "ol", "pre", "blockquote"]})
.should.equal("<p>there are many more than seven characters in this paragraph</p>");
Expand All @@ -111,10 +139,12 @@ describe("Word-wise truncation", function () {
});

describe("Appending", function () {
it("should properly append an ellipsis where required", function () {
it("should properly append an ellipsis where required for word truncation", function () {
downsize("<p>abcdefghij</p><p>klmnop</p><p>qrs</p>", {characters: 15, append: "..."})
.should.equal("<p>abcdefghij</p><p>klmno...</p>");
});

it("should properly append an ellipsis where required for character truncation", function () {
downsize("<p>here's some text.</p>", {words: 2, append: "... (read more)"})
.should.equal("<p>here's some... (read more)</p>");
});
Expand All @@ -123,6 +153,12 @@ describe("Appending", function () {
downsize("<p>here's some text.</p>", {words: 5, append: "..."})
.should.equal("<p>here's some text.</p>");
});

it("should not have trailing empty tags", function () {
downsize("<p>characters</p><i>what</i>", {characters: 10})
.should.equal("<p>characters</p>");
});

});

describe("Performance", function () {
Expand Down

0 comments on commit e65fe59

Please sign in to comment.