diff --git a/README.md b/README.md index 4a841b7..e3fcf3c 100644 --- a/README.md +++ b/README.md @@ -180,6 +180,10 @@ sanitizeHtml( ## Changelog +1.4.3: invokes itself recursively until the markup stops changing to guard against [this issue](https://github.com/fb55/htmlparser2/issues/105). Bump to htmlparser2 version 3.7.x. + +1.4.1, 1.4.2: more tests. + 1.4.0: ability to allow all attributes or tags through by setting `allowedAttributes` and/or `allowedTags` to false. Thanks to Anand Thakker. 1.3.0: `attribs` now available on frames passed to exclusive filter. diff --git a/index.js b/index.js index 8a3c01c..3aba678 100644 --- a/index.js +++ b/index.js @@ -4,7 +4,11 @@ var he = require('he'); module.exports = sanitizeHtml; -function sanitizeHtml(html, options) { +// Ignore the _recursing flag; it's there for recursive +// invocation as a guard against this exploit: +// https://github.com/fb55/htmlparser2/issues/105 + +function sanitizeHtml(html, options, _recursing) { var result = ''; function Frame(tag, attribs) { @@ -87,8 +91,8 @@ function sanitizeHtml(html, options) { var skipText = false; var parser = new htmlparser.Parser({ onopentag: function(name, attribs) { - var frame = new Frame(name, attribs); - stack.push(frame); + var frame = new Frame(name, attribs); + stack.push(frame); var skip = false; if (_.has(transformTagsMap, name)) { @@ -198,6 +202,18 @@ function sanitizeHtml(html, options) { }); parser.write(html); parser.end(); + + // Invoke recursively until we stop finding + // clever little nesting exploits + if (!_recursing) { + while (true) { + var newResult = sanitizeHtml(result, options, true); + if (newResult === result) { + return result; + } + result = newResult; + } + } return result; function escapeHtml(s) { diff --git a/package.json b/package.json index 44d5763..cf20799 100644 --- a/package.json +++ b/package.json @@ -22,7 +22,7 @@ "license": "MIT", "dependencies": { "he": "~0.4.1", - "htmlparser2": "~3.3.0", + "htmlparser2": "3.7.x", "lodash": "2.4.x" } } diff --git a/test/test.js b/test/test.js index 86e3b2a..d4c36ea 100644 --- a/test/test.js +++ b/test/test.js @@ -281,6 +281,21 @@ describe('sanitizeHtml', function() { } ), '' - ) + ); + }); + it('should not be faked out by double <', function() { + assert.equal( + sanitizeHtml('<img src="javascript:evil"/>' + ), + '' + ); + // I don't love what I get back here obviously, but + // it is not an attack vector, although it might be parsed + // by some browsers as containing an unbalanced close tag. + assert.equal( + sanitizeHtml('<a href="javascript:evil"/>' + ), + '<a href="javascript:evil"/>' + ); }); });