Skip to content

Commit

Permalink
Removed the code to combine pairs of unicode character entities into …
Browse files Browse the repository at this point in the history
…a single value. Updated the tests to reflect this case. Added a giant comment to the tests to explain the reasoning.
  • Loading branch information
Mike Lee committed Feb 20, 2018
1 parent 3376bc7 commit ec18902
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 8 deletions.
6 changes: 2 additions & 4 deletions sax.js
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,10 @@ function parse(source,defaultNSMapCopy,entityMap,domBuilder,errorHandler){
}
}
function appendText(end){//has some bugs
// support for xml entities encoded using ª» where AA+BB are hex
var xt = source.substring(start,end).replace(/&#x(\w+);&#x(\w+);/g, function(a, b, c) { return new Buffer(b + c, 'hex').toString() });
xt = xt.replace(/&#?\w+;/g,entityReplacer);
var xt = source.substring(start,end).replace(/&#?\w+;/g,entityReplacer);
locator&&position(start);
domBuilder.characters(xt,0,end-start);
start = end
start = end;
}
function position(start,m){
while(start>=endPos && (m = linePattern.exec(source))){
Expand Down
37 changes: 33 additions & 4 deletions test/node.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,46 @@ wows.describe('XML Node Parse').addBatch({
var dom = new DOMParser().parseFromString('<xml>test value</xml>');
var root = dom.documentElement;
console.assert ( root.firstChild.textContent =='test value');
},
},
/**
* These tests were added to confirm a change that removed logic which was
* inappropriately converting pairs of Unicode code point character entities
* into a single value.
*
* This appears to be against the XML 1.0 specification "Section 4.1:
* Character and Entity References":
*
* > If the character reference begins with " &#x ", the digits and letters
* > up to the terminating ; provide a hexadecimal representation of the
* > character's code point in ISO/IEC 10646. If it begins just with " &# ",
* > the digits up to the terminating ; provide a decimal representation of
* > the character's code point.
*
* {@link https://www.w3.org/TR/xml/#sec-references}
*/
'text node with two one-byte character entities': function () {
var dom = new DOMParser().parseFromString('<xml>&lt;inner&gt;&lt;/inner&gt;</xml>');
var root = dom.documentElement;
console.assert ( root.firstChild.textContent =='<inner><inner>');
},
console.assert ( root.firstChild.textContent =='<inner></inner>');
},
'text node with a two-byte character entity': function () {
var dom = new DOMParser().parseFromString('<xml>f&#xC3;&#xBC;n</xml>');
var root = dom.documentElement;
console.assert ( root.firstChild.textContent =='fün');
},
'text node with a single two-byte character entity': function () {
var dom = new DOMParser().parseFromString('<xml>f&#x00FC;n</xml>');
var root = dom.documentElement;
console.assert ( root.firstChild.textContent =='fün');
},
},
'text node with two one-byte Unicode character entities': function () {
var dom = new DOMParser().parseFromString('<xml>kchen&#x4E03;&#x5473;@shichimitogarashi.org</xml>');
var root = dom.documentElement;
console.assert ( root.firstChild.textContent =='kchen七味@shichimitogarashi.org');
},
/**
* End of unicode tests
*/
'append node': function () {
var dom = new DOMParser().parseFromString('<xml/>');
var child = dom.createElement("child");
Expand Down

0 comments on commit ec18902

Please sign in to comment.