Skip to content

Commit

Permalink
Fix Big5-HKSCS encoding to prefer non-HKSCS codes in case of multiple…
Browse files Browse the repository at this point in the history
… options (fixes #264)
  • Loading branch information
ashtuchkin committed May 23, 2021
1 parent 9627ecf commit ed88711
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 3 deletions.
14 changes: 13 additions & 1 deletion encodings/dbcs-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,19 @@ module.exports = {
'big5hkscs': {
type: '_dbcs',
table: function() { return require('./tables/cp950.json').concat(require('./tables/big5-added.json')) },
encodeSkipVals: [0xa2cc],
encodeSkipVals: [
// Although Encoding Standard says we should avoid encoding to HKSCS area (See Step 1 of
// https://encoding.spec.whatwg.org/#index-big5-pointer), we still do it to increase compatibility with ICU.
// But if a single unicode point can be encoded both as HKSCS and regular Big5, we prefer the latter.
0x8e69, 0x8e6f, 0x8e7e, 0x8eab, 0x8eb4, 0x8ecd, 0x8ed0, 0x8f57, 0x8f69, 0x8f6e, 0x8fcb, 0x8ffe,
0x906d, 0x907a, 0x90c4, 0x90dc, 0x90f1, 0x91bf, 0x92af, 0x92b0, 0x92b1, 0x92b2, 0x92d1, 0x9447, 0x94ca,
0x95d9, 0x96fc, 0x9975, 0x9b76, 0x9b78, 0x9b7b, 0x9bc6, 0x9bde, 0x9bec, 0x9bf6, 0x9c42, 0x9c53, 0x9c62,
0x9c68, 0x9c6b, 0x9c77, 0x9cbc, 0x9cbd, 0x9cd0, 0x9d57, 0x9d5a, 0x9dc4, 0x9def, 0x9dfb, 0x9ea9, 0x9eef,
0x9efd, 0x9f60, 0x9fcb, 0xa077, 0xa0dc, 0xa0df, 0x8fcc, 0x92c8, 0x9644, 0x96ed,

// Step 2 of https://encoding.spec.whatwg.org/#index-big5-pointer: Use last pointer for U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345
0xa2a4, 0xa2a5, 0xa2a7, 0xa2a6, 0xa2cc, 0xa2ce,
],
},

'cnbig5': 'big5hkscs',
Expand Down
32 changes: 30 additions & 2 deletions generation/gen-dbcs.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ async.parallel({
}

// Calculate difference between big5 and cp950, and write it to a file.
// See http://encoding.spec.whatwg.org/#big5-encoder
// See http://encoding.spec.whatwg.org/#big5
var big5add = {}
for (var i = 0x8100; i < 0x10000; i++) { // Lead byte is 0x81 .. 0xFE
var trail = i & 0xFF;
Expand All @@ -41,7 +41,35 @@ async.parallel({
big5add[i] = big5Char;
}

// Add char sequences that are not in the index file (as given in http://encoding.spec.whatwg.org/#big5-encoder)
// Calculate HKSCS codes that are duplicates of big5 codes and need to be skipped when encoding.
console.log("Duplicate HKSCS codes that need to be skipped when encoded (see encodeSkipVals in big5hkscs): ")
var big5codes = {};
for (var i = 0xA100; i < 0x10000; i++) {
var uCharCode = (big5add[i] !== undefined) ? big5add[i] : data.cp950[i];
if (uCharCode !== undefined) {
big5codes[uCharCode] = true;
}
}
for (var i = 0x8100; i < 0xA100; i++) {
var uCharCode = (big5add[i] !== undefined) ? big5add[i] : data.cp950[i];
if (uCharCode !== undefined && big5codes[uCharCode]) {
console.log("0x"+i.toString(16));
}
}

if (big5Char !== undefined) {
if (lead < 0xA1) {
if (d[big5Char] !== undefined) {
console.log("duplicate in first: "+ pointer + " char " + big5Char);
}
d[big5Char] = i;
} else if (d[big5Char] !== undefined) {
console.log("dup 0x"+d[big5Char].toString(16) + " -> " + i.toString(16))
}

}

// Add char sequences that are not in the index file (as given in http://encoding.spec.whatwg.org/#big5-decoder)
function toIdx(pointer) { var trail = pointer % 157; var lead = Math.floor(pointer / 157) + 0x81; return (lead << 8) + (trail + (trail < 0x3F ? 0x40 : 0x62))}
big5add[toIdx(1133)] = [0x00CA, 0x0304];
big5add[toIdx(1135)] = [0x00CA, 0x030C];
Expand Down
4 changes: 4 additions & 0 deletions test/big5-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,8 @@ describe("Big5 tests", function() {
it("Big5 correctly encodes 十", function() {
assert.strictEqual(iconv.encode("十", "big5").toString('hex'), "a451");
});

it("Big5 correctly encodes 起 (issue #264)", function() {
assert.strictEqual(iconv.encode("起", "big5").toString('hex'), "b05f");
});
});

0 comments on commit ed88711

Please sign in to comment.