Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

present an api

  • Loading branch information...
commit 81e2bd945a62c3865e9e62e82d9cadb926c0ab77 1 parent fad3a1c
@arlolra authored
Showing with 102 additions and 49 deletions.
  1. +42 −44 cesu-8.js
  2. +4 −4 package.json
  3. +44 −1 readme.md
  4. +12 −0 test.js
View
86 cesu-8.js
@@ -6,6 +6,7 @@ const Iconv = require('iconv').Iconv
// https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/String/fromCharCode
function fixedFromCharCode(codePt) {
+
if (codePt > 0xFFFF) {
codePt -= 0x10000
return String.fromCharCode(
@@ -17,62 +18,59 @@ function fixedFromCharCode(codePt) {
}
-// emoji 🍨 🍩 🍪
-var arr = [
- 101, 109,111, 106, 105, 32, 240, 159, 141, 168, 32,
- 240, 159, 141, 169, 32, 240, 159, 141, 170, 10
-]
+var iconv16 = new Iconv('UTF-8', 'UTF-16')
+ , iconv8 = new Iconv('UTF-16', 'UTF-8')
-// do this w/o iconv
-var iconv = new Iconv('UTF-8', 'UTF-16')
- , buf = iconv.convert(new Buffer(arr))
+module.exports = {
+ toString: function (buf) {
-var i = 0
- , len = buf.length
- , last = ''
- , string = ''
+ // accepts a utf-8 encoded buffer
+ if (!Buffer.isBuffer(buf)) return
-for (; i < len; i++) {
- last += buf[i].toString(16)
- if (i % 2 != 0) {
- string += fixedFromCharCode(parseInt('0x' + last))
- last = ''
- }
-}
+ // do this w/o iconv
+ var utf16buf = iconv16.convert(buf)
+ var i = 0
+ , len = utf16buf.length
+ , last = ''
+ , string = ''
+ for (; i < len; i++) {
+ last += utf16buf[i].toString(16)
+ if (i % 2 != 0) {
+ string += fixedFromCharCode(parseInt('0x' + last))
+ last = ''
+ }
+ }
-// we now have a string that we can do whatever with
-// and should be fine. no different than in the browser.
-// when you're done with it, convert it back to utf-8
+ return string
+ },
+ toBuffer: function (string) {
+ var i = 0
+ , len = string.length
+ , utf16buf = new Buffer(len * 2)
+ , hex, ii, arr, j
-// https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/String/charCodeAt
+ for (; i < len; i++) {
-function fixedCharCodeAt(str, idx) {
- idx = idx || 0
- var code = str.charCodeAt(idx)
- var hi, low
- if (0xD800 <= code && code <= 0xDBFF) {
- hi = code
- low = str.charCodeAt(idx+1)
- if (isNaN(low)) {
- throw 'High surrogate not followed by low surrogate'
- }
- return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000
- }
- if (0xDC00 <= code && code <= 0xDFFF) {
- return false
- }
- return code
-}
+ hex = string.charCodeAt(i).toString(16)
+
+ hex = hex.split('')
+ j = hex.length
+ for (; j < 4; j++) {
+ hex.unshift('0')
+ }
+ ii = i * 2
+ utf16buf[ii] = parseInt('0x' + hex.slice(0, 2).join(''))
+ utf16buf[ii + 1] = parseInt('0x' + hex.slice(2, 4).join(''))
-i = 0
-len = string.length
+ }
+
+ return iconv8.convert(utf16buf)
+ }
-for (; i < len; i++) {
- console.log(fixedCharCodeAt(string, i).toString(16))
}
View
8 package.json
@@ -2,16 +2,16 @@
"author": "Arlo Breault <arlolra@gmail.com>",
"name": "cesu-8",
"description": "Convert a UTF-8 buffer to a CESU-8 string",
+ "keywords": ["utf-8", "utf8", "cesu-8", "emoji", "surrogate pair"],
"version": "0.0.1",
"repository": {
- "url": ""
+ "url": "https://github.com/arlolra/node-cesu-8"
},
"engines": {
- "node": "0.6.x"
+ "node": "*"
},
"dependencies": {
"iconv": "1.1.3"
},
- "devDependencies": {},
- "optionalDependencies": {}
+ "main": "cesu-8.js"
}
View
45 readme.md
@@ -1,3 +1,7 @@
+CESU-8 Encoding for Node.js
+===========================
+
+
An Explanation
--------------
@@ -20,4 +24,43 @@ http://www.unicode.org/reports/tr26/
This library shows that you can take a UTF-8 encoded buffer, convert it to UTF-16 with [node-iconv](https://github.com/bnoordhuis/node-iconv) and then build a string with the surrogate pairs. There are a few issues with this, `String.length` will be one longer than expected, but that's the way it should work and does in the browser. But otherwise parsing JSON, the original use case, will be fine.
-This can probably be done purely in Javascript, ie. no libiconv, or even pushed down to C++, as a new encoding, CESU-8.
+This can probably be done purely in Javascript, ie. no libiconv, or even pushed down to C++, as a new encoding, CESU-8.
+
+
+Install
+-------
+
+With everyone's favourite package manager,
+
+ npm install cesu-8
+
+
+Example Usage
+-------------
+
+ var cesu = require('cesu-8')
+
+ // emoji 🍨 🍩 🍪
+ var arr = [
+ 101, 109, 111, 106, 105, 32, 240, 159, 141, 168,
+ 32, 240, 159, 141, 169, 32, 240, 159, 141, 170
+ ] // these are the octets for the above string in utf8
+
+ var utf8buffer = new Buffer(arr)
+
+ var mystring = cesu.toString(utf8buffer)
+
+ // do whatever you want with the string
+ // have fun
+ // seriously
+
+ // convert it back to a buffer
+ var backtobuf = cesu.toBuffer(mystring)
+ process.stdout.write(backtobuf)
+
+
+Todo
+----
+
+- Find out what the first char is: `0xFFEF`
+- Remove iconv dependency.
View
12 test.js
@@ -0,0 +1,12 @@
+var cesu = require('./cesu-8.js')
+
+// emoji 🍨 🍩 🍪
+var arr = [
+ 101, 109, 111, 106, 105, 32, 240, 159, 141, 168,
+ 32, 240, 159, 141, 169, 32, 240, 159, 141, 170, 10
+]
+
+var string = cesu.toString(new Buffer(arr))
+var buf = cesu.toBuffer(string)
+
+process.stdout.write(buf)
Please sign in to comment.
Something went wrong with that request. Please try again.