more work on parser, getting UTF-8 unescaping working

brianmario · Aug 3, 2009 · 36f20ec · 36f20ec
1 parent c0aec93
commit 36f20ec
Show file tree

Hide file tree

Showing 4 changed files with 72 additions and 19 deletions.
diff --git a/benchmark/parser.rb b/benchmark/parser.rb
@@ -14,21 +14,21 @@
 Benchmark.bm do |x|
   puts "yajl-ruby"
   x.report do
-    Yajl::Parser.parse(json)
+    yajl = Yajl::Parser.parse(json)
   end
 
   puts "JsonMachine"
   x.report do
-    JsonMachine::Parser.new.parse(json)
+    json_machine = JsonMachine::Parser.new.parse(json)
   end
 
-  puts "JSON (pure)"
-  x.report do
-    JSON.parse(json, :max_nesting => false)
-  end
+  # puts "JSON (pure)"
+  # x.report do
+  #   JSON.parse(json, :max_nesting => false)
+  # end
 
   # puts "ActiveSupport"
   # x.report do
   #   active_support = ActiveSupport::JSON.decode(json)
   # end
-end
+end
diff --git a/lib/core_ext/string.rb b/lib/core_ext/string.rb
@@ -0,0 +1,60 @@
+# encoding: UTF-8
+
+class String
+
+  # This was ported from Yajl (http://github.com/lloyd/yajl)
+  # The reason for this is because Ruby 1.8's Iconv class, nor 1.9's native Unicode
+  # implementation seemed to support surrogate characters (I may be wrong about that)
+  # The example below would throw exceptions on *every* attempt I'd tried to decode it.
+  # But Yajl decodes it fine, so I ported it's decoding logic into pure Ruby for us all
+  # to enjoy :)
+  #
+  # Takes an escaped string such as:
+  #   "\u004d\u0430\u4e8c\ud800\udf02"
+  #
+  # And returns a new unescaped UTF-8 string like:
+  #   Mа二������
+  #
+  def unescape_utf8
+    utf8Buf = nil
+    scanner = StringScanner.new(self)
+    while !scanner.eos?
+      if scanner.getch == "\\" && scanner.getch == "u"
+        utf8Buf ||= ""
+        codepoint = scanner.peek(4).to_i(16)
+        scanner.pos += 4
+
+        # check if this is a surrogate
+        if ((codepoint & 0xFC00) == 0xD800)
+          if scanner.getch == "\\" && scanner.getch == "u"
+            surrogate = scanner.peek(4).to_i(16)
+            scanner.pos += 4
+            codepoint = (((codepoint & 0x3F) << 10) |
+                        ((((codepoint >> 6) & 0xF) + 1) << 16) |
+                        (surrogate & 0x3FF))
+          end
+        end
+
+        if (codepoint < 0x80)
+          utf8Buf << codepoint
+        elsif (codepoint < 0x0800)
+          utf8Buf << ((codepoint >> 6) | 0xC0)
+          utf8Buf << ((codepoint & 0x3F) | 0x80)
+        elsif (codepoint < 0x10000)
+          utf8Buf << ((codepoint >> 12) | 0xE0)
+          utf8Buf << (((codepoint >> 6) & 0x3F) | 0x80)
+          utf8Buf << ((codepoint & 0x3F) | 0x80)
+        elsif (codepoint < 0x200000)
+          utf8Buf << ((codepoint >> 18) | 0xF0)
+          utf8Buf << (((codepoint >> 12) & 0x3F) | 0x80)
+          utf8Buf << (((codepoint >> 6) & 0x3F) | 0x80)
+          utf8Buf << ((codepoint & 0x3F) | 0x80)
+        else
+          utf8Buf << '?'
+        end
+
+      end
+    end
+    utf8Buf.nil? ? self : utf8Buf
+  end
+end
diff --git a/lib/json_machine.rb b/lib/json_machine.rb
@@ -1,5 +1,6 @@
 # encoding: UTF-8
 
+require File.join(File.dirname(__FILE__), 'core_ext', 'string')
 require File.join(File.dirname(__FILE__), 'json_machine', 'parser')
 require File.join(File.dirname(__FILE__), 'json_machine', 'encoder')
 

diff --git a/lib/json_machine/parser.rb b/lib/json_machine/parser.rb
@@ -1,4 +1,5 @@
 # encoding: UTF-8
+
 require 'strscan'
 
 module JsonMachine
@@ -117,18 +118,9 @@ def internal_parse(str)
             # grabs the contents of a string between " and ", even escaped strings
             scanner.pos += 1 # don't need the wrapping " char
             current = scanner.scan_until(/\"|\\\".+\"/m)
-            current.gsub!(/\\[\\bfnrt]/) { |match| u if u = UNESCAPE_MAP[$&[1]] }
-            current.gsub!(/\\([\\\/]|u[[:xdigit:]]{4})/) do
-              ustr = $1
-              if ustr[0,1] == 'u'
-                [ustr[1..-1].to_i(16)].pack("U")
-              elsif ustr == '\\'
-                '\\\\'
-              else
-                ustr
-              end
-            end
-            current = current[0,current.size-1]
+            current.gsub!(/\\[\\bfnrt]/) { |match| match if match = UNESCAPE_MAP[$&[1]] }
+            current = current.unescape_utf8
+            current = current[0,current.size-1] if current[current.size-1,1] == "\""
             if @state == :wants_hash_key
               found_hash_key(current)
             else