Fix some frmdump encoding bugs

- Add pycharset() method to the frm Charset instance to map between MySQL and python character sets - Fix decoding of all table level attributes to use utf-8 rather than incorrectly assuming the table character set. - Fix decoding of enum/set labels when the column character set is multibyte; In this case, MySQL uses a hex encoding for the label string which needs to decoded before applying any character set decoding. Fixes #97
abg · Dec 8, 2016 · ed59818 · ed59818
1 parent 6e3618b
commit ed59818
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 10 deletions.
diff --git a/dbsake/core/mysql/frm/binaryfrm.py b/dbsake/core/mysql/frm/binaryfrm.py
@@ -6,6 +6,7 @@
 """
 from __future__ import unicode_literals
 
+import binascii
 import collections
 import errno
 import itertools
@@ -135,13 +136,13 @@ def from_data(cls, data, context):
         if extrasize:
             if extrainfo.tell() < extrasize:
                 connection = extrainfo.bytes_prefix16()
-                connection = connection.decode(charset.name)
+                connection = connection.decode('utf-8')
             if extrainfo.tell() < extrasize:
                 engine = extrainfo.bytes_prefix16()
-                engine = engine.decode(charset.name)
+                engine = engine.decode('utf-8')
             if extrainfo.tell() < extrasize:
                 partition_info = extrainfo.bytes_prefix32()
-                partition_info = partition_info.decode(charset.name)
+                partition_info = partition_info.decode('utf-8')
             extrainfo.skip(2)  # skip null + autopartition flag
 
         if not engine:
@@ -243,7 +244,7 @@ def unpack_column_attributes(*args, **kwargs):
 
 
 def unpack_column_names(names):
-    return tuple(name.decode('utf8') for name in names[1:-2].split(b'\xff'))
+    return tuple(name.decode('utf-8') for name in names[1:-2].split(b'\xff'))
 
 
 def unpack_column_labels(labels):
@@ -253,7 +254,7 @@ def unpack_column_labels(labels):
     Returns a tuple of tuples
     """
     return tuple(
-        tuple(name.decode('utf8') for name in group[1:-1].split(b'\xff'))
+        tuple(name for name in group[1:-1].split(b'\xff'))
         for group in labels[:-1].split(b'\x00')
     )
 
@@ -313,6 +314,13 @@ def unpack_columns(packed_columns, table):
         charset = charsets.lookup(charset_id)
         context.update(subtype_code=subtype_code, charset=charset)
 
+        if context.labels:
+            if charset.name in ('ucs2', 'utf16', 'utf16le', 'utf32'):
+                context.labels = tuple(binascii.unhexlify(val)
+                                       for val in context.labels)
+
+            context.update(labels=tuple(value.decode(charset.pycharset())
+                                        for value in context.labels))
         with defaults.offset(defaults_offset):
             default = mysqltypes.unpack_default(defaults, context)
         comment = comments.read(comment_length).decode('utf-8')

diff --git a/dbsake/core/mysql/frm/charsets.py b/dbsake/core/mysql/frm/charsets.py
@@ -10,8 +10,34 @@
 
 import collections
 
-Charset = collections.namedtuple('Charset',
-                                 'id name collation maxlen is_default')
+_mysql_to_py_charset = {
+    'armscii8': NotImplementedError,
+    'binary':   NotImplementedError,
+    'dec8':     NotImplementedError,
+    'eucjpms':  NotImplementedError,
+    'geostd8':  NotImplementedError,
+    'hp8':      NotImplementedError,
+    'keybcs2':  NotImplementedError,
+    'koi8r':    'koi8-r',
+    'koi8u':    'koi8-u',
+    'macce':    'maccentraleurope',
+    'swe7':     NotImplementedError,
+    'ucs2':     'utf-16-be',
+    'utf8mb4':  'utf-8',
+    'utf16le':  'utf-16-le',
+    'utf16':    'utf-16-be',
+    'utf32':    'utf-32-be',
+}
+
+class Charset(collections.namedtuple('Charset',
+                                     'id name collation maxlen is_default')):
+    def pycharset(self):
+        charset = _mysql_to_py_charset.get(self.name, self.name)
+        if charset is NotImplementedError:
+            raise NotImplementedError("Unsupported character set '%s'" %
+                                      (self.name,))
+        return charset
+
 
 CHARSETS = {
     32: Charset(id=32,

diff --git a/dbsake/core/mysql/frm/mysqltypes.py b/dbsake/core/mysql/frm/mysqltypes.py
@@ -704,7 +704,9 @@ def unpack_type_varchar(defaults, context):
         length = defaults.uint8()
     else:
         length = defaults.uint16()
-    return "'%s'" % defaults.read(length).decode(context.charset.name)
+    data = defaults.read(length)
+
+    return "'%s'" % data.decode(context.charset.pycharset())
 
 
 # This is the 4.1 varchar type, but with trailing whitespace
@@ -715,14 +717,14 @@ def unpack_type_varchar(defaults, context):
 def unpack_type_var_string(defaults, context):
     """Unpack a MySQL 4.1 VARCHAR(N) default value"""
     data = defaults.read(context.length)
-    return "'%s'" % data.decode(context.charset.name).rstrip(' ')
+    return "'%s'" % data.decode(context.charset.pycharset()).rstrip(' ')
 
 
 def unpack_type_string(defaults, context):
     """Unpack a CHAR(N) fixed length string"""
     # Trailing spaces are always stripped for CHAR fields
     bytestr = defaults.read(context.length)
-    return "'%s'" % bytestr.decode(context.charset.name).rstrip(' ')
+    return "'%s'" % bytestr.decode(context.charset.pycharset()).rstrip(' ')
 
 
 # MySQL BIT(m) type