Skip to content

Commit

Permalink
Fix some frmdump encoding bugs
Browse files Browse the repository at this point in the history
- Add pycharset() method to the frm Charset instance to map between MySQL
  and python character sets

- Fix decoding of all table level attributes to use utf-8 rather than
  incorrectly assuming the table character set.

- Fix decoding of enum/set labels when the column character set is multibyte;
  In this case, MySQL uses a hex encoding for the label string which needs to
  decoded before applying any character set decoding.

Fixes #97
  • Loading branch information
abg committed Dec 8, 2016
1 parent 6e3618b commit ed59818
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 10 deletions.
18 changes: 13 additions & 5 deletions dbsake/core/mysql/frm/binaryfrm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""
from __future__ import unicode_literals

import binascii
import collections
import errno
import itertools
Expand Down Expand Up @@ -135,13 +136,13 @@ def from_data(cls, data, context):
if extrasize:
if extrainfo.tell() < extrasize:
connection = extrainfo.bytes_prefix16()
connection = connection.decode(charset.name)
connection = connection.decode('utf-8')
if extrainfo.tell() < extrasize:
engine = extrainfo.bytes_prefix16()
engine = engine.decode(charset.name)
engine = engine.decode('utf-8')
if extrainfo.tell() < extrasize:
partition_info = extrainfo.bytes_prefix32()
partition_info = partition_info.decode(charset.name)
partition_info = partition_info.decode('utf-8')
extrainfo.skip(2) # skip null + autopartition flag

if not engine:
Expand Down Expand Up @@ -243,7 +244,7 @@ def unpack_column_attributes(*args, **kwargs):


def unpack_column_names(names):
return tuple(name.decode('utf8') for name in names[1:-2].split(b'\xff'))
return tuple(name.decode('utf-8') for name in names[1:-2].split(b'\xff'))


def unpack_column_labels(labels):
Expand All @@ -253,7 +254,7 @@ def unpack_column_labels(labels):
Returns a tuple of tuples
"""
return tuple(
tuple(name.decode('utf8') for name in group[1:-1].split(b'\xff'))
tuple(name for name in group[1:-1].split(b'\xff'))
for group in labels[:-1].split(b'\x00')
)

Expand Down Expand Up @@ -313,6 +314,13 @@ def unpack_columns(packed_columns, table):
charset = charsets.lookup(charset_id)
context.update(subtype_code=subtype_code, charset=charset)

if context.labels:
if charset.name in ('ucs2', 'utf16', 'utf16le', 'utf32'):
context.labels = tuple(binascii.unhexlify(val)
for val in context.labels)

context.update(labels=tuple(value.decode(charset.pycharset())
for value in context.labels))
with defaults.offset(defaults_offset):
default = mysqltypes.unpack_default(defaults, context)
comment = comments.read(comment_length).decode('utf-8')
Expand Down
30 changes: 28 additions & 2 deletions dbsake/core/mysql/frm/charsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,34 @@

import collections

Charset = collections.namedtuple('Charset',
'id name collation maxlen is_default')
_mysql_to_py_charset = {
'armscii8': NotImplementedError,
'binary': NotImplementedError,
'dec8': NotImplementedError,
'eucjpms': NotImplementedError,
'geostd8': NotImplementedError,
'hp8': NotImplementedError,
'keybcs2': NotImplementedError,
'koi8r': 'koi8-r',
'koi8u': 'koi8-u',
'macce': 'maccentraleurope',
'swe7': NotImplementedError,
'ucs2': 'utf-16-be',
'utf8mb4': 'utf-8',
'utf16le': 'utf-16-le',
'utf16': 'utf-16-be',
'utf32': 'utf-32-be',
}

class Charset(collections.namedtuple('Charset',
'id name collation maxlen is_default')):
def pycharset(self):
charset = _mysql_to_py_charset.get(self.name, self.name)
if charset is NotImplementedError:
raise NotImplementedError("Unsupported character set '%s'" %
(self.name,))
return charset


CHARSETS = {
32: Charset(id=32,
Expand Down
8 changes: 5 additions & 3 deletions dbsake/core/mysql/frm/mysqltypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -704,7 +704,9 @@ def unpack_type_varchar(defaults, context):
length = defaults.uint8()
else:
length = defaults.uint16()
return "'%s'" % defaults.read(length).decode(context.charset.name)
data = defaults.read(length)

return "'%s'" % data.decode(context.charset.pycharset())


# This is the 4.1 varchar type, but with trailing whitespace
Expand All @@ -715,14 +717,14 @@ def unpack_type_varchar(defaults, context):
def unpack_type_var_string(defaults, context):
"""Unpack a MySQL 4.1 VARCHAR(N) default value"""
data = defaults.read(context.length)
return "'%s'" % data.decode(context.charset.name).rstrip(' ')
return "'%s'" % data.decode(context.charset.pycharset()).rstrip(' ')


def unpack_type_string(defaults, context):
"""Unpack a CHAR(N) fixed length string"""
# Trailing spaces are always stripped for CHAR fields
bytestr = defaults.read(context.length)
return "'%s'" % bytestr.decode(context.charset.name).rstrip(' ')
return "'%s'" % bytestr.decode(context.charset.pycharset()).rstrip(' ')


# MySQL BIT(m) type
Expand Down

0 comments on commit ed59818

Please sign in to comment.