diff --git a/androguard/cli/main.py b/androguard/cli/main.py index df85d4886..2ffc2549b 100644 --- a/androguard/cli/main.py +++ b/androguard/cli/main.py @@ -240,7 +240,7 @@ def export_apps_to_format(filename, continue # Current Folder to write to - filename_class = valid_class_name(method.get_class_name()) + filename_class = valid_class_name(method.get_class_name().string) filename_class = os.path.join(output, filename_class) create_directory(filename_class) @@ -257,10 +257,10 @@ def export_apps_to_format(filename, method2format(filename + "." + form, form, None, buff) # Write the Java file for the whole class - if method.get_class_name() not in dump_classes: + if method.get_class_name().string not in dump_classes: print("source codes ...", end=' ') current_class = vm.get_class(method.get_class_name()) - current_filename_class = valid_class_name(current_class.get_name()) + current_filename_class = valid_class_name(current_class.get_name().string) current_filename_class = os.path.join(output, current_filename_class + ".java") with open(current_filename_class, "w") as fd: diff --git a/androguard/core/analysis/analysis.py b/androguard/core/analysis/analysis.py index 2eb04b3e7..3e829faff 100644 --- a/androguard/core/analysis/analysis.py +++ b/androguard/core/analysis/analysis.py @@ -5,7 +5,7 @@ from androguard.core.androconf import is_ascii_problem, load_api_specific_resource_module from androguard.core.bytecodes import dvm import logging -from androguard.core import bytecode +from androguard.core import bytecode, mutf8 import networkx as nx from enum import IntEnum @@ -789,7 +789,7 @@ def get_method(self, name, descriptor): :param descriptor: method descriptor, for example `'(I)V'` :return: :class:`ExternalMethod` """ - key = name + str(descriptor) + key = name + mutf8.MUTF8String.join(descriptor) if key not in self.methods: self.methods[key] = ExternalMethod(self.name, name, descriptor) @@ -818,7 +818,7 @@ def get_class_name(self): return self.class_name def get_descriptor(self): - return ''.join(self.descriptor) + return mutf8.MUTF8String.join(self.descriptor) @property def full_name(self): @@ -837,7 +837,7 @@ def get_access_flags_string(self): return "" def __str__(self): - return "{}->{}{}".format(self.class_name, self.name, ''.join(self.descriptor)) + return "{}->{}{}".format(self.class_name.__str__(), self.name.__str__(), mutf8.MUTF8String.join(self.descriptor).string) def __repr__(self): return "".format(self.__str__()) @@ -988,7 +988,7 @@ def get_fake_method(self, name, descriptor): # We are searching an unknown method in this class # It could be something that the class herits - key = name + str(descriptor) + key = name + mutf8.MUTF8String.join(descriptor) if key not in self._inherits_methods: self._inherits_methods[key] = ExternalMethod(self.orig_class.get_name(), name, descriptor) return self._inherits_methods[key] @@ -1277,14 +1277,14 @@ def _create_xref(self, current_class): continue class_info = method_info[0].lstrip('[') - if class_info[0] != 'L': + if class_info[0] != b'L': # Need to make sure, that we get class types and not other types continue method_item = None # TODO: should create get_method_descriptor inside Analysis for vm in self.vms: - method_item = vm.get_method_descriptor(class_info, method_info[1], ''.join(method_info[2])) + method_item = vm.get_method_descriptor(class_info, method_info[1], mutf8.MUTF8String.join(method_info[2])) if method_item: break @@ -1501,10 +1501,11 @@ def find_classes(self, name=".*", no_external=False): :param no_external: Remove external classes from the output (default False) :rtype: Iterator[ClassAnalysis] """ + name = mutf8.MUTF8String.from_str(name).bytes for cname, c in self.classes.items(): if no_external and isinstance(c.get_vm_class(), ExternalClass): continue - if re.match(name, cname): + if re.match(name, cname.bytes): yield c def find_methods(self, classname=".*", methodname=".*", descriptor=".*", @@ -1521,8 +1522,11 @@ def find_methods(self, classname=".*", methodname=".*", descriptor=".*", :param no_external: Remove external method from the output (default False) :rtype: Iterator[MethodClassAnalysis] """ + classname = mutf8.MUTF8String.from_str(classname).bytes + methodname = mutf8.MUTF8String.from_str(methodname).bytes + descriptor = mutf8.MUTF8String.from_str(descriptor).bytes for cname, c in self.classes.items(): - if re.match(classname, cname): + if re.match(classname, cname.bytes): for m in c.get_methods(): z = m.get_method() # TODO is it even possible that an internal class has @@ -1530,8 +1534,8 @@ def find_methods(self, classname=".*", methodname=".*", descriptor=".*", # instead... if no_external and isinstance(z, ExternalMethod): continue - if re.match(methodname, z.get_name()) and \ - re.match(descriptor, z.get_descriptor()) and \ + if re.match(methodname, z.get_name().bytes) and \ + re.match(descriptor, z.get_descriptor().bytes) and \ re.match(accessflags, z.get_access_flags_string()): yield m @@ -1542,8 +1546,9 @@ def find_strings(self, string=".*"): :param string: regular expression for the string to search for :rtype: Iterator[StringAnalysis] """ + string = mutf8.MUTF8String.from_str(string).bytes for s, sa in self.strings.items(): - if re.match(string, s): + if re.match(string, s.bytes): yield sa def find_fields(self, classname=".*", fieldname=".*", fieldtype=".*", accessflags=".*"): @@ -1556,12 +1561,15 @@ def find_fields(self, classname=".*", fieldname=".*", fieldtype=".*", accessflag :param accessflags: regular expression of the access flags :rtype: Iterator[FieldClassAnalysis] """ + classname = mutf8.MUTF8String.from_str(classname).bytes + fieldname = mutf8.MUTF8String.from_str(fieldname).bytes + fieldtype = mutf8.MUTF8String.from_str(fieldtype).bytes for cname, c in self.classes.items(): - if re.match(classname, cname): + if re.match(classname, cname.bytes): for f in c.get_fields(): z = f.get_field() - if re.match(fieldname, z.get_name()) and \ - re.match(fieldtype, z.get_descriptor()) and \ + if re.match(fieldname, z.get_name().bytes) and \ + re.match(fieldtype, z.get_descriptor().bytes) and \ re.match(accessflags, z.get_access_flags_string()): yield f diff --git a/androguard/core/bytecode.py b/androguard/core/bytecode.py index 354cd1252..9baea5580 100644 --- a/androguard/core/bytecode.py +++ b/androguard/core/bytecode.py @@ -593,60 +593,6 @@ def method2json_direct(mx): return json.dumps(d) -class SV: - - def __init__(self, size, buff): - self.__size = size - self.__value = unpack(self.__size, buff)[0] - - def _get(self): - return pack(self.__size, self.__value) - - def __str__(self): - return "0x%x" % self.__value - - def __int__(self): - return self.__value - - def get_value_buff(self): - return self._get() - - def get_value(self): - return self.__value - - def set_value(self, attr): - self.__value = attr - - -class SVs: - - def __init__(self, size, ntuple, buff): - self.__size = size - - self.__value = ntuple._make(unpack(self.__size, buff)) - - def _get(self): - l = [] - for i in self.__value._fields: - l.append(getattr(self.__value, i)) - return pack(self.__size, *l) - - def _export(self): - return [x for x in self.__value._fields] - - def get_value_buff(self): - return self._get() - - def get_value(self): - return self.__value - - def set_value(self, attr): - self.__value = self.__value._replace(**attr) - - def __str__(self): - return self.__value.__str__() - - def object_to_bytes(obj): """ Convert a object to a bytearray or call get_raw() of the object @@ -787,9 +733,6 @@ def readat(self, off): :param int off: starting offset :rtype: bytearray """ - if isinstance(off, SV): - off = off.value - return self.__buff[off:] def read(self, size): @@ -800,9 +743,6 @@ def read(self, size): :param int size: length of bytes to read :rtype: bytearray """ - if isinstance(size, SV): - size = size.value - buff = self.__buff[self.__idx:self.__idx + size] self.__idx += size diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py index cf5e563c8..19c3bcf15 100644 --- a/androguard/core/bytecodes/dvm.py +++ b/androguard/core/bytecodes/dvm.py @@ -2,7 +2,7 @@ from androguard.core.bytecodes.apk import APK from androguard.core.androconf import CONF -from androguard.core.bytecodes import mutf8 +from androguard.core import mutf8 from androguard.core.bytecodes.dvm_types import TypeMapItem, ACCESS_FLAGS, TYPE_DESCRIPTOR import sys @@ -97,17 +97,21 @@ class InvalidInstruction(Error): def read_null_terminated_string(f): """ Read a null terminated string from a file-like object. - :param f: file-like object :rtype: bytearray """ - x = bytearray() + x = [] while True: - z = f.read(1) - if ord(z) == 0: - return x + z = f.read(128) + if 0 in z: + s = z.split(b'\x00',1) + x.append(s[0]) + idx = f.get_idx() + f.set_idx(idx - len(s[1])) + break else: - x.append(ord(z)) + x.append(z) + return b''.join(x) def get_access_flags_string(value): @@ -194,11 +198,11 @@ def static_operand_instruction(instruction): def get_sbyte(buff): - return unpack('=b', bytearray(buff.read(1)))[0] + return unpack('=b', buff.read(1))[0] def get_byte(buff): - return unpack('=B', bytearray(buff.read(1)))[0] + return unpack('=B', buff.read(1))[0] def readuleb128(buff): @@ -446,11 +450,9 @@ def determineException(vm, m): class HeaderItem: """ This class can parse an header_item of a dex file. - Several checks are performed to detect if this is not an header_item. Also the Adler32 checksum of the file is calculated in order to detect file corruption. - :param buff: a string which represents a Buff object of the header_item :type androguard.core.bytecode.BuffHandle buff: Buff object :param cm: a ClassManager object @@ -1887,32 +1889,11 @@ def set_off(self, off): def get_off(self): return self.offset - def get_unicode(self): - """ - Returns an Unicode String - This is the actual string. Beware that some strings might be not - decodeable with usual UTF-16 decoder, as they use surrogates that are - not supported by python. - """ - s = mutf8.decode(self.data) - if len(s) != self.utf16_size: - raise ValueError("UTF16 Length does not match!") - - # Return a UTF16 String - return s - def get(self): """ - Returns a printable string. - In this case, all lonely surrogates are escaped, thus are represented in the - string as 6 characters: \\ud853 - Valid surrogates are encoded as 32bit values, ie. \U00024f5c. + Returns a MUTF8String object """ - s = mutf8.decode(self.data) - if len(s) != self.utf16_size: - raise ValueError("UTF16 Length does not match!") - # log.debug("Decoding UTF16 string with IDX {}, utf16 length {} and hexdata '{}'.".format(self.offset, self.utf16_size, binascii.hexlify(self.data))) - return mutf8.patch_string(s) + return mutf8.MUTF8String.from_bytes(self.data) def show(self): bytecode._PrintSubBanner("String Data Item") @@ -2163,11 +2144,11 @@ def get_parameters_off_value(self): """ Return the string associated to the parameters_off - :rtype: string + :rtype: MUTF8String """ if self.parameters_off_value is None: params = self.CM.get_type_list(self.parameters_off) - self.parameters_off_value = '({})'.format(' '.join(params)) + self.parameters_off_value = mutf8.MUTF8String.from_bytes(b'(') + mutf8.MUTF8String.join(params, spacing=b' ') + mutf8.MUTF8String.from_bytes(b')') return self.parameters_off_value def show(self): @@ -2680,7 +2661,7 @@ def reload(self): name = self.CM.get_field(self.field_idx) self.class_name = name[0] self.name = name[2] - self.proto = ''.join(i for i in name[1]) + self.proto = name[1] def set_init_value(self, value): """ @@ -2916,7 +2897,7 @@ def reload(self): if v and len(v) >= 3: self.class_name = v[0] self.name = v[1] - self.proto = ''.join(i for i in v[2]) + self.proto = mutf8.MUTF8String.join(i for i in v[2]) else: self.class_name = 'CLASS_NAME_ERROR' self.name = 'NAME_ERROR' @@ -2993,7 +2974,7 @@ def __str__(self): @property def full_name(self): """Return class_name + name + descriptor, separated by spaces (no access flags""" - return " ".join([self.class_name, self.name, self.get_descriptor()]) + return mutf8.MUTF8String.join([self.class_name, self.name, self.get_descriptor()], spacing=b' ') def get_short_string(self): """ @@ -3024,11 +3005,11 @@ def _fmt_classname(cls): cls = cls.rsplit("/", 1)[1] return arr + cls - clsname = _fmt_classname(self.get_class_name()) + clsname = _fmt_classname(self.get_class_name().string) - param, ret = self.get_descriptor()[1:].split(")") + param, ret = self.get_descriptor().string[1:].split(")") params = map(_fmt_classname, param.split(" ")) - desc = "({}){}".format(" ".join(params), _fmt_classname(ret)) + desc = "({}){}".format(mutf8.MUTF8String.join(params), _fmt_classname(ret)) return "{cls} {meth} {desc}".format(cls=clsname, meth=self.get_name(), desc=desc) @@ -7964,11 +7945,12 @@ def get_method(self, name): :rtype: a list with all :class:`EncodedMethod` objects """ # TODO could use a generator here - prog = re.compile(name) + name = mutf8.MUTF8String.from_str(name) + prog = re.compile(name.bytes) l = [] for i in self.get_classes(): for j in i.get_methods(): - if prog.match(j.get_name()): + if prog.match(j.get_name().bytes): l.append(j) return l @@ -7981,11 +7963,12 @@ def get_field(self, name): :rtype: a list with all :class:`EncodedField` objects """ # TODO could use a generator here - prog = re.compile(name) + name = mutf8.MUTF8String.from_str(name) + prog = re.compile(name.bytes) l = [] for i in self.get_classes(): for j in i.get_fields(): - if prog.match(j.get_name()): + if prog.match(j.get_name().bytes): l.append(j) return l @@ -8156,27 +8139,6 @@ def get_field_descriptor(self, class_name, field_name, descriptor): return self.__cache_fields.get(key) - def get_strings_unicode(self): - """ - Return all strings - - This method will return pure UTF-16 strings. This is the "exact" same string as used in Java. - Those strings can be problematic for python, as they can contain surrogates as well as "broken" - surrogate pairs, ie single high or low surrogates. - Such a string can for example not be printed. - To avoid such problems, there is an escape mechanism to detect such lonely surrogates - and escape them in the string. Of course, this results in a different string than in the Java Source! - - Use `get_strings()` as a general purpose and `get_strings_unicode()` if you require the exact string - from the Java Source. - You can always escape the string from `get_strings_unicode()` using the function - :meth:`androguard.core.bytecodes.mutf8.patch_string` - - :rtype: a list with all strings used in the format (types, names ...) - """ - for i in self.strings: - yield i.get_unicode() - def get_strings(self): """ Return all strings @@ -8228,7 +8190,7 @@ def _delete_python_export_class(self, _class): def _create_python_export_class(self, _class, delete=False): if _class is not None: ### Class - name = bytecode.FormatClassToPython(_class.get_name()) + name = bytecode.FormatClassToPython(_class.get_name()).string if delete: delattr(self.C, name) return @@ -8252,13 +8214,13 @@ def _create_python_export_methods(self, _class, delete): for i in m: if len(m[i]) == 1: j = m[i][0] - name = bytecode.FormatNameToPython(j.get_name()) + name = bytecode.FormatNameToPython(j.get_name()).string setattr(_class.M, name, j) else: for j in m[i]: name = ( bytecode.FormatNameToPython(j.get_name()) + "_" + - bytecode.FormatDescriptorToPython(j.get_descriptor())) + bytecode.FormatDescriptorToPython(j.get_descriptor())).string setattr(_class.M, name, j) def _create_python_export_fields(self, _class, delete): @@ -8273,13 +8235,13 @@ def _create_python_export_fields(self, _class, delete): for i in f: if len(f[i]) == 1: j = f[i][0] - name = bytecode.FormatNameToPython(j.get_name()) + name = bytecode.FormatNameToPython(j.get_name()).string setattr(_class.F, name, j) else: for j in f[i]: name = bytecode.FormatNameToPython(j.get_name( )) + "_" + bytecode.FormatDescriptorToPython( - j.get_descriptor()) + j.get_descriptor()).string setattr(_class.F, name, j) def get_BRANCH_DVM_OPCODES(self): diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py deleted file mode 100644 index 7d974996a..000000000 --- a/androguard/core/bytecodes/mutf8.py +++ /dev/null @@ -1,103 +0,0 @@ -def decode(b): - """ - Decode bytes as MUTF-8 - See https://docs.oracle.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8 - for more information - - Surrogates will be returned as two 16 bit characters. - - :param b: bytes to decode - :rtype: unicode (py2), str (py3) of 16bit chars - :raises: UnicodeDecodeError if string is not decodable - """ - res = "" - - b = iter(bytearray(b)) - - for x in b: - if x >> 7 == 0: - # Single char: - res += chr(x & 0x7f) - elif x >> 5 == 0b110: - # 2 byte Multichar - b2 = next(b) - if b2 >> 6 != 0b10: - raise UnicodeDecodeError("Second byte of 2 byte sequence does not looks right.") - - res += chr((x & 0x1f) << 6 | b2 & 0x3f) - elif x >> 4 == 0b1110: - # 3 byte Multichar - b2 = next(b) - b3 = next(b) - if b2 >> 6 != 0b10: - raise UnicodeDecodeError("Second byte of 3 byte sequence does not looks right.") - if b3 >> 6 != 0b10: - raise UnicodeDecodeError("Third byte of 3 byte sequence does not looks right.") - - res += chr((x & 0xf) << 12 | (b2 & 0x3f) << 6 | b3 & 0x3f) - else: - raise UnicodeDecodeError("Could not decode byte") - - return res - - -class PeekIterator: - """ - A quick'n'dirty variant of an Iterator that has a special function - peek, which will return the next object but not consume it. - """ - idx = 0 - - def __init__(self, s): - self.s = s - - def __iter__(self): - return self - - def __next__(self): - if self.idx == len(self.s): - raise StopIteration() - self.idx = self.idx + 1 - return self.s[self.idx - 1] - - def next(self): - # py2 compliance - return self.__next__() - - def peek(self): - if self.idx == len(self.s): - return None - return self.s[self.idx] - - -def patch_string(s): - """ - Reorganize a String in such a way that surrogates are printable - and lonely surrogates are escaped. - - :param s: input string - :return: string with escaped lonely surrogates and 32bit surrogates - """ - res = '' - it = PeekIterator(s) - for c in it: - if (ord(c) >> 10) == 0b110110: - # High surrogate - # Check for the next - n = it.peek() - if n and (ord(n) >> 10) == 0b110111: - # Next is a low surrogate! Merge them together - res += chr(((ord(c) & 0x3ff) << 10 | (ord(n) & 0x3ff)) + 0x10000) - # Skip next char, as we already consumed it - next(it) - else: - # Lonely high surrogate - res += "\\u{:04x}".format(ord(c)) - elif (ord(c) >> 10) == 0b110111: - # Lonely low surrogate - res += "\\u{:04x}".format(ord(c)) - else: - # Looks like a normal char... - res += c - return res - diff --git a/androguard/core/mutf8.py b/androguard/core/mutf8.py new file mode 100644 index 000000000..793dd67c6 --- /dev/null +++ b/androguard/core/mutf8.py @@ -0,0 +1,271 @@ +def decode(b): + size = len(b) + ord_array = [None] * size + ord_index = 0 + + b = iter(b) + + for x in b: + if x >> 7 == 0: + # Single char: + ord_array[ord_index] = x & 0x7f + elif x >> 5 == 0b110: + # 2 byte Multichar + b2 = next(b) + if b2 >> 6 != 0b10: + raise UnicodeDecodeError( + "Second byte of 2 byte sequence does not looks right.") + + ord_array[ord_index] = (x & 0x1f) << 6 | b2 & 0x3f + elif x >> 4 == 0b1110: + # 3 byte Multichar + b2 = next(b) + b3 = next(b) + if b2 >> 6 != 0b10: + raise UnicodeDecodeError( + "Second byte of 3 byte sequence does not looks right.") + if b3 >> 6 != 0b10: + raise UnicodeDecodeError( + "Third byte of 3 byte sequence does not looks right.") + + ord_array[ord_index] = (x & 0xf) << 12 | ( + b2 & 0x3f) << 6 | b3 & 0x3f + else: + raise UnicodeDecodeError("Could not decode byte") + ord_index += 1 + + chr_array = [""]*size + chr_index = 0 + while chr_index < size: + c = ord_array[chr_index] + if c is None: + break + if (c >> 10) == 0b110110: + n = None + try: + n = ord_array[chr_index + 1] + except: + pass + if n and (n >> 10) == 0b110111: + chr_array[chr_index] = chr( + ((c & 0x3ff) << 10 | (n & 0x3ff)) + 0x10000) + chr_index += 1 + else: + chr_array[chr_index] = chr(c) + else: + chr_array[chr_index] = chr(c) + chr_index += 1 + + return "".join(chr_array) + + +def encode(s): + b = [b""]*len(s) + ord_array = [i for i in map(lambda x: ord(x), s)] + for x in ord_array: + if (x == 0) or ((x <= 0x7ff) and (x >= 0x80)): + b1 = ((x & 0x7c0) >> 6 | 0xc0).to_bytes(1, 'big') + b2 = ((x & 0x3f) | 0x80).to_bytes(1, 'big') + b.append(b1 + b2) + elif (x <= 0x7f): + b1 = x.to_bytes(1, 'big') + b.append(b1) + elif (x >= 0x800) and (x <= 0xffff): + b1 = ((x & 0xf000) >> 12 | 0xe0).to_bytes(1, 'big') + b2 = ((x & 0xfff) >> 6 | 0x80).to_bytes(1, 'big') + b3 = ((x & 0x3f) | 0x80).to_bytes(1, 'big') + b.append(b1 + b2 + b3) + else: + a = x - 0x10000 + s1 = ((a >> 10) | 0xd800) + s2 = ((a & 0x3ff) | 0xdc00) + b1 = ((s1 & 0xf000) >> 12 | 0xe0).to_bytes(1, 'big') + b2 = ((s1 & 0xfff) >> 6 | 0x80).to_bytes(1, 'big') + b3 = ((s1 & 0x3f) | 0x80).to_bytes(1, 'big') + b4 = ((s2 & 0xf000) >> 12 | 0xe0).to_bytes(1, 'big') + b5 = ((s2 & 0xfff) >> 6 | 0x80).to_bytes(1, 'big') + b6 = ((s2 & 0x3f) | 0x80).to_bytes(1, 'big') + b.append(b1 + b2 + b3 + b4 + b5 + b6) + return b"".join(b) + + +class MUTF8String(): + def __init__(self, data, raw=True): + if isinstance(data, MUTF8String): + self.__encoded = data.__encoded + self.__decoded = data.__decoded + else: + self.__encoded = None + self.__decoded = None + if raw: + self.__encoded = data + else: + self.__decoded = data + + @classmethod + def from_bytes(cls, data): + return cls(bytes(data)) + + @classmethod + def from_str(cls, data): + return cls(data, raw=False) + + @classmethod + def join(cls, data, spacing=b''): + array = [] + for i in data: + try: + array.append(i.bytes) + except AttributeError: + if isinstance(i, bytes): + array.append(i) + else: + array.append(encode(i)) + return MUTF8String.from_bytes(spacing.join(array)) + + @property + def bytes(self): + if self.__encoded is None: + self.__encoded = encode(self.__decoded) + return self.__encoded + + @property + def string(self): + if self.__decoded is None: + self.__decoded = decode(self.__encoded) + return self.__decoded + + def replace(self, old, new, count=None): + if count is None: + try: + return MUTF8String.from_bytes(self.bytes.replace(old, new)) + except TypeError: + return MUTF8String.from_bytes(self.bytes.replace(encode(old), encode(new))) + else: + try: + return MUTF8String.from_bytes(self.bytes.replace(old, new, count)) + except TypeError: + return MUTF8String.from_bytes(self.bytes.replace(encode(old), encode(new), count)) + + def find(self, sub): + try: + return self.bytes.find(sub) + except TypeError: + return self.bytes.find(encode(sub)) + + def split(self, sep=None, maxsplit=-1): + try: + return [MUTF8String.from_bytes(i) for i in self.bytes.split(sep, maxsplit)] + except TypeError: + return [MUTF8String.from_bytes(i) for i in self.bytes.split(encode(sep), maxsplit)] + + def rsplit(self, sep=None, maxsplit=-1): + try: + return [MUTF8String.from_bytes(i) for i in self.bytes.rsplit(sep, maxsplit)] + except TypeError: + return [MUTF8String.from_bytes(i) for i in self.bytes.rsplit(encode(sep), maxsplit)] + + def lstrip(self, sub): + try: + return MUTF8String.from_bytes(self.bytes.lstrip(sub)) + except TypeError: + return MUTF8String.from_bytes(self.bytes.lstrip(encode(sub))) + + def startswith(self, sub): + try: + return self.bytes.startswith(sub) + except TypeError: + return self.bytes.startswith(encode(sub)) + + def __add__(self, other): + try: + return MUTF8String.from_bytes(self.bytes + other.bytes) + except AttributeError: + return MUTF8String.from_bytes(self.bytes + encode(other)) + + def __getitem__(self, item): + if isinstance(item, int): + return MUTF8String.from_bytes(self.bytes[item].to_bytes(1, byteorder='big')) + else: + return MUTF8String.from_bytes(self.bytes[item]) + + def __repr__(self): + return "".format(self.__str__()) + + def __str__(self): + return self.string.encode('utf8', errors='backslashreplace').decode('utf8') + + def __format__(self, format_spec): + return format(self.string, format_spec) + + def __hash__(self): + return hash(self.bytes) + + def __len__(self): + return len(self.bytes) + + def __lt__(self, other): + try: + return self.bytes.__lt__(other.bytes) + except AttributeError: + if isinstance(other, bytes): + return self.bytes.__lt__(other) + elif isinstance(other, str): + return self.bytes.__lt__(MUTF8String.from_str(other).bytes) + else: + raise TypeError('{} is not supported'.format(type(other))) + + def __le__(self, other): + try: + return self.bytes.__le__(other.bytes) + except AttributeError: + if isinstance(other, bytes): + return self.bytes.__le__(other) + elif isinstance(other, str): + return self.bytes.__le__(MUTF8String.from_str(other).bytes) + else: + raise TypeError('{} is not supported'.format(type(other))) + + def __eq__(self, other): + try: + return self.bytes.__eq__(other.bytes) + except AttributeError: + if isinstance(other, bytes): + return self.bytes.__eq__(other) + elif isinstance(other, str): + return self.bytes.__eq__(MUTF8String.from_str(other).bytes) + else: + raise TypeError('{} is not supported'.format(type(other))) + + def __ne__(self, other): + try: + return self.bytes.__ne__(other.bytes) + except AttributeError: + if isinstance(other, bytes): + return self.bytes.__ne__(other) + elif isinstance(other, str): + return self.bytes.__ne__(MUTF8String.from_str(other).bytes) + else: + raise TypeError('{} is not supported'.format(type(other))) + + def __gt__(self, other): + try: + return self.bytes.__gt__(other.bytes) + except AttributeError: + if isinstance(other, bytes): + return self.bytes.__gt__(other) + elif isinstance(other, str): + return self.bytes.__gt__(MUTF8String.from_str(other).bytes) + else: + raise TypeError('{} is not supported'.format(type(other))) + + def __ge__(self, other): + try: + return self.bytes.__ge__(other.bytes) + except AttributeError: + if isinstance(other, bytes): + return self.bytes.__ge__(other) + elif isinstance(other, str): + return self.bytes.__ge__(MUTF8String.from_str(other).bytes) + else: + raise TypeError('{} is not supported'.format(type(other))) diff --git a/androguard/decompiler/dad/decompile.py b/androguard/decompiler/dad/decompile.py index 3f1a98ab6..a09b9319e 100644 --- a/androguard/decompiler/dad/decompile.py +++ b/androguard/decompiler/dad/decompile.py @@ -299,7 +299,7 @@ def get_source(self): if len(self.interfaces) > 0: prototype += ' implements %s' % ', '.join( - [n[1:-1].replace('/', '.') for n in self.interfaces]) + [n[1:-1].replace('/', '.').string for n in self.interfaces]) source.append('%s {\n' % prototype) for field in self.fields: diff --git a/androguard/decompiler/dad/writer.py b/androguard/decompiler/dad/writer.py index 61a91bee0..31168a677 100644 --- a/androguard/decompiler/dad/writer.py +++ b/androguard/decompiler/dad/writer.py @@ -17,6 +17,7 @@ import logging from struct import unpack +from androguard.core import mutf8 from androguard.decompiler.dad.util import get_type from androguard.decompiler.dad.opcode_ins import Op from androguard.decompiler.dad.instruction import ( @@ -48,7 +49,7 @@ def __init__(self, graph, method): self.need_break = True def __str__(self): - return ''.join(self.buffer) + return mutf8.MUTF8String.join(self.buffer).string def str_ext(self): return self.buffer2 @@ -678,6 +679,10 @@ def visit_condz_expression(self, op, arg): arg.visit(self) else: arg.visit(self) + try: + atype = atype.string + except AttributeError: + pass if atype in 'VBSCIJFD': self.write(' %s 0' % op, data="TODO64") else: diff --git a/androguard/decompiler/decompiler.py b/androguard/decompiler/decompiler.py index 76a0eeaaa..7084a785c 100644 --- a/androguard/decompiler/decompiler.py +++ b/androguard/decompiler/decompiler.py @@ -696,7 +696,7 @@ def __init__(self, vm, vmx, jadx="jadx", keepfiles=False): # Next, try to find files for the classes we have for cl in andr_class_names: - fname = self._find_class(cl, tmpfolder) + fname = self._find_class(cl.string, tmpfolder) if fname: if "L{};".format(cl) not in self.classes: with open(fname, "rb") as fp: diff --git a/tests/test_decompiler.py b/tests/test_decompiler.py index badfce314..00e52b0d1 100644 --- a/tests/test_decompiler.py +++ b/tests/test_decompiler.py @@ -47,7 +47,7 @@ def test_all_decompiler(): # Generate test cases for this APK: a, d, dx = AnalyzeAPK("examples/tests/hello-world.apk") for c in d[0].get_classes(): - test_name = re.sub("[^a-zA-Z0-9_]", "_", c.get_name()[1:-1]) + test_name = re.sub("[^a-zA-Z0-9_]", "_", c.get_name().string[1:-1]) # Test the decompilation of a single class # disable for now, as testing all DvMethods has the same effect as # testing all DvClasses. diff --git a/tests/test_dexcodeparsing.py b/tests/test_dexcodeparsing.py index 335658f0d..c5b553f2f 100644 --- a/tests/test_dexcodeparsing.py +++ b/tests/test_dexcodeparsing.py @@ -45,7 +45,7 @@ def testcode(self): def testClassManager(self): """Test if the classmanager has the same items""" - from androguard.core.bytecodes.mutf8 import decode, patch_string + from androguard.core.mutf8 import decode fname = "examples/android/TestsAndroguard/bin/classes.dex" @@ -64,7 +64,7 @@ def testClassManager(self): for idx in range(parsed.string_ids_size): self.assertNotEqual(cm.get_string(idx), ERR_STR) self.assertNotEqual(cm.get_raw_string(idx), ERR_STR) - self.assertEqual(cm.get_raw_string(idx), patch_string(decode(parsed.str_raw[idx]))) + self.assertEqual(cm.get_raw_string(idx), decode(parsed.str_raw[idx])) self.assertEqual(cm.get_string(parsed.string_ids_size), ERR_STR) self.assertEqual(cm.get_raw_string(parsed.string_ids_size), ERR_STR) diff --git a/tests/test_strings.py b/tests/test_strings.py index df136b8fc..d9ef06de8 100644 --- a/tests/test_strings.py +++ b/tests/test_strings.py @@ -3,7 +3,8 @@ import sys -from androguard.core.bytecodes import dvm, mutf8 +from androguard.core import mutf8 +from androguard.core.bytecodes import dvm from androguard.core.analysis import analysis @@ -27,25 +28,20 @@ def testDex(self): self.assertIn(s, d.get_strings()) def testMUTF8(self): - self.assertEqual("\x67", mutf8.decode(b"\x67")) - # Null byte - self.assertEqual("\x00", mutf8.decode(b"\xc0\x80")) - self.assertEqual("\uacf0", mutf8.decode(b"\xea\xb3\xb0")) - # Surrogates - self.assertEqual("\ud83d\ude4f", mutf8.decode(b"\xed\xa0\xbd\xed\xb9\x8f")) - self.assertEqual("\ud853\udf5c", mutf8.decode(b"\xed\xa1\x93\xed\xbd\x9c")) - # Lonely surrogates - self.assertEqual("\ud853", mutf8.decode(b"\xed\xa1\x93")) - self.assertEqual("\udf5c", mutf8.decode(b"\xed\xbd\x9c")) - # Normal ASCII String - self.assertEqual("hello world", mutf8.decode(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64")) - - # Test the patching of strings - - self.assertEqual("hello world", mutf8.patch_string(mutf8.decode(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64"))) - self.assertEqual("\U00024f5c", mutf8.patch_string(mutf8.decode(b"\xed\xa1\x93\xed\xbd\x9c"))) - self.assertEqual("\U0001f64f", mutf8.patch_string(mutf8.decode(b"\xed\xa0\xbd\xed\xb9\x8f"))) - self.assertEqual("\\ud853", mutf8.patch_string(mutf8.decode(b"\xed\xa1\x93"))) + # self.assertEqual("\x67", mutf8.decode(b"\x67")) + # # Null byte + # self.assertEqual("\x00", mutf8.decode(b"\xc0\x80")) + # self.assertEqual("\uacf0", mutf8.decode(b"\xea\xb3\xb0")) + # # Surrogates + # self.assertEqual("\ud83d\ude4f", mutf8.decode(b"\xed\xa0\xbd\xed\xb9\x8f")) + # self.assertEqual("\ud853\udf5c", mutf8.decode(b"\xed\xa1\x93\xed\xbd\x9c")) + # # Lonely surrogates + # self.assertEqual("\ud853", mutf8.decode(b"\xed\xa1\x93")) + # self.assertEqual("\udf5c", mutf8.decode(b"\xed\xbd\x9c")) + # # Normal ASCII String + # self.assertEqual("hello world", mutf8.decode(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64")) + + # Testing decode b = b"\xed\xa1\x93\xed\xbd\x9c" + \ b"\xed\xa0\xbd\xed\xb9\x8f" + \ @@ -54,8 +50,21 @@ def testMUTF8(self): b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64" + \ b"\xc0\x80" - self.assertEqual("\U00024f5c\U0001f64f\\ud83d\uacf0hello world\x00", mutf8.patch_string(mutf8.decode(b))) + self.assertEqual("hello world", mutf8.decode(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64").encode('utf8', errors='backslashreplace').decode('utf8')) + self.assertEqual("\U00024f5c", mutf8.decode(b"\xed\xa1\x93\xed\xbd\x9c").encode('utf8', errors='backslashreplace').decode('utf8')) + self.assertEqual("\U0001f64f", mutf8.decode(b"\xed\xa0\xbd\xed\xb9\x8f").encode('utf8', errors='backslashreplace').decode('utf8')) + self.assertEqual("\\ud853", mutf8.decode(b"\xed\xa1\x93").encode('utf8', errors='backslashreplace').decode('utf8')) + self.assertEqual("\U00024f5c\U0001f64f\\ud83d\uacf0hello world\x00", mutf8.decode(b).encode('utf8', errors='backslashreplace').decode('utf8')) + + # Testing encode + + self.assertEqual(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64", mutf8.encode("hello world")) + self.assertEqual(b"\xed\xa1\x93\xed\xbd\x9c", mutf8.encode("\U00024f5c")) + self.assertEqual(b"\xed\xa0\xbd\xed\xb9\x8f", mutf8.encode("\U0001f64f")) + self.assertEqual(b"\xed\xa1\x93", mutf8.encode("\ud853")) + self.assertEqual(b, mutf8.encode("\U00024f5c\U0001f64f\ud83d\uacf0hello world\x00")) + self.assertEqual(mutf8.MUTF8String.from_bytes(b), mutf8.MUTF8String.from_str("\U00024f5c\U0001f64f\ud83d\uacf0hello world\x00")) if __name__ == '__main__':