From 2fcc17ef84b26f3bf4548ce88173ede0ab3e38dc Mon Sep 17 00:00:00 2001 From: Neakxs Date: Thu, 25 Apr 2019 11:42:39 -0700 Subject: [PATCH 01/15] Adding function for MUTF-8 decoding and patching This function regroup functions `decode` and `patch_string` --- androguard/core/bytecodes/mutf8.py | 63 ++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py index 7d974996a..3dca54e79 100644 --- a/androguard/core/bytecodes/mutf8.py +++ b/androguard/core/bytecodes/mutf8.py @@ -41,6 +41,69 @@ def decode(b): return res +def decode_and_patch(b, size): + """ + Decode bytes as MUTF-8 + See https://docs.oracle.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8 + for more information + + Surrogates will be returned as two 16 bit characters. + + :param b: bytes to decode + :rtype: unicode (py2), str (py3) of 16bit chars + :raises: UnicodeDecodeError if string is not decodable + """ + chr_array = [""] * size + + b = iter(b) + + high_surrogate = None + chr_index = 0 + for x in b: + n = 0 + if x >> 7 == 0: + # Single char: + n = x & 0x7f + elif x >> 5 == 0b110: + # 2 byte Multichar + b2 = next(b) + if b2 >> 6 != 0b10: + raise UnicodeDecodeError("Second byte of 2 byte sequence does not looks right.") + + n = (x & 0x1f) << 6 | b2 & 0x3f + elif x >> 4 == 0b1110: + # 3 byte Multichar + b2 = next(b) + b3 = next(b) + if b2 >> 6 != 0b10: + raise UnicodeDecodeError("Second byte of 3 byte sequence does not looks right.") + if b3 >> 6 != 0b10: + raise UnicodeDecodeError("Third byte of 3 byte sequence does not looks right.") + + n = (x & 0xf) << 12 | (b2 & 0x3f) << 6 | b3 & 0x3f + else: + raise UnicodeDecodeError("Could not decode byte") + if high_surrogate is not None: + c = high_surrogate + if n and (n >> 10) == 0b110111: + chr_array[chr_index] = chr(((c & 0x3ff) << 10 | (n & 0x3ff)) + 0x10000) + chr_index += 1 + else: + chr_array[chr_index] = "\\u{:04x}".format(c) + high_surrogate = None + else: + c = n + if (c >> 10) == 0b110110: + high_surrogate = c + elif (c >> 10) == 0b110111: + chr_array[chr_index] = "\\u{:04x}".format(c) + else: + chr_array[chr_index] = chr(c) + chr_index += 1 + + return "".join(chr_array) + + class PeekIterator: """ A quick'n'dirty variant of an Iterator that has a special function From 51f80833c3896bf71e036f846087fdbcf1acbcb3 Mon Sep 17 00:00:00 2001 From: Neakxs Date: Thu, 25 Apr 2019 12:04:22 -0700 Subject: [PATCH 02/15] Replacing `path_string` calls with `decode_and_patch` Moreover, adding size checking inside `decode_and_patch` function --- androguard/core/bytecodes/dvm.py | 5 +---- androguard/core/bytecodes/mutf8.py | 5 +++++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py index 0c0859cfe..66152b184 100644 --- a/androguard/core/bytecodes/dvm.py +++ b/androguard/core/bytecodes/dvm.py @@ -1908,11 +1908,8 @@ def get(self): string as 6 characters: \\ud853 Valid surrogates are encoded as 32bit values, ie. \U00024f5c. """ - s = mutf8.decode(self.data) - if len(s) != self.utf16_size: - raise ValueError("UTF16 Length does not match!") # log.debug("Decoding UTF16 string with IDX {}, utf16 length {} and hexdata '{}'.".format(self.offset, self.utf16_size, binascii.hexlify(self.data))) - return mutf8.patch_string(s) + return mutf8.decode_and_patch(self.data, self.utf16_size) def show(self): bytecode._PrintSubBanner("String Data Item") diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py index 3dca54e79..f351b6c3b 100644 --- a/androguard/core/bytecodes/mutf8.py +++ b/androguard/core/bytecodes/mutf8.py @@ -56,6 +56,7 @@ def decode_and_patch(b, size): chr_array = [""] * size b = iter(b) + decoded_size = 0 high_surrogate = None chr_index = 0 @@ -100,6 +101,10 @@ def decode_and_patch(b, size): else: chr_array[chr_index] = chr(c) chr_index += 1 + decoded_size += 1 + + if decoded_size != size: + raise ValueError("UTF16 Length does not match!") return "".join(chr_array) From 26557ef66e16225816bdb1802be42a52a59bc24a Mon Sep 17 00:00:00 2001 From: Neakxs Date: Thu, 25 Apr 2019 14:12:41 -0700 Subject: [PATCH 03/15] Adding mutf8 tests and fixing bugs --- androguard/core/bytecodes/mutf8.py | 58 +++++++++++++----------------- tests/test_strings.py | 6 ++++ 2 files changed, 30 insertions(+), 34 deletions(-) diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py index f351b6c3b..97c5ddf42 100644 --- a/androguard/core/bytecodes/mutf8.py +++ b/androguard/core/bytecodes/mutf8.py @@ -42,36 +42,22 @@ def decode(b): def decode_and_patch(b, size): - """ - Decode bytes as MUTF-8 - See https://docs.oracle.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8 - for more information - - Surrogates will be returned as two 16 bit characters. - - :param b: bytes to decode - :rtype: unicode (py2), str (py3) of 16bit chars - :raises: UnicodeDecodeError if string is not decodable - """ - chr_array = [""] * size + ord_array = [None] * size + ord_index = 0 b = iter(b) - decoded_size = 0 - high_surrogate = None - chr_index = 0 for x in b: - n = 0 if x >> 7 == 0: # Single char: - n = x & 0x7f + ord_array[ord_index] = x & 0x7f elif x >> 5 == 0b110: # 2 byte Multichar b2 = next(b) if b2 >> 6 != 0b10: raise UnicodeDecodeError("Second byte of 2 byte sequence does not looks right.") - n = (x & 0x1f) << 6 | b2 & 0x3f + ord_array[ord_index] = (x & 0x1f) << 6 | b2 & 0x3f elif x >> 4 == 0b1110: # 3 byte Multichar b2 = next(b) @@ -81,30 +67,34 @@ def decode_and_patch(b, size): if b3 >> 6 != 0b10: raise UnicodeDecodeError("Third byte of 3 byte sequence does not looks right.") - n = (x & 0xf) << 12 | (b2 & 0x3f) << 6 | b3 & 0x3f + ord_array[ord_index] = (x & 0xf) << 12 | (b2 & 0x3f) << 6 | b3 & 0x3f else: raise UnicodeDecodeError("Could not decode byte") - if high_surrogate is not None: - c = high_surrogate + ord_index += 1 + + if ord_index != size: + raise ValueError("UTF16 Length does not match! {}".format(ord_index)) + + chr_array = [""]*size + chr_index = 0 + while chr_index < size: + c = ord_array[chr_index] + if (c >> 10) == 0b110110: + n = None + try: + n = ord_array[chr_index + 1] + except: + pass if n and (n >> 10) == 0b110111: chr_array[chr_index] = chr(((c & 0x3ff) << 10 | (n & 0x3ff)) + 0x10000) chr_index += 1 else: chr_array[chr_index] = "\\u{:04x}".format(c) - high_surrogate = None + elif (c >> 10) == 0b110111: + chr_array[chr_index] = "\\u{:04x}".format(c) else: - c = n - if (c >> 10) == 0b110110: - high_surrogate = c - elif (c >> 10) == 0b110111: - chr_array[chr_index] = "\\u{:04x}".format(c) - else: - chr_array[chr_index] = chr(c) - chr_index += 1 - decoded_size += 1 - - if decoded_size != size: - raise ValueError("UTF16 Length does not match!") + chr_array[chr_index] = chr(c) + chr_index += 1 return "".join(chr_array) diff --git a/tests/test_strings.py b/tests/test_strings.py index df136b8fc..48877d969 100644 --- a/tests/test_strings.py +++ b/tests/test_strings.py @@ -56,6 +56,12 @@ def testMUTF8(self): self.assertEqual("\U00024f5c\U0001f64f\\ud83d\uacf0hello world\x00", mutf8.patch_string(mutf8.decode(b))) + self.assertEqual("hello world", mutf8.decode_and_patch(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64", 11)) + self.assertEqual("\U00024f5c", mutf8.decode_and_patch(b"\xed\xa1\x93\xed\xbd\x9c",2)) + self.assertEqual("\U0001f64f", mutf8.decode_and_patch(b"\xed\xa0\xbd\xed\xb9\x8f",2)) + self.assertEqual("\\ud853", mutf8.decode_and_patch(b"\xed\xa1\x93", 1)) + self.assertEqual("\U00024f5c\U0001f64f\\ud83d\uacf0hello world\x00", mutf8.decode_and_patch(b, 18)) + if __name__ == '__main__': From e0f9625602fbf0d7522e3422f15b5cfd49024ac0 Mon Sep 17 00:00:00 2001 From: Neakxs Date: Sat, 27 Apr 2019 08:30:01 -0700 Subject: [PATCH 04/15] Improving `read_null_terminated_string` performance --- androguard/core/bytecodes/dvm.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py index 66152b184..f439ce921 100644 --- a/androguard/core/bytecodes/dvm.py +++ b/androguard/core/bytecodes/dvm.py @@ -101,13 +101,18 @@ def read_null_terminated_string(f): :param f: file-like object :rtype: bytearray """ - x = bytearray() + x = [] while True: - z = f.read(1) - if ord(z) == 0: - return x + z = f.read(128) + if 0 in z: + s = z.split(b'\x00',1) + x.append(s[0]) + idx = f.get_idx() + f.set_idx(idx - len(s[1])) + break else: - x.append(ord(z)) + x.append(z) + return b''.join(x) def get_access_flags_string(value): From 80a26ca2d547b185b6fb0f433b0cede8ce5eb2b6 Mon Sep 17 00:00:00 2001 From: Neakxs Date: Sat, 27 Apr 2019 09:10:31 -0700 Subject: [PATCH 05/15] Removing duplicate object creation --- androguard/core/bytecodes/dvm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py index f439ce921..5ae3f2af8 100644 --- a/androguard/core/bytecodes/dvm.py +++ b/androguard/core/bytecodes/dvm.py @@ -199,11 +199,11 @@ def static_operand_instruction(instruction): def get_sbyte(buff): - return unpack('=b', bytearray(buff.read(1)))[0] + return unpack('=b', buff.read(1))[0] def get_byte(buff): - return unpack('=B', bytearray(buff.read(1)))[0] + return unpack('=B', buff.read(1))[0] def readuleb128(buff): From 5d9e496695860a86ac973bbf3c93e83b50e499c4 Mon Sep 17 00:00:00 2001 From: Neakxs Date: Sun, 28 Apr 2019 04:56:26 -0700 Subject: [PATCH 06/15] Removing SV and SVs classes --- androguard/core/bytecode.py | 60 ------------------------------------- 1 file changed, 60 deletions(-) diff --git a/androguard/core/bytecode.py b/androguard/core/bytecode.py index 91a943ff6..7691b0f25 100644 --- a/androguard/core/bytecode.py +++ b/androguard/core/bytecode.py @@ -593,60 +593,6 @@ def method2json_direct(mx): return json.dumps(d) -class SV: - - def __init__(self, size, buff): - self.__size = size - self.__value = unpack(self.__size, buff)[0] - - def _get(self): - return pack(self.__size, self.__value) - - def __str__(self): - return "0x%x" % self.__value - - def __int__(self): - return self.__value - - def get_value_buff(self): - return self._get() - - def get_value(self): - return self.__value - - def set_value(self, attr): - self.__value = attr - - -class SVs: - - def __init__(self, size, ntuple, buff): - self.__size = size - - self.__value = ntuple._make(unpack(self.__size, buff)) - - def _get(self): - l = [] - for i in self.__value._fields: - l.append(getattr(self.__value, i)) - return pack(self.__size, *l) - - def _export(self): - return [x for x in self.__value._fields] - - def get_value_buff(self): - return self._get() - - def get_value(self): - return self.__value - - def set_value(self, attr): - self.__value = self.__value._replace(**attr) - - def __str__(self): - return self.__value.__str__() - - def object_to_bytes(obj): """ Convert a object to a bytearray or call get_raw() of the object @@ -787,9 +733,6 @@ def readat(self, off): :param int off: starting offset :rtype: bytearray """ - if isinstance(off, SV): - off = off.value - return self.__buff[off:] def read(self, size): @@ -800,9 +743,6 @@ def read(self, size): :param int size: length of bytes to read :rtype: bytearray """ - if isinstance(size, SV): - size = size.value - buff = self.__buff[self.__idx:self.__idx + size] self.__idx += size From b0b91abb0c2f78e6367edb8e5da22c728b64d396 Mon Sep 17 00:00:00 2001 From: Neakxs Date: Wed, 1 May 2019 12:54:24 -0700 Subject: [PATCH 07/15] Adding MUTF8String class for dexparser Include `decode` and `encode` functions Warning : `decode` doesn't return a printable string --- androguard/core/bytecodes/mutf8.py | 256 ++++++++++++++++++++--------- 1 file changed, 175 insertions(+), 81 deletions(-) diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py index 7d974996a..c79a2987e 100644 --- a/androguard/core/bytecodes/mutf8.py +++ b/androguard/core/bytecodes/mutf8.py @@ -1,103 +1,197 @@ def decode(b): - """ - Decode bytes as MUTF-8 - See https://docs.oracle.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8 - for more information + size = len(b) + ord_array = [None] * size + ord_index = 0 - Surrogates will be returned as two 16 bit characters. - - :param b: bytes to decode - :rtype: unicode (py2), str (py3) of 16bit chars - :raises: UnicodeDecodeError if string is not decodable - """ - res = "" - - b = iter(bytearray(b)) + b = iter(b) for x in b: if x >> 7 == 0: # Single char: - res += chr(x & 0x7f) + ord_array[ord_index] = x & 0x7f elif x >> 5 == 0b110: # 2 byte Multichar b2 = next(b) if b2 >> 6 != 0b10: - raise UnicodeDecodeError("Second byte of 2 byte sequence does not looks right.") + raise UnicodeDecodeError( + "Second byte of 2 byte sequence does not looks right.") - res += chr((x & 0x1f) << 6 | b2 & 0x3f) + ord_array[ord_index] = (x & 0x1f) << 6 | b2 & 0x3f elif x >> 4 == 0b1110: # 3 byte Multichar b2 = next(b) b3 = next(b) if b2 >> 6 != 0b10: - raise UnicodeDecodeError("Second byte of 3 byte sequence does not looks right.") + raise UnicodeDecodeError( + "Second byte of 3 byte sequence does not looks right.") if b3 >> 6 != 0b10: - raise UnicodeDecodeError("Third byte of 3 byte sequence does not looks right.") + raise UnicodeDecodeError( + "Third byte of 3 byte sequence does not looks right.") - res += chr((x & 0xf) << 12 | (b2 & 0x3f) << 6 | b3 & 0x3f) + ord_array[ord_index] = (x & 0xf) << 12 | ( + b2 & 0x3f) << 6 | b3 & 0x3f else: raise UnicodeDecodeError("Could not decode byte") - - return res - - -class PeekIterator: - """ - A quick'n'dirty variant of an Iterator that has a special function - peek, which will return the next object but not consume it. - """ - idx = 0 - - def __init__(self, s): - self.s = s - - def __iter__(self): - return self - - def __next__(self): - if self.idx == len(self.s): - raise StopIteration() - self.idx = self.idx + 1 - return self.s[self.idx - 1] - - def next(self): - # py2 compliance - return self.__next__() - - def peek(self): - if self.idx == len(self.s): - return None - return self.s[self.idx] - - -def patch_string(s): - """ - Reorganize a String in such a way that surrogates are printable - and lonely surrogates are escaped. - - :param s: input string - :return: string with escaped lonely surrogates and 32bit surrogates - """ - res = '' - it = PeekIterator(s) - for c in it: - if (ord(c) >> 10) == 0b110110: - # High surrogate - # Check for the next - n = it.peek() - if n and (ord(n) >> 10) == 0b110111: - # Next is a low surrogate! Merge them together - res += chr(((ord(c) & 0x3ff) << 10 | (ord(n) & 0x3ff)) + 0x10000) - # Skip next char, as we already consumed it - next(it) + ord_index += 1 + + chr_array = [""]*size + chr_index = 0 + while chr_index < size: + c = ord_array[chr_index] + if c is None: + break + if (c >> 10) == 0b110110: + n = None + try: + n = ord_array[chr_index + 1] + except: + pass + if n and (n >> 10) == 0b110111: + chr_array[chr_index] = chr( + ((c & 0x3ff) << 10 | (n & 0x3ff)) + 0x10000) + chr_index += 1 else: - # Lonely high surrogate - res += "\\u{:04x}".format(ord(c)) - elif (ord(c) >> 10) == 0b110111: - # Lonely low surrogate - res += "\\u{:04x}".format(ord(c)) + chr_array[chr_index] = chr(c) else: - # Looks like a normal char... - res += c - return res - + chr_array[chr_index] = chr(c) + chr_index += 1 + + return "".join(chr_array) + + +def encode(s): + b = [b""]*len(s) + ord_array = [i for i in map(lambda x: ord(x), s)] + for x in ord_array: + if (x == 0) or ((x <= 0x7ff) and (x >= 0x80)): + b1 = ((x & 0x7c0) >> 6 | 0xc0).to_bytes(1, 'big') + b2 = ((x & 0x3f) | 0x80).to_bytes(1, 'big') + b.append(b1 + b2) + elif (x <= 0x7f): + b1 = x.to_bytes(1, 'big') + b.append(b1) + elif (x >= 0x800) and (x <= 0xffff): + b1 = ((x & 0xf000) >> 12 | 0xe0).to_bytes(1, 'big') + b2 = ((x & 0xfff) >> 6 | 0x80).to_bytes(1, 'big') + b3 = ((x & 0x3f) | 0x80).to_bytes(1, 'big') + b.append(b1 + b2 + b3) + else: + a = x - 0x10000 + s1 = ((a >> 10) | 0xd800) + s2 = ((a & 0x3ff) | 0xdc00) + b1 = ((s1 & 0xf000) >> 12 | 0xe0).to_bytes(1, 'big') + b2 = ((s1 & 0xfff) >> 6 | 0x80).to_bytes(1, 'big') + b3 = ((s1 & 0x3f) | 0x80).to_bytes(1, 'big') + b4 = ((s2 & 0xf000) >> 12 | 0xe0).to_bytes(1, 'big') + b5 = ((s2 & 0xfff) >> 6 | 0x80).to_bytes(1, 'big') + b6 = ((s2 & 0x3f) | 0x80).to_bytes(1, 'big') + b.append(b1 + b2 + b3 + b4 + b5 + b6) + return b"".join(b) + + +class MUTF8String(): + def __init__(self, data, raw=True): + self.__encoded = None + self.__decoded = None + if raw: + self.__encoded = data + else: + self.__decoded = data + + @classmethod + def from_bytes(cls, data): + return cls(data) + + @classmethod + def from_str(cls, data): + return cls(data, raw=False) + + @property + def bytes(self): + if self.__encoded is None: + self.__encoded = encode(self.__decoded) + return self.__encoded + + @property + def string(self): + if self.__decoded is None: + self.__decoded = decode(self.__encoded) + return self.__decoded + + def __repr__(self): + return "".format(self.__str__()) + + def __str__(self): + return self.string.encode('utf8', errors='backslashreplace').decode('utf8') + + def __hash__(self): + return self.bytes.__hash__() + + def __len__(self): + return self.bytes.__len__() + + def __lt__(self, other): + try: + return self.bytes.__lt__(other.bytes) + except AttributeError: + if isinstance(other, bytes): + return self.bytes.__lt__(other) + elif isinstance(other, str): + return self.bytes.__lt__(MUTF8String.from_str(other).bytes) + else: + raise TypeError('{} is not supported'.format(type(other))) + + def __le__(self, other): + try: + return self.bytes.__le__(other.bytes) + except AttributeError: + if isinstance(other, bytes): + return self.bytes.__le__(other) + elif isinstance(other, str): + return self.bytes.__le__(MUTF8String.from_str(other).bytes) + else: + raise TypeError('{} is not supported'.format(type(other))) + + def __eq__(self, other): + try: + return self.bytes.__eq__(other.bytes) + except AttributeError: + if isinstance(other, bytes): + return self.bytes.__eq__(other) + elif isinstance(other, str): + return self.bytes.__eq__(MUTF8String.from_str(other).bytes) + else: + raise TypeError('{} is not supported'.format(type(other))) + + def __ne__(self, other): + try: + return self.bytes.__ne__(other.bytes) + except AttributeError: + if isinstance(other, bytes): + return self.bytes.__ne__(other) + elif isinstance(other, str): + return self.bytes.__ne__(MUTF8String.from_str(other).bytes) + else: + raise TypeError('{} is not supported'.format(type(other))) + + def __gt__(self, other): + try: + return self.bytes.__gt__(other.bytes) + except AttributeError: + if isinstance(other, bytes): + return self.bytes.__gt__(other) + elif isinstance(other, str): + return self.bytes.__gt__(MUTF8String.from_str(other).bytes) + else: + raise TypeError('{} is not supported'.format(type(other))) + + def __ge__(self, other): + try: + return self.bytes.__ge__(other.bytes) + except AttributeError: + if isinstance(other, bytes): + return self.bytes.__ge__(other) + elif isinstance(other, str): + return self.bytes.__ge__(MUTF8String.from_str(other).bytes) + else: + raise TypeError('{} is not supported'.format(type(other))) From 1d4531be97141b6a0f8dcadea555b3bf2e9d6346 Mon Sep 17 00:00:00 2001 From: Neakxs Date: Wed, 1 May 2019 12:56:24 -0700 Subject: [PATCH 08/15] Changing tests for new mutf8 package --- tests/test_strings.py | 48 +++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/tests/test_strings.py b/tests/test_strings.py index df136b8fc..4a2224979 100644 --- a/tests/test_strings.py +++ b/tests/test_strings.py @@ -27,25 +27,20 @@ def testDex(self): self.assertIn(s, d.get_strings()) def testMUTF8(self): - self.assertEqual("\x67", mutf8.decode(b"\x67")) - # Null byte - self.assertEqual("\x00", mutf8.decode(b"\xc0\x80")) - self.assertEqual("\uacf0", mutf8.decode(b"\xea\xb3\xb0")) - # Surrogates - self.assertEqual("\ud83d\ude4f", mutf8.decode(b"\xed\xa0\xbd\xed\xb9\x8f")) - self.assertEqual("\ud853\udf5c", mutf8.decode(b"\xed\xa1\x93\xed\xbd\x9c")) - # Lonely surrogates - self.assertEqual("\ud853", mutf8.decode(b"\xed\xa1\x93")) - self.assertEqual("\udf5c", mutf8.decode(b"\xed\xbd\x9c")) - # Normal ASCII String - self.assertEqual("hello world", mutf8.decode(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64")) - - # Test the patching of strings - - self.assertEqual("hello world", mutf8.patch_string(mutf8.decode(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64"))) - self.assertEqual("\U00024f5c", mutf8.patch_string(mutf8.decode(b"\xed\xa1\x93\xed\xbd\x9c"))) - self.assertEqual("\U0001f64f", mutf8.patch_string(mutf8.decode(b"\xed\xa0\xbd\xed\xb9\x8f"))) - self.assertEqual("\\ud853", mutf8.patch_string(mutf8.decode(b"\xed\xa1\x93"))) + # self.assertEqual("\x67", mutf8.decode(b"\x67")) + # # Null byte + # self.assertEqual("\x00", mutf8.decode(b"\xc0\x80")) + # self.assertEqual("\uacf0", mutf8.decode(b"\xea\xb3\xb0")) + # # Surrogates + # self.assertEqual("\ud83d\ude4f", mutf8.decode(b"\xed\xa0\xbd\xed\xb9\x8f")) + # self.assertEqual("\ud853\udf5c", mutf8.decode(b"\xed\xa1\x93\xed\xbd\x9c")) + # # Lonely surrogates + # self.assertEqual("\ud853", mutf8.decode(b"\xed\xa1\x93")) + # self.assertEqual("\udf5c", mutf8.decode(b"\xed\xbd\x9c")) + # # Normal ASCII String + # self.assertEqual("hello world", mutf8.decode(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64")) + + # Testing decode b = b"\xed\xa1\x93\xed\xbd\x9c" + \ b"\xed\xa0\xbd\xed\xb9\x8f" + \ @@ -54,8 +49,21 @@ def testMUTF8(self): b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64" + \ b"\xc0\x80" - self.assertEqual("\U00024f5c\U0001f64f\\ud83d\uacf0hello world\x00", mutf8.patch_string(mutf8.decode(b))) + self.assertEqual("hello world", mutf8.decode(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64").encode('utf8', errors='backslashreplace').decode('utf8')) + self.assertEqual("\U00024f5c", mutf8.decode(b"\xed\xa1\x93\xed\xbd\x9c").encode('utf8', errors='backslashreplace').decode('utf8')) + self.assertEqual("\U0001f64f", mutf8.decode(b"\xed\xa0\xbd\xed\xb9\x8f").encode('utf8', errors='backslashreplace').decode('utf8')) + self.assertEqual("\\ud853", mutf8.decode(b"\xed\xa1\x93").encode('utf8', errors='backslashreplace').decode('utf8')) + self.assertEqual("\U00024f5c\U0001f64f\\ud83d\uacf0hello world\x00", mutf8.decode(b).encode('utf8', errors='backslashreplace').decode('utf8')) + + # Testing encode + + self.assertEqual(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64", mutf8.encode("hello world")) + self.assertEqual(b"\xed\xa1\x93\xed\xbd\x9c", mutf8.encode("\U00024f5c")) + self.assertEqual(b"\xed\xa0\xbd\xed\xb9\x8f", mutf8.encode("\U0001f64f")) + self.assertEqual(b"\xed\xa1\x93", mutf8.encode("\ud853")) + self.assertEqual(b, mutf8.encode("\U00024f5c\U0001f64f\ud83d\uacf0hello world\x00")) + self.assertEqual(mutf8.MUTF8String.from_bytes(b), mutf8.MUTF8String.from_str("\U00024f5c\U0001f64f\ud83d\uacf0hello world\x00")) if __name__ == '__main__': From 6a6f2a97454bc2f8ee749d148510244d2bbc43bb Mon Sep 17 00:00:00 2001 From: Neakxs Date: Wed, 1 May 2019 12:57:43 -0700 Subject: [PATCH 09/15] Applying MUTF8String to dexparser --- androguard/core/analysis/analysis.py | 36 +++++++++------ androguard/core/bytecode.py | 1 + androguard/core/bytecodes/dvm.py | 65 +++++++++++++------------- androguard/core/bytecodes/mutf8.py | 69 ++++++++++++++++++++++++---- androguard/decompiler/dad/writer.py | 3 +- tests/test_dexcodeparsing.py | 4 +- 6 files changed, 121 insertions(+), 57 deletions(-) diff --git a/androguard/core/analysis/analysis.py b/androguard/core/analysis/analysis.py index 9988a4570..52193ef5e 100644 --- a/androguard/core/analysis/analysis.py +++ b/androguard/core/analysis/analysis.py @@ -3,7 +3,7 @@ import time import warnings from androguard.core.androconf import is_ascii_problem, load_api_specific_resource_module -from androguard.core.bytecodes import dvm +from androguard.core.bytecodes import dvm, mutf8 import logging from androguard.core import bytecode import networkx as nx @@ -789,7 +789,7 @@ def get_method(self, name, descriptor): :param descriptor: method descriptor, for example `'(I)V'` :return: :class:`ExternalMethod` """ - key = name + str(descriptor) + key = name + mutf8.MUTF8String.join(descriptor) if key not in self.methods: self.methods[key] = ExternalMethod(self.name, name, descriptor) @@ -818,7 +818,7 @@ def get_class_name(self): return self.class_name def get_descriptor(self): - return ''.join(self.descriptor) + return mutf8.MUTF8String.join(self.descriptor) @property def full_name(self): @@ -837,7 +837,7 @@ def get_access_flags_string(self): return "" def __str__(self): - return "{}->{}{}".format(self.class_name, self.name, ''.join(self.descriptor)) + return "{}->{}{}".format(self.class_name.__str__(), self.name.__str__(), mutf8.MUTF8String.join(self.descriptor).string) def __repr__(self): return "".format(self.__str__()) @@ -986,7 +986,7 @@ def get_fake_method(self, name, descriptor): # We are searching an unknown method in this class # It could be something that the class herits - key = name + str(descriptor) + key = name + mutf8.MUTF8String.join(descriptor) if key not in self._inherits_methods: self._inherits_methods[key] = ExternalMethod(self.orig_class.get_name(), name, descriptor) return self._inherits_methods[key] @@ -1269,7 +1269,7 @@ def _create_xref(self, current_class): method_item = None # TODO: should create get_method_descriptor inside Analysis for vm in self.vms: - method_item = vm.get_method_descriptor(method_info[0], method_info[1], ''.join(method_info[2])) + method_item = vm.get_method_descriptor(method_info[0], method_info[1], mutf8.MUTF8String.join(method_info[2])) if method_item: break @@ -1485,10 +1485,11 @@ def find_classes(self, name=".*", no_external=False): :param no_external: Remove external classes from the output (default False) :rtype: generator of `ClassAnalysis` """ + name = mutf8.MUTF8String.from_str(name).bytes for cname, c in self.classes.items(): if no_external and isinstance(c.get_vm_class(), ExternalClass): continue - if re.match(name, cname): + if re.match(name, cname.bytes): yield c def find_methods(self, classname=".*", methodname=".*", descriptor=".*", @@ -1505,8 +1506,11 @@ def find_methods(self, classname=".*", methodname=".*", descriptor=".*", :param no_external: Remove external method from the output (default False) :rtype: generator of `MethodClassAnalysis` """ + classname = mutf8.MUTF8String.from_str(classname).bytes + methodname = mutf8.MUTF8String.from_str(methodname).bytes + descriptor = mutf8.MUTF8String.from_str(descriptor).bytes for cname, c in self.classes.items(): - if re.match(classname, cname): + if re.match(classname, cname.bytes): for m in c.get_methods(): z = m.get_method() # TODO is it even possible that an internal class has @@ -1514,8 +1518,8 @@ def find_methods(self, classname=".*", methodname=".*", descriptor=".*", # instead... if no_external and isinstance(z, ExternalMethod): continue - if re.match(methodname, z.get_name()) and \ - re.match(descriptor, z.get_descriptor()) and \ + if re.match(methodname, z.get_name().bytes) and \ + re.match(descriptor, z.get_descriptor().bytes) and \ re.match(accessflags, z.get_access_flags_string()): yield m @@ -1526,8 +1530,9 @@ def find_strings(self, string=".*"): :param string: regular expression for the string to search for :rtype: generator of `StringAnalysis` """ + string = mutf8.MUTF8String.from_str(string).bytes for s, sa in self.strings.items(): - if re.match(string, s): + if re.match(string, s.bytes): yield sa def find_fields(self, classname=".*", fieldname=".*", fieldtype=".*", accessflags=".*"): @@ -1540,12 +1545,15 @@ def find_fields(self, classname=".*", fieldname=".*", fieldtype=".*", accessflag :param accessflags: regular expression of the access flags :rtype: generator of `FieldClassAnalysis` """ + classname = mutf8.MUTF8String.from_str(classname).bytes + fieldname = mutf8.MUTF8String.from_str(fieldname).bytes + fieldtype = mutf8.MUTF8String.from_str(fieldtype).bytes for cname, c in self.classes.items(): - if re.match(classname, cname): + if re.match(classname, cname.bytes): for f in c.get_fields(): z = f.get_field() - if re.match(fieldname, z.get_name()) and \ - re.match(fieldtype, z.get_descriptor()) and \ + if re.match(fieldname, z.get_name().bytes) and \ + re.match(fieldtype, z.get_descriptor().bytes) and \ re.match(accessflags, z.get_access_flags_string()): yield f diff --git a/androguard/core/bytecode.py b/androguard/core/bytecode.py index 91a943ff6..d1ffba44b 100644 --- a/androguard/core/bytecode.py +++ b/androguard/core/bytecode.py @@ -885,6 +885,7 @@ def FormatClassToPython(i): :rtype: str """ i = i[:-1] + print(i) i = i.replace("/", "_") i = i.replace("$", "_") diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py index 0c0859cfe..a6c68d389 100644 --- a/androguard/core/bytecodes/dvm.py +++ b/androguard/core/bytecodes/dvm.py @@ -446,11 +446,9 @@ def determineException(vm, m): class HeaderItem: """ This class can parse an header_item of a dex file. - Several checks are performed to detect if this is not an header_item. Also the Adler32 checksum of the file is calculated in order to detect file corruption. - :param buff: a string which represents a Buff object of the header_item :type androguard.core.bytecode.BuffHandle buff: Buff object :param cm: a ClassManager object @@ -1887,19 +1885,19 @@ def set_off(self, off): def get_off(self): return self.offset - def get_unicode(self): - """ - Returns an Unicode String - This is the actual string. Beware that some strings might be not - decodeable with usual UTF-16 decoder, as they use surrogates that are - not supported by python. - """ - s = mutf8.decode(self.data) - if len(s) != self.utf16_size: - raise ValueError("UTF16 Length does not match!") + # def get_unicode(self): + # """ + # Returns an Unicode String + # This is the actual string. Beware that some strings might be not + # decodeable with usual UTF-16 decoder, as they use surrogates that are + # not supported by python. + # """ + # s = mutf8.decode(self.data) + # if len(s) != self.utf16_size: + # raise ValueError("UTF16 Length does not match!") - # Return a UTF16 String - return s + # # Return a UTF16 String + # return s def get(self): """ @@ -1908,11 +1906,12 @@ def get(self): string as 6 characters: \\ud853 Valid surrogates are encoded as 32bit values, ie. \U00024f5c. """ - s = mutf8.decode(self.data) - if len(s) != self.utf16_size: - raise ValueError("UTF16 Length does not match!") - # log.debug("Decoding UTF16 string with IDX {}, utf16 length {} and hexdata '{}'.".format(self.offset, self.utf16_size, binascii.hexlify(self.data))) - return mutf8.patch_string(s) + return mutf8.MUTF8String.from_bytes(self.data) + # s = mutf8.decode(self.data) + # if len(s) != self.utf16_size: + # raise ValueError("UTF16 Length does not match!") + # # log.debug("Decoding UTF16 string with IDX {}, utf16 length {} and hexdata '{}'.".format(self.offset, self.utf16_size, binascii.hexlify(self.data))) + # return mutf8.patch_string(s) def show(self): bytecode._PrintSubBanner("String Data Item") @@ -2167,7 +2166,7 @@ def get_parameters_off_value(self): """ if self.parameters_off_value is None: params = self.CM.get_type_list(self.parameters_off) - self.parameters_off_value = '({})'.format(' '.join(params)) + self.parameters_off_value = mutf8.MUTF8String.from_bytes(b'(') + mutf8.MUTF8String.join(params, spacing=b' ') + mutf8.MUTF8String.from_bytes(b')') return self.parameters_off_value def show(self): @@ -2680,7 +2679,7 @@ def reload(self): name = self.CM.get_field(self.field_idx) self.class_name = name[0] self.name = name[2] - self.proto = ''.join(i for i in name[1]) + self.proto = name[1] def set_init_value(self, value): """ @@ -2916,7 +2915,7 @@ def reload(self): if v and len(v) >= 3: self.class_name = v[0] self.name = v[1] - self.proto = ''.join(i for i in v[2]) + self.proto = mutf8.MUTF8String.join(i for i in v[2]) else: self.class_name = 'CLASS_NAME_ERROR' self.name = 'NAME_ERROR' @@ -2993,7 +2992,7 @@ def __str__(self): @property def full_name(self): """Return class_name + name + descriptor, separated by spaces (no access flags""" - return " ".join([self.class_name, self.name, self.get_descriptor()]) + return mutf8.MUTF8String.join([self.class_name, self.name, self.get_descriptor()], spacing=b' ') def get_short_string(self): """ @@ -7948,11 +7947,12 @@ def get_method(self, name): :rtype: a list with all :class:`EncodedMethod` objects """ # TODO could use a generator here - prog = re.compile(name) + name = mutf8.MUTF8String.from_str(name) + prog = re.compile(name.bytes) l = [] for i in self.get_classes(): for j in i.get_methods(): - if prog.match(j.get_name()): + if prog.match(j.get_name().bytes): l.append(j) return l @@ -7965,11 +7965,12 @@ def get_field(self, name): :rtype: a list with all :class:`EncodedField` objects """ # TODO could use a generator here - prog = re.compile(name) + name = mutf8.MUTF8String.from_str(name) + prog = re.compile(name.bytes) l = [] for i in self.get_classes(): for j in i.get_fields(): - if prog.match(j.get_name()): + if prog.match(j.get_name().bytes): l.append(j) return l @@ -8212,7 +8213,7 @@ def _delete_python_export_class(self, _class): def _create_python_export_class(self, _class, delete=False): if _class is not None: ### Class - name = bytecode.FormatClassToPython(_class.get_name()) + name = bytecode.FormatClassToPython(_class.get_name()).string if delete: delattr(self.C, name) return @@ -8236,13 +8237,13 @@ def _create_python_export_methods(self, _class, delete): for i in m: if len(m[i]) == 1: j = m[i][0] - name = bytecode.FormatNameToPython(j.get_name()) + name = bytecode.FormatNameToPython(j.get_name()).string setattr(_class.M, name, j) else: for j in m[i]: name = ( bytecode.FormatNameToPython(j.get_name()) + "_" + - bytecode.FormatDescriptorToPython(j.get_descriptor())) + bytecode.FormatDescriptorToPython(j.get_descriptor())).string setattr(_class.M, name, j) def _create_python_export_fields(self, _class, delete): @@ -8257,13 +8258,13 @@ def _create_python_export_fields(self, _class, delete): for i in f: if len(f[i]) == 1: j = f[i][0] - name = bytecode.FormatNameToPython(j.get_name()) + name = bytecode.FormatNameToPython(j.get_name()).string setattr(_class.F, name, j) else: for j in f[i]: name = bytecode.FormatNameToPython(j.get_name( )) + "_" + bytecode.FormatDescriptorToPython( - j.get_descriptor()) + j.get_descriptor()).string setattr(_class.F, name, j) def get_BRANCH_DVM_OPCODES(self): diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py index c79a2987e..21ea1122e 100644 --- a/androguard/core/bytecodes/mutf8.py +++ b/androguard/core/bytecodes/mutf8.py @@ -91,21 +91,38 @@ def encode(s): class MUTF8String(): def __init__(self, data, raw=True): - self.__encoded = None - self.__decoded = None - if raw: - self.__encoded = data + if isinstance(data, MUTF8String): + self.__encoded = data.__encoded + self.__decoded = data.__decoded else: - self.__decoded = data + self.__encoded = None + self.__decoded = None + if raw: + self.__encoded = data + else: + self.__decoded = data @classmethod def from_bytes(cls, data): - return cls(data) + return cls(bytes(data)) @classmethod def from_str(cls, data): return cls(data, raw=False) + @classmethod + def join(cls, data, spacing=b''): + array = [] + for i in data: + try: + array.append(i.bytes) + except AttributeError: + if isinstance(i, bytes): + array.append(i) + else: + array.append(encode(i)) + return MUTF8String.from_bytes(spacing.join(array)) + @property def bytes(self): if self.__encoded is None: @@ -118,17 +135,53 @@ def string(self): self.__decoded = decode(self.__encoded) return self.__decoded + def replace(self, old, new): + try: + return MUTF8String.from_bytes(self.bytes.replace(old, new)) + except TypeError: + return MUTF8String.from_bytes(self.bytes.replace(encode(old), encode(new))) + + def find(self, sub): + try: + return self.bytes.find(sub) + except TypeError: + return self.bytes.find(encode(sub)) + + def split(self, sub): + try: + return self.bytes.split(sub) + except TypeError: + return self.bytes.split(encode(sub)) + + def startswith(self, sub): + try: + return self.bytes.startswith(sub) + except TypeError: + return self.bytes.startswith(encode(sub)) + + def __add__(self, other): + try: + return MUTF8String.from_bytes(self.bytes + other.bytes) + except AttributeError: + return MUTF8String.from_bytes(self.bytes + encode(other)) + + def __getitem__(self, item): + return MUTF8String.from_bytes(self.bytes[item]) + def __repr__(self): return "".format(self.__str__()) def __str__(self): return self.string.encode('utf8', errors='backslashreplace').decode('utf8') + def __format__(self, format_spec): + return format(self.string, format_spec) + def __hash__(self): - return self.bytes.__hash__() + return hash(self.bytes) def __len__(self): - return self.bytes.__len__() + return len(self.bytes) def __lt__(self, other): try: diff --git a/androguard/decompiler/dad/writer.py b/androguard/decompiler/dad/writer.py index 61a91bee0..1fd2d3e84 100644 --- a/androguard/decompiler/dad/writer.py +++ b/androguard/decompiler/dad/writer.py @@ -17,6 +17,7 @@ import logging from struct import unpack +from androguard.core.bytecodes import mutf8 from androguard.decompiler.dad.util import get_type from androguard.decompiler.dad.opcode_ins import Op from androguard.decompiler.dad.instruction import ( @@ -48,7 +49,7 @@ def __init__(self, graph, method): self.need_break = True def __str__(self): - return ''.join(self.buffer) + return mutf8.MUTF8String.join(self.buffer).string def str_ext(self): return self.buffer2 diff --git a/tests/test_dexcodeparsing.py b/tests/test_dexcodeparsing.py index 335658f0d..5f5c9cac2 100644 --- a/tests/test_dexcodeparsing.py +++ b/tests/test_dexcodeparsing.py @@ -45,7 +45,7 @@ def testcode(self): def testClassManager(self): """Test if the classmanager has the same items""" - from androguard.core.bytecodes.mutf8 import decode, patch_string + from androguard.core.bytecodes.mutf8 import decode fname = "examples/android/TestsAndroguard/bin/classes.dex" @@ -64,7 +64,7 @@ def testClassManager(self): for idx in range(parsed.string_ids_size): self.assertNotEqual(cm.get_string(idx), ERR_STR) self.assertNotEqual(cm.get_raw_string(idx), ERR_STR) - self.assertEqual(cm.get_raw_string(idx), patch_string(decode(parsed.str_raw[idx]))) + self.assertEqual(cm.get_raw_string(idx), decode(parsed.str_raw[idx])) self.assertEqual(cm.get_string(parsed.string_ids_size), ERR_STR) self.assertEqual(cm.get_raw_string(parsed.string_ids_size), ERR_STR) From 8623899959ee94636fe5296a227945eae58ef9b6 Mon Sep 17 00:00:00 2001 From: Neakxs Date: Thu, 2 May 2019 12:26:57 -0700 Subject: [PATCH 10/15] Applying patch of #684 and fixing bugs Patch concern `read_null_terminated_string` improvement --- androguard/core/bytecode.py | 1 - androguard/core/bytecodes/dvm.py | 20 ++++++++++++-------- androguard/decompiler/decompiler.py | 2 +- tests/test_decompiler.py | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/androguard/core/bytecode.py b/androguard/core/bytecode.py index d1ffba44b..91a943ff6 100644 --- a/androguard/core/bytecode.py +++ b/androguard/core/bytecode.py @@ -885,7 +885,6 @@ def FormatClassToPython(i): :rtype: str """ i = i[:-1] - print(i) i = i.replace("/", "_") i = i.replace("$", "_") diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py index a6c68d389..31ea96b57 100644 --- a/androguard/core/bytecodes/dvm.py +++ b/androguard/core/bytecodes/dvm.py @@ -97,17 +97,21 @@ class InvalidInstruction(Error): def read_null_terminated_string(f): """ Read a null terminated string from a file-like object. - :param f: file-like object :rtype: bytearray """ - x = bytearray() + x = [] while True: - z = f.read(1) - if ord(z) == 0: - return x + z = f.read(128) + if 0 in z: + s = z.split(b'\x00',1) + x.append(s[0]) + idx = f.get_idx() + f.set_idx(idx - len(s[1])) + break else: - x.append(ord(z)) + x.append(z) + return b''.join(x) def get_access_flags_string(value): @@ -194,11 +198,11 @@ def static_operand_instruction(instruction): def get_sbyte(buff): - return unpack('=b', bytearray(buff.read(1)))[0] + return unpack('=b', buff.read(1))[0] def get_byte(buff): - return unpack('=B', bytearray(buff.read(1)))[0] + return unpack('=B', buff.read(1))[0] def readuleb128(buff): diff --git a/androguard/decompiler/decompiler.py b/androguard/decompiler/decompiler.py index 76a0eeaaa..de34bd843 100644 --- a/androguard/decompiler/decompiler.py +++ b/androguard/decompiler/decompiler.py @@ -740,7 +740,7 @@ def _find_class(self, clname, basefolder): return res # Check the whole supplied name - fname = os.path.join(basefolder, clname.replace("/", os.sep) + ".java") + fname = os.path.join(basefolder, (clname.replace("/", os.sep) + ".java").string) if not os.path.isfile(fname): return None return fname diff --git a/tests/test_decompiler.py b/tests/test_decompiler.py index badfce314..00e52b0d1 100644 --- a/tests/test_decompiler.py +++ b/tests/test_decompiler.py @@ -47,7 +47,7 @@ def test_all_decompiler(): # Generate test cases for this APK: a, d, dx = AnalyzeAPK("examples/tests/hello-world.apk") for c in d[0].get_classes(): - test_name = re.sub("[^a-zA-Z0-9_]", "_", c.get_name()[1:-1]) + test_name = re.sub("[^a-zA-Z0-9_]", "_", c.get_name().string[1:-1]) # Test the decompilation of a single class # disable for now, as testing all DvMethods has the same effect as # testing all DvClasses. From 2acf45dfd161a4953cdf1197170deb068ebc5f62 Mon Sep 17 00:00:00 2001 From: Neakxs Date: Thu, 2 May 2019 13:25:24 -0700 Subject: [PATCH 11/15] Improving MUTF8String methods --- androguard/core/bytecodes/mutf8.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py index 21ea1122e..2817b6685 100644 --- a/androguard/core/bytecodes/mutf8.py +++ b/androguard/core/bytecodes/mutf8.py @@ -135,11 +135,17 @@ def string(self): self.__decoded = decode(self.__encoded) return self.__decoded - def replace(self, old, new): - try: - return MUTF8String.from_bytes(self.bytes.replace(old, new)) - except TypeError: - return MUTF8String.from_bytes(self.bytes.replace(encode(old), encode(new))) + def replace(self, old, new, count=None): + if count is None: + try: + return MUTF8String.from_bytes(self.bytes.replace(old, new)) + except TypeError: + return MUTF8String.from_bytes(self.bytes.replace(encode(old), encode(new))) + else: + try: + return MUTF8String.from_bytes(self.bytes.replace(old, new, count)) + except TypeError: + return MUTF8String.from_bytes(self.bytes.replace(encode(old), encode(new), count)) def find(self, sub): try: @@ -147,11 +153,17 @@ def find(self, sub): except TypeError: return self.bytes.find(encode(sub)) - def split(self, sub): + def split(self, sep=None, maxsplit=-1): + try: + return [MUTF8String.from_bytes(i) for i in self.bytes.split(sep, maxsplit)] + except TypeError: + return [MUTF8String.from_bytes(i) for i in self.bytes.split(encode(sep), maxsplit)] + + def rsplit(self, sep=None, maxsplit=-1): try: - return self.bytes.split(sub) + return [MUTF8String.from_bytes(i) for i in self.bytes.rsplit(sep, maxsplit)] except TypeError: - return self.bytes.split(encode(sub)) + return [MUTF8String.from_bytes(i) for i in self.bytes.rsplit(encode(sep), maxsplit)] def startswith(self, sub): try: From 14d6b0ae6b73673c4d4408ec1d99e105e713465e Mon Sep 17 00:00:00 2001 From: Neakxs Date: Thu, 2 May 2019 13:27:42 -0700 Subject: [PATCH 12/15] Changing DAD strings with MUTF8String equivalents --- androguard/decompiler/dad/decompile.py | 2 +- androguard/decompiler/dad/writer.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/androguard/decompiler/dad/decompile.py b/androguard/decompiler/dad/decompile.py index 3f1a98ab6..a09b9319e 100644 --- a/androguard/decompiler/dad/decompile.py +++ b/androguard/decompiler/dad/decompile.py @@ -299,7 +299,7 @@ def get_source(self): if len(self.interfaces) > 0: prototype += ' implements %s' % ', '.join( - [n[1:-1].replace('/', '.') for n in self.interfaces]) + [n[1:-1].replace('/', '.').string for n in self.interfaces]) source.append('%s {\n' % prototype) for field in self.fields: diff --git a/androguard/decompiler/dad/writer.py b/androguard/decompiler/dad/writer.py index 1fd2d3e84..83055f80f 100644 --- a/androguard/decompiler/dad/writer.py +++ b/androguard/decompiler/dad/writer.py @@ -679,6 +679,10 @@ def visit_condz_expression(self, op, arg): arg.visit(self) else: arg.visit(self) + try: + atype = atype.string + except AttributeError: + pass if atype in 'VBSCIJFD': self.write(' %s 0' % op, data="TODO64") else: From ad1fffeeb8a87737cb2c0acfa699b484f1ee4422 Mon Sep 17 00:00:00 2001 From: Neakxs Date: Thu, 2 May 2019 14:52:59 -0700 Subject: [PATCH 13/15] Fixing decompiler bugs with MUTF8String --- androguard/cli/main.py | 6 +++--- androguard/core/bytecodes/dvm.py | 6 +++--- androguard/core/bytecodes/mutf8.py | 6 ++++++ androguard/decompiler/decompiler.py | 4 ++-- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/androguard/cli/main.py b/androguard/cli/main.py index df85d4886..2ffc2549b 100644 --- a/androguard/cli/main.py +++ b/androguard/cli/main.py @@ -240,7 +240,7 @@ def export_apps_to_format(filename, continue # Current Folder to write to - filename_class = valid_class_name(method.get_class_name()) + filename_class = valid_class_name(method.get_class_name().string) filename_class = os.path.join(output, filename_class) create_directory(filename_class) @@ -257,10 +257,10 @@ def export_apps_to_format(filename, method2format(filename + "." + form, form, None, buff) # Write the Java file for the whole class - if method.get_class_name() not in dump_classes: + if method.get_class_name().string not in dump_classes: print("source codes ...", end=' ') current_class = vm.get_class(method.get_class_name()) - current_filename_class = valid_class_name(current_class.get_name()) + current_filename_class = valid_class_name(current_class.get_name().string) current_filename_class = os.path.join(output, current_filename_class + ".java") with open(current_filename_class, "w") as fd: diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py index 31ea96b57..8ed1b2c05 100644 --- a/androguard/core/bytecodes/dvm.py +++ b/androguard/core/bytecodes/dvm.py @@ -3027,11 +3027,11 @@ def _fmt_classname(cls): cls = cls.rsplit("/", 1)[1] return arr + cls - clsname = _fmt_classname(self.get_class_name()) + clsname = _fmt_classname(self.get_class_name().string) - param, ret = self.get_descriptor()[1:].split(")") + param, ret = self.get_descriptor().string[1:].split(")") params = map(_fmt_classname, param.split(" ")) - desc = "({}){}".format(" ".join(params), _fmt_classname(ret)) + desc = "({}){}".format(mutf8.MUTF8String.join(params), _fmt_classname(ret)) return "{cls} {meth} {desc}".format(cls=clsname, meth=self.get_name(), desc=desc) diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py index 2817b6685..81726d759 100644 --- a/androguard/core/bytecodes/mutf8.py +++ b/androguard/core/bytecodes/mutf8.py @@ -165,6 +165,12 @@ def rsplit(self, sep=None, maxsplit=-1): except TypeError: return [MUTF8String.from_bytes(i) for i in self.bytes.rsplit(encode(sep), maxsplit)] + def lstrip(self, sub): + try: + return MUTF8String.from_bytes(self.bytes.lstrip(sub)) + except TypeError: + return MUTF8String.from_bytes(self.bytes.lstrip(encode(sub))) + def startswith(self, sub): try: return self.bytes.startswith(sub) diff --git a/androguard/decompiler/decompiler.py b/androguard/decompiler/decompiler.py index de34bd843..7084a785c 100644 --- a/androguard/decompiler/decompiler.py +++ b/androguard/decompiler/decompiler.py @@ -696,7 +696,7 @@ def __init__(self, vm, vmx, jadx="jadx", keepfiles=False): # Next, try to find files for the classes we have for cl in andr_class_names: - fname = self._find_class(cl, tmpfolder) + fname = self._find_class(cl.string, tmpfolder) if fname: if "L{};".format(cl) not in self.classes: with open(fname, "rb") as fp: @@ -740,7 +740,7 @@ def _find_class(self, clname, basefolder): return res # Check the whole supplied name - fname = os.path.join(basefolder, (clname.replace("/", os.sep) + ".java").string) + fname = os.path.join(basefolder, clname.replace("/", os.sep) + ".java") if not os.path.isfile(fname): return None return fname From 46be25270ea7569ed77e8c7acffdd0d6116742ec Mon Sep 17 00:00:00 2001 From: Neakxs Date: Wed, 8 May 2019 14:21:36 -0700 Subject: [PATCH 14/15] Removing unicode related methods --- androguard/core/bytecodes/dvm.py | 47 ++------------------------------ 1 file changed, 2 insertions(+), 45 deletions(-) diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py index 8ed1b2c05..3e769c333 100644 --- a/androguard/core/bytecodes/dvm.py +++ b/androguard/core/bytecodes/dvm.py @@ -1889,33 +1889,11 @@ def set_off(self, off): def get_off(self): return self.offset - # def get_unicode(self): - # """ - # Returns an Unicode String - # This is the actual string. Beware that some strings might be not - # decodeable with usual UTF-16 decoder, as they use surrogates that are - # not supported by python. - # """ - # s = mutf8.decode(self.data) - # if len(s) != self.utf16_size: - # raise ValueError("UTF16 Length does not match!") - - # # Return a UTF16 String - # return s - def get(self): """ - Returns a printable string. - In this case, all lonely surrogates are escaped, thus are represented in the - string as 6 characters: \\ud853 - Valid surrogates are encoded as 32bit values, ie. \U00024f5c. + Returns a MUTF8String object """ return mutf8.MUTF8String.from_bytes(self.data) - # s = mutf8.decode(self.data) - # if len(s) != self.utf16_size: - # raise ValueError("UTF16 Length does not match!") - # # log.debug("Decoding UTF16 string with IDX {}, utf16 length {} and hexdata '{}'.".format(self.offset, self.utf16_size, binascii.hexlify(self.data))) - # return mutf8.patch_string(s) def show(self): bytecode._PrintSubBanner("String Data Item") @@ -2166,7 +2144,7 @@ def get_parameters_off_value(self): """ Return the string associated to the parameters_off - :rtype: string + :rtype: MUTF8String """ if self.parameters_off_value is None: params = self.CM.get_type_list(self.parameters_off) @@ -8145,27 +8123,6 @@ def get_field_descriptor(self, class_name, field_name, descriptor): return self.__cache_fields.get(key) - def get_strings_unicode(self): - """ - Return all strings - - This method will return pure UTF-16 strings. This is the "exact" same string as used in Java. - Those strings can be problematic for python, as they can contain surrogates as well as "broken" - surrogate pairs, ie single high or low surrogates. - Such a string can for example not be printed. - To avoid such problems, there is an escape mechanism to detect such lonely surrogates - and escape them in the string. Of course, this results in a different string than in the Java Source! - - Use `get_strings()` as a general purpose and `get_strings_unicode()` if you require the exact string - from the Java Source. - You can always escape the string from `get_strings_unicode()` using the function - :meth:`androguard.core.bytecodes.mutf8.patch_string` - - :rtype: a list with all strings used in the format (types, names ...) - """ - for i in self.strings: - yield i.get_unicode() - def get_strings(self): """ Return all strings From 50ccf777cc4870bd3eeb0f2a2ea4e2599a078051 Mon Sep 17 00:00:00 2001 From: Neakxs Date: Thu, 9 May 2019 12:20:57 -0700 Subject: [PATCH 15/15] Changing mutf8 package location --- androguard/core/analysis/analysis.py | 4 ++-- androguard/core/bytecodes/dvm.py | 2 +- androguard/core/{bytecodes => }/mutf8.py | 0 androguard/decompiler/dad/writer.py | 2 +- tests/test_dexcodeparsing.py | 2 +- tests/test_strings.py | 3 ++- 6 files changed, 7 insertions(+), 6 deletions(-) rename androguard/core/{bytecodes => }/mutf8.py (100%) diff --git a/androguard/core/analysis/analysis.py b/androguard/core/analysis/analysis.py index 52193ef5e..d29591928 100644 --- a/androguard/core/analysis/analysis.py +++ b/androguard/core/analysis/analysis.py @@ -3,9 +3,9 @@ import time import warnings from androguard.core.androconf import is_ascii_problem, load_api_specific_resource_module -from androguard.core.bytecodes import dvm, mutf8 +from androguard.core.bytecodes import dvm import logging -from androguard.core import bytecode +from androguard.core import bytecode, mutf8 import networkx as nx from enum import IntEnum diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py index 3e769c333..d4e41d7d0 100644 --- a/androguard/core/bytecodes/dvm.py +++ b/androguard/core/bytecodes/dvm.py @@ -2,7 +2,7 @@ from androguard.core.bytecodes.apk import APK from androguard.core.androconf import CONF -from androguard.core.bytecodes import mutf8 +from androguard.core import mutf8 from androguard.core.bytecodes.dvm_types import TypeMapItem, ACCESS_FLAGS, TYPE_DESCRIPTOR import sys diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/mutf8.py similarity index 100% rename from androguard/core/bytecodes/mutf8.py rename to androguard/core/mutf8.py diff --git a/androguard/decompiler/dad/writer.py b/androguard/decompiler/dad/writer.py index 83055f80f..31168a677 100644 --- a/androguard/decompiler/dad/writer.py +++ b/androguard/decompiler/dad/writer.py @@ -17,7 +17,7 @@ import logging from struct import unpack -from androguard.core.bytecodes import mutf8 +from androguard.core import mutf8 from androguard.decompiler.dad.util import get_type from androguard.decompiler.dad.opcode_ins import Op from androguard.decompiler.dad.instruction import ( diff --git a/tests/test_dexcodeparsing.py b/tests/test_dexcodeparsing.py index 5f5c9cac2..c5b553f2f 100644 --- a/tests/test_dexcodeparsing.py +++ b/tests/test_dexcodeparsing.py @@ -45,7 +45,7 @@ def testcode(self): def testClassManager(self): """Test if the classmanager has the same items""" - from androguard.core.bytecodes.mutf8 import decode + from androguard.core.mutf8 import decode fname = "examples/android/TestsAndroguard/bin/classes.dex" diff --git a/tests/test_strings.py b/tests/test_strings.py index 4a2224979..d9ef06de8 100644 --- a/tests/test_strings.py +++ b/tests/test_strings.py @@ -3,7 +3,8 @@ import sys -from androguard.core.bytecodes import dvm, mutf8 +from androguard.core import mutf8 +from androguard.core.bytecodes import dvm from androguard.core.analysis import analysis