From 2fcc17ef84b26f3bf4548ce88173ede0ab3e38dc Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Thu, 25 Apr 2019 11:42:39 -0700
Subject: [PATCH 01/15] Adding function for MUTF-8 decoding and patching

This function regroup functions `decode` and `patch_string`
---
 androguard/core/bytecodes/mutf8.py | 63 ++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py
index 7d974996a..3dca54e79 100644
--- a/androguard/core/bytecodes/mutf8.py
+++ b/androguard/core/bytecodes/mutf8.py
@@ -41,6 +41,69 @@ def decode(b):
     return res
 
 
+def decode_and_patch(b, size):
+    """
+    Decode bytes as MUTF-8
+    See https://docs.oracle.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8
+    for more information
+
+    Surrogates will be returned as two 16 bit characters.
+
+    :param b: bytes to decode
+    :rtype: unicode (py2), str (py3) of 16bit chars
+    :raises: UnicodeDecodeError if string is not decodable
+    """
+    chr_array = [""] * size
+
+    b = iter(b)
+
+    high_surrogate = None
+    chr_index = 0
+    for x in b:
+        n = 0
+        if x >> 7 == 0:
+            # Single char:
+            n = x & 0x7f
+        elif x >> 5 == 0b110:
+            # 2 byte Multichar
+            b2 = next(b)
+            if b2 >> 6 != 0b10:
+                raise UnicodeDecodeError("Second byte of 2 byte sequence does not looks right.")
+
+            n = (x & 0x1f) << 6 | b2 & 0x3f
+        elif x >> 4 == 0b1110:
+            # 3 byte Multichar
+            b2 = next(b)
+            b3 = next(b)
+            if b2 >> 6 != 0b10:
+                raise UnicodeDecodeError("Second byte of 3 byte sequence does not looks right.")
+            if b3 >> 6 != 0b10:
+                raise UnicodeDecodeError("Third byte of 3 byte sequence does not looks right.")
+
+            n = (x & 0xf) << 12 | (b2 & 0x3f) << 6 | b3 & 0x3f
+        else:
+            raise UnicodeDecodeError("Could not decode byte")
+        if high_surrogate is not None:
+            c = high_surrogate
+            if n and (n >> 10) == 0b110111:
+                chr_array[chr_index] = chr(((c & 0x3ff) << 10 | (n & 0x3ff)) + 0x10000)
+                chr_index += 1
+            else:
+                chr_array[chr_index] = "\\u{:04x}".format(c)
+            high_surrogate = None
+        else:
+            c = n
+            if (c >> 10) == 0b110110:
+                high_surrogate = c
+            elif (c >> 10) == 0b110111:
+                chr_array[chr_index] = "\\u{:04x}".format(c)
+            else:
+                chr_array[chr_index] = chr(c)
+            chr_index += 1
+
+    return "".join(chr_array)
+
+
 class PeekIterator:
     """
     A quick'n'dirty variant of an Iterator that has a special function

From 51f80833c3896bf71e036f846087fdbcf1acbcb3 Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Thu, 25 Apr 2019 12:04:22 -0700
Subject: [PATCH 02/15] Replacing `path_string` calls with `decode_and_patch`

Moreover, adding size checking inside `decode_and_patch` function
---
 androguard/core/bytecodes/dvm.py   | 5 +----
 androguard/core/bytecodes/mutf8.py | 5 +++++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py
index 0c0859cfe..66152b184 100644
--- a/androguard/core/bytecodes/dvm.py
+++ b/androguard/core/bytecodes/dvm.py
@@ -1908,11 +1908,8 @@ def get(self):
         string as 6 characters: \\ud853
         Valid surrogates are encoded as 32bit values, ie. \U00024f5c.
         """
-        s = mutf8.decode(self.data)
-        if len(s) != self.utf16_size:
-            raise ValueError("UTF16 Length does not match!")
         # log.debug("Decoding UTF16 string with IDX {}, utf16 length {} and hexdata '{}'.".format(self.offset, self.utf16_size, binascii.hexlify(self.data)))
-        return mutf8.patch_string(s)
+        return mutf8.decode_and_patch(self.data, self.utf16_size)
 
     def show(self):
         bytecode._PrintSubBanner("String Data Item")
diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py
index 3dca54e79..f351b6c3b 100644
--- a/androguard/core/bytecodes/mutf8.py
+++ b/androguard/core/bytecodes/mutf8.py
@@ -56,6 +56,7 @@ def decode_and_patch(b, size):
     chr_array = [""] * size
 
     b = iter(b)
+    decoded_size = 0
 
     high_surrogate = None
     chr_index = 0
@@ -100,6 +101,10 @@ def decode_and_patch(b, size):
             else:
                 chr_array[chr_index] = chr(c)
             chr_index += 1
+        decoded_size += 1
+
+    if decoded_size != size:
+        raise ValueError("UTF16 Length does not match!")
 
     return "".join(chr_array)
 

From 26557ef66e16225816bdb1802be42a52a59bc24a Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Thu, 25 Apr 2019 14:12:41 -0700
Subject: [PATCH 03/15] Adding mutf8 tests and fixing bugs

---
 androguard/core/bytecodes/mutf8.py | 58 +++++++++++++-----------------
 tests/test_strings.py              |  6 ++++
 2 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py
index f351b6c3b..97c5ddf42 100644
--- a/androguard/core/bytecodes/mutf8.py
+++ b/androguard/core/bytecodes/mutf8.py
@@ -42,36 +42,22 @@ def decode(b):
 
 
 def decode_and_patch(b, size):
-    """
-    Decode bytes as MUTF-8
-    See https://docs.oracle.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8
-    for more information
-
-    Surrogates will be returned as two 16 bit characters.
-
-    :param b: bytes to decode
-    :rtype: unicode (py2), str (py3) of 16bit chars
-    :raises: UnicodeDecodeError if string is not decodable
-    """
-    chr_array = [""] * size
+    ord_array = [None] * size
+    ord_index = 0
 
     b = iter(b)
-    decoded_size = 0
 
-    high_surrogate = None
-    chr_index = 0
     for x in b:
-        n = 0
         if x >> 7 == 0:
             # Single char:
-            n = x & 0x7f
+            ord_array[ord_index] = x & 0x7f
         elif x >> 5 == 0b110:
             # 2 byte Multichar
             b2 = next(b)
             if b2 >> 6 != 0b10:
                 raise UnicodeDecodeError("Second byte of 2 byte sequence does not looks right.")
 
-            n = (x & 0x1f) << 6 | b2 & 0x3f
+            ord_array[ord_index] = (x & 0x1f) << 6 | b2 & 0x3f
         elif x >> 4 == 0b1110:
             # 3 byte Multichar
             b2 = next(b)
@@ -81,30 +67,34 @@ def decode_and_patch(b, size):
             if b3 >> 6 != 0b10:
                 raise UnicodeDecodeError("Third byte of 3 byte sequence does not looks right.")
 
-            n = (x & 0xf) << 12 | (b2 & 0x3f) << 6 | b3 & 0x3f
+            ord_array[ord_index] = (x & 0xf) << 12 | (b2 & 0x3f) << 6 | b3 & 0x3f
         else:
             raise UnicodeDecodeError("Could not decode byte")
-        if high_surrogate is not None:
-            c = high_surrogate
+        ord_index += 1
+
+    if ord_index != size:
+        raise ValueError("UTF16 Length does not match! {}".format(ord_index))
+
+    chr_array = [""]*size
+    chr_index = 0
+    while chr_index < size:
+        c = ord_array[chr_index]
+        if (c >> 10) == 0b110110:
+            n = None
+            try:
+                n = ord_array[chr_index + 1]
+            except:
+                pass
             if n and (n >> 10) == 0b110111:
                 chr_array[chr_index] = chr(((c & 0x3ff) << 10 | (n & 0x3ff)) + 0x10000)
                 chr_index += 1
             else:
                 chr_array[chr_index] = "\\u{:04x}".format(c)
-            high_surrogate = None
+        elif (c >> 10) == 0b110111:
+            chr_array[chr_index] = "\\u{:04x}".format(c)
         else:
-            c = n
-            if (c >> 10) == 0b110110:
-                high_surrogate = c
-            elif (c >> 10) == 0b110111:
-                chr_array[chr_index] = "\\u{:04x}".format(c)
-            else:
-                chr_array[chr_index] = chr(c)
-            chr_index += 1
-        decoded_size += 1
-
-    if decoded_size != size:
-        raise ValueError("UTF16 Length does not match!")
+            chr_array[chr_index] = chr(c)
+        chr_index += 1
 
     return "".join(chr_array)
 
diff --git a/tests/test_strings.py b/tests/test_strings.py
index df136b8fc..48877d969 100644
--- a/tests/test_strings.py
+++ b/tests/test_strings.py
@@ -56,6 +56,12 @@ def testMUTF8(self):
 
         self.assertEqual("\U00024f5c\U0001f64f\\ud83d\uacf0hello world\x00", mutf8.patch_string(mutf8.decode(b)))
 
+        self.assertEqual("hello world", mutf8.decode_and_patch(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64", 11))
+        self.assertEqual("\U00024f5c", mutf8.decode_and_patch(b"\xed\xa1\x93\xed\xbd\x9c",2))
+        self.assertEqual("\U0001f64f", mutf8.decode_and_patch(b"\xed\xa0\xbd\xed\xb9\x8f",2))
+        self.assertEqual("\\ud853", mutf8.decode_and_patch(b"\xed\xa1\x93", 1))
+        self.assertEqual("\U00024f5c\U0001f64f\\ud83d\uacf0hello world\x00", mutf8.decode_and_patch(b, 18))
+
 
 
 if __name__ == '__main__':

From e0f9625602fbf0d7522e3422f15b5cfd49024ac0 Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Sat, 27 Apr 2019 08:30:01 -0700
Subject: [PATCH 04/15] Improving `read_null_terminated_string` performance

---
 androguard/core/bytecodes/dvm.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py
index 66152b184..f439ce921 100644
--- a/androguard/core/bytecodes/dvm.py
+++ b/androguard/core/bytecodes/dvm.py
@@ -101,13 +101,18 @@ def read_null_terminated_string(f):
     :param f: file-like object
     :rtype: bytearray
     """
-    x = bytearray()
+    x = []
     while True:
-        z = f.read(1)
-        if ord(z) == 0:
-            return x
+        z = f.read(128)
+        if 0 in z:
+            s = z.split(b'\x00',1)
+            x.append(s[0])
+            idx = f.get_idx()
+            f.set_idx(idx - len(s[1]))
+            break
         else:
-            x.append(ord(z))
+            x.append(z)
+    return b''.join(x)
 
 
 def get_access_flags_string(value):

From 80a26ca2d547b185b6fb0f433b0cede8ce5eb2b6 Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Sat, 27 Apr 2019 09:10:31 -0700
Subject: [PATCH 05/15] Removing duplicate object creation

---
 androguard/core/bytecodes/dvm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py
index f439ce921..5ae3f2af8 100644
--- a/androguard/core/bytecodes/dvm.py
+++ b/androguard/core/bytecodes/dvm.py
@@ -199,11 +199,11 @@ def static_operand_instruction(instruction):
 
 
 def get_sbyte(buff):
-    return unpack('=b', bytearray(buff.read(1)))[0]
+    return unpack('=b', buff.read(1))[0]
 
 
 def get_byte(buff):
-    return unpack('=B', bytearray(buff.read(1)))[0]
+    return unpack('=B', buff.read(1))[0]
 
 
 def readuleb128(buff):

From 5d9e496695860a86ac973bbf3c93e83b50e499c4 Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Sun, 28 Apr 2019 04:56:26 -0700
Subject: [PATCH 06/15] Removing SV and SVs classes

---
 androguard/core/bytecode.py | 60 -------------------------------------
 1 file changed, 60 deletions(-)

diff --git a/androguard/core/bytecode.py b/androguard/core/bytecode.py
index 91a943ff6..7691b0f25 100644
--- a/androguard/core/bytecode.py
+++ b/androguard/core/bytecode.py
@@ -593,60 +593,6 @@ def method2json_direct(mx):
     return json.dumps(d)
 
 
-class SV:
-
-    def __init__(self, size, buff):
-        self.__size = size
-        self.__value = unpack(self.__size, buff)[0]
-
-    def _get(self):
-        return pack(self.__size, self.__value)
-
-    def __str__(self):
-        return "0x%x" % self.__value
-
-    def __int__(self):
-        return self.__value
-
-    def get_value_buff(self):
-        return self._get()
-
-    def get_value(self):
-        return self.__value
-
-    def set_value(self, attr):
-        self.__value = attr
-
-
-class SVs:
-
-    def __init__(self, size, ntuple, buff):
-        self.__size = size
-
-        self.__value = ntuple._make(unpack(self.__size, buff))
-
-    def _get(self):
-        l = []
-        for i in self.__value._fields:
-            l.append(getattr(self.__value, i))
-        return pack(self.__size, *l)
-
-    def _export(self):
-        return [x for x in self.__value._fields]
-
-    def get_value_buff(self):
-        return self._get()
-
-    def get_value(self):
-        return self.__value
-
-    def set_value(self, attr):
-        self.__value = self.__value._replace(**attr)
-
-    def __str__(self):
-        return self.__value.__str__()
-
-
 def object_to_bytes(obj):
     """
     Convert a object to a bytearray or call get_raw() of the object
@@ -787,9 +733,6 @@ def readat(self, off):
         :param int off: starting offset
         :rtype: bytearray
         """
-        if isinstance(off, SV):
-            off = off.value
-
         return self.__buff[off:]
 
     def read(self, size):
@@ -800,9 +743,6 @@ def read(self, size):
         :param int size: length of bytes to read
         :rtype: bytearray
         """
-        if isinstance(size, SV):
-            size = size.value
-
         buff = self.__buff[self.__idx:self.__idx + size]
         self.__idx += size
 

From b0b91abb0c2f78e6367edb8e5da22c728b64d396 Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Wed, 1 May 2019 12:54:24 -0700
Subject: [PATCH 07/15] Adding MUTF8String class for dexparser

Include `decode` and `encode` functions
Warning : `decode` doesn't return a printable string
---
 androguard/core/bytecodes/mutf8.py | 256 ++++++++++++++++++++---------
 1 file changed, 175 insertions(+), 81 deletions(-)

diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py
index 7d974996a..c79a2987e 100644
--- a/androguard/core/bytecodes/mutf8.py
+++ b/androguard/core/bytecodes/mutf8.py
@@ -1,103 +1,197 @@
 def decode(b):
-    """
-    Decode bytes as MUTF-8
-    See https://docs.oracle.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8
-    for more information
+    size = len(b)
+    ord_array = [None] * size
+    ord_index = 0
 
-    Surrogates will be returned as two 16 bit characters.
-
-    :param b: bytes to decode
-    :rtype: unicode (py2), str (py3) of 16bit chars
-    :raises: UnicodeDecodeError if string is not decodable
-    """
-    res = ""
-
-    b = iter(bytearray(b))
+    b = iter(b)
 
     for x in b:
         if x >> 7 == 0:
             # Single char:
-            res += chr(x & 0x7f)
+            ord_array[ord_index] = x & 0x7f
         elif x >> 5 == 0b110:
             # 2 byte Multichar
             b2 = next(b)
             if b2 >> 6 != 0b10:
-                raise UnicodeDecodeError("Second byte of 2 byte sequence does not looks right.")
+                raise UnicodeDecodeError(
+                    "Second byte of 2 byte sequence does not looks right.")
 
-            res += chr((x & 0x1f) << 6 | b2 & 0x3f)
+            ord_array[ord_index] = (x & 0x1f) << 6 | b2 & 0x3f
         elif x >> 4 == 0b1110:
             # 3 byte Multichar
             b2 = next(b)
             b3 = next(b)
             if b2 >> 6 != 0b10:
-                raise UnicodeDecodeError("Second byte of 3 byte sequence does not looks right.")
+                raise UnicodeDecodeError(
+                    "Second byte of 3 byte sequence does not looks right.")
             if b3 >> 6 != 0b10:
-                raise UnicodeDecodeError("Third byte of 3 byte sequence does not looks right.")
+                raise UnicodeDecodeError(
+                    "Third byte of 3 byte sequence does not looks right.")
 
-            res += chr((x & 0xf) << 12 | (b2 & 0x3f) << 6 | b3 & 0x3f)
+            ord_array[ord_index] = (x & 0xf) << 12 | (
+                b2 & 0x3f) << 6 | b3 & 0x3f
         else:
             raise UnicodeDecodeError("Could not decode byte")
-
-    return res
-
-
-class PeekIterator:
-    """
-    A quick'n'dirty variant of an Iterator that has a special function
-    peek, which will return the next object but not consume it.
-    """
-    idx = 0
-
-    def __init__(self, s):
-        self.s = s
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        if self.idx == len(self.s):
-            raise StopIteration()
-        self.idx = self.idx + 1
-        return self.s[self.idx - 1]
-
-    def next(self):
-        # py2 compliance
-        return self.__next__()
-
-    def peek(self):
-        if self.idx == len(self.s):
-            return None
-        return self.s[self.idx]
-
-
-def patch_string(s):
-    """
-    Reorganize a String in such a way that surrogates are printable
-    and lonely surrogates are escaped.
-
-    :param s: input string
-    :return: string with escaped lonely surrogates and 32bit surrogates
-    """
-    res = ''
-    it = PeekIterator(s)
-    for c in it:
-        if (ord(c) >> 10) == 0b110110:
-            # High surrogate
-            # Check for the next
-            n = it.peek()
-            if n and (ord(n) >> 10) == 0b110111:
-                # Next is a low surrogate! Merge them together
-                res += chr(((ord(c) & 0x3ff) << 10 | (ord(n) & 0x3ff)) + 0x10000)
-                # Skip next char, as we already consumed it
-                next(it)
+        ord_index += 1
+
+    chr_array = [""]*size
+    chr_index = 0
+    while chr_index < size:
+        c = ord_array[chr_index]
+        if c is None:
+            break
+        if (c >> 10) == 0b110110:
+            n = None
+            try:
+                n = ord_array[chr_index + 1]
+            except:
+                pass
+            if n and (n >> 10) == 0b110111:
+                chr_array[chr_index] = chr(
+                    ((c & 0x3ff) << 10 | (n & 0x3ff)) + 0x10000)
+                chr_index += 1
             else:
-                # Lonely high surrogate
-                res += "\\u{:04x}".format(ord(c))
-        elif (ord(c) >> 10) == 0b110111:
-            # Lonely low surrogate
-            res += "\\u{:04x}".format(ord(c))
+                chr_array[chr_index] = chr(c)
         else:
-            # Looks like a normal char...
-            res += c
-    return res
-
+            chr_array[chr_index] = chr(c)
+        chr_index += 1
+
+    return "".join(chr_array)
+
+
+def encode(s):
+    b = [b""]*len(s)
+    ord_array = [i for i in map(lambda x: ord(x), s)]
+    for x in ord_array:
+        if (x == 0) or ((x <= 0x7ff) and (x >= 0x80)):
+            b1 = ((x & 0x7c0) >> 6 | 0xc0).to_bytes(1, 'big')
+            b2 = ((x & 0x3f) | 0x80).to_bytes(1, 'big')
+            b.append(b1 + b2)
+        elif (x <= 0x7f):
+            b1 = x.to_bytes(1, 'big')
+            b.append(b1)
+        elif (x >= 0x800) and (x <= 0xffff):
+            b1 = ((x & 0xf000) >> 12 | 0xe0).to_bytes(1, 'big')
+            b2 = ((x & 0xfff) >> 6 | 0x80).to_bytes(1, 'big')
+            b3 = ((x & 0x3f) | 0x80).to_bytes(1, 'big')
+            b.append(b1 + b2 + b3)
+        else:
+            a = x - 0x10000
+            s1 = ((a >> 10) | 0xd800)
+            s2 = ((a & 0x3ff) | 0xdc00)
+            b1 = ((s1 & 0xf000) >> 12 | 0xe0).to_bytes(1, 'big')
+            b2 = ((s1 & 0xfff) >> 6 | 0x80).to_bytes(1, 'big')
+            b3 = ((s1 & 0x3f) | 0x80).to_bytes(1, 'big')
+            b4 = ((s2 & 0xf000) >> 12 | 0xe0).to_bytes(1, 'big')
+            b5 = ((s2 & 0xfff) >> 6 | 0x80).to_bytes(1, 'big')
+            b6 = ((s2 & 0x3f) | 0x80).to_bytes(1, 'big')
+            b.append(b1 + b2 + b3 + b4 + b5 + b6)
+    return b"".join(b)
+
+
+class MUTF8String():
+    def __init__(self, data, raw=True):
+        self.__encoded = None
+        self.__decoded = None
+        if raw:
+            self.__encoded = data
+        else:
+            self.__decoded = data
+
+    @classmethod
+    def from_bytes(cls, data):
+        return cls(data)
+
+    @classmethod
+    def from_str(cls, data):
+        return cls(data, raw=False)
+
+    @property
+    def bytes(self):
+        if self.__encoded is None:
+            self.__encoded = encode(self.__decoded)
+        return self.__encoded
+
+    @property
+    def string(self):
+        if self.__decoded is None:
+            self.__decoded = decode(self.__encoded)
+        return self.__decoded
+
+    def __repr__(self):
+        return "<mutf8.MUTF8String {}>".format(self.__str__())
+
+    def __str__(self):
+        return self.string.encode('utf8', errors='backslashreplace').decode('utf8')
+
+    def __hash__(self):
+        return self.bytes.__hash__()
+
+    def __len__(self):
+        return self.bytes.__len__()
+
+    def __lt__(self, other):
+        try:
+            return self.bytes.__lt__(other.bytes)
+        except AttributeError:
+            if isinstance(other, bytes):
+                return self.bytes.__lt__(other)
+            elif isinstance(other, str):
+                return self.bytes.__lt__(MUTF8String.from_str(other).bytes)
+            else:
+                raise TypeError('{} is not supported'.format(type(other)))
+
+    def __le__(self, other):
+        try:
+            return self.bytes.__le__(other.bytes)
+        except AttributeError:
+            if isinstance(other, bytes):
+                return self.bytes.__le__(other)
+            elif isinstance(other, str):
+                return self.bytes.__le__(MUTF8String.from_str(other).bytes)
+            else:
+                raise TypeError('{} is not supported'.format(type(other)))
+
+    def __eq__(self, other):
+        try:
+            return self.bytes.__eq__(other.bytes)
+        except AttributeError:
+            if isinstance(other, bytes):
+                return self.bytes.__eq__(other)
+            elif isinstance(other, str):
+                return self.bytes.__eq__(MUTF8String.from_str(other).bytes)
+            else:
+                raise TypeError('{} is not supported'.format(type(other)))
+
+    def __ne__(self, other):
+        try:
+            return self.bytes.__ne__(other.bytes)
+        except AttributeError:
+            if isinstance(other, bytes):
+                return self.bytes.__ne__(other)
+            elif isinstance(other, str):
+                return self.bytes.__ne__(MUTF8String.from_str(other).bytes)
+            else:
+                raise TypeError('{} is not supported'.format(type(other)))
+
+    def __gt__(self, other):
+        try:
+            return self.bytes.__gt__(other.bytes)
+        except AttributeError:
+            if isinstance(other, bytes):
+                return self.bytes.__gt__(other)
+            elif isinstance(other, str):
+                return self.bytes.__gt__(MUTF8String.from_str(other).bytes)
+            else:
+                raise TypeError('{} is not supported'.format(type(other)))
+
+    def __ge__(self, other):
+        try:
+            return self.bytes.__ge__(other.bytes)
+        except AttributeError:
+            if isinstance(other, bytes):
+                return self.bytes.__ge__(other)
+            elif isinstance(other, str):
+                return self.bytes.__ge__(MUTF8String.from_str(other).bytes)
+            else:
+                raise TypeError('{} is not supported'.format(type(other)))

From 1d4531be97141b6a0f8dcadea555b3bf2e9d6346 Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Wed, 1 May 2019 12:56:24 -0700
Subject: [PATCH 08/15] Changing tests for new mutf8 package

---
 tests/test_strings.py | 48 +++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/tests/test_strings.py b/tests/test_strings.py
index df136b8fc..4a2224979 100644
--- a/tests/test_strings.py
+++ b/tests/test_strings.py
@@ -27,25 +27,20 @@ def testDex(self):
                 self.assertIn(s, d.get_strings())
 
     def testMUTF8(self):
-        self.assertEqual("\x67", mutf8.decode(b"\x67"))
-        # Null byte
-        self.assertEqual("\x00", mutf8.decode(b"\xc0\x80"))
-        self.assertEqual("\uacf0", mutf8.decode(b"\xea\xb3\xb0"))
-        # Surrogates
-        self.assertEqual("\ud83d\ude4f", mutf8.decode(b"\xed\xa0\xbd\xed\xb9\x8f"))
-        self.assertEqual("\ud853\udf5c", mutf8.decode(b"\xed\xa1\x93\xed\xbd\x9c"))
-        # Lonely surrogates
-        self.assertEqual("\ud853", mutf8.decode(b"\xed\xa1\x93"))
-        self.assertEqual("\udf5c", mutf8.decode(b"\xed\xbd\x9c"))
-        # Normal ASCII String
-        self.assertEqual("hello world", mutf8.decode(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64"))
-
-        # Test the patching of strings
-
-        self.assertEqual("hello world", mutf8.patch_string(mutf8.decode(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64")))
-        self.assertEqual("\U00024f5c", mutf8.patch_string(mutf8.decode(b"\xed\xa1\x93\xed\xbd\x9c")))
-        self.assertEqual("\U0001f64f", mutf8.patch_string(mutf8.decode(b"\xed\xa0\xbd\xed\xb9\x8f")))
-        self.assertEqual("\\ud853", mutf8.patch_string(mutf8.decode(b"\xed\xa1\x93")))
+        # self.assertEqual("\x67", mutf8.decode(b"\x67"))
+        # # Null byte
+        # self.assertEqual("\x00", mutf8.decode(b"\xc0\x80"))
+        # self.assertEqual("\uacf0", mutf8.decode(b"\xea\xb3\xb0"))
+        # # Surrogates
+        # self.assertEqual("\ud83d\ude4f", mutf8.decode(b"\xed\xa0\xbd\xed\xb9\x8f"))
+        # self.assertEqual("\ud853\udf5c", mutf8.decode(b"\xed\xa1\x93\xed\xbd\x9c"))
+        # # Lonely surrogates
+        # self.assertEqual("\ud853", mutf8.decode(b"\xed\xa1\x93"))
+        # self.assertEqual("\udf5c", mutf8.decode(b"\xed\xbd\x9c"))
+        # # Normal ASCII String
+        # self.assertEqual("hello world", mutf8.decode(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64"))
+
+        # Testing decode
 
         b = b"\xed\xa1\x93\xed\xbd\x9c" + \
             b"\xed\xa0\xbd\xed\xb9\x8f" + \
@@ -54,8 +49,21 @@ def testMUTF8(self):
             b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64" + \
             b"\xc0\x80"
 
-        self.assertEqual("\U00024f5c\U0001f64f\\ud83d\uacf0hello world\x00", mutf8.patch_string(mutf8.decode(b)))
+        self.assertEqual("hello world", mutf8.decode(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64").encode('utf8', errors='backslashreplace').decode('utf8'))
+        self.assertEqual("\U00024f5c", mutf8.decode(b"\xed\xa1\x93\xed\xbd\x9c").encode('utf8', errors='backslashreplace').decode('utf8'))
+        self.assertEqual("\U0001f64f", mutf8.decode(b"\xed\xa0\xbd\xed\xb9\x8f").encode('utf8', errors='backslashreplace').decode('utf8'))
+        self.assertEqual("\\ud853", mutf8.decode(b"\xed\xa1\x93").encode('utf8', errors='backslashreplace').decode('utf8'))
+        self.assertEqual("\U00024f5c\U0001f64f\\ud83d\uacf0hello world\x00", mutf8.decode(b).encode('utf8', errors='backslashreplace').decode('utf8'))
+
+        # Testing encode
+
+        self.assertEqual(b"\x68\x65\x6c\x6c\x6f\x20\x77\x6f\x72\x6c\x64", mutf8.encode("hello world"))
+        self.assertEqual(b"\xed\xa1\x93\xed\xbd\x9c", mutf8.encode("\U00024f5c"))
+        self.assertEqual(b"\xed\xa0\xbd\xed\xb9\x8f", mutf8.encode("\U0001f64f"))
+        self.assertEqual(b"\xed\xa1\x93", mutf8.encode("\ud853"))
+        self.assertEqual(b, mutf8.encode("\U00024f5c\U0001f64f\ud83d\uacf0hello world\x00"))
 
+        self.assertEqual(mutf8.MUTF8String.from_bytes(b), mutf8.MUTF8String.from_str("\U00024f5c\U0001f64f\ud83d\uacf0hello world\x00"))
 
 
 if __name__ == '__main__':

From 6a6f2a97454bc2f8ee749d148510244d2bbc43bb Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Wed, 1 May 2019 12:57:43 -0700
Subject: [PATCH 09/15] Applying MUTF8String to dexparser

---
 androguard/core/analysis/analysis.py | 36 +++++++++------
 androguard/core/bytecode.py          |  1 +
 androguard/core/bytecodes/dvm.py     | 65 +++++++++++++-------------
 androguard/core/bytecodes/mutf8.py   | 69 ++++++++++++++++++++++++----
 androguard/decompiler/dad/writer.py  |  3 +-
 tests/test_dexcodeparsing.py         |  4 +-
 6 files changed, 121 insertions(+), 57 deletions(-)

diff --git a/androguard/core/analysis/analysis.py b/androguard/core/analysis/analysis.py
index 9988a4570..52193ef5e 100644
--- a/androguard/core/analysis/analysis.py
+++ b/androguard/core/analysis/analysis.py
@@ -3,7 +3,7 @@
 import time
 import warnings
 from androguard.core.androconf import is_ascii_problem, load_api_specific_resource_module
-from androguard.core.bytecodes import dvm
+from androguard.core.bytecodes import dvm, mutf8
 import logging
 from androguard.core import bytecode
 import networkx as nx
@@ -789,7 +789,7 @@ def get_method(self, name, descriptor):
         :param descriptor: method descriptor, for example `'(I)V'`
         :return: :class:`ExternalMethod`
         """
-        key = name + str(descriptor)
+        key = name + mutf8.MUTF8String.join(descriptor)
         if key not in self.methods:
             self.methods[key] = ExternalMethod(self.name, name, descriptor)
 
@@ -818,7 +818,7 @@ def get_class_name(self):
         return self.class_name
 
     def get_descriptor(self):
-        return ''.join(self.descriptor)
+        return mutf8.MUTF8String.join(self.descriptor)
 
     @property
     def full_name(self):
@@ -837,7 +837,7 @@ def get_access_flags_string(self):
         return ""
 
     def __str__(self):
-        return "{}->{}{}".format(self.class_name, self.name, ''.join(self.descriptor))
+        return "{}->{}{}".format(self.class_name.__str__(), self.name.__str__(), mutf8.MUTF8String.join(self.descriptor).string)
 
     def __repr__(self):
         return "<analysis.ExternalMethod {}>".format(self.__str__())
@@ -986,7 +986,7 @@ def get_fake_method(self, name, descriptor):
 
         # We are searching an unknown method in this class
         # It could be something that the class herits
-        key = name + str(descriptor)
+        key = name + mutf8.MUTF8String.join(descriptor)
         if key not in self._inherits_methods:
             self._inherits_methods[key] = ExternalMethod(self.orig_class.get_name(), name, descriptor)
         return self._inherits_methods[key]
@@ -1269,7 +1269,7 @@ def _create_xref(self, current_class):
                         method_item = None
                         # TODO: should create get_method_descriptor inside Analysis
                         for vm in self.vms:
-                            method_item = vm.get_method_descriptor(method_info[0], method_info[1], ''.join(method_info[2]))
+                            method_item = vm.get_method_descriptor(method_info[0], method_info[1], mutf8.MUTF8String.join(method_info[2]))
                             if method_item:
                                 break
 
@@ -1485,10 +1485,11 @@ def find_classes(self, name=".*", no_external=False):
         :param no_external: Remove external classes from the output (default False)
         :rtype: generator of `ClassAnalysis`
         """
+        name = mutf8.MUTF8String.from_str(name).bytes
         for cname, c in self.classes.items():
             if no_external and isinstance(c.get_vm_class(), ExternalClass):
                 continue
-            if re.match(name, cname):
+            if re.match(name, cname.bytes):
                 yield c
 
     def find_methods(self, classname=".*", methodname=".*", descriptor=".*",
@@ -1505,8 +1506,11 @@ def find_methods(self, classname=".*", methodname=".*", descriptor=".*",
         :param no_external: Remove external method from the output (default False)
         :rtype: generator of `MethodClassAnalysis`
         """
+        classname = mutf8.MUTF8String.from_str(classname).bytes
+        methodname = mutf8.MUTF8String.from_str(methodname).bytes
+        descriptor = mutf8.MUTF8String.from_str(descriptor).bytes
         for cname, c in self.classes.items():
-            if re.match(classname, cname):
+            if re.match(classname, cname.bytes):
                 for m in c.get_methods():
                     z = m.get_method()
                     # TODO is it even possible that an internal class has
@@ -1514,8 +1518,8 @@ def find_methods(self, classname=".*", methodname=".*", descriptor=".*",
                     # instead...
                     if no_external and isinstance(z, ExternalMethod):
                         continue
-                    if re.match(methodname, z.get_name()) and \
-                       re.match(descriptor, z.get_descriptor()) and \
+                    if re.match(methodname, z.get_name().bytes) and \
+                       re.match(descriptor, z.get_descriptor().bytes) and \
                        re.match(accessflags, z.get_access_flags_string()):
                         yield m
 
@@ -1526,8 +1530,9 @@ def find_strings(self, string=".*"):
         :param string: regular expression for the string to search for
         :rtype: generator of `StringAnalysis`
         """
+        string = mutf8.MUTF8String.from_str(string).bytes
         for s, sa in self.strings.items():
-            if re.match(string, s):
+            if re.match(string, s.bytes):
                 yield sa
 
     def find_fields(self, classname=".*", fieldname=".*", fieldtype=".*", accessflags=".*"):
@@ -1540,12 +1545,15 @@ def find_fields(self, classname=".*", fieldname=".*", fieldtype=".*", accessflag
         :param accessflags: regular expression of the access flags
         :rtype: generator of `FieldClassAnalysis`
         """
+        classname = mutf8.MUTF8String.from_str(classname).bytes
+        fieldname = mutf8.MUTF8String.from_str(fieldname).bytes
+        fieldtype = mutf8.MUTF8String.from_str(fieldtype).bytes
         for cname, c in self.classes.items():
-            if re.match(classname, cname):
+            if re.match(classname, cname.bytes):
                 for f in c.get_fields():
                     z = f.get_field()
-                    if re.match(fieldname, z.get_name()) and \
-                       re.match(fieldtype, z.get_descriptor()) and \
+                    if re.match(fieldname, z.get_name().bytes) and \
+                       re.match(fieldtype, z.get_descriptor().bytes) and \
                        re.match(accessflags, z.get_access_flags_string()):
                         yield f
 
diff --git a/androguard/core/bytecode.py b/androguard/core/bytecode.py
index 91a943ff6..d1ffba44b 100644
--- a/androguard/core/bytecode.py
+++ b/androguard/core/bytecode.py
@@ -885,6 +885,7 @@ def FormatClassToPython(i):
     :rtype: str
     """
     i = i[:-1]
+    print(i)
     i = i.replace("/", "_")
     i = i.replace("$", "_")
 
diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py
index 0c0859cfe..a6c68d389 100644
--- a/androguard/core/bytecodes/dvm.py
+++ b/androguard/core/bytecodes/dvm.py
@@ -446,11 +446,9 @@ def determineException(vm, m):
 class HeaderItem:
     """
     This class can parse an header_item of a dex file.
-
     Several checks are performed to detect if this is not an header_item.
     Also the Adler32 checksum of the file is calculated in order to detect file
     corruption.
-
     :param buff: a string which represents a Buff object of the header_item
     :type androguard.core.bytecode.BuffHandle buff: Buff object
     :param cm: a ClassManager object
@@ -1887,19 +1885,19 @@ def set_off(self, off):
     def get_off(self):
         return self.offset
 
-    def get_unicode(self):
-        """
-        Returns an Unicode String
-        This is the actual string. Beware that some strings might be not
-        decodeable with usual UTF-16 decoder, as they use surrogates that are
-        not supported by python.
-        """
-        s = mutf8.decode(self.data)
-        if len(s) != self.utf16_size:
-            raise ValueError("UTF16 Length does not match!")
+    # def get_unicode(self):
+    #     """
+    #     Returns an Unicode String
+    #     This is the actual string. Beware that some strings might be not
+    #     decodeable with usual UTF-16 decoder, as they use surrogates that are
+    #     not supported by python.
+    #     """
+    #     s = mutf8.decode(self.data)
+    #     if len(s) != self.utf16_size:
+    #         raise ValueError("UTF16 Length does not match!")
 
-        # Return a UTF16 String
-        return s
+    #     # Return a UTF16 String
+    #     return s
 
     def get(self):
         """
@@ -1908,11 +1906,12 @@ def get(self):
         string as 6 characters: \\ud853
         Valid surrogates are encoded as 32bit values, ie. \U00024f5c.
         """
-        s = mutf8.decode(self.data)
-        if len(s) != self.utf16_size:
-            raise ValueError("UTF16 Length does not match!")
-        # log.debug("Decoding UTF16 string with IDX {}, utf16 length {} and hexdata '{}'.".format(self.offset, self.utf16_size, binascii.hexlify(self.data)))
-        return mutf8.patch_string(s)
+        return mutf8.MUTF8String.from_bytes(self.data)
+        # s = mutf8.decode(self.data)
+        # if len(s) != self.utf16_size:
+        #     raise ValueError("UTF16 Length does not match!")
+        # # log.debug("Decoding UTF16 string with IDX {}, utf16 length {} and hexdata '{}'.".format(self.offset, self.utf16_size, binascii.hexlify(self.data)))
+        # return mutf8.patch_string(s)
 
     def show(self):
         bytecode._PrintSubBanner("String Data Item")
@@ -2167,7 +2166,7 @@ def get_parameters_off_value(self):
         """
         if self.parameters_off_value is None:
             params = self.CM.get_type_list(self.parameters_off)
-            self.parameters_off_value = '({})'.format(' '.join(params))
+            self.parameters_off_value = mutf8.MUTF8String.from_bytes(b'(') + mutf8.MUTF8String.join(params, spacing=b' ') + mutf8.MUTF8String.from_bytes(b')')
         return self.parameters_off_value
 
     def show(self):
@@ -2680,7 +2679,7 @@ def reload(self):
         name = self.CM.get_field(self.field_idx)
         self.class_name = name[0]
         self.name = name[2]
-        self.proto = ''.join(i for i in name[1])
+        self.proto = name[1]
 
     def set_init_value(self, value):
         """
@@ -2916,7 +2915,7 @@ def reload(self):
         if v and len(v) >= 3:
             self.class_name = v[0]
             self.name = v[1]
-            self.proto = ''.join(i for i in v[2])
+            self.proto = mutf8.MUTF8String.join(i for i in v[2])
         else:
             self.class_name = 'CLASS_NAME_ERROR'
             self.name = 'NAME_ERROR'
@@ -2993,7 +2992,7 @@ def __str__(self):
     @property
     def full_name(self):
         """Return class_name + name + descriptor, separated by spaces (no access flags"""
-        return " ".join([self.class_name, self.name, self.get_descriptor()])
+        return mutf8.MUTF8String.join([self.class_name, self.name, self.get_descriptor()], spacing=b' ')
 
     def get_short_string(self):
         """
@@ -7948,11 +7947,12 @@ def get_method(self, name):
         :rtype: a list with all :class:`EncodedMethod` objects
         """
         # TODO could use a generator here
-        prog = re.compile(name)
+        name = mutf8.MUTF8String.from_str(name)
+        prog = re.compile(name.bytes)
         l = []
         for i in self.get_classes():
             for j in i.get_methods():
-                if prog.match(j.get_name()):
+                if prog.match(j.get_name().bytes):
                     l.append(j)
         return l
 
@@ -7965,11 +7965,12 @@ def get_field(self, name):
         :rtype: a list with all :class:`EncodedField` objects
         """
         # TODO could use a generator here
-        prog = re.compile(name)
+        name = mutf8.MUTF8String.from_str(name)
+        prog = re.compile(name.bytes)
         l = []
         for i in self.get_classes():
             for j in i.get_fields():
-                if prog.match(j.get_name()):
+                if prog.match(j.get_name().bytes):
                     l.append(j)
         return l
 
@@ -8212,7 +8213,7 @@ def _delete_python_export_class(self, _class):
     def _create_python_export_class(self, _class, delete=False):
         if _class is not None:
             ### Class
-            name = bytecode.FormatClassToPython(_class.get_name())
+            name = bytecode.FormatClassToPython(_class.get_name()).string
             if delete:
                 delattr(self.C, name)
                 return
@@ -8236,13 +8237,13 @@ def _create_python_export_methods(self, _class, delete):
         for i in m:
             if len(m[i]) == 1:
                 j = m[i][0]
-                name = bytecode.FormatNameToPython(j.get_name())
+                name = bytecode.FormatNameToPython(j.get_name()).string
                 setattr(_class.M, name, j)
             else:
                 for j in m[i]:
                     name = (
                         bytecode.FormatNameToPython(j.get_name()) + "_" +
-                        bytecode.FormatDescriptorToPython(j.get_descriptor()))
+                        bytecode.FormatDescriptorToPython(j.get_descriptor())).string
                     setattr(_class.M, name, j)
 
     def _create_python_export_fields(self, _class, delete):
@@ -8257,13 +8258,13 @@ def _create_python_export_fields(self, _class, delete):
         for i in f:
             if len(f[i]) == 1:
                 j = f[i][0]
-                name = bytecode.FormatNameToPython(j.get_name())
+                name = bytecode.FormatNameToPython(j.get_name()).string
                 setattr(_class.F, name, j)
             else:
                 for j in f[i]:
                     name = bytecode.FormatNameToPython(j.get_name(
                     )) + "_" + bytecode.FormatDescriptorToPython(
-                        j.get_descriptor())
+                        j.get_descriptor()).string
                     setattr(_class.F, name, j)
 
     def get_BRANCH_DVM_OPCODES(self):
diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py
index c79a2987e..21ea1122e 100644
--- a/androguard/core/bytecodes/mutf8.py
+++ b/androguard/core/bytecodes/mutf8.py
@@ -91,21 +91,38 @@ def encode(s):
 
 class MUTF8String():
     def __init__(self, data, raw=True):
-        self.__encoded = None
-        self.__decoded = None
-        if raw:
-            self.__encoded = data
+        if isinstance(data, MUTF8String):
+            self.__encoded = data.__encoded
+            self.__decoded = data.__decoded
         else:
-            self.__decoded = data
+            self.__encoded = None
+            self.__decoded = None
+            if raw:
+                self.__encoded = data
+            else:
+                self.__decoded = data
 
     @classmethod
     def from_bytes(cls, data):
-        return cls(data)
+        return cls(bytes(data))
 
     @classmethod
     def from_str(cls, data):
         return cls(data, raw=False)
 
+    @classmethod
+    def join(cls, data, spacing=b''):
+        array = []
+        for i in data:
+            try:
+                array.append(i.bytes)
+            except AttributeError:
+                if isinstance(i, bytes):
+                    array.append(i)
+                else:
+                    array.append(encode(i))
+        return MUTF8String.from_bytes(spacing.join(array))
+
     @property
     def bytes(self):
         if self.__encoded is None:
@@ -118,17 +135,53 @@ def string(self):
             self.__decoded = decode(self.__encoded)
         return self.__decoded
 
+    def replace(self, old, new):
+        try:
+            return MUTF8String.from_bytes(self.bytes.replace(old, new))
+        except TypeError:
+            return MUTF8String.from_bytes(self.bytes.replace(encode(old), encode(new)))
+
+    def find(self, sub):
+        try:
+            return self.bytes.find(sub)
+        except TypeError:
+            return self.bytes.find(encode(sub))
+
+    def split(self, sub):
+        try:
+            return self.bytes.split(sub)
+        except TypeError:
+            return self.bytes.split(encode(sub))
+
+    def startswith(self, sub):
+        try:
+            return self.bytes.startswith(sub)
+        except TypeError:
+            return self.bytes.startswith(encode(sub))
+
+    def __add__(self, other):
+        try:
+            return MUTF8String.from_bytes(self.bytes + other.bytes)
+        except AttributeError:
+            return MUTF8String.from_bytes(self.bytes + encode(other))
+
+    def __getitem__(self, item):
+        return MUTF8String.from_bytes(self.bytes[item])
+
     def __repr__(self):
         return "<mutf8.MUTF8String {}>".format(self.__str__())
 
     def __str__(self):
         return self.string.encode('utf8', errors='backslashreplace').decode('utf8')
 
+    def __format__(self, format_spec):
+        return format(self.string, format_spec)
+
     def __hash__(self):
-        return self.bytes.__hash__()
+        return hash(self.bytes)
 
     def __len__(self):
-        return self.bytes.__len__()
+        return len(self.bytes)
 
     def __lt__(self, other):
         try:
diff --git a/androguard/decompiler/dad/writer.py b/androguard/decompiler/dad/writer.py
index 61a91bee0..1fd2d3e84 100644
--- a/androguard/decompiler/dad/writer.py
+++ b/androguard/decompiler/dad/writer.py
@@ -17,6 +17,7 @@
 
 import logging
 from struct import unpack
+from androguard.core.bytecodes import mutf8
 from androguard.decompiler.dad.util import get_type
 from androguard.decompiler.dad.opcode_ins import Op
 from androguard.decompiler.dad.instruction import (
@@ -48,7 +49,7 @@ def __init__(self, graph, method):
         self.need_break = True
 
     def __str__(self):
-        return ''.join(self.buffer)
+        return mutf8.MUTF8String.join(self.buffer).string
 
     def str_ext(self):
         return self.buffer2
diff --git a/tests/test_dexcodeparsing.py b/tests/test_dexcodeparsing.py
index 335658f0d..5f5c9cac2 100644
--- a/tests/test_dexcodeparsing.py
+++ b/tests/test_dexcodeparsing.py
@@ -45,7 +45,7 @@ def testcode(self):
     def testClassManager(self):
         """Test if the classmanager has the same items"""
 
-        from androguard.core.bytecodes.mutf8 import decode, patch_string
+        from androguard.core.bytecodes.mutf8 import decode
 
         fname = "examples/android/TestsAndroguard/bin/classes.dex"
 
@@ -64,7 +64,7 @@ def testClassManager(self):
         for idx in range(parsed.string_ids_size):
             self.assertNotEqual(cm.get_string(idx), ERR_STR)
             self.assertNotEqual(cm.get_raw_string(idx), ERR_STR)
-            self.assertEqual(cm.get_raw_string(idx), patch_string(decode(parsed.str_raw[idx])))
+            self.assertEqual(cm.get_raw_string(idx), decode(parsed.str_raw[idx]))
 
         self.assertEqual(cm.get_string(parsed.string_ids_size), ERR_STR)
         self.assertEqual(cm.get_raw_string(parsed.string_ids_size), ERR_STR)

From 8623899959ee94636fe5296a227945eae58ef9b6 Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Thu, 2 May 2019 12:26:57 -0700
Subject: [PATCH 10/15] Applying patch of #684 and fixing bugs

Patch concern `read_null_terminated_string` improvement
---
 androguard/core/bytecode.py         |  1 -
 androguard/core/bytecodes/dvm.py    | 20 ++++++++++++--------
 androguard/decompiler/decompiler.py |  2 +-
 tests/test_decompiler.py            |  2 +-
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/androguard/core/bytecode.py b/androguard/core/bytecode.py
index d1ffba44b..91a943ff6 100644
--- a/androguard/core/bytecode.py
+++ b/androguard/core/bytecode.py
@@ -885,7 +885,6 @@ def FormatClassToPython(i):
     :rtype: str
     """
     i = i[:-1]
-    print(i)
     i = i.replace("/", "_")
     i = i.replace("$", "_")
 
diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py
index a6c68d389..31ea96b57 100644
--- a/androguard/core/bytecodes/dvm.py
+++ b/androguard/core/bytecodes/dvm.py
@@ -97,17 +97,21 @@ class InvalidInstruction(Error):
 def read_null_terminated_string(f):
     """
     Read a null terminated string from a file-like object.
-
     :param f: file-like object
     :rtype: bytearray
     """
-    x = bytearray()
+    x = []
     while True:
-        z = f.read(1)
-        if ord(z) == 0:
-            return x
+        z = f.read(128)
+        if 0 in z:
+            s = z.split(b'\x00',1)
+            x.append(s[0])
+            idx = f.get_idx()
+            f.set_idx(idx - len(s[1]))
+            break
         else:
-            x.append(ord(z))
+            x.append(z)
+    return b''.join(x)
 
 
 def get_access_flags_string(value):
@@ -194,11 +198,11 @@ def static_operand_instruction(instruction):
 
 
 def get_sbyte(buff):
-    return unpack('=b', bytearray(buff.read(1)))[0]
+    return unpack('=b', buff.read(1))[0]
 
 
 def get_byte(buff):
-    return unpack('=B', bytearray(buff.read(1)))[0]
+    return unpack('=B', buff.read(1))[0]
 
 
 def readuleb128(buff):
diff --git a/androguard/decompiler/decompiler.py b/androguard/decompiler/decompiler.py
index 76a0eeaaa..de34bd843 100644
--- a/androguard/decompiler/decompiler.py
+++ b/androguard/decompiler/decompiler.py
@@ -740,7 +740,7 @@ def _find_class(self, clname, basefolder):
                     return res
 
         # Check the whole supplied name
-        fname = os.path.join(basefolder, clname.replace("/", os.sep) + ".java")
+        fname = os.path.join(basefolder, (clname.replace("/", os.sep) + ".java").string)
         if not os.path.isfile(fname):
             return None
         return fname
diff --git a/tests/test_decompiler.py b/tests/test_decompiler.py
index badfce314..00e52b0d1 100644
--- a/tests/test_decompiler.py
+++ b/tests/test_decompiler.py
@@ -47,7 +47,7 @@ def test_all_decompiler():
     # Generate test cases for this APK:
     a, d, dx = AnalyzeAPK("examples/tests/hello-world.apk")
     for c in d[0].get_classes():
-        test_name = re.sub("[^a-zA-Z0-9_]", "_", c.get_name()[1:-1])
+        test_name = re.sub("[^a-zA-Z0-9_]", "_", c.get_name().string[1:-1])
         # Test the decompilation of a single class
         # disable for now, as testing all DvMethods has the same effect as
         # testing all DvClasses.

From 2acf45dfd161a4953cdf1197170deb068ebc5f62 Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Thu, 2 May 2019 13:25:24 -0700
Subject: [PATCH 11/15] Improving MUTF8String methods

---
 androguard/core/bytecodes/mutf8.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py
index 21ea1122e..2817b6685 100644
--- a/androguard/core/bytecodes/mutf8.py
+++ b/androguard/core/bytecodes/mutf8.py
@@ -135,11 +135,17 @@ def string(self):
             self.__decoded = decode(self.__encoded)
         return self.__decoded
 
-    def replace(self, old, new):
-        try:
-            return MUTF8String.from_bytes(self.bytes.replace(old, new))
-        except TypeError:
-            return MUTF8String.from_bytes(self.bytes.replace(encode(old), encode(new)))
+    def replace(self, old, new, count=None):
+        if count is None:
+            try:
+                return MUTF8String.from_bytes(self.bytes.replace(old, new))
+            except TypeError:
+                return MUTF8String.from_bytes(self.bytes.replace(encode(old), encode(new)))
+        else:
+            try:
+                return MUTF8String.from_bytes(self.bytes.replace(old, new, count))
+            except TypeError:
+                return MUTF8String.from_bytes(self.bytes.replace(encode(old), encode(new), count))
 
     def find(self, sub):
         try:
@@ -147,11 +153,17 @@ def find(self, sub):
         except TypeError:
             return self.bytes.find(encode(sub))
 
-    def split(self, sub):
+    def split(self, sep=None, maxsplit=-1):
+        try:
+            return [MUTF8String.from_bytes(i) for i in self.bytes.split(sep, maxsplit)]
+        except TypeError:
+            return [MUTF8String.from_bytes(i) for i in self.bytes.split(encode(sep), maxsplit)]
+
+    def rsplit(self, sep=None, maxsplit=-1):
         try:
-            return self.bytes.split(sub)
+            return [MUTF8String.from_bytes(i) for i in self.bytes.rsplit(sep, maxsplit)]
         except TypeError:
-            return self.bytes.split(encode(sub))
+            return [MUTF8String.from_bytes(i) for i in self.bytes.rsplit(encode(sep), maxsplit)]
 
     def startswith(self, sub):
         try:

From 14d6b0ae6b73673c4d4408ec1d99e105e713465e Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Thu, 2 May 2019 13:27:42 -0700
Subject: [PATCH 12/15] Changing DAD strings with MUTF8String equivalents

---
 androguard/decompiler/dad/decompile.py | 2 +-
 androguard/decompiler/dad/writer.py    | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/androguard/decompiler/dad/decompile.py b/androguard/decompiler/dad/decompile.py
index 3f1a98ab6..a09b9319e 100644
--- a/androguard/decompiler/dad/decompile.py
+++ b/androguard/decompiler/dad/decompile.py
@@ -299,7 +299,7 @@ def get_source(self):
 
         if len(self.interfaces) > 0:
             prototype += ' implements %s' % ', '.join(
-                [n[1:-1].replace('/', '.') for n in self.interfaces])
+                [n[1:-1].replace('/', '.').string for n in self.interfaces])
 
         source.append('%s {\n' % prototype)
         for field in self.fields:
diff --git a/androguard/decompiler/dad/writer.py b/androguard/decompiler/dad/writer.py
index 1fd2d3e84..83055f80f 100644
--- a/androguard/decompiler/dad/writer.py
+++ b/androguard/decompiler/dad/writer.py
@@ -679,6 +679,10 @@ def visit_condz_expression(self, op, arg):
             arg.visit(self)
         else:
             arg.visit(self)
+            try:
+                atype = atype.string
+            except AttributeError:
+                pass
             if atype in 'VBSCIJFD':
                 self.write(' %s 0' % op, data="TODO64")
             else:

From ad1fffeeb8a87737cb2c0acfa699b484f1ee4422 Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Thu, 2 May 2019 14:52:59 -0700
Subject: [PATCH 13/15] Fixing decompiler bugs with MUTF8String

---
 androguard/cli/main.py              | 6 +++---
 androguard/core/bytecodes/dvm.py    | 6 +++---
 androguard/core/bytecodes/mutf8.py  | 6 ++++++
 androguard/decompiler/decompiler.py | 4 ++--
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/androguard/cli/main.py b/androguard/cli/main.py
index df85d4886..2ffc2549b 100644
--- a/androguard/cli/main.py
+++ b/androguard/cli/main.py
@@ -240,7 +240,7 @@ def export_apps_to_format(filename,
                     continue
 
             # Current Folder to write to
-            filename_class = valid_class_name(method.get_class_name())
+            filename_class = valid_class_name(method.get_class_name().string)
             filename_class = os.path.join(output, filename_class)
             create_directory(filename_class)
 
@@ -257,10 +257,10 @@ def export_apps_to_format(filename,
                 method2format(filename + "." + form, form, None, buff)
 
             # Write the Java file for the whole class
-            if method.get_class_name() not in dump_classes:
+            if method.get_class_name().string not in dump_classes:
                 print("source codes ...", end=' ')
                 current_class = vm.get_class(method.get_class_name())
-                current_filename_class = valid_class_name(current_class.get_name())
+                current_filename_class = valid_class_name(current_class.get_name().string)
 
                 current_filename_class = os.path.join(output, current_filename_class + ".java")
                 with open(current_filename_class, "w") as fd:
diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py
index 31ea96b57..8ed1b2c05 100644
--- a/androguard/core/bytecodes/dvm.py
+++ b/androguard/core/bytecodes/dvm.py
@@ -3027,11 +3027,11 @@ def _fmt_classname(cls):
                 cls = cls.rsplit("/", 1)[1]
             return arr + cls
 
-        clsname = _fmt_classname(self.get_class_name())
+        clsname = _fmt_classname(self.get_class_name().string)
 
-        param, ret = self.get_descriptor()[1:].split(")")
+        param, ret = self.get_descriptor().string[1:].split(")")
         params = map(_fmt_classname, param.split(" "))
-        desc = "({}){}".format(" ".join(params), _fmt_classname(ret))
+        desc = "({}){}".format(mutf8.MUTF8String.join(params), _fmt_classname(ret))
 
         return "{cls} {meth} {desc}".format(cls=clsname, meth=self.get_name(), desc=desc)
 
diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/bytecodes/mutf8.py
index 2817b6685..81726d759 100644
--- a/androguard/core/bytecodes/mutf8.py
+++ b/androguard/core/bytecodes/mutf8.py
@@ -165,6 +165,12 @@ def rsplit(self, sep=None, maxsplit=-1):
         except TypeError:
             return [MUTF8String.from_bytes(i) for i in self.bytes.rsplit(encode(sep), maxsplit)]
 
+    def lstrip(self, sub):
+        try:
+            return MUTF8String.from_bytes(self.bytes.lstrip(sub))
+        except TypeError:
+            return MUTF8String.from_bytes(self.bytes.lstrip(encode(sub)))
+
     def startswith(self, sub):
         try:
             return self.bytes.startswith(sub)
diff --git a/androguard/decompiler/decompiler.py b/androguard/decompiler/decompiler.py
index de34bd843..7084a785c 100644
--- a/androguard/decompiler/decompiler.py
+++ b/androguard/decompiler/decompiler.py
@@ -696,7 +696,7 @@ def __init__(self, vm, vmx, jadx="jadx", keepfiles=False):
 
         # Next, try to find files for the classes we have
         for cl in andr_class_names:
-            fname = self._find_class(cl, tmpfolder)
+            fname = self._find_class(cl.string, tmpfolder)
             if fname:
                 if "L{};".format(cl) not in self.classes:
                     with open(fname, "rb") as fp:
@@ -740,7 +740,7 @@ def _find_class(self, clname, basefolder):
                     return res
 
         # Check the whole supplied name
-        fname = os.path.join(basefolder, (clname.replace("/", os.sep) + ".java").string)
+        fname = os.path.join(basefolder, clname.replace("/", os.sep) + ".java")
         if not os.path.isfile(fname):
             return None
         return fname

From 46be25270ea7569ed77e8c7acffdd0d6116742ec Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Wed, 8 May 2019 14:21:36 -0700
Subject: [PATCH 14/15] Removing unicode related methods

---
 androguard/core/bytecodes/dvm.py | 47 ++------------------------------
 1 file changed, 2 insertions(+), 45 deletions(-)

diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py
index 8ed1b2c05..3e769c333 100644
--- a/androguard/core/bytecodes/dvm.py
+++ b/androguard/core/bytecodes/dvm.py
@@ -1889,33 +1889,11 @@ def set_off(self, off):
     def get_off(self):
         return self.offset
 
-    # def get_unicode(self):
-    #     """
-    #     Returns an Unicode String
-    #     This is the actual string. Beware that some strings might be not
-    #     decodeable with usual UTF-16 decoder, as they use surrogates that are
-    #     not supported by python.
-    #     """
-    #     s = mutf8.decode(self.data)
-    #     if len(s) != self.utf16_size:
-    #         raise ValueError("UTF16 Length does not match!")
-
-    #     # Return a UTF16 String
-    #     return s
-
     def get(self):
         """
-        Returns a printable string.
-        In this case, all lonely surrogates are escaped, thus are represented in the
-        string as 6 characters: \\ud853
-        Valid surrogates are encoded as 32bit values, ie. \U00024f5c.
+        Returns a MUTF8String object
         """
         return mutf8.MUTF8String.from_bytes(self.data)
-        # s = mutf8.decode(self.data)
-        # if len(s) != self.utf16_size:
-        #     raise ValueError("UTF16 Length does not match!")
-        # # log.debug("Decoding UTF16 string with IDX {}, utf16 length {} and hexdata '{}'.".format(self.offset, self.utf16_size, binascii.hexlify(self.data)))
-        # return mutf8.patch_string(s)
 
     def show(self):
         bytecode._PrintSubBanner("String Data Item")
@@ -2166,7 +2144,7 @@ def get_parameters_off_value(self):
         """
         Return the string associated to the parameters_off
 
-        :rtype: string
+        :rtype: MUTF8String
         """
         if self.parameters_off_value is None:
             params = self.CM.get_type_list(self.parameters_off)
@@ -8145,27 +8123,6 @@ def get_field_descriptor(self, class_name, field_name, descriptor):
 
         return self.__cache_fields.get(key)
 
-    def get_strings_unicode(self):
-        """
-        Return all strings
-
-        This method will return pure UTF-16 strings. This is the "exact" same string as used in Java.
-        Those strings can be problematic for python, as they can contain surrogates as well as "broken"
-        surrogate pairs, ie single high or low surrogates.
-        Such a string can for example not be printed.
-        To avoid such problems, there is an escape mechanism to detect such lonely surrogates
-        and escape them in the string. Of course, this results in a different string than in the Java Source!
-
-        Use `get_strings()` as a general purpose and `get_strings_unicode()` if you require the exact string
-        from the Java Source.
-        You can always escape the string from `get_strings_unicode()` using the function
-        :meth:`androguard.core.bytecodes.mutf8.patch_string`
-
-        :rtype: a list with all strings used in the format (types, names ...)
-        """
-        for i in self.strings:
-            yield i.get_unicode()
-
     def get_strings(self):
         """
         Return all strings

From 50ccf777cc4870bd3eeb0f2a2ea4e2599a078051 Mon Sep 17 00:00:00 2001
From: Neakxs <contact@nlach.fr>
Date: Thu, 9 May 2019 12:20:57 -0700
Subject: [PATCH 15/15] Changing mutf8 package location

---
 androguard/core/analysis/analysis.py     | 4 ++--
 androguard/core/bytecodes/dvm.py         | 2 +-
 androguard/core/{bytecodes => }/mutf8.py | 0
 androguard/decompiler/dad/writer.py      | 2 +-
 tests/test_dexcodeparsing.py             | 2 +-
 tests/test_strings.py                    | 3 ++-
 6 files changed, 7 insertions(+), 6 deletions(-)
 rename androguard/core/{bytecodes => }/mutf8.py (100%)

diff --git a/androguard/core/analysis/analysis.py b/androguard/core/analysis/analysis.py
index 52193ef5e..d29591928 100644
--- a/androguard/core/analysis/analysis.py
+++ b/androguard/core/analysis/analysis.py
@@ -3,9 +3,9 @@
 import time
 import warnings
 from androguard.core.androconf import is_ascii_problem, load_api_specific_resource_module
-from androguard.core.bytecodes import dvm, mutf8
+from androguard.core.bytecodes import dvm
 import logging
-from androguard.core import bytecode
+from androguard.core import bytecode, mutf8
 import networkx as nx
 from enum import IntEnum
 
diff --git a/androguard/core/bytecodes/dvm.py b/androguard/core/bytecodes/dvm.py
index 3e769c333..d4e41d7d0 100644
--- a/androguard/core/bytecodes/dvm.py
+++ b/androguard/core/bytecodes/dvm.py
@@ -2,7 +2,7 @@
 from androguard.core.bytecodes.apk import APK
 from androguard.core.androconf import CONF
 
-from androguard.core.bytecodes import mutf8
+from androguard.core import mutf8
 from androguard.core.bytecodes.dvm_types import TypeMapItem, ACCESS_FLAGS, TYPE_DESCRIPTOR
 
 import sys
diff --git a/androguard/core/bytecodes/mutf8.py b/androguard/core/mutf8.py
similarity index 100%
rename from androguard/core/bytecodes/mutf8.py
rename to androguard/core/mutf8.py
diff --git a/androguard/decompiler/dad/writer.py b/androguard/decompiler/dad/writer.py
index 83055f80f..31168a677 100644
--- a/androguard/decompiler/dad/writer.py
+++ b/androguard/decompiler/dad/writer.py
@@ -17,7 +17,7 @@
 
 import logging
 from struct import unpack
-from androguard.core.bytecodes import mutf8
+from androguard.core import mutf8
 from androguard.decompiler.dad.util import get_type
 from androguard.decompiler.dad.opcode_ins import Op
 from androguard.decompiler.dad.instruction import (
diff --git a/tests/test_dexcodeparsing.py b/tests/test_dexcodeparsing.py
index 5f5c9cac2..c5b553f2f 100644
--- a/tests/test_dexcodeparsing.py
+++ b/tests/test_dexcodeparsing.py
@@ -45,7 +45,7 @@ def testcode(self):
     def testClassManager(self):
         """Test if the classmanager has the same items"""
 
-        from androguard.core.bytecodes.mutf8 import decode
+        from androguard.core.mutf8 import decode
 
         fname = "examples/android/TestsAndroguard/bin/classes.dex"
 
diff --git a/tests/test_strings.py b/tests/test_strings.py
index 4a2224979..d9ef06de8 100644
--- a/tests/test_strings.py
+++ b/tests/test_strings.py
@@ -3,7 +3,8 @@
 
 import sys
 
-from androguard.core.bytecodes import dvm, mutf8
+from androguard.core import  mutf8
+from androguard.core.bytecodes import dvm
 from androguard.core.analysis import analysis