Fix multi-unquicken bug for same CodeItems

Newly introduced CompactDex is deduplicating not only String data, but all types of items in data section, including CodeItems. Therefore, if the same CodeItem is linked with more than one methods (e.g. simple type of constructors) the Cdex is using the same code offset, which should be decompiled once. So we need to maintain a history with all the visited CodeItems so we check if already decompiled. Otherwise, the QuickenData streams will get corrupted since it will be consumed in wrong offsets. Currently this is achieved via simple hashset implementation that was forked from the https://github.com/avsej/hashset.c. Hashing the data pointers is good enough for now since we always operate against the already mapped file (all CodeItem offsets calculated from loaded file virtual address). Signed-off-by: Anestis Bechtsoudis <anestis@census-labs.com>
anestisb · Aug 30, 2018 · 254fb3b · 254fb3b
1 parent 7ad48d6
commit 254fb3b
Show file tree

Hide file tree

Showing 6 changed files with 269 additions and 18 deletions.
diff --git a/src/dex.c b/src/dex.c
@@ -1281,5 +1281,18 @@ void dex_DecodeCDexFields(cdexCode *pCdexCode,
   }
 }
 
+void dex_getCodeItemInfo(const u1 *dexFileBuf, dexMethod *pDexMethod, u2 **pCode, u4 *codeSize) {
+  // We have different code items in StandardDex and CompactDex
+  if (dex_checkType(dexFileBuf) == kNormalDex) {
+    dexCode *pDexCode = (dexCode *)(dex_getDataAddr(dexFileBuf) + pDexMethod->codeOff);
+    *pCode = pDexCode->insns;
+    *codeSize = pDexCode->insnsSize;
+  } else {
+    cdexCode *pCdexCode = (cdexCode *)(dex_getDataAddr(dexFileBuf) + pDexMethod->codeOff);
+    *pCode = pCdexCode->insns;
+    dex_DecodeCDexFields(pCdexCode, codeSize, NULL, NULL, NULL, NULL, true);
+  }
+}
+
 void dex_setDisassemblerStatus(bool status) { enableDisassembler = status; }
 bool dex_getDisassemblerStatus(void) { return enableDisassembler; }
diff --git a/src/dex.h b/src/dex.h
@@ -388,4 +388,7 @@ char *dex_descriptorClassToDot(const char *);
 // decodeOnlyInsrCnt is specified then only the instruction count is decoded.
 void dex_DecodeCDexFields(cdexCode *, u4 *, u2 *, u2 *, u2 *, u2 *, bool);
 
+// Get CodeItem information from a DexMethod
+void dex_getCodeItemInfo(const u1 *, dexMethod *, u2 **, u4 *);
+
 #endif
diff --git a/src/hashset/hashset.c b/src/hashset/hashset.c
@@ -0,0 +1,133 @@
+/*
+ *     Copyright 2012 Couchbase, Inc.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ */
+
+#include "hashset.h"
+#include <assert.h>
+
+static const unsigned int prime_1 = 73;
+static const unsigned int prime_2 = 5009;
+
+hashset_t hashset_create() {
+  hashset_t set = calloc(1, sizeof(struct hashset_st));
+
+  if (set == NULL) {
+    return NULL;
+  }
+  set->nbits = 3;
+  set->capacity = (size_t)(1 << set->nbits);
+  set->mask = set->capacity - 1;
+  set->items = calloc(set->capacity, sizeof(size_t));
+  if (set->items == NULL) {
+    hashset_destroy(set);
+    return NULL;
+  }
+  set->nitems = 0;
+  set->n_deleted_items = 0;
+  return set;
+}
+
+size_t hashset_num_items(hashset_t set) { return set->nitems; }
+
+void hashset_destroy(hashset_t set) {
+  if (set) {
+    free(set->items);
+  }
+  free(set);
+}
+
+static int hashset_add_member(hashset_t set, void *item) {
+  size_t value = (size_t)item;
+  size_t ii;
+
+  if (value == 0 || value == 1) {
+    return -1;
+  }
+
+  ii = set->mask & (prime_1 * value);
+
+  while (set->items[ii] != 0 && set->items[ii] != 1) {
+    if (set->items[ii] == value) {
+      return 0;
+    } else {
+      /* search free slot */
+      ii = set->mask & (ii + prime_2);
+    }
+  }
+  set->nitems++;
+  if (set->items[ii] == 1) {
+    set->n_deleted_items--;
+  }
+  set->items[ii] = value;
+  return 1;
+}
+
+static void maybe_rehash(hashset_t set) {
+  size_t *old_items;
+  size_t old_capacity, ii;
+
+  if (set->nitems + set->n_deleted_items >= (double)set->capacity * 0.85) {
+    old_items = set->items;
+    old_capacity = set->capacity;
+    set->nbits++;
+    set->capacity = (size_t)(1 << set->nbits);
+    set->mask = set->capacity - 1;
+    set->items = calloc(set->capacity, sizeof(size_t));
+    set->nitems = 0;
+    set->n_deleted_items = 0;
+    assert(set->items);
+    for (ii = 0; ii < old_capacity; ii++) {
+      hashset_add_member(set, (void *)old_items[ii]);
+    }
+    free(old_items);
+  }
+}
+
+int hashset_add(hashset_t set, void *item) {
+  int rv = hashset_add_member(set, item);
+  maybe_rehash(set);
+  return rv;
+}
+
+int hashset_remove(hashset_t set, void *item) {
+  size_t value = (size_t)item;
+  size_t ii = set->mask & (prime_1 * value);
+
+  while (set->items[ii] != 0) {
+    if (set->items[ii] == value) {
+      set->items[ii] = 1;
+      set->nitems--;
+      set->n_deleted_items++;
+      return 1;
+    } else {
+      ii = set->mask & (ii + prime_2);
+    }
+  }
+  return 0;
+}
+
+int hashset_is_member(hashset_t set, void *item) {
+  size_t value = (size_t)item;
+  size_t ii = set->mask & (prime_1 * value);
+
+  while (set->items[ii] != 0) {
+    if (set->items[ii] == value) {
+      return 1;
+    } else {
+      ii = set->mask & (ii + prime_2);
+    }
+  }
+  return 0;
+}
diff --git a/src/hashset/hashset.h b/src/hashset/hashset.h
@@ -0,0 +1,72 @@
+/*
+ *     Copyright 2012 Couchbase, Inc.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ */
+
+#ifndef HASHSET_H
+#define HASHSET_H 1
+
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct hashset_st {
+  size_t nbits;
+  size_t mask;
+
+  size_t capacity;
+  size_t *items;
+  size_t nitems;
+  size_t n_deleted_items;
+};
+
+typedef struct hashset_st *hashset_t;
+
+/* create hashset instance */
+hashset_t hashset_create(void);
+
+/* destroy hashset instance */
+void hashset_destroy(hashset_t set);
+
+size_t hashset_num_items(hashset_t set);
+
+/* add item into the hashset.
+ *
+ * @note 0 and 1 is special values, meaning nil and deleted items. the
+ *       function will return -1 indicating error.
+ *
+ * returns zero if the item already in the set and non-zero otherwise
+ */
+int hashset_add(hashset_t set, void *item);
+
+/* remove item from the hashset
+ *
+ * returns non-zero if the item was removed and zero if the item wasn't
+ * exist
+ */
+int hashset_remove(hashset_t set, void *item);
+
+/* check if existence of the item
+ *
+ * returns non-zero if the item exists and zero otherwise
+ */
+int hashset_is_member(hashset_t set, void *item);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/vdex/vdex_backend_019.c b/src/vdex/vdex_backend_019.c
@@ -22,6 +22,7 @@
 
 #include <sys/mman.h>
 
+#include "../hashset/hashset.h"
 #include "../out_writer.h"
 #include "../utils.h"
 #include "vdex_backend_019.h"
@@ -414,6 +415,13 @@ int vdex_backend_019_process(const char *VdexFileName,
       initCompactOffset(quickenInfoOffTable.data);
     }
 
+    // Make sure to not unquicken the same code item multiple times.
+    hashset_t unquickened_code_items = hashset_create();
+    if (!unquickened_code_items) {
+      LOGMSG(l_ERROR, "Failed to create hashset");
+      return -1;
+    }
+
     // For each class
     log_dis("file #%zu: classDefsSize=%" PRIu32 "\n", dex_file_idx,
             dex_getClassDefsSize(dexFileBuf));
@@ -462,16 +470,27 @@ int vdex_backend_019_process(const char *VdexFileName,
 
         // Skip empty methods
         if (curDexMethod.codeOff == 0) {
-          continue;
+          goto next_dmethod;
         }
 
         if (pRunArgs->unquicken) {
+          // Check if we've already unquickened the code item
+          u2 *pCode = NULL;
+          u4 codeSize = 0;
+          dex_getCodeItemInfo(dexFileBuf, &curDexMethod, &pCode, &codeSize);
+          if (hashset_is_member(unquickened_code_items, (void *)pCode)) {
+            LOGMSG(l_DEBUG, "Already unquickened direct method:%d",
+                   lastIdx + curDexMethod.methodIdx);
+            goto next_dmethod;
+          }
+
+          // Since new code item, add to set
+          hashset_add(unquickened_code_items, (void *)pCode);
+
           // Offset being 0 means not quickened.
           const u4 qOffset = getOffset(lastIdx + curDexMethod.methodIdx);
 
-          // Update lastIdx since followings delta_idx are based on 1st elements idx
-          lastIdx += curDexMethod.methodIdx;
-
+          // Get quickenData for method and decompile
           vdex_data_array_t quickenData;
           memset(&quickenData, 0, sizeof(vdex_data_array_t));
           if (quickenInfo.size != 0 && qOffset != 0u) {
@@ -482,6 +501,10 @@ int vdex_backend_019_process(const char *VdexFileName,
             LOGMSG(l_ERROR, "Failed to decompile Dex file");
             return -1;
           }
+
+        next_dmethod:
+          // Update lastIdx since followings delta_idx are based on 1st elements idx
+          lastIdx += curDexMethod.methodIdx;
         } else {
           vdex_decompiler_019_walk(dexFileBuf, &curDexMethod);
         }
@@ -497,16 +520,27 @@ int vdex_backend_019_process(const char *VdexFileName,
 
         // Skip native or abstract methods
         if (curDexMethod.codeOff == 0) {
-          continue;
+          goto next_vmethod;
         }
 
         if (pRunArgs->unquicken) {
+          // Check if we've already unquickened the code item
+          u2 *pCode = NULL;
+          u4 codeSize = 0;
+          dex_getCodeItemInfo(dexFileBuf, &curDexMethod, &pCode, &codeSize);
+          if (hashset_is_member(unquickened_code_items, (void *)pCode)) {
+            LOGMSG(l_DEBUG, "Already unquickened virtual method:%d",
+                   lastIdx + curDexMethod.methodIdx);
+            goto next_vmethod;
+          }
+
+          // Since new code item, add to set
+          hashset_add(unquickened_code_items, (void *)pCode);
+
           // Offset being 0 means not quickened.
           const u4 qOffset = getOffset(lastIdx + curDexMethod.methodIdx);
 
-          // Update lastIdx since followings delta_idx are based on 1st elements idx
-          lastIdx += curDexMethod.methodIdx;
-
+          // Get quickenData for method and decompile
           vdex_data_array_t quickenData;
           memset(&quickenData, 0, sizeof(vdex_data_array_t));
           if (quickenInfo.size != 0 && qOffset != 0u) {
@@ -517,6 +551,10 @@ int vdex_backend_019_process(const char *VdexFileName,
             LOGMSG(l_ERROR, "Failed to decompile Dex file");
             return -1;
           }
+
+        next_vmethod:
+          // Update lastIdx since followings delta_idx are based on 1st elements idx
+          lastIdx += curDexMethod.methodIdx;
         } else {
           vdex_decompiler_019_walk(dexFileBuf, &curDexMethod);
         }

diff --git a/src/vdex/vdex_decompiler_019.c b/src/vdex/vdex_decompiler_019.c
@@ -107,18 +107,10 @@ bool vdex_decompiler_019_decompile(const u1 *dexFileBuf,
     return true;
   }
 
-  // We have different code items in StandardDex and CompactDex
+  // Get method's CodeItem information
   u2 *pCode = NULL;
   u4 codeSize = 0;
-  if (dex_checkType(dexFileBuf) == kNormalDex) {
-    dexCode *pDexCode = (dexCode *)(dex_getDataAddr(dexFileBuf) + pDexMethod->codeOff);
-    pCode = pDexCode->insns;
-    codeSize = pDexCode->insnsSize;
-  } else {
-    cdexCode *pCdexCode = (cdexCode *)(dex_getDataAddr(dexFileBuf) + pDexMethod->codeOff);
-    pCode = pCdexCode->insns;
-    dex_DecodeCDexFields(pCdexCode, &codeSize, NULL, NULL, NULL, NULL, true);
-  }
+  dex_getCodeItemInfo(dexFileBuf, pDexMethod, &pCode, &codeSize);
 
   u4 startCodeOff = dex_getFirstInstrOff(dexFileBuf, pDexMethod);