ad-freiburg · Buchhold · Jan 12, 2016 · Jan 11, 2016 · Jan 12, 2016 · Jan 12, 2016
diff --git a/src/util/Simple8bCode.h b/src/util/Simple8bCode.h
@@ -1,107 +1,115 @@
-// Copyright 2013, University of Freiburg,
+// Copyright 2015, University of Freiburg,
 // Chair of Algorithms and Data Structures.
-// Author: Björn Buchhold <buchholb>
+// Author: Björn Buchhold <buchholb> and Zhiwei Zhang <zhang>
 
 #pragma once
 #include <stdint.h>
 #include <algorithm>
+#include <assert.h>
 
 namespace ad_utility {
 
-//! Selector mask,
-//! see: Anh & Moffat: "Index compression using 64-bit words."
-static const uint64_t SIMPLE8B_SELECTOR_MASK = 0x000000000000000F;
+  //! Selector mask,
+  //! see: Anh & Moffat: "Index compression using 64-bit words."
+  static const uint64_t SIMPLE8B_SELECTOR_MASK = 0x000000000000000F;
 
-//! Selectors,
-//! see: Anh & Moffat: "Index compression using 64-bit words."
-static const struct {
-    unsigned char _itemWidth;
-    unsigned char _groupSize;
-    unsigned char _wastedBits;
-    uint64_t _mask;
-} SIMPLE8B_SELECTORS[16] = {
-    {0, 240, 60, 0x0000000000000000},  // selector 0
-    {0, 120, 60, 0x0000000000000000},  // selector 1
-    {1,  60,  0, 0x0000000000000001},  // selector 2
-    {2,  30,  0, 0x0000000000000003},  // selector 3
-    {3,  20,  0, 0x0000000000000007},  // selector 4
-    {4,  15,  0, 0x000000000000000F},  // selector 5
-    {5,  12,  0, 0x000000000000001F},  // selector 6
-    {6,  10,  0, 0x000000000000003F},  // selector 7
-    {7,   8,  4, 0x000000000000007F},  // selector 8
-    {8,   7,  4, 0x00000000000000FF},  // selector 9
-    {10,  6,  0, 0x00000000000003FF},  // selector 10
-    {12,  5,  0, 0x0000000000000FFF},  // selector 11
-    {15,  4,  0, 0x0000000000007FFF},  // selector 12
-    {20,  3,  0, 0x00000000000FFFFF},  // selector 13
-    {30,  2,  0, 0x000000003FFFFFFF},  // selector 14
-    {60,  1,  0, 0x0FFFFFFFFFFFFFFF},  // selector 15
-};
+  //! Selectors,
+  //! see: Anh & Moffat: "Index compression using 64-bit words."
+  static const struct {
+      unsigned char _itemWidth;
+      unsigned char _groupSize;
+      unsigned char _wastedBits;
+      uint64_t _mask;
+  } SIMPLE8B_SELECTORS[16] = {
+      {0, 240, 60, 0x0000000000000000},  // selector 0
+      {0, 120, 60, 0x0000000000000000},  // selector 1
+      {1,  60,  0, 0x0000000000000001},  // selector 2
+      {2,  30,  0, 0x0000000000000003},  // selector 3
+      {3,  20,  0, 0x0000000000000007},  // selector 4
+      {4,  15,  0, 0x000000000000000F},  // selector 5
+      {5,  12,  0, 0x000000000000001F},  // selector 6
+      {6,  10,  0, 0x000000000000003F},  // selector 7
+      {7,   8,  4, 0x000000000000007F},  // selector 8
+      {8,   7,  4, 0x00000000000000FF},  // selector 9
+      {10,  6,  0, 0x00000000000003FF},  // selector 10
+      {12,  5,  0, 0x0000000000000FFF},  // selector 11
+      {15,  4,  0, 0x0000000000007FFF},  // selector 12
+      {20,  3,  0, 0x00000000000FFFFF},  // selector 13
+      {30,  2,  0, 0x000000003FFFFFFF},  // selector 14
+      {60,  1,  0, 0x0FFFFFFFFFFFFFFF},  // selector 15
+  };
 
-//! Simple8b Compression Scheme.
-//! See: Anh & Moffat: Index compression using 64-bit words.
-//! Changed the following:
-//! Selectors 0 and 1 are used to encode long streaks of the most frequent
-//! elements. In the original paper, this is 1 since doclists are encoded.
-//! our contexts benefit less from this.
-//! Especially, our frequency encoding benefits from 0's much more (esp. scores)
-//! hence we use those selectors to encode 0's instead of 1's
-class Simple8bCode {
-  public:
+  //! Simple8b Compression Scheme.
+  //! See: Anh & Moffat: Index compression using 64-bit words.
+  //! Changed the following:
+  //! Selectors 0 and 1 are used to encode long streaks of the most frequent
+  //! elements. In the original paper, this is 1 since doclists are encoded.
+  //! our contexts benefit less from this.
+  //! Especially, our frequency encoding benefits from 0's much more (esp. scores)
+  //! hence we use those selectors to encode 0's instead of 1's
+  class Simple8bCode {
+   public:
 
-    //! Encodes a list of elements that can be interpreted as numeric values
-    //! using the Simple8b compression scheme.
-    //! Returns the number of bytes in the encoded array, will always be a
-    //! multiple of 8.
-    //! Requires encoded to be preallocated with sufficient space.
+    // ! Encodes a list of elements that can be interpreted as numeric values
+    // ! using the Simple8b compression scheme.
+    // ! Returns the number of bytes in the encoded array, will always be a
+    // ! multiple of 8.
+    // ! Requires encoded to be preallocated with sufficient space.
     template<typename Numeric>
     static size_t encode(Numeric* plaintext, size_t nofElements,
-        uint64_t* encoded) {
+            uint64_t* encoded) {
       size_t nofElementsEncoded = 0;
       size_t nofCodeWordsDone = 0;
       while (nofElementsEncoded < nofElements) {
+        // it's the lambda.
         size_t itemsLeft = nofElements - nofElementsEncoded;
         // Count the number of consecutive 0's and decide if
         // selectors 0 or 1 can be used
         bool selector0 = true;
         bool selector1 = false;
         for (size_t i = 0; i < std::min<size_t>(240, itemsLeft); ++i) {
           if (plaintext[nofElementsEncoded + i] != 0) {
-            if (i > 120) {
-              selector0 = false;
-              selector1 = true;
-              break;
-            } else {
-              // Not possible to use selector 0 or 1
-              selector0 = false;
-              break;
-            }
+            selector0 = false;
+            if (i > 120) selector1 = true;
+            break;
           }
         }
+        // If there are less than 240 elements left and all the elements are 0,
+        // then we should not use the selector0. Otherwise the words in the 
+        // encoded list maybe represent more words than the actually encoded.
+        // This step is necessary for the optimal decode function, which runs 
+        // more faster than the old decode function.
+        if (itemsLeft < 240 && selector0) {
+          selector0 = false;
+          if (itemsLeft >= 120)  selector1 = true;
+        }
         if (selector0) {
           // Use selector 0 to compress 240 consecutive 0's.
           encoded[nofCodeWordsDone] = 0;
           nofElementsEncoded += 240;
-          nofCodeWordsDone++;
+          ++nofCodeWordsDone;
           continue;
         }
         if (selector1) {
           // Use selector 1 to compress 120 consecutive 0's.
           encoded[nofCodeWordsDone] = 1;
           nofElementsEncoded += 120;
-          nofCodeWordsDone++;
+          ++nofCodeWordsDone;
           continue;
         }
         // Try selectors for as many items per codeword as possible,
         // take the next selector whenever it is not possible.
         for (unsigned char selector = 2; selector < 16; ++selector) {
           uint64_t codeword = selector;
           size_t nofItemsInWord = 0;
-          while (nofItemsInWord < std::min<size_t>(itemsLeft,
-              SIMPLE8B_SELECTORS[selector]._groupSize)) {
+          // Do the check again, it's also necessary for the optimal decode function.
+          if (itemsLeft < SIMPLE8B_SELECTORS[selector]._groupSize) {
+            continue;
+          }
+          while (nofItemsInWord < SIMPLE8B_SELECTORS[selector]._groupSize) {
             // Check that the max value (60 bit) is not exceeded.
             assert(plaintext[nofElementsEncoded + nofItemsInWord]
-                             <= 0x0FFFFFFFFFFFFFFF);
+                   <= 0x0FFFFFFFFFFFFFFF);
             // If an element is too large, break the loop.
             // Later we recognize that not enough elements
             // were written for the specific selector, and hence try the next
@@ -111,15 +119,14 @@ class Simple8bCode {
               break;
             }
             codeword |= (static_cast<uint64_t>(
-                plaintext[nofElementsEncoded + nofItemsInWord])
-                << (4 +  // Selector bits.
-                nofItemsInWord * SIMPLE8B_SELECTORS[selector]._itemWidth));
+                         plaintext[nofElementsEncoded + nofItemsInWord])
+                         << (4 +  // Selector bits.
+                     nofItemsInWord * SIMPLE8B_SELECTORS[selector]._itemWidth));
             ++nofItemsInWord;
           }
           // Check if enough elements have been written to fit the selector
           // or if the loop was left earlier.
-          if (nofItemsInWord == SIMPLE8B_SELECTORS[selector]._groupSize ||
-              nofItemsInWord == itemsLeft) {
+          if (nofItemsInWord == SIMPLE8B_SELECTORS[selector]._groupSize) {
             encoded[nofCodeWordsDone] = codeword;
             nofElementsEncoded += nofItemsInWord;
             ++nofCodeWordsDone;
@@ -130,37 +137,38 @@ class Simple8bCode {
       return sizeof(uint64_t) * nofCodeWordsDone;
     }
 
-    //! Decodes a list of elements using the Simple8b compression scheme.
-    //! Requires decoded to be preallocated with sufficient space,
-    //! i.e. sizeof(Numeric) * (nofElements + 239).
-    //! The overhead is included so that no check for noundaries
-    //! is necessary inside the decoding of a single codeword.
+    // ! Decodes a list of elements using the Simple8b compression scheme.
+    // ! Requires decoded to be preallocated with sufficient space,
+    // ! i.e. sizeof(Numeric) * (nofElements + 239).
+    // ! The overhead is included so that no check for noundaries
+    // ! is necessary inside the decoding of a single codeword.
     template<typename Numeric>
     static void decode(uint64_t* encoded, size_t nofElements,
-        Numeric* decoded) {
+                       Numeric* decoded) {
       size_t nofElementsDone = 0;
       size_t nofCodeWordsDone = 0;
       // Loop over full 64bit codewords
       while (nofElementsDone < nofElements) {
         unsigned char selector = encoded[nofCodeWordsDone]
             & SIMPLE8B_SELECTOR_MASK;
-        if (selector > 1) {
+        if (selector > 1 && encoded[nofCodeWordsDone] != selector) {
           // Case: Usual decompression.
+          // if encoded[nofCodeWordsDone] == selector, then we know that
+          // there are selector consecutive 0's. We don't need to do the
+          // assignment for 0, because the initial assignment of the normal
+          // Numeric type is 0. We can save a lot of time to decompress the
+          // code which constains not only 0.
+          uint64_t word = encoded[nofCodeWordsDone] >> 4;
           for (size_t i = 0; i < SIMPLE8B_SELECTORS[selector]._groupSize;
                ++i) {
-            decoded[nofElementsDone + i] = (encoded[nofCodeWordsDone] >>
-                                            (4 + SIMPLE8B_SELECTORS[selector]._itemWidth * i))
-                                           & SIMPLE8B_SELECTORS[selector]._mask;
-          }
-        } else {
-          // Case: Long sequences of 1's (or 0's) compressed.
-          for (size_t i = 0; i < SIMPLE8B_SELECTORS[selector]._groupSize; ++i) {
-            decoded[nofElementsDone + i] = 0;
+            decoded[nofElementsDone + i] = word
+            & SIMPLE8B_SELECTORS[selector]._mask;
+            word = word >> SIMPLE8B_SELECTORS[selector]._itemWidth;
           }
         }
         nofElementsDone += SIMPLE8B_SELECTORS[selector]._groupSize;
         ++nofCodeWordsDone;
       }
     }
-};
+  };
 }