Speed up decompression by moving the refill check to the end of the l… · apage43/snappy@95a111b

Commit

Speed up decompression by moving the refill check to the end of the l…

…oop.

This seems to work because in most of the branches, the compiler can evaluate
“ip_limit_ - ip” in a more efficient way than reloading ip_limit_ from memory
(either by already having the entire expression in a register, or reconstructing
it from “avail”, or something else). Memory loads, even from L1, are seemingly
costly in the big picture at the current decompression speeds.

Microbenchmarks (64-bit, opt mode):

Westmere (Intel Core i7):

  Benchmark     Time(ns)    CPU(ns) Iterations
  --------------------------------------------
  BM_UFlat/0       74492      74491     187894 1.3GB/s  html      [ +5.9%]
  BM_UFlat/1      712268     712263      19644 940.0MB/s  urls    [ +3.8%]
  BM_UFlat/2       10591      10590    1000000 11.2GB/s  jpg      [ -6.8%]
  BM_UFlat/3       29643      29643     469915 3.0GB/s  pdf       [ +7.9%]
  BM_UFlat/4      304669     304667      45930 1.3GB/s  html4     [ +4.8%]
  BM_UFlat/5       28508      28507     490077 823.1MB/s  cp      [ +4.0%]
  BM_UFlat/6       12415      12415    1000000 856.5MB/s  c       [ +8.6%]
  BM_UFlat/7        3415       3415    4084723 1039.0MB/s  lsp    [+18.0%]
  BM_UFlat/8      979569     979563      14261 1002.5MB/s  xls    [ +5.8%]
  BM_UFlat/9      230150     230148      60934 630.2MB/s  txt1    [ +5.2%]
  BM_UFlat/10     197167     197166      71135 605.5MB/s  txt2    [ +4.7%]
  BM_UFlat/11     607394     607390      23041 670.1MB/s  txt3    [ +5.6%]
  BM_UFlat/12     808502     808496      17316 568.4MB/s  txt4    [ +5.0%]
  BM_UFlat/13     372791     372788      37564 1.3GB/s  bin       [ +3.3%]
  BM_UFlat/14      44541      44541     313969 818.8MB/s  sum     [ +5.7%]
  BM_UFlat/15       4833       4833    2898697 834.1MB/s  man     [ +4.8%]
  BM_UFlat/16      79855      79855     175356 1.4GB/s  pb        [ +4.8%]
  BM_UFlat/17     245845     245843      56838 715.0MB/s  gaviota [ +5.8%]

Clovertown (Intel Core 2):

  Benchmark     Time(ns)    CPU(ns) Iterations
  --------------------------------------------
  BM_UFlat/0      107911     107890     100000 905.1MB/s  html    [ +2.2%]
  BM_UFlat/1     1011237    1011041      10000 662.3MB/s  urls    [ +2.5%]
  BM_UFlat/2       26775      26770     523089 4.4GB/s  jpg       [ +0.0%]
  BM_UFlat/3       48103      48095     290618 1.8GB/s  pdf       [ +3.4%]
  BM_UFlat/4      437724     437644      31937 892.6MB/s  html4   [ +2.1%]
  BM_UFlat/5       39607      39600     358284 592.5MB/s  cp      [ +2.4%]
  BM_UFlat/6       18227      18224     768191 583.5MB/s  c       [ +2.7%]
  BM_UFlat/7        5171       5170    2709437 686.4MB/s  lsp     [ +3.9%]
  BM_UFlat/8     1560291    1559989       8970 629.5MB/s  xls     [ +3.6%]
  BM_UFlat/9      335401     335343      41731 432.5MB/s  txt1    [ +3.0%]
  BM_UFlat/10     287014     286963      48758 416.0MB/s  txt2    [ +2.8%]
  BM_UFlat/11     888522     888356      15752 458.1MB/s  txt3    [ +2.9%]
  BM_UFlat/12    1186600    1186378      10000 387.3MB/s  txt4    [ +3.1%]
  BM_UFlat/13     572295     572188      24468 855.4MB/s  bin     [ +2.1%]
  BM_UFlat/14      64060      64049     218401 569.4MB/s  sum     [ +4.1%]
  BM_UFlat/15       7264       7263    1916168 555.0MB/s  man     [ +1.4%]
  BM_UFlat/16     108853     108836     100000 1039.1MB/s  pb     [ +1.7%]
  BM_UFlat/17     364289     364223      38419 482.6MB/s  gaviota [ +4.9%]

Barcelona (AMD Opteron):

  Benchmark     Time(ns)    CPU(ns) Iterations
  --------------------------------------------
  BM_UFlat/0      103900     103871     100000 940.2MB/s  html    [ +8.3%]
  BM_UFlat/1     1000435    1000107      10000 669.5MB/s  urls    [ +6.6%]
  BM_UFlat/2       24659      24652     567362 4.8GB/s  jpg       [ +0.1%]
  BM_UFlat/3       48206      48193     291121 1.8GB/s  pdf       [ +5.0%]
  BM_UFlat/4      421980     421850      33174 926.0MB/s  html4   [ +7.3%]
  BM_UFlat/5       40368      40357     346994 581.4MB/s  cp      [ +8.7%]
  BM_UFlat/6       19836      19830     708695 536.2MB/s  c       [ +8.0%]
  BM_UFlat/7        6100       6098    2292774 581.9MB/s  lsp     [ +9.0%]
  BM_UFlat/8     1693093    1692514       8261 580.2MB/s  xls     [ +8.0%]
  BM_UFlat/9      365991     365886      38225 396.4MB/s  txt1    [ +7.1%]
  BM_UFlat/10     311330     311238      44950 383.6MB/s  txt2    [ +7.6%]
  BM_UFlat/11     975037     974737      14376 417.5MB/s  txt3    [ +6.9%]
  BM_UFlat/12    1303558    1303175      10000 352.6MB/s  txt4    [ +7.3%]
  BM_UFlat/13     517448     517290      27144 946.2MB/s  bin     [ +5.5%]
  BM_UFlat/14      66537      66518     210352 548.3MB/s  sum     [ +7.5%]
  BM_UFlat/15       7976       7974    1760383 505.6MB/s  man     [ +5.6%]
  BM_UFlat/16     103121     103092     100000 1097.0MB/s  pb     [ +8.7%]
  BM_UFlat/17     391431     391314      35733 449.2MB/s  gaviota [ +6.5%]

R=sanjay


git-svn-id: http://snappy.googlecode.com/svn/trunk@54 03e5f5b5-db94-4691-08a0-1a8bf15f6143

Loading branch information

snappy.mirrorbot@gmail.com committed Dec 5, 2011

1 parent dc9e55f commit 95a111b

snappy.cc

-Original file line number
+Diff line change
@@ Expand Up / @@ -669,20 +669,28 @@ class SnappyDecompressor { @@
       template <class Writer>
       void DecompressAllTags(Writer* writer) {
         const char* ip = ip_;
-        for ( ;; ) {
-          if (ip_limit_ - ip < 5) {
-            ip_ = ip;
-            if (!RefillTag()) return;
-            ip = ip_;
-          }
+        // We could have put this refill fragment only at the beginning of the loop.
+        // However, duplicating it at the end of each branch gives the compiler more
+        // scope to optimize the <ip_limit_ - ip> expression based on the local
+        // context, which overall increases speed.
+        #define MAYBE_REFILL() \
+            if (ip_limit_ - ip < 5) { \
+              ip_ = ip; \
+              if (!RefillTag()) return; \
+              ip = ip_; \
+            }
+        MAYBE_REFILL();
+        for ( ;; ) {
           const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
           if ((c & 0x3) == LITERAL) {
             uint32 literal_length = (c >> 2) + 1;
             if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
               DCHECK_LT(literal_length, 61);
               ip += literal_length;
+              MAYBE_REFILL();
               continue;
             }
             if (PREDICT_FALSE(literal_length >= 61)) {
@@ Expand All / @@ -709,6 +717,7 @@ class SnappyDecompressor { @@
               return;
             }
             ip += literal_length;
+            MAYBE_REFILL();
           } else {
             const uint32 entry = char_table[c];
             const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
@@ Expand All / @@ -722,8 +731,11 @@ class SnappyDecompressor { @@
             if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
               return;
             }
+            MAYBE_REFILL();
           }
         }
+    #undef MAYBE_REFILL
       }
     };
@@ Expand Down @@

0 comments on commit `95a111b`

Please sign in to comment.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Commit

There are no files selected for viewing

0 comments on commit `95a111b`

Commit

There are no files selected for viewing

0 comments on commit 95a111b

0 comments on commit `95a111b`