Skip to content

Commit

Permalink
Proactively skip huffman compression based on sampling where non-comp…
Browse files Browse the repository at this point in the history
…ressibility is suspected
  • Loading branch information
Binh Vo committed Jun 28, 2021
1 parent 88f107b commit 75105ec
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 12 deletions.
16 changes: 16 additions & 0 deletions lib/common/huf.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,14 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);

/* Incorporate fast check(s) on the suspicion that this data is not compressible and back off
quickly to raw format if so. */
size_t HUF_compress4X_repeat_fastCheck(void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned tableLog,
void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);

/** HUF_buildCTable_wksp() :
* Same as HUF_buildCTable(), but using externally allocated scratch buffer.
* `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
Expand Down Expand Up @@ -318,6 +326,14 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);

/* Incorporate fast check(s) on the suspicion that this data is not compressible and back off
quickly to raw format if so. */
size_t HUF_compress1X_repeat_fastCheck(void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned tableLog,
void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);

size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */
#ifndef HUF_FORCE_DECOMPRESS_X1
size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */
Expand Down
56 changes: 51 additions & 5 deletions lib/compress/huf_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,9 @@ typedef struct {
} wksps;
} HUF_compress_tables_t;

#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 // Must be larger than 2

/* HUF_compress_internal() :
* `workSpace_align4` must be aligned on 4-bytes boundaries,
* and occupies the same space as a table of HUF_WORKSPACE_SIZE_U32 unsigned */
Expand All @@ -768,7 +771,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
HUF_nbStreams_e nbStreams,
void* workSpace_align4, size_t wkspSize,
HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
const int bmi2)
const int bmi2, unsigned suspectUncompressible)
{
HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace_align4;
BYTE* const ostart = (BYTE*)dst;
Expand All @@ -795,6 +798,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
nbStreams, oldHufTable, bmi2);
}

/* If uncompressible data is suspected, do a smaller sampling first */
if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
size_t largestTotal = 0;
{ unsigned maxSymbolValueBegin = maxSymbolValue;
CHECK_V_F(largestBegin, HIST_count_wksp (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, workSpace_align4, wkspSize) );
largestTotal += largestBegin;
}
{ unsigned maxSymbolValueEnd = maxSymbolValue;
CHECK_V_F(largestEnd, HIST_count_wksp (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, workSpace_align4, wkspSize) );
largestTotal += largestEnd;
}
if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0; /* heuristic : probably not compressible enough */
}

/* Scan input and build symbol stats */
{ CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace_align4, wkspSize) );
if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */
Expand Down Expand Up @@ -860,7 +877,7 @@ size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
return HUF_compress_internal(dst, dstSize, src, srcSize,
maxSymbolValue, huffLog, HUF_singleStream,
workSpace, wkspSize,
NULL, NULL, 0, 0 /*bmi2*/);
NULL, NULL, 0, 0 /*bmi2*/, 0);
}

size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
Expand All @@ -872,7 +889,20 @@ size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
return HUF_compress_internal(dst, dstSize, src, srcSize,
maxSymbolValue, huffLog, HUF_singleStream,
workSpace, wkspSize, hufTable,
repeat, preferRepeat, bmi2);
repeat, preferRepeat, bmi2, 0);
}

size_t HUF_compress1X_repeat_fastCheck (void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned huffLog,
void* workSpace, size_t wkspSize,
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
int bmi2, unsigned suspectUncompressible)
{
return HUF_compress_internal(dst, dstSize, src, srcSize,
maxSymbolValue, huffLog, HUF_singleStream,
workSpace, wkspSize, hufTable,
repeat, preferRepeat, bmi2, suspectUncompressible);
}

/* HUF_compress4X_repeat():
Expand All @@ -886,7 +916,7 @@ size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
return HUF_compress_internal(dst, dstSize, src, srcSize,
maxSymbolValue, huffLog, HUF_fourStreams,
workSpace, wkspSize,
NULL, NULL, 0, 0 /*bmi2*/);
NULL, NULL, 0, 0 /*bmi2*/, 0);
}

/* HUF_compress4X_repeat():
Expand All @@ -901,7 +931,23 @@ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
return HUF_compress_internal(dst, dstSize, src, srcSize,
maxSymbolValue, huffLog, HUF_fourStreams,
workSpace, wkspSize,
hufTable, repeat, preferRepeat, bmi2);
hufTable, repeat, preferRepeat, bmi2, 0);
}

/* HUF_compress4X_repeat_fastCheck():
* compress input using 4 streams.
* consider skipping quickly
* re-use an existing huffman compression table */
size_t HUF_compress4X_repeat_fastCheck (void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned huffLog,
void* workSpace, size_t wkspSize,
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
{
return HUF_compress_internal(dst, dstSize, src, srcSize,
maxSymbolValue, huffLog, HUF_fourStreams,
workSpace, wkspSize,
hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
}

#ifndef ZSTD_NO_UNUSED_FUNCTIONS
Expand Down
11 changes: 8 additions & 3 deletions lib/compress/zstd_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -2557,6 +2557,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
* compresses both literals and sequences
* Returns compressed size of block, or a zstd error.
*/
#define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
MEM_STATIC size_t
ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
const ZSTD_entropyCTables_t* prevEntropy,
Expand Down Expand Up @@ -2591,15 +2592,18 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,

/* Compress literals */
{ const BYTE* const literals = seqStorePtr->litStart;
size_t numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
size_t numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
unsigned suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
size_t const litSize = (size_t)(seqStorePtr->lit - literals);
size_t const cSize = ZSTD_compressLiterals(
size_t const cSize = ZSTD_compressLiterals_fastCheck(
&prevEntropy->huf, &nextEntropy->huf,
cctxParams->cParams.strategy,
ZSTD_disableLiteralsCompression(cctxParams),
op, dstCapacity,
literals, litSize,
entropyWorkspace, entropyWkspSize,
bmi2);
bmi2, suspectUncompressible);
FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
assert(cSize <= dstCapacity);
op += cSize;
Expand Down Expand Up @@ -6169,7 +6173,8 @@ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
/*-===== Pre-defined compression levels =====-*/

#define ZSTD_MAX_CLEVEL 22
int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
#define ZSTD_MAX_CLEVEL_32 21
int ZSTD_maxCLevel(void) { return sizeof(void *) == 8 ? ZSTD_MAX_CLEVEL : ZSTD_MAX_CLEVEL_32; }
int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; }
int ZSTD_defaultCLevel(void) { return ZSTD_CLEVEL_DEFAULT; }

Expand Down
21 changes: 17 additions & 4 deletions lib/compress/zstd_compress_literals.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
const void* src, size_t srcSize,
void* entropyWorkspace, size_t entropyWorkspaceSize,
const int bmi2)
{
return ZSTD_compressLiterals_fastCheck(prevHuf, nextHuf, strategy, disableLiteralCompression, dst, dstCapacity,
src, srcSize, entropyWorkspace, entropyWorkspaceSize, bmi2, 0);
}

size_t ZSTD_compressLiterals_fastCheck (ZSTD_hufCTables_t const* prevHuf,
ZSTD_hufCTables_t* nextHuf,
ZSTD_strategy strategy, int disableLiteralCompression,
void* dst, size_t dstCapacity,
const void* src, size_t srcSize,
void* entropyWorkspace, size_t entropyWorkspaceSize,
const int bmi2,
unsigned suspectUncompressible)
{
size_t const minGain = ZSTD_minGain(srcSize, strategy);
size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
Expand Down Expand Up @@ -102,14 +115,14 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
cLitSize = singleStream ?
HUF_compress1X_repeat(
HUF_compress1X_repeat_fastCheck(
ostart+lhSize, dstCapacity-lhSize, src, srcSize,
HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
(HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) :
HUF_compress4X_repeat(
(HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
HUF_compress4X_repeat_fastCheck(
ostart+lhSize, dstCapacity-lhSize, src, srcSize,
HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
(HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2);
(HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
if (repeat != HUF_repeat_none) {
/* reused the existing table */
DEBUGLOG(5, "Reusing previous huffman table");
Expand Down
11 changes: 11 additions & 0 deletions lib/compress/zstd_compress_literals.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,15 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
void* entropyWorkspace, size_t entropyWorkspaceSize,
const int bmi2);

/* Incorporate fast check(s) on the suspicion that this data is not compressible and back off
quickly to raw format if so. */
size_t ZSTD_compressLiterals_fastCheck (ZSTD_hufCTables_t const* prevHuf,
ZSTD_hufCTables_t* nextHuf,
ZSTD_strategy strategy, int disableLiteralCompression,
void* dst, size_t dstCapacity,
const void* src, size_t srcSize,
void* entropyWorkspace, size_t entropyWorkspaceSize,
const int bmi2,
unsigned suspectUncompressible);

#endif /* ZSTD_COMPRESS_LITERALS_H */

0 comments on commit 75105ec

Please sign in to comment.