Skip to content

Commit

Permalink
Implement support for Intel crc32 instruction (SSE 4.2)
Browse files Browse the repository at this point in the history
This change authored by vadimskipin and submitted via:

    google/leveldb#309

Changes made to support iOS builds and other architectures
without support for SSE 4.2.

db_bench reports original crc32 speed at:

    crc32c : 3.610 micros/op; 1082.0 MB/s (4K per op)

with this change performance has increased to:

    crc32c : 0.843 micros/op; 4633.6 MB/s (4K per op)

-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=148694935
  • Loading branch information
pwnall committed Feb 28, 2017
1 parent 95cd743 commit ea175e2
Show file tree
Hide file tree
Showing 6 changed files with 186 additions and 1 deletion.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -412,3 +412,9 @@ $(SHARED_OUTDIR)/%.o: %.cc

$(SHARED_OUTDIR)/%.o: %.c
$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@

$(STATIC_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
$(CXX) $(CXXFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@

$(SHARED_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
30 changes: 29 additions & 1 deletion build_detect_platform
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ PLATFORM_SHARED_EXT="so"
PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
PLATFORM_SHARED_CFLAGS="-fPIC"
PLATFORM_SHARED_VERSIONED=true
PLATFORM_SSEFLAGS=

MEMCMP_FLAG=
if [ "$CXX" = "g++" ]; then
Expand All @@ -77,6 +78,7 @@ case "$TARGET_OS" in
COMMON_FLAGS="$MEMCMP_FLAG -lpthread -DOS_LINUX -DCYGWIN"
PLATFORM_LDFLAGS="-lpthread"
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;;
Darwin)
PLATFORM=OS_MACOSX
Expand All @@ -85,55 +87,64 @@ case "$TARGET_OS" in
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/"
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;;
Linux)
PLATFORM=OS_LINUX
COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX"
PLATFORM_LDFLAGS="-pthread"
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;;
SunOS)
PLATFORM=OS_SOLARIS
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS"
PLATFORM_LIBS="-lpthread -lrt"
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;;
FreeBSD)
PLATFORM=OS_FREEBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD"
PLATFORM_LIBS="-lpthread"
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;;
NetBSD)
PLATFORM=OS_NETBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD"
PLATFORM_LIBS="-lpthread -lgcc_s"
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;;
OpenBSD)
PLATFORM=OS_OPENBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD"
PLATFORM_LDFLAGS="-pthread"
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;;
DragonFly)
PLATFORM=OS_DRAGONFLYBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD"
PLATFORM_LIBS="-lpthread"
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
;;
OS_ANDROID_CROSSCOMPILE)
PLATFORM=OS_ANDROID
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
PLATFORM_LDFLAGS="" # All pthread features are in the Android C library
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
CROSS_COMPILE=true
;;
HP-UX)
PLATFORM=OS_HPUX
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX"
PLATFORM_LDFLAGS="-pthread"
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
# man ld: +h internal_name
PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl,"
;;
Expand All @@ -142,6 +153,7 @@ case "$TARGET_OS" in
COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
PORT_FILE=port/port_posix.cc
PORT_SSE_FILE=port/port_posix_sse.cc
PLATFORM_SHARED_EXT=
PLATFORM_SHARED_LDFLAGS=
PLATFORM_SHARED_CFLAGS=
Expand All @@ -168,7 +180,7 @@ set +f # re-enable globbing

# The sources consist of the portable files, plus the platform-specific port
# file.
echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE" >> $OUTPUT
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT

if [ "$CROSS_COMPILE" = "true" ]; then
Expand Down Expand Up @@ -210,6 +222,21 @@ EOF
fi

rm -f $CXXOUTPUT 2>/dev/null

# Test if gcc SSE 4.2 is supported
$CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -msse4.2 2>/dev/null <<EOF
int main() {}
EOF
if [ "$?" = 0 ]; then
PLATFORM_SSEFLAGS="-msse4.2"
fi

rm -f $CXXOUTPUT 2>/dev/null
fi

# Use the SSE 4.2 CRC32C intrinsics iff runtime checks indicate compiler supports them.
if [ -n "$PLATFORM_SSEFLAGS" ]; then
PLATFORM_SSEFLAGS="$PLATFORM_SSEFLAGS -DLEVELDB_PLATFORM_POSIX_SSE"
fi

PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
Expand All @@ -222,6 +249,7 @@ echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
echo "PLATFORM_SSEFLAGS=$PLATFORM_SSEFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
Expand Down
6 changes: 6 additions & 0 deletions port/port_example.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,12 @@ extern bool Snappy_Uncompress(const char* input_data, size_t input_length,
// The concatenation of all "data[0,n-1]" fragments is the heap profile.
extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg);

// Extend the CRC to include the first n bytes of buf.
//
// Returns zero if the CRC cannot be extended using acceleration, else returns
// the newly extended CRC value (which may also be zero).
uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size);

} // namespace port
} // namespace leveldb

Expand Down
2 changes: 2 additions & 0 deletions port/port_posix.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
return false;
}

uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size);

} // namespace port
} // namespace leveldb

Expand Down
125 changes: 125 additions & 0 deletions port/port_posix_sse.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// Copyright 2016 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A portable implementation of crc32c, optimized to handle
// four bytes at a time.
//
// In a separate source file to allow this accelerated CRC32C function to be
// compiled with the appropriate compiler flags to enable x86 SSE 4.2
// instructions.

#include <stdint.h>
#include <string.h>
#include "port/port.h"

#if defined(LEVELDB_PLATFORM_POSIX_SSE)

#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__) && defined(__SSE4_2__)
#include <nmmintrin.h>
#include <cpuid.h>
#endif

#endif // defined(LEVELDB_PLATFORM_POSIX_SSE)

namespace leveldb {
namespace port {

#if defined(LEVELDB_PLATFORM_POSIX_SSE)

// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
static inline uint32_t LE_LOAD32(const uint8_t *p) {
// SSE is x86 only, so ensured that |p| is always little-endian.
uint32_t word;
memcpy(&word, p, sizeof(word));
return word;
}

// Used to fetch a naturally-aligned 64-bit word in little endian byte-order
static inline uint64_t LE_LOAD64(const uint8_t *p) {
uint64_t dword;
memcpy(&dword, p, sizeof(dword));
return dword;
}

static inline bool HaveSSE42() {
#if defined(_MSC_VER)
int cpu_info[4];
__cpuid(cpu_info, 1);
return (cpu_info[2] & (1 << 20)) != 0;
#elif defined(__GNUC__)
unsigned int eax, ebx, ecx, edx;
__get_cpuid(1, &eax, &ebx, &ecx, &edx);
return (ecx & (1 << 20)) != 0;
#else
return false;
#endif
}

#endif // defined(LEVELDB_PLATFORM_POSIX_SSE)

// For further improvements see Intel publication at:
// http://download.intel.com/design/intarch/papers/323405.pdf
uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) {
#if !defined(LEVELDB_PLATFORM_POSIX_SSE)
return 0;
#else
static bool have = HaveSSE42();
if (!have) {
return 0;
}

const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
const uint8_t *e = p + size;
uint32_t l = crc ^ 0xffffffffu;

#define STEP1 do { \
l = _mm_crc32_u8(l, *p++); \
} while (0)
#define STEP4 do { \
l = _mm_crc32_u32(l, LE_LOAD32(p)); \
p += 4; \
} while (0)
#define STEP8 do { \
l = _mm_crc32_u64(l, LE_LOAD64(p)); \
p += 8; \
} while (0)

if (size > 16) {
// Process unaligned bytes
for (unsigned int i = reinterpret_cast<uintptr_t>(p) % 8; i; --i) {
STEP1;
}

// _mm_crc32_u64 is only available on x64.
#if defined(_M_X64) || defined(__x86_64__)
// Process 8 bytes at a time
while ((e-p) >= 8) {
STEP8;
}
// Process 4 bytes at a time
if ((e-p) >= 4) {
STEP4;
}
#else // !(defined(_M_X64) || defined(__x86_64__))
// Process 4 bytes at a time
while ((e-p) >= 4) {
STEP4;
}
#endif // defined(_M_X64) || defined(__x86_64__)
}
// Process the last few bytes
while (p != e) {
STEP1;
}
#undef STEP8
#undef STEP4
#undef STEP1
return l ^ 0xffffffffu;
#endif // defined(LEVELDB_PLATFORM_POSIX_SSE)
}

} // namespace port
} // namespace leveldb
18 changes: 18 additions & 0 deletions util/crc32c.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include "util/crc32c.h"

#include <stdint.h>

#include "port/port.h"
#include "util/coding.h"

namespace leveldb {
Expand Down Expand Up @@ -283,7 +285,23 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) {
return DecodeFixed32(reinterpret_cast<const char*>(p));
}

// Determine if the CPU running this program can accelerate the CRC32C
// calculation.
static bool CanAccelerateCRC32C() {
// port::AcceleretedCRC32C returns zero when unable to accelerate.
static const char kTestCRCBuffer[] = "TestCRCBuffer";
static const char kBufSize = sizeof(kTestCRCBuffer) - 1;
static const uint32_t kTestCRCValue = 0xdcbc59fa;

return port::AcceleratedCRC32C(0, kTestCRCBuffer, kBufSize) == kTestCRCValue;
}

uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
static bool accelerate = CanAccelerateCRC32C();
if (accelerate) {
return port::AcceleratedCRC32C(crc, buf, size);
}

const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
const uint8_t *e = p + size;
uint32_t l = crc ^ 0xffffffffu;
Expand Down

0 comments on commit ea175e2

Please sign in to comment.