From 7f3564cca62de49c9f2ea67fcf735921dbebb4d1 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sun, 27 Feb 2022 10:38:17 +0800 Subject: [PATCH] [chore] Support aarch64 target with ldb_toolchain (#8249) --- be/CMakeLists.txt | 1 - be/src/glibc-compatibility/CMakeLists.txt | 9 +- be/src/glibc-compatibility/FastMemcpy.c | 220 ------ be/src/glibc-compatibility/FastMemcpy.h | 694 ------------------ be/src/glibc-compatibility/LICENSE_FastMemcpy | 22 - .../memcpy/memcpy_aarch64.cpp | 245 +++++++ .../memcpy/memcpy_x86_64.cpp | 220 ++++++ be/src/glibc-compatibility/memcpy_wrapper.c | 6 - thirdparty/patches/libhdfs3-master.patch | 24 +- 9 files changed, 492 insertions(+), 949 deletions(-) delete mode 100644 be/src/glibc-compatibility/FastMemcpy.c delete mode 100644 be/src/glibc-compatibility/FastMemcpy.h delete mode 100644 be/src/glibc-compatibility/LICENSE_FastMemcpy create mode 100644 be/src/glibc-compatibility/memcpy/memcpy_aarch64.cpp create mode 100644 be/src/glibc-compatibility/memcpy/memcpy_x86_64.cpp delete mode 100644 be/src/glibc-compatibility/memcpy_wrapper.c diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index dfe85cee4070ec..c56b0225270bad 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -563,7 +563,6 @@ if(ARCH_AARCH64) ${DORIS_DEPENDENCIES} ${WL_START_GROUP} ${COMMON_THIRDPARTY} - ${WL_END_GROUP} ) else() set(DORIS_DEPENDENCIES diff --git a/be/src/glibc-compatibility/CMakeLists.txt b/be/src/glibc-compatibility/CMakeLists.txt index bd2e5200eed2e7..9579c7819f849a 100644 --- a/be/src/glibc-compatibility/CMakeLists.txt +++ b/be/src/glibc-compatibility/CMakeLists.txt @@ -32,13 +32,13 @@ if (GLIBC_COMPATIBILITY) add_headers_and_sources(glibc_compatibility .) add_headers_and_sources(glibc_compatibility musl) if (ARCH_ARM) - # FastMemcpy not support arm, remove it from glibc_compatibility_sources - list (REMOVE_ITEM glibc_compatibility_sources FastMemcpy.c memcpy_wrapper.c) list (APPEND glibc_compatibility_sources musl/aarch64/syscall.s musl/aarch64/longjmp.s) set (musl_arch_include_dir musl/aarch64) + set (MEMCPY_SOURCE memcpy/memcpy_aarch64.cpp) elseif (ARCH_AMD64) list (APPEND glibc_compatibility_sources musl/x86_64/syscall.s musl/x86_64/longjmp.s) set (musl_arch_include_dir musl/x86_64) + set (MEMCPY_SOURCE memcpy/memcpy_x86_64.cpp) else () message (FATAL_ERROR "glibc_compatibility can only be used on x86_64 or aarch64.") endif () @@ -48,8 +48,6 @@ if (GLIBC_COMPATIBILITY) list(APPEND glibc_compatibility_sources musl/getentropy.c) endif() - list(REMOVE_ITEM glibc_compatibility_sources musl/getrandom.c memcpy_wrapper.c) - # Need to omit frame pointers to match the performance of glibc set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer") @@ -57,9 +55,10 @@ if (GLIBC_COMPATIBILITY) # without glibc-compatibility. Also sanitizers might generate getrandom # libcalls. Workaround: Use object file so that linker will always take a # look at its symbol table. + list(REMOVE_ITEM glibc_compatibility_sources musl/getrandom.c) # NOTE(amos): sanitizers might generate memcpy references that are too late to # refer. Let's also extract memcpy definitions explicitly to avoid UNDEF GLIBC 2.14. - add_library(glibc-compatibility-explicit OBJECT musl/getrandom.c memcpy_wrapper.c) + add_library(glibc-compatibility-explicit OBJECT musl/getrandom.c ${MEMCPY_SOURCE}) target_compile_options(glibc-compatibility-explicit PRIVATE -fPIC) add_library(glibc-compatibility STATIC ${glibc_compatibility_sources}) diff --git a/be/src/glibc-compatibility/FastMemcpy.c b/be/src/glibc-compatibility/FastMemcpy.c deleted file mode 100644 index 6c489ff144d445..00000000000000 --- a/be/src/glibc-compatibility/FastMemcpy.c +++ /dev/null @@ -1,220 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9) -// -//===================================================================== -#include -#include -#include -#include - -#if (defined(_WIN32) || defined(WIN32)) -#include -#include -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif -#elif defined(__unix) -#include -#include -#else -#error it can only be compiled under windows or unix -#endif - -#include "FastMemcpy.h" - -unsigned int gettime() -{ - #if (defined(_WIN32) || defined(WIN32)) - return timeGetTime(); - #else - static struct timezone tz={ 0,0 }; - struct timeval time; - gettimeofday(&time,&tz); - return (time.tv_sec * 1000 + time.tv_usec / 1000); - #endif -} - -void sleepms(unsigned int millisec) -{ -#if defined(_WIN32) || defined(WIN32) - Sleep(millisec); -#else - usleep(millisec * 1000); -#endif -} - - -void benchmark(int dstalign, int srcalign, size_t size, int times) -{ - char *DATA1 = (char*)malloc(size + 64); - char *DATA2 = (char*)malloc(size + 64); - size_t LINEAR1 = ((size_t)DATA1); - size_t LINEAR2 = ((size_t)DATA2); - char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1); - char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2); - char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1); - char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3); - unsigned int t1, t2; - int k; - - sleepms(100); - t1 = gettime(); - for (k = times; k > 0; k--) { - memcpy(dst, src, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (k = times; k > 0; k--) { - memcpy_fast(dst, src, size); - } - t2 = gettime() - t2; - - free(DATA1); - free(DATA2); - - printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n", - dstalign? "aligned" : "unalign", - srcalign? "aligned" : "unalign", (int)t2, (int)t1); -} - - -void bench(int copysize, int times) -{ - printf("benchmark(size=%d bytes, times=%d):\n", copysize, times); - benchmark(1, 1, copysize, times); - benchmark(1, 0, copysize, times); - benchmark(0, 1, copysize, times); - benchmark(0, 0, copysize, times); - printf("\n"); -} - - -void random_bench(int maxsize, int times) -{ - static char A[11 * 1024 * 1024 + 2]; - static char B[11 * 1024 * 1024 + 2]; - static int random_offsets[0x10000]; - static int random_sizes[0x8000]; - unsigned int i, p1, p2; - unsigned int t1, t2; - for (i = 0; i < 0x10000; i++) { // generate random offsets - random_offsets[i] = rand() % (10 * 1024 * 1024 + 1); - } - for (i = 0; i < 0x8000; i++) { // generate random sizes - random_sizes[i] = 1 + rand() % maxsize; - } - sleepms(100); - t1 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy(A + offset1, B + offset2, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy_fast(A + offset1, B + offset2, size); - } - t2 = gettime() - t2; - printf("benchmark random access:\n"); - printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1); -} - - -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif - -int main(void) -{ - bench(32, 0x1000000); - bench(64, 0x1000000); - bench(512, 0x800000); - bench(1024, 0x400000); - bench(4096, 0x80000); - bench(8192, 0x40000); - bench(1024 * 1024 * 1, 0x800); - bench(1024 * 1024 * 4, 0x200); - bench(1024 * 1024 * 8, 0x100); - - random_bench(2048, 8000000); - - return 0; -} - - - - -/* -benchmark(size=32 bytes, times=16777216): -result(dst aligned, src aligned): memcpy_fast=78ms memcpy=260 ms -result(dst aligned, src unalign): memcpy_fast=78ms memcpy=250 ms -result(dst unalign, src aligned): memcpy_fast=78ms memcpy=266 ms -result(dst unalign, src unalign): memcpy_fast=78ms memcpy=234 ms - -benchmark(size=64 bytes, times=16777216): -result(dst aligned, src aligned): memcpy_fast=109ms memcpy=281 ms -result(dst aligned, src unalign): memcpy_fast=109ms memcpy=328 ms -result(dst unalign, src aligned): memcpy_fast=109ms memcpy=343 ms -result(dst unalign, src unalign): memcpy_fast=93ms memcpy=344 ms - -benchmark(size=512 bytes, times=8388608): -result(dst aligned, src aligned): memcpy_fast=125ms memcpy=218 ms -result(dst aligned, src unalign): memcpy_fast=156ms memcpy=484 ms -result(dst unalign, src aligned): memcpy_fast=172ms memcpy=546 ms -result(dst unalign, src unalign): memcpy_fast=172ms memcpy=515 ms - -benchmark(size=1024 bytes, times=4194304): -result(dst aligned, src aligned): memcpy_fast=109ms memcpy=172 ms -result(dst aligned, src unalign): memcpy_fast=187ms memcpy=453 ms -result(dst unalign, src aligned): memcpy_fast=172ms memcpy=437 ms -result(dst unalign, src unalign): memcpy_fast=156ms memcpy=452 ms - -benchmark(size=4096 bytes, times=524288): -result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms -result(dst aligned, src unalign): memcpy_fast=109ms memcpy=202 ms -result(dst unalign, src aligned): memcpy_fast=94ms memcpy=203 ms -result(dst unalign, src unalign): memcpy_fast=110ms memcpy=218 ms - -benchmark(size=8192 bytes, times=262144): -result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms -result(dst aligned, src unalign): memcpy_fast=78ms memcpy=202 ms -result(dst unalign, src aligned): memcpy_fast=78ms memcpy=203 ms -result(dst unalign, src unalign): memcpy_fast=94ms memcpy=203 ms - -benchmark(size=1048576 bytes, times=2048): -result(dst aligned, src aligned): memcpy_fast=203ms memcpy=191 ms -result(dst aligned, src unalign): memcpy_fast=219ms memcpy=281 ms -result(dst unalign, src aligned): memcpy_fast=218ms memcpy=328 ms -result(dst unalign, src unalign): memcpy_fast=218ms memcpy=312 ms - -benchmark(size=4194304 bytes, times=512): -result(dst aligned, src aligned): memcpy_fast=312ms memcpy=406 ms -result(dst aligned, src unalign): memcpy_fast=296ms memcpy=421 ms -result(dst unalign, src aligned): memcpy_fast=312ms memcpy=468 ms -result(dst unalign, src unalign): memcpy_fast=297ms memcpy=452 ms - -benchmark(size=8388608 bytes, times=256): -result(dst aligned, src aligned): memcpy_fast=281ms memcpy=452 ms -result(dst aligned, src unalign): memcpy_fast=280ms memcpy=468 ms -result(dst unalign, src aligned): memcpy_fast=298ms memcpy=514 ms -result(dst unalign, src unalign): memcpy_fast=344ms memcpy=472 ms - -benchmark random access: -memcpy_fast=515ms memcpy=1014ms - -*/ - - - - diff --git a/be/src/glibc-compatibility/FastMemcpy.h b/be/src/glibc-compatibility/FastMemcpy.h deleted file mode 100644 index cdccdf8a52478e..00000000000000 --- a/be/src/glibc-compatibility/FastMemcpy.h +++ /dev/null @@ -1,694 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) -// -//===================================================================== -#ifndef __FAST_MEMCPY_H__ -#define __FAST_MEMCPY_H__ - -#include -#include -#include - - -//--------------------------------------------------------------------- -// force inline for compilers -//--------------------------------------------------------------------- -#ifndef INLINE -#ifdef __GNUC__ -#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) - #define INLINE __inline__ __attribute__((always_inline)) -#else - #define INLINE __inline__ -#endif -#elif defined(_MSC_VER) - #define INLINE __forceinline -#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) - #define INLINE __inline -#else - #define INLINE -#endif -#endif - -typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t; -typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t; -typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t; - -//--------------------------------------------------------------------- -// fast copy for different sizes -//--------------------------------------------------------------------- -static INLINE void memcpy_sse2_16(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); -} - -static INLINE void memcpy_sse2_32(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); -} - -static INLINE void memcpy_sse2_64(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); - __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); - _mm_storeu_si128(((__m128i*)dst) + 2, m2); - _mm_storeu_si128(((__m128i*)dst) + 3, m3); -} - -static INLINE void memcpy_sse2_128(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); - __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); - __m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4); - __m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5); - __m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6); - __m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); - _mm_storeu_si128(((__m128i*)dst) + 2, m2); - _mm_storeu_si128(((__m128i*)dst) + 3, m3); - _mm_storeu_si128(((__m128i*)dst) + 4, m4); - _mm_storeu_si128(((__m128i*)dst) + 5, m5); - _mm_storeu_si128(((__m128i*)dst) + 6, m6); - _mm_storeu_si128(((__m128i*)dst) + 7, m7); -} - - -//--------------------------------------------------------------------- -// tiny memory copy with jump table optimized -//--------------------------------------------------------------------- -/// Attribute is used to avoid an error with undefined behaviour sanitizer -/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer -/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library. -__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { - unsigned char *dd = ((unsigned char*)dst) + size; - const unsigned char *ss = ((const unsigned char*)src) + size; - - switch (size) { - case 64: - memcpy_sse2_64(dd - 64, ss - 64); - case 0: - break; - - case 65: - memcpy_sse2_64(dd - 65, ss - 65); - case 1: - dd[-1] = ss[-1]; - break; - - case 66: - memcpy_sse2_64(dd - 66, ss - 66); - case 2: - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 67: - memcpy_sse2_64(dd - 67, ss - 67); - case 3: - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 68: - memcpy_sse2_64(dd - 68, ss - 68); - case 4: - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 69: - memcpy_sse2_64(dd - 69, ss - 69); - case 5: - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 70: - memcpy_sse2_64(dd - 70, ss - 70); - case 6: - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 71: - memcpy_sse2_64(dd - 71, ss - 71); - case 7: - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 72: - memcpy_sse2_64(dd - 72, ss - 72); - case 8: - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 73: - memcpy_sse2_64(dd - 73, ss - 73); - case 9: - *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 74: - memcpy_sse2_64(dd - 74, ss - 74); - case 10: - *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 75: - memcpy_sse2_64(dd - 75, ss - 75); - case 11: - *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 76: - memcpy_sse2_64(dd - 76, ss - 76); - case 12: - *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 77: - memcpy_sse2_64(dd - 77, ss - 77); - case 13: - *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13)); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 78: - memcpy_sse2_64(dd - 78, ss - 78); - case 14: - *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 79: - memcpy_sse2_64(dd - 79, ss - 79); - case 15: - *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 80: - memcpy_sse2_64(dd - 80, ss - 80); - case 16: - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 81: - memcpy_sse2_64(dd - 81, ss - 81); - case 17: - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 82: - memcpy_sse2_64(dd - 82, ss - 82); - case 18: - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 83: - memcpy_sse2_64(dd - 83, ss - 83); - case 19: - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 84: - memcpy_sse2_64(dd - 84, ss - 84); - case 20: - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 85: - memcpy_sse2_64(dd - 85, ss - 85); - case 21: - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 86: - memcpy_sse2_64(dd - 86, ss - 86); - case 22: - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 87: - memcpy_sse2_64(dd - 87, ss - 87); - case 23: - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 88: - memcpy_sse2_64(dd - 88, ss - 88); - case 24: - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 89: - memcpy_sse2_64(dd - 89, ss - 89); - case 25: - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 90: - memcpy_sse2_64(dd - 90, ss - 90); - case 26: - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 91: - memcpy_sse2_64(dd - 91, ss - 91); - case 27: - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 92: - memcpy_sse2_64(dd - 92, ss - 92); - case 28: - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 93: - memcpy_sse2_64(dd - 93, ss - 93); - case 29: - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 94: - memcpy_sse2_64(dd - 94, ss - 94); - case 30: - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 95: - memcpy_sse2_64(dd - 95, ss - 95); - case 31: - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 96: - memcpy_sse2_64(dd - 96, ss - 96); - case 32: - memcpy_sse2_32(dd - 32, ss - 32); - break; - - case 97: - memcpy_sse2_64(dd - 97, ss - 97); - case 33: - memcpy_sse2_32(dd - 33, ss - 33); - dd[-1] = ss[-1]; - break; - - case 98: - memcpy_sse2_64(dd - 98, ss - 98); - case 34: - memcpy_sse2_32(dd - 34, ss - 34); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 99: - memcpy_sse2_64(dd - 99, ss - 99); - case 35: - memcpy_sse2_32(dd - 35, ss - 35); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 100: - memcpy_sse2_64(dd - 100, ss - 100); - case 36: - memcpy_sse2_32(dd - 36, ss - 36); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 101: - memcpy_sse2_64(dd - 101, ss - 101); - case 37: - memcpy_sse2_32(dd - 37, ss - 37); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 102: - memcpy_sse2_64(dd - 102, ss - 102); - case 38: - memcpy_sse2_32(dd - 38, ss - 38); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 103: - memcpy_sse2_64(dd - 103, ss - 103); - case 39: - memcpy_sse2_32(dd - 39, ss - 39); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 104: - memcpy_sse2_64(dd - 104, ss - 104); - case 40: - memcpy_sse2_32(dd - 40, ss - 40); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 105: - memcpy_sse2_64(dd - 105, ss - 105); - case 41: - memcpy_sse2_32(dd - 41, ss - 41); - *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 106: - memcpy_sse2_64(dd - 106, ss - 106); - case 42: - memcpy_sse2_32(dd - 42, ss - 42); - *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 107: - memcpy_sse2_64(dd - 107, ss - 107); - case 43: - memcpy_sse2_32(dd - 43, ss - 43); - *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 108: - memcpy_sse2_64(dd - 108, ss - 108); - case 44: - memcpy_sse2_32(dd - 44, ss - 44); - *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 109: - memcpy_sse2_64(dd - 109, ss - 109); - case 45: - memcpy_sse2_32(dd - 45, ss - 45); - *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13)); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 110: - memcpy_sse2_64(dd - 110, ss - 110); - case 46: - memcpy_sse2_32(dd - 46, ss - 46); - *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 111: - memcpy_sse2_64(dd - 111, ss - 111); - case 47: - memcpy_sse2_32(dd - 47, ss - 47); - *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 112: - memcpy_sse2_64(dd - 112, ss - 112); - case 48: - memcpy_sse2_32(dd - 48, ss - 48); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 113: - memcpy_sse2_64(dd - 113, ss - 113); - case 49: - memcpy_sse2_32(dd - 49, ss - 49); - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 114: - memcpy_sse2_64(dd - 114, ss - 114); - case 50: - memcpy_sse2_32(dd - 50, ss - 50); - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 115: - memcpy_sse2_64(dd - 115, ss - 115); - case 51: - memcpy_sse2_32(dd - 51, ss - 51); - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 116: - memcpy_sse2_64(dd - 116, ss - 116); - case 52: - memcpy_sse2_32(dd - 52, ss - 52); - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 117: - memcpy_sse2_64(dd - 117, ss - 117); - case 53: - memcpy_sse2_32(dd - 53, ss - 53); - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 118: - memcpy_sse2_64(dd - 118, ss - 118); - case 54: - memcpy_sse2_32(dd - 54, ss - 54); - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 119: - memcpy_sse2_64(dd - 119, ss - 119); - case 55: - memcpy_sse2_32(dd - 55, ss - 55); - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 120: - memcpy_sse2_64(dd - 120, ss - 120); - case 56: - memcpy_sse2_32(dd - 56, ss - 56); - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 121: - memcpy_sse2_64(dd - 121, ss - 121); - case 57: - memcpy_sse2_32(dd - 57, ss - 57); - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 122: - memcpy_sse2_64(dd - 122, ss - 122); - case 58: - memcpy_sse2_32(dd - 58, ss - 58); - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 123: - memcpy_sse2_64(dd - 123, ss - 123); - case 59: - memcpy_sse2_32(dd - 59, ss - 59); - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 124: - memcpy_sse2_64(dd - 124, ss - 124); - case 60: - memcpy_sse2_32(dd - 60, ss - 60); - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 125: - memcpy_sse2_64(dd - 125, ss - 125); - case 61: - memcpy_sse2_32(dd - 61, ss - 61); - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 126: - memcpy_sse2_64(dd - 126, ss - 126); - case 62: - memcpy_sse2_32(dd - 62, ss - 62); - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 127: - memcpy_sse2_64(dd - 127, ss - 127); - case 63: - memcpy_sse2_32(dd - 63, ss - 63); - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 128: - memcpy_sse2_128(dd - 128, ss - 128); - break; - } - - return dst; -} - - -//--------------------------------------------------------------------- -// main routine -//--------------------------------------------------------------------- -static void* memcpy_fast(void *destination, const void *source, size_t size) -{ - unsigned char *dst = (unsigned char*)destination; - const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // L2-cache size - size_t padding; - - // small memory copy - if (size <= 128) { - return memcpy_tiny(dst, src, size); - } - - // align destination to 16 bytes boundary - padding = (16 - (((size_t)dst) & 15)) & 15; - - if (padding > 0) { - __m128i head = _mm_loadu_si128((const __m128i*)src); - _mm_storeu_si128((__m128i*)dst, head); - dst += padding; - src += padding; - size -= padding; - } - - // medium size copy - if (size <= cachesize) { - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128(((const __m128i*)src) + 0); - c1 = _mm_loadu_si128(((const __m128i*)src) + 1); - c2 = _mm_loadu_si128(((const __m128i*)src) + 2); - c3 = _mm_loadu_si128(((const __m128i*)src) + 3); - c4 = _mm_loadu_si128(((const __m128i*)src) + 4); - c5 = _mm_loadu_si128(((const __m128i*)src) + 5); - c6 = _mm_loadu_si128(((const __m128i*)src) + 6); - c7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_store_si128((((__m128i*)dst) + 0), c0); - _mm_store_si128((((__m128i*)dst) + 1), c1); - _mm_store_si128((((__m128i*)dst) + 2), c2); - _mm_store_si128((((__m128i*)dst) + 3), c3); - _mm_store_si128((((__m128i*)dst) + 4), c4); - _mm_store_si128((((__m128i*)dst) + 5), c5); - _mm_store_si128((((__m128i*)dst) + 6), c6); - _mm_store_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - else { // big memory copy - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - _mm_prefetch((const char*)(src), _MM_HINT_NTA); - - if ((((size_t)src) & 15) == 0) { // source aligned - for (; size >= 128; size -= 128) { - c0 = _mm_load_si128(((const __m128i*)src) + 0); - c1 = _mm_load_si128(((const __m128i*)src) + 1); - c2 = _mm_load_si128(((const __m128i*)src) + 2); - c3 = _mm_load_si128(((const __m128i*)src) + 3); - c4 = _mm_load_si128(((const __m128i*)src) + 4); - c5 = _mm_load_si128(((const __m128i*)src) + 5); - c6 = _mm_load_si128(((const __m128i*)src) + 6); - c7 = _mm_load_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128((((__m128i*)dst) + 0), c0); - _mm_stream_si128((((__m128i*)dst) + 1), c1); - _mm_stream_si128((((__m128i*)dst) + 2), c2); - _mm_stream_si128((((__m128i*)dst) + 3), c3); - _mm_stream_si128((((__m128i*)dst) + 4), c4); - _mm_stream_si128((((__m128i*)dst) + 5), c5); - _mm_stream_si128((((__m128i*)dst) + 6), c6); - _mm_stream_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - else { // source unaligned - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128(((const __m128i*)src) + 0); - c1 = _mm_loadu_si128(((const __m128i*)src) + 1); - c2 = _mm_loadu_si128(((const __m128i*)src) + 2); - c3 = _mm_loadu_si128(((const __m128i*)src) + 3); - c4 = _mm_loadu_si128(((const __m128i*)src) + 4); - c5 = _mm_loadu_si128(((const __m128i*)src) + 5); - c6 = _mm_loadu_si128(((const __m128i*)src) + 6); - c7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128((((__m128i*)dst) + 0), c0); - _mm_stream_si128((((__m128i*)dst) + 1), c1); - _mm_stream_si128((((__m128i*)dst) + 2), c2); - _mm_stream_si128((((__m128i*)dst) + 3), c3); - _mm_stream_si128((((__m128i*)dst) + 4), c4); - _mm_stream_si128((((__m128i*)dst) + 5), c5); - _mm_stream_si128((((__m128i*)dst) + 6), c6); - _mm_stream_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - _mm_sfence(); - } - - memcpy_tiny(dst, src, size); - - return destination; -} - - -#endif diff --git a/be/src/glibc-compatibility/LICENSE_FastMemcpy b/be/src/glibc-compatibility/LICENSE_FastMemcpy deleted file mode 100644 index c449da6aa8acfc..00000000000000 --- a/be/src/glibc-compatibility/LICENSE_FastMemcpy +++ /dev/null @@ -1,22 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2015 Linwei - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - diff --git a/be/src/glibc-compatibility/memcpy/memcpy_aarch64.cpp b/be/src/glibc-compatibility/memcpy/memcpy_aarch64.cpp new file mode 100644 index 00000000000000..4a6a5ac96fbf4b --- /dev/null +++ b/be/src/glibc-compatibility/memcpy/memcpy_aarch64.cpp @@ -0,0 +1,245 @@ +#include + +#include + +typedef int64x2_t __m128i; + +#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) +#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) + +static inline __attribute__((always_inline)) void _mm_storeu_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); +} + +static inline __attribute__((always_inline)) void _mm_store_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); +} + +static inline __attribute__((always_inline)) __m128i _mm_loadu_si128(const __m128i *p) +{ + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); +} + +static inline __attribute__((always_inline)) __m128i _mm_load_si128(const __m128i *p) +{ + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); +} + +/** Custom memcpy implementation for ClickHouse. + * It has the following benefits over using glibc's implementation: + * 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability. + * 2. Avoiding indirect call via PLT due to shared linking, that can be less efficient. + * 3. It's possible to include this header and call inline_memcpy directly for better inlining or interprocedural analysis. + * 4. Better results on our performance tests on current CPUs: up to 25% on some queries and up to 0.7%..1% in average across all queries. + * + * Writing our own memcpy is extremely difficult for the following reasons: + * 1. The optimal variant depends on the specific CPU model. + * 2. The optimal variant depends on the distribution of size arguments. + * 3. It depends on the number of threads copying data concurrently. + * 4. It also depends on how the calling code is using the copied data and how the different memcpy calls are related to each other. + * Due to vast range of scenarios it makes proper testing especially difficult. + * When writing our own memcpy there is a risk to overoptimize it + * on non-representative microbenchmarks while making real-world use cases actually worse. + * + * Most of the benchmarks for memcpy on the internet are wrong. + * + * Let's look at the details: + * + * For small size, the order of branches in code is important. + * There are variants with specific order of branches (like here or in glibc) + * or with jump table (in asm code see example from Cosmopolitan libc: + * https://github.com/jart/cosmopolitan/blob/de09bec215675e9b0beb722df89c6f794da74f3f/libc/nexgen32e/memcpy.S#L61) + * or with Duff device in C (see https://github.com/skywind3000/FastMemcpy/) + * + * It's also important how to copy uneven sizes. + * Almost every implementation, including this, is using two overlapping movs. + * + * It is important to disable -ftree-loop-distribute-patterns when compiling memcpy implementation, + * otherwise the compiler can replace internal loops to a call to memcpy that will lead to infinite recursion. + * + * For larger sizes it's important to choose the instructions used: + * - SSE or AVX or AVX-512; + * - rep movsb; + * Performance will depend on the size threshold, on the CPU model, on the "erms" flag + * ("Enhansed Rep MovS" - it indicates that performance of "rep movsb" is decent for large sizes) + * https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy + * + * Using AVX-512 can be bad due to throttling. + * Using AVX can be bad if most code is using SSE due to switching penalty + * (it also depends on the usage of "vzeroupper" instruction). + * But in some cases AVX gives a win. + * + * It also depends on how many times the loop will be unrolled. + * We are unrolling the loop 8 times (by the number of available registers), but it not always the best. + * + * It also depends on the usage of aligned or unaligned loads/stores. + * We are using unaligned loads and aligned stores. + * + * It also depends on the usage of prefetch instructions. It makes sense on some Intel CPUs but can slow down performance on AMD. + * Setting up correct offset for prefetching is non-obvious. + * + * Non-temporary (cache bypassing) stores can be used for very large sizes (more than a half of L3 cache). + * But the exact threshold is unclear - when doing memcpy from multiple threads the optimal threshold can be lower, + * because L3 cache is shared (and L2 cache is partially shared). + * + * Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios, + * so we don't pay attention to using non-temporary stores. + * + * On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most benefitial, + * even comparing to non-temporary aligned unrolled stores even with the most wide registers. + * + * memcpy can be written in asm, C or C++. The latter can also use inline asm. + * The asm implementation can be better to make sure that compiler won't make the code worse, + * to ensure the order of branches, the code layout, the usage of all required registers. + * But if it is located in separate translation unit, inlining will not be possible + * (inline asm can be used to overcome this limitation). + * Sometimes C or C++ code can be further optimized by compiler. + * For example, clang is capable replacing SSE intrinsics to AVX code if -mavx is used. + * + * Please note that compiler can replace plain code to memcpy and vice versa. + * - memcpy with compile-time known small size is replaced to simple instructions without a call to memcpy; + * it is controlled by -fbuiltin-memcpy and can be manually ensured by calling __builtin_memcpy. + * This is often used to implement unaligned load/store without undefined behaviour in C++. + * - a loop with copying bytes can be recognized and replaced by a call to memcpy; + * it is controlled by -ftree-loop-distribute-patterns. + * - also note that a loop with copying bytes can be unrolled, peeled and vectorized that will give you + * inline code somewhat similar to a decent implementation of memcpy. + * + * This description is up to date as of Mar 2021. + * + * How to test the memcpy implementation for performance: + * 1. Test on real production workload. + * 2. For synthetic test, see utils/memcpy-bench, but make sure you will do the best to exhaust the wide range of scenarios. + * + * TODO: Add self-tuning memcpy with bayesian bandits algorithm for large sizes. + * See https://habr.com/en/company/yandex/blog/457612/ + */ + + +static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size) +{ + /// We will use pointer arithmetic, so char pointer will be used. + /// Note that __restrict makes sense (otherwise compiler will reload data from memory + /// instead of using the value of registers due to possible aliasing). + char * __restrict dst = reinterpret_cast(dst_); + const char * __restrict src = reinterpret_cast(src_); + + /// Standard memcpy returns the original value of dst. It is rarely used but we have to do it. + /// If you use memcpy with small but non-constant sizes, you can call inline_memcpy directly + /// for inlining and removing this single instruction. + void * ret = dst; + +tail: + /// Small sizes and tails after the loop for large sizes. + /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application. + /// This order of branches is from the disassembly of glibc's code. + /// We copy chunks of possibly uneven size with two overlapping movs. + /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3]. + if (size <= 16) + { + if (size >= 8) + { + /// Chunks of 8..16 bytes. + __builtin_memcpy(dst + size - 8, src + size - 8, 8); + __builtin_memcpy(dst, src, 8); + } + else if (size >= 4) + { + /// Chunks of 4..7 bytes. + __builtin_memcpy(dst + size - 4, src + size - 4, 4); + __builtin_memcpy(dst, src, 4); + } + else if (size >= 2) + { + /// Chunks of 2..3 bytes. + __builtin_memcpy(dst + size - 2, src + size - 2, 2); + __builtin_memcpy(dst, src, 2); + } + else if (size >= 1) + { + /// A single byte. + *dst = *src; + } + /// No bytes remaining. + } + else + { + /// Medium and large sizes. + if (size <= 128) + { + /// Medium size, not enough for full loop unrolling. + + /// We will copy the last 16 bytes. + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast(src + size - 16))); + + /// Then we will copy every 16 bytes from the beginning in a loop. + /// The last loop iteration will possibly overwrite some part of already copied last 16 bytes. + /// This is Ok, similar to the code for small sizes above. + while (size > 16) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast(src))); + dst += 16; + src += 16; + size -= 16; + } + } + else + { + /// Large size with fully unrolled loop. + + /// Align destination to 16 bytes boundary. + size_t padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + /// If not aligned - we will copy first 16 bytes with unaligned stores. + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + /// Aligned unrolled copy. We will use half of available SSE registers. + /// It's not possible to have both src and dst aligned. + /// So, we will use aligned stores and unaligned loads. + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + while (size >= 128) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); + c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); + c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); + c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); + src += 128; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); + dst += 128; + + size -= 128; + } + + /// The latest remaining 0..127 bytes will be processed as usual. + goto tail; + } + } + + return ret; +} + +extern "C" void * memcpy(void * __restrict dst, const void * __restrict src, size_t size) +{ + return inline_memcpy(dst, src, size); +} diff --git a/be/src/glibc-compatibility/memcpy/memcpy_x86_64.cpp b/be/src/glibc-compatibility/memcpy/memcpy_x86_64.cpp new file mode 100644 index 00000000000000..50deabf892f31b --- /dev/null +++ b/be/src/glibc-compatibility/memcpy/memcpy_x86_64.cpp @@ -0,0 +1,220 @@ +#include + +#include + +/** Custom memcpy implementation for ClickHouse. + * It has the following benefits over using glibc's implementation: + * 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability. + * 2. Avoiding indirect call via PLT due to shared linking, that can be less efficient. + * 3. It's possible to include this header and call inline_memcpy directly for better inlining or interprocedural analysis. + * 4. Better results on our performance tests on current CPUs: up to 25% on some queries and up to 0.7%..1% in average across all queries. + * + * Writing our own memcpy is extremely difficult for the following reasons: + * 1. The optimal variant depends on the specific CPU model. + * 2. The optimal variant depends on the distribution of size arguments. + * 3. It depends on the number of threads copying data concurrently. + * 4. It also depends on how the calling code is using the copied data and how the different memcpy calls are related to each other. + * Due to vast range of scenarios it makes proper testing especially difficult. + * When writing our own memcpy there is a risk to overoptimize it + * on non-representative microbenchmarks while making real-world use cases actually worse. + * + * Most of the benchmarks for memcpy on the internet are wrong. + * + * Let's look at the details: + * + * For small size, the order of branches in code is important. + * There are variants with specific order of branches (like here or in glibc) + * or with jump table (in asm code see example from Cosmopolitan libc: + * https://github.com/jart/cosmopolitan/blob/de09bec215675e9b0beb722df89c6f794da74f3f/libc/nexgen32e/memcpy.S#L61) + * or with Duff device in C (see https://github.com/skywind3000/FastMemcpy/) + * + * It's also important how to copy uneven sizes. + * Almost every implementation, including this, is using two overlapping movs. + * + * It is important to disable -ftree-loop-distribute-patterns when compiling memcpy implementation, + * otherwise the compiler can replace internal loops to a call to memcpy that will lead to infinite recursion. + * + * For larger sizes it's important to choose the instructions used: + * - SSE or AVX or AVX-512; + * - rep movsb; + * Performance will depend on the size threshold, on the CPU model, on the "erms" flag + * ("Enhansed Rep MovS" - it indicates that performance of "rep movsb" is decent for large sizes) + * https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy + * + * Using AVX-512 can be bad due to throttling. + * Using AVX can be bad if most code is using SSE due to switching penalty + * (it also depends on the usage of "vzeroupper" instruction). + * But in some cases AVX gives a win. + * + * It also depends on how many times the loop will be unrolled. + * We are unrolling the loop 8 times (by the number of available registers), but it not always the best. + * + * It also depends on the usage of aligned or unaligned loads/stores. + * We are using unaligned loads and aligned stores. + * + * It also depends on the usage of prefetch instructions. It makes sense on some Intel CPUs but can slow down performance on AMD. + * Setting up correct offset for prefetching is non-obvious. + * + * Non-temporary (cache bypassing) stores can be used for very large sizes (more than a half of L3 cache). + * But the exact threshold is unclear - when doing memcpy from multiple threads the optimal threshold can be lower, + * because L3 cache is shared (and L2 cache is partially shared). + * + * Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios, + * so we don't pay attention to using non-temporary stores. + * + * On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most benefitial, + * even comparing to non-temporary aligned unrolled stores even with the most wide registers. + * + * memcpy can be written in asm, C or C++. The latter can also use inline asm. + * The asm implementation can be better to make sure that compiler won't make the code worse, + * to ensure the order of branches, the code layout, the usage of all required registers. + * But if it is located in separate translation unit, inlining will not be possible + * (inline asm can be used to overcome this limitation). + * Sometimes C or C++ code can be further optimized by compiler. + * For example, clang is capable replacing SSE intrinsics to AVX code if -mavx is used. + * + * Please note that compiler can replace plain code to memcpy and vice versa. + * - memcpy with compile-time known small size is replaced to simple instructions without a call to memcpy; + * it is controlled by -fbuiltin-memcpy and can be manually ensured by calling __builtin_memcpy. + * This is often used to implement unaligned load/store without undefined behaviour in C++. + * - a loop with copying bytes can be recognized and replaced by a call to memcpy; + * it is controlled by -ftree-loop-distribute-patterns. + * - also note that a loop with copying bytes can be unrolled, peeled and vectorized that will give you + * inline code somewhat similar to a decent implementation of memcpy. + * + * This description is up to date as of Mar 2021. + * + * How to test the memcpy implementation for performance: + * 1. Test on real production workload. + * 2. For synthetic test, see utils/memcpy-bench, but make sure you will do the best to exhaust the wide range of scenarios. + * + * TODO: Add self-tuning memcpy with bayesian bandits algorithm for large sizes. + * See https://habr.com/en/company/yandex/blog/457612/ + */ + + +static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size) +{ + /// We will use pointer arithmetic, so char pointer will be used. + /// Note that __restrict makes sense (otherwise compiler will reload data from memory + /// instead of using the value of registers due to possible aliasing). + char * __restrict dst = reinterpret_cast(dst_); + const char * __restrict src = reinterpret_cast(src_); + + /// Standard memcpy returns the original value of dst. It is rarely used but we have to do it. + /// If you use memcpy with small but non-constant sizes, you can call inline_memcpy directly + /// for inlining and removing this single instruction. + void * ret = dst; + +tail: + /// Small sizes and tails after the loop for large sizes. + /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application. + /// This order of branches is from the disassembly of glibc's code. + /// We copy chunks of possibly uneven size with two overlapping movs. + /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3]. + if (size <= 16) + { + if (size >= 8) + { + /// Chunks of 8..16 bytes. + __builtin_memcpy(dst + size - 8, src + size - 8, 8); + __builtin_memcpy(dst, src, 8); + } + else if (size >= 4) + { + /// Chunks of 4..7 bytes. + __builtin_memcpy(dst + size - 4, src + size - 4, 4); + __builtin_memcpy(dst, src, 4); + } + else if (size >= 2) + { + /// Chunks of 2..3 bytes. + __builtin_memcpy(dst + size - 2, src + size - 2, 2); + __builtin_memcpy(dst, src, 2); + } + else if (size >= 1) + { + /// A single byte. + *dst = *src; + } + /// No bytes remaining. + } + else + { + /// Medium and large sizes. + if (size <= 128) + { + /// Medium size, not enough for full loop unrolling. + + /// We will copy the last 16 bytes. + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast(src + size - 16))); + + /// Then we will copy every 16 bytes from the beginning in a loop. + /// The last loop iteration will possibly overwrite some part of already copied last 16 bytes. + /// This is Ok, similar to the code for small sizes above. + while (size > 16) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast(src))); + dst += 16; + src += 16; + size -= 16; + } + } + else + { + /// Large size with fully unrolled loop. + + /// Align destination to 16 bytes boundary. + size_t padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + /// If not aligned - we will copy first 16 bytes with unaligned stores. + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + /// Aligned unrolled copy. We will use half of available SSE registers. + /// It's not possible to have both src and dst aligned. + /// So, we will use aligned stores and unaligned loads. + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + while (size >= 128) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); + c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); + c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); + c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); + src += 128; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); + dst += 128; + + size -= 128; + } + + /// The latest remaining 0..127 bytes will be processed as usual. + goto tail; + } + } + + return ret; +} + +extern "C" void * memcpy(void * __restrict dst, const void * __restrict src, size_t size) +{ + return inline_memcpy(dst, src, size); +} diff --git a/be/src/glibc-compatibility/memcpy_wrapper.c b/be/src/glibc-compatibility/memcpy_wrapper.c deleted file mode 100644 index 1f57345980ad52..00000000000000 --- a/be/src/glibc-compatibility/memcpy_wrapper.c +++ /dev/null @@ -1,6 +0,0 @@ -#include "FastMemcpy.h" - -void * memcpy(void * __restrict destination, const void * __restrict source, size_t size) -{ - return memcpy_fast(destination, source, size); -} diff --git a/thirdparty/patches/libhdfs3-master.patch b/thirdparty/patches/libhdfs3-master.patch index febf1c4ac9a557..6c4eb2bfd70e45 100644 --- a/thirdparty/patches/libhdfs3-master.patch +++ b/thirdparty/patches/libhdfs3-master.patch @@ -1,4 +1,4 @@ -diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +diff -uprN a/src/CMakeLists.txt b/src/CMakeLists.txt --- a/src/CMakeLists.txt 2021-09-23 22:03:55.000000000 +0800 +++ b/src/CMakeLists.txt 2022-01-18 00:58:22.411061469 +0800 @@ -46,7 +46,7 @@ SET(HEADER @@ -114,3 +114,25 @@ index 9aa9d7b..53893a1 100644 if (output) { free(output); +diff -uprN a/src/CMakeLists.txt b/src/CMakeLists.txt +--- a/bootstrap 2022-02-26 16:12:06.065389096 +0800 ++++ b/bootstrap 2022-02-26 16:11:45.989378097 +0800 +@@ -111,11 +111,17 @@ if [[ ! -x ${cmake} ]]; then + die "cannot found cmake" + fi + ++arch=$(uname -i) ++if [[ $arch == arm* ]] || [[ $arch = aarch64 ]]; then ++ CMAKE_EXTRA_FLAGS="-DENABLE_SSE=0" ++fi ++ + # Configure + ${cmake} -DENABLE_DEBUG=${enable_build} -DCMAKE_INSTALL_PREFIX=${prefix_dirs} \ + -DCMAKE_C_COMPILER=${c_compiler} -DCMAKE_CXX_COMPILER=${cxx_compiler} \ + -DCMAKE_PREFIX_PATH=${dependency_dir} -DENABLE_BOOST=${enable_boost} \ + -DENABLE_COVERAGE=${enable_coverage} -DENABLE_LIBCPP=${enable_clang_lib} ${source_dir} \ ++ $CMAKE_EXTRA_FLAGS \ + || die "failed to configure the project" + + echo 'bootstrap success. Run "make" to build.' +