From 7f3564cca62de49c9f2ea67fcf735921dbebb4d1 Mon Sep 17 00:00:00 2001
From: Amos Bird <amosbird@gmail.com>
Date: Sun, 27 Feb 2022 10:38:17 +0800
Subject: [PATCH] [chore] Support aarch64 target with ldb_toolchain (#8249)

---
 be/CMakeLists.txt                             |   1 -
 be/src/glibc-compatibility/CMakeLists.txt     |   9 +-
 be/src/glibc-compatibility/FastMemcpy.c       | 220 ------
 be/src/glibc-compatibility/FastMemcpy.h       | 694 ------------------
 be/src/glibc-compatibility/LICENSE_FastMemcpy |  22 -
 .../memcpy/memcpy_aarch64.cpp                 | 245 +++++++
 .../memcpy/memcpy_x86_64.cpp                  | 220 ++++++
 be/src/glibc-compatibility/memcpy_wrapper.c   |   6 -
 thirdparty/patches/libhdfs3-master.patch      |  24 +-
 9 files changed, 492 insertions(+), 949 deletions(-)
 delete mode 100644 be/src/glibc-compatibility/FastMemcpy.c
 delete mode 100644 be/src/glibc-compatibility/FastMemcpy.h
 delete mode 100644 be/src/glibc-compatibility/LICENSE_FastMemcpy
 create mode 100644 be/src/glibc-compatibility/memcpy/memcpy_aarch64.cpp
 create mode 100644 be/src/glibc-compatibility/memcpy/memcpy_x86_64.cpp
 delete mode 100644 be/src/glibc-compatibility/memcpy_wrapper.c

diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index dfe85cee4070ec..c56b0225270bad 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -563,7 +563,6 @@ if(ARCH_AARCH64)
         ${DORIS_DEPENDENCIES}
         ${WL_START_GROUP}
         ${COMMON_THIRDPARTY}
-        ${WL_END_GROUP}
     )
 else()
     set(DORIS_DEPENDENCIES
diff --git a/be/src/glibc-compatibility/CMakeLists.txt b/be/src/glibc-compatibility/CMakeLists.txt
index bd2e5200eed2e7..9579c7819f849a 100644
--- a/be/src/glibc-compatibility/CMakeLists.txt
+++ b/be/src/glibc-compatibility/CMakeLists.txt
@@ -32,13 +32,13 @@ if (GLIBC_COMPATIBILITY)
     add_headers_and_sources(glibc_compatibility .)
     add_headers_and_sources(glibc_compatibility musl)
     if (ARCH_ARM)
-        # FastMemcpy not support arm, remove it from glibc_compatibility_sources
-        list (REMOVE_ITEM glibc_compatibility_sources FastMemcpy.c memcpy_wrapper.c)
         list (APPEND glibc_compatibility_sources musl/aarch64/syscall.s musl/aarch64/longjmp.s)
         set (musl_arch_include_dir musl/aarch64)
+        set (MEMCPY_SOURCE memcpy/memcpy_aarch64.cpp)
     elseif (ARCH_AMD64)
         list (APPEND glibc_compatibility_sources musl/x86_64/syscall.s musl/x86_64/longjmp.s)
         set (musl_arch_include_dir musl/x86_64)
+        set (MEMCPY_SOURCE memcpy/memcpy_x86_64.cpp)
     else ()
         message (FATAL_ERROR "glibc_compatibility can only be used on x86_64 or aarch64.")
     endif ()
@@ -48,8 +48,6 @@ if (GLIBC_COMPATIBILITY)
         list(APPEND glibc_compatibility_sources musl/getentropy.c)
     endif()
 
-    list(REMOVE_ITEM glibc_compatibility_sources musl/getrandom.c memcpy_wrapper.c)
-
     # Need to omit frame pointers to match the performance of glibc
     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer")
 
@@ -57,9 +55,10 @@ if (GLIBC_COMPATIBILITY)
     # without glibc-compatibility. Also sanitizers might generate getrandom
     # libcalls. Workaround: Use object file so that linker will always take a
     # look at its symbol table.
+    list(REMOVE_ITEM glibc_compatibility_sources musl/getrandom.c)
     # NOTE(amos): sanitizers might generate memcpy references that are too late to
     # refer. Let's also extract memcpy definitions explicitly to avoid UNDEF GLIBC 2.14.
-    add_library(glibc-compatibility-explicit OBJECT musl/getrandom.c memcpy_wrapper.c)
+    add_library(glibc-compatibility-explicit OBJECT musl/getrandom.c ${MEMCPY_SOURCE})
     target_compile_options(glibc-compatibility-explicit PRIVATE -fPIC)
     add_library(glibc-compatibility STATIC ${glibc_compatibility_sources})
 
diff --git a/be/src/glibc-compatibility/FastMemcpy.c b/be/src/glibc-compatibility/FastMemcpy.c
deleted file mode 100644
index 6c489ff144d445..00000000000000
--- a/be/src/glibc-compatibility/FastMemcpy.c
+++ /dev/null
@@ -1,220 +0,0 @@
-//=====================================================================
-//
-// FastMemcpy.c - skywind3000@163.com, 2015
-//
-// feature:
-// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9)
-//
-//=====================================================================
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-
-#if (defined(_WIN32) || defined(WIN32))
-#include <windows.h>
-#include <mmsystem.h>
-#ifdef _MSC_VER
-#pragma comment(lib, "winmm.lib")
-#endif
-#elif defined(__unix)
-#include <sys/time.h>
-#include <unistd.h>
-#else
-#error it can only be compiled under windows or unix
-#endif
-
-#include "FastMemcpy.h"
-
-unsigned int gettime()
-{
-	#if (defined(_WIN32) || defined(WIN32))
-	return timeGetTime();
-	#else
-	static struct timezone tz={ 0,0 };
-	struct timeval time;
-	gettimeofday(&time,&tz);
-	return (time.tv_sec * 1000 + time.tv_usec / 1000);
-	#endif
-}
-
-void sleepms(unsigned int millisec)
-{
-#if defined(_WIN32) || defined(WIN32)
-	Sleep(millisec);
-#else
-	usleep(millisec * 1000);
-#endif
-}
-
-
-void benchmark(int dstalign, int srcalign, size_t size, int times)
-{
-	char *DATA1 = (char*)malloc(size + 64);
-	char *DATA2 = (char*)malloc(size + 64);
-	size_t LINEAR1 = ((size_t)DATA1);
-	size_t LINEAR2 = ((size_t)DATA2);
-	char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1);
-	char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2);
-	char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1);
-	char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3);
-	unsigned int t1, t2;
-	int k;
-	
-	sleepms(100);
-	t1 = gettime();
-	for (k = times; k > 0; k--) {
-		memcpy(dst, src, size);
-	}
-	t1 = gettime() - t1;
-	sleepms(100);
-	t2 = gettime();
-	for (k = times; k > 0; k--) {
-		memcpy_fast(dst, src, size);
-	}
-	t2 = gettime() - t2;
-
-	free(DATA1);
-	free(DATA2);
-
-	printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n",  
-		dstalign? "aligned" : "unalign", 
-		srcalign? "aligned" : "unalign", (int)t2, (int)t1);
-}
-
-
-void bench(int copysize, int times)
-{
-	printf("benchmark(size=%d bytes, times=%d):\n", copysize, times);
-	benchmark(1, 1, copysize, times);
-	benchmark(1, 0, copysize, times);
-	benchmark(0, 1, copysize, times);
-	benchmark(0, 0, copysize, times);
-	printf("\n");
-}
-
-
-void random_bench(int maxsize, int times)
-{
-	static char A[11 * 1024 * 1024 + 2];
-	static char B[11 * 1024 * 1024 + 2];
-	static int random_offsets[0x10000];
-	static int random_sizes[0x8000];
-	unsigned int i, p1, p2;
-	unsigned int t1, t2;
-	for (i = 0; i < 0x10000; i++) {	// generate random offsets
-		random_offsets[i] = rand() % (10 * 1024 * 1024 + 1);
-	}
-	for (i = 0; i < 0x8000; i++) {	// generate random sizes
-		random_sizes[i] = 1 + rand() % maxsize;
-	}
-	sleepms(100);
-	t1 = gettime();
-	for (p1 = 0, p2 = 0, i = 0; i < times; i++) {
-		int offset1 = random_offsets[(p1++) & 0xffff];
-		int offset2 = random_offsets[(p1++) & 0xffff];
-		int size = random_sizes[(p2++) & 0x7fff];
-		memcpy(A + offset1, B + offset2, size);
-	}
-	t1 = gettime() - t1;
-	sleepms(100);
-	t2 = gettime();
-	for (p1 = 0, p2 = 0, i = 0; i < times; i++) {
-		int offset1 = random_offsets[(p1++) & 0xffff];
-		int offset2 = random_offsets[(p1++) & 0xffff];
-		int size = random_sizes[(p2++) & 0x7fff];
-		memcpy_fast(A + offset1, B + offset2, size);
-	}
-	t2 = gettime() - t2;
-	printf("benchmark random access:\n");
-	printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1);
-}
-
-
-#ifdef _MSC_VER
-#pragma comment(lib, "winmm.lib")
-#endif
-
-int main(void)
-{
-	bench(32, 0x1000000);
-	bench(64, 0x1000000);
-	bench(512, 0x800000);
-	bench(1024, 0x400000);
-	bench(4096, 0x80000);
-	bench(8192, 0x40000);
-	bench(1024 * 1024 * 1, 0x800);
-	bench(1024 * 1024 * 4, 0x200);
-	bench(1024 * 1024 * 8, 0x100);
-	
-	random_bench(2048, 8000000);
-
-	return 0;
-}
-
-
-
-
-/*
-benchmark(size=32 bytes, times=16777216):
-result(dst aligned, src aligned): memcpy_fast=78ms memcpy=260 ms
-result(dst aligned, src unalign): memcpy_fast=78ms memcpy=250 ms
-result(dst unalign, src aligned): memcpy_fast=78ms memcpy=266 ms
-result(dst unalign, src unalign): memcpy_fast=78ms memcpy=234 ms
-
-benchmark(size=64 bytes, times=16777216):
-result(dst aligned, src aligned): memcpy_fast=109ms memcpy=281 ms
-result(dst aligned, src unalign): memcpy_fast=109ms memcpy=328 ms
-result(dst unalign, src aligned): memcpy_fast=109ms memcpy=343 ms
-result(dst unalign, src unalign): memcpy_fast=93ms memcpy=344 ms
-
-benchmark(size=512 bytes, times=8388608):
-result(dst aligned, src aligned): memcpy_fast=125ms memcpy=218 ms
-result(dst aligned, src unalign): memcpy_fast=156ms memcpy=484 ms
-result(dst unalign, src aligned): memcpy_fast=172ms memcpy=546 ms
-result(dst unalign, src unalign): memcpy_fast=172ms memcpy=515 ms
-
-benchmark(size=1024 bytes, times=4194304):
-result(dst aligned, src aligned): memcpy_fast=109ms memcpy=172 ms
-result(dst aligned, src unalign): memcpy_fast=187ms memcpy=453 ms
-result(dst unalign, src aligned): memcpy_fast=172ms memcpy=437 ms
-result(dst unalign, src unalign): memcpy_fast=156ms memcpy=452 ms
-
-benchmark(size=4096 bytes, times=524288):
-result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms
-result(dst aligned, src unalign): memcpy_fast=109ms memcpy=202 ms
-result(dst unalign, src aligned): memcpy_fast=94ms memcpy=203 ms
-result(dst unalign, src unalign): memcpy_fast=110ms memcpy=218 ms
-
-benchmark(size=8192 bytes, times=262144):
-result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms
-result(dst aligned, src unalign): memcpy_fast=78ms memcpy=202 ms
-result(dst unalign, src aligned): memcpy_fast=78ms memcpy=203 ms
-result(dst unalign, src unalign): memcpy_fast=94ms memcpy=203 ms
-
-benchmark(size=1048576 bytes, times=2048):
-result(dst aligned, src aligned): memcpy_fast=203ms memcpy=191 ms
-result(dst aligned, src unalign): memcpy_fast=219ms memcpy=281 ms
-result(dst unalign, src aligned): memcpy_fast=218ms memcpy=328 ms
-result(dst unalign, src unalign): memcpy_fast=218ms memcpy=312 ms
-
-benchmark(size=4194304 bytes, times=512):
-result(dst aligned, src aligned): memcpy_fast=312ms memcpy=406 ms
-result(dst aligned, src unalign): memcpy_fast=296ms memcpy=421 ms
-result(dst unalign, src aligned): memcpy_fast=312ms memcpy=468 ms
-result(dst unalign, src unalign): memcpy_fast=297ms memcpy=452 ms
-
-benchmark(size=8388608 bytes, times=256):
-result(dst aligned, src aligned): memcpy_fast=281ms memcpy=452 ms
-result(dst aligned, src unalign): memcpy_fast=280ms memcpy=468 ms
-result(dst unalign, src aligned): memcpy_fast=298ms memcpy=514 ms
-result(dst unalign, src unalign): memcpy_fast=344ms memcpy=472 ms
-
-benchmark random access:
-memcpy_fast=515ms memcpy=1014ms
-
-*/
-
-
-
-
diff --git a/be/src/glibc-compatibility/FastMemcpy.h b/be/src/glibc-compatibility/FastMemcpy.h
deleted file mode 100644
index cdccdf8a52478e..00000000000000
--- a/be/src/glibc-compatibility/FastMemcpy.h
+++ /dev/null
@@ -1,694 +0,0 @@
-//=====================================================================
-//
-// FastMemcpy.c - skywind3000@163.com, 2015
-//
-// feature:
-// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1)
-//
-//=====================================================================
-#ifndef __FAST_MEMCPY_H__
-#define __FAST_MEMCPY_H__
-
-#include <stddef.h>
-#include <stdint.h>
-#include <emmintrin.h>
-
-
-//---------------------------------------------------------------------
-// force inline for compilers
-//---------------------------------------------------------------------
-#ifndef INLINE
-#ifdef __GNUC__
-#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
-    #define INLINE         __inline__ __attribute__((always_inline))
-#else
-    #define INLINE         __inline__
-#endif
-#elif defined(_MSC_VER)
-	#define INLINE __forceinline
-#elif (defined(__BORLANDC__) || defined(__WATCOMC__))
-    #define INLINE __inline
-#else
-    #define INLINE
-#endif
-#endif
-
-typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t;
-typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t;
-typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t;
-
-//---------------------------------------------------------------------
-// fast copy for different sizes
-//---------------------------------------------------------------------
-static INLINE void memcpy_sse2_16(void *dst, const void *src) {
-	__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
-	_mm_storeu_si128(((__m128i*)dst) + 0, m0);
-}
-
-static INLINE void memcpy_sse2_32(void *dst, const void *src) {
-	__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
-	__m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1);
-	_mm_storeu_si128(((__m128i*)dst) + 0, m0);
-	_mm_storeu_si128(((__m128i*)dst) + 1, m1);
-}
-
-static INLINE void memcpy_sse2_64(void *dst, const void *src) {
-	__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
-	__m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1);
-	__m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2);
-	__m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3);
-	_mm_storeu_si128(((__m128i*)dst) + 0, m0);
-	_mm_storeu_si128(((__m128i*)dst) + 1, m1);
-	_mm_storeu_si128(((__m128i*)dst) + 2, m2);
-	_mm_storeu_si128(((__m128i*)dst) + 3, m3);
-}
-
-static INLINE void memcpy_sse2_128(void *dst, const void *src) {
-	__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
-	__m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1);
-	__m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2);
-	__m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3);
-	__m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4);
-	__m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5);
-	__m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6);
-	__m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7);
-	_mm_storeu_si128(((__m128i*)dst) + 0, m0);
-	_mm_storeu_si128(((__m128i*)dst) + 1, m1);
-	_mm_storeu_si128(((__m128i*)dst) + 2, m2);
-	_mm_storeu_si128(((__m128i*)dst) + 3, m3);
-	_mm_storeu_si128(((__m128i*)dst) + 4, m4);
-	_mm_storeu_si128(((__m128i*)dst) + 5, m5);
-	_mm_storeu_si128(((__m128i*)dst) + 6, m6);
-	_mm_storeu_si128(((__m128i*)dst) + 7, m7);
-}
-
-
-//---------------------------------------------------------------------
-// tiny memory copy with jump table optimized
-//---------------------------------------------------------------------
-/// Attribute is used to avoid an error with undefined behaviour sanitizer
-/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer
-/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library.
-__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) {
-	unsigned char *dd = ((unsigned char*)dst) + size;
-	const unsigned char *ss = ((const unsigned char*)src) + size;
-
-	switch (size) {
-	case 64:
-		memcpy_sse2_64(dd - 64, ss - 64);
-	case 0:
-		break;
-
-	case 65:
-		memcpy_sse2_64(dd - 65, ss - 65);
-	case 1:
-		dd[-1] = ss[-1];
-		break;
-
-	case 66:
-		memcpy_sse2_64(dd - 66, ss - 66);
-	case 2:
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 67:
-		memcpy_sse2_64(dd - 67, ss - 67);
-	case 3:
-		*((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
-		dd[-1] = ss[-1];
-		break;
-
-	case 68:
-		memcpy_sse2_64(dd - 68, ss - 68);
-	case 4:
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 69:
-		memcpy_sse2_64(dd - 69, ss - 69);
-	case 5:
-		*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
-		dd[-1] = ss[-1];
-		break;
-
-	case 70:
-		memcpy_sse2_64(dd - 70, ss - 70);
-	case 6:
-		*((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 71:
-		memcpy_sse2_64(dd - 71, ss - 71);
-	case 7:
-		*((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 72:
-		memcpy_sse2_64(dd - 72, ss - 72);
-	case 8:
-		*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
-		break;
-
-	case 73:
-		memcpy_sse2_64(dd - 73, ss - 73);
-	case 9:
-		*((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9));
-		dd[-1] = ss[-1];
-		break;
-
-	case 74:
-		memcpy_sse2_64(dd - 74, ss - 74);
-	case 10:
-		*((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10));
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 75:
-		memcpy_sse2_64(dd - 75, ss - 75);
-	case 11:
-		*((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 76:
-		memcpy_sse2_64(dd - 76, ss - 76);
-	case 12:
-		*((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 77:
-		memcpy_sse2_64(dd - 77, ss - 77);
-	case 13:
-		*((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13));
-		*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
-		dd[-1] = ss[-1];
-		break;
-
-	case 78:
-		memcpy_sse2_64(dd - 78, ss - 78);
-	case 14:
-		*((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14));
-		*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
-		break;
-
-	case 79:
-		memcpy_sse2_64(dd - 79, ss - 79);
-	case 15:
-		*((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15));
-		*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
-		break;
-
-	case 80:
-		memcpy_sse2_64(dd - 80, ss - 80);
-	case 16:
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 81:
-		memcpy_sse2_64(dd - 81, ss - 81);
-	case 17:
-		memcpy_sse2_16(dd - 17, ss - 17);
-		dd[-1] = ss[-1];
-		break;
-
-	case 82:
-		memcpy_sse2_64(dd - 82, ss - 82);
-	case 18:
-		memcpy_sse2_16(dd - 18, ss - 18);
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 83:
-		memcpy_sse2_64(dd - 83, ss - 83);
-	case 19:
-		memcpy_sse2_16(dd - 19, ss - 19);
-		*((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
-		dd[-1] = ss[-1];
-		break;
-
-	case 84:
-		memcpy_sse2_64(dd - 84, ss - 84);
-	case 20:
-		memcpy_sse2_16(dd - 20, ss - 20);
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 85:
-		memcpy_sse2_64(dd - 85, ss - 85);
-	case 21:
-		memcpy_sse2_16(dd - 21, ss - 21);
-		*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
-		dd[-1] = ss[-1];
-		break;
-
-	case 86:
-		memcpy_sse2_64(dd - 86, ss - 86);
-	case 22:
-		memcpy_sse2_16(dd - 22, ss - 22);
-		*((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 87:
-		memcpy_sse2_64(dd - 87, ss - 87);
-	case 23:
-		memcpy_sse2_16(dd - 23, ss - 23);
-		*((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 88:
-		memcpy_sse2_64(dd - 88, ss - 88);
-	case 24:
-		memcpy_sse2_16(dd - 24, ss - 24);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 89:
-		memcpy_sse2_64(dd - 89, ss - 89);
-	case 25:
-		memcpy_sse2_16(dd - 25, ss - 25);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 90:
-		memcpy_sse2_64(dd - 90, ss - 90);
-	case 26:
-		memcpy_sse2_16(dd - 26, ss - 26);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 91:
-		memcpy_sse2_64(dd - 91, ss - 91);
-	case 27:
-		memcpy_sse2_16(dd - 27, ss - 27);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 92:
-		memcpy_sse2_64(dd - 92, ss - 92);
-	case 28:
-		memcpy_sse2_16(dd - 28, ss - 28);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 93:
-		memcpy_sse2_64(dd - 93, ss - 93);
-	case 29:
-		memcpy_sse2_16(dd - 29, ss - 29);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 94:
-		memcpy_sse2_64(dd - 94, ss - 94);
-	case 30:
-		memcpy_sse2_16(dd - 30, ss - 30);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 95:
-		memcpy_sse2_64(dd - 95, ss - 95);
-	case 31:
-		memcpy_sse2_16(dd - 31, ss - 31);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 96:
-		memcpy_sse2_64(dd - 96, ss - 96);
-	case 32:
-		memcpy_sse2_32(dd - 32, ss - 32);
-		break;
-
-	case 97:
-		memcpy_sse2_64(dd - 97, ss - 97);
-	case 33:
-		memcpy_sse2_32(dd - 33, ss - 33);
-		dd[-1] = ss[-1];
-		break;
-
-	case 98:
-		memcpy_sse2_64(dd - 98, ss - 98);
-	case 34:
-		memcpy_sse2_32(dd - 34, ss - 34);
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 99:
-		memcpy_sse2_64(dd - 99, ss - 99);
-	case 35:
-		memcpy_sse2_32(dd - 35, ss - 35);
-		*((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
-		dd[-1] = ss[-1];
-		break;
-
-	case 100:
-		memcpy_sse2_64(dd - 100, ss - 100);
-	case 36:
-		memcpy_sse2_32(dd - 36, ss - 36);
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 101:
-		memcpy_sse2_64(dd - 101, ss - 101);
-	case 37:
-		memcpy_sse2_32(dd - 37, ss - 37);
-		*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
-		dd[-1] = ss[-1];
-		break;
-
-	case 102:
-		memcpy_sse2_64(dd - 102, ss - 102);
-	case 38:
-		memcpy_sse2_32(dd - 38, ss - 38);
-		*((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 103:
-		memcpy_sse2_64(dd - 103, ss - 103);
-	case 39:
-		memcpy_sse2_32(dd - 39, ss - 39);
-		*((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 104:
-		memcpy_sse2_64(dd - 104, ss - 104);
-	case 40:
-		memcpy_sse2_32(dd - 40, ss - 40);
-		*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
-		break;
-
-	case 105:
-		memcpy_sse2_64(dd - 105, ss - 105);
-	case 41:
-		memcpy_sse2_32(dd - 41, ss - 41);
-		*((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9));
-		dd[-1] = ss[-1];
-		break;
-
-	case 106:
-		memcpy_sse2_64(dd - 106, ss - 106);
-	case 42:
-		memcpy_sse2_32(dd - 42, ss - 42);
-		*((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10));
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 107:
-		memcpy_sse2_64(dd - 107, ss - 107);
-	case 43:
-		memcpy_sse2_32(dd - 43, ss - 43);
-		*((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 108:
-		memcpy_sse2_64(dd - 108, ss - 108);
-	case 44:
-		memcpy_sse2_32(dd - 44, ss - 44);
-		*((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 109:
-		memcpy_sse2_64(dd - 109, ss - 109);
-	case 45:
-		memcpy_sse2_32(dd - 45, ss - 45);
-		*((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13));
-		*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
-		dd[-1] = ss[-1];
-		break;
-
-	case 110:
-		memcpy_sse2_64(dd - 110, ss - 110);
-	case 46:
-		memcpy_sse2_32(dd - 46, ss - 46);
-		*((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14));
-		*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
-		break;
-
-	case 111:
-		memcpy_sse2_64(dd - 111, ss - 111);
-	case 47:
-		memcpy_sse2_32(dd - 47, ss - 47);
-		*((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15));
-		*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
-		break;
-
-	case 112:
-		memcpy_sse2_64(dd - 112, ss - 112);
-	case 48:
-		memcpy_sse2_32(dd - 48, ss - 48);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 113:
-		memcpy_sse2_64(dd - 113, ss - 113);
-	case 49:
-		memcpy_sse2_32(dd - 49, ss - 49);
-		memcpy_sse2_16(dd - 17, ss - 17);
-		dd[-1] = ss[-1];
-		break;
-
-	case 114:
-		memcpy_sse2_64(dd - 114, ss - 114);
-	case 50:
-		memcpy_sse2_32(dd - 50, ss - 50);
-		memcpy_sse2_16(dd - 18, ss - 18);
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 115:
-		memcpy_sse2_64(dd - 115, ss - 115);
-	case 51:
-		memcpy_sse2_32(dd - 51, ss - 51);
-		memcpy_sse2_16(dd - 19, ss - 19);
-		*((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
-		dd[-1] = ss[-1];
-		break;
-
-	case 116:
-		memcpy_sse2_64(dd - 116, ss - 116);
-	case 52:
-		memcpy_sse2_32(dd - 52, ss - 52);
-		memcpy_sse2_16(dd - 20, ss - 20);
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 117:
-		memcpy_sse2_64(dd - 117, ss - 117);
-	case 53:
-		memcpy_sse2_32(dd - 53, ss - 53);
-		memcpy_sse2_16(dd - 21, ss - 21);
-		*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
-		dd[-1] = ss[-1];
-		break;
-
-	case 118:
-		memcpy_sse2_64(dd - 118, ss - 118);
-	case 54:
-		memcpy_sse2_32(dd - 54, ss - 54);
-		memcpy_sse2_16(dd - 22, ss - 22);
-		*((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 119:
-		memcpy_sse2_64(dd - 119, ss - 119);
-	case 55:
-		memcpy_sse2_32(dd - 55, ss - 55);
-		memcpy_sse2_16(dd - 23, ss - 23);
-		*((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 120:
-		memcpy_sse2_64(dd - 120, ss - 120);
-	case 56:
-		memcpy_sse2_32(dd - 56, ss - 56);
-		memcpy_sse2_16(dd - 24, ss - 24);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 121:
-		memcpy_sse2_64(dd - 121, ss - 121);
-	case 57:
-		memcpy_sse2_32(dd - 57, ss - 57);
-		memcpy_sse2_16(dd - 25, ss - 25);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 122:
-		memcpy_sse2_64(dd - 122, ss - 122);
-	case 58:
-		memcpy_sse2_32(dd - 58, ss - 58);
-		memcpy_sse2_16(dd - 26, ss - 26);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 123:
-		memcpy_sse2_64(dd - 123, ss - 123);
-	case 59:
-		memcpy_sse2_32(dd - 59, ss - 59);
-		memcpy_sse2_16(dd - 27, ss - 27);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 124:
-		memcpy_sse2_64(dd - 124, ss - 124);
-	case 60:
-		memcpy_sse2_32(dd - 60, ss - 60);
-		memcpy_sse2_16(dd - 28, ss - 28);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 125:
-		memcpy_sse2_64(dd - 125, ss - 125);
-	case 61:
-		memcpy_sse2_32(dd - 61, ss - 61);
-		memcpy_sse2_16(dd - 29, ss - 29);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 126:
-		memcpy_sse2_64(dd - 126, ss - 126);
-	case 62:
-		memcpy_sse2_32(dd - 62, ss - 62);
-		memcpy_sse2_16(dd - 30, ss - 30);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 127:
-		memcpy_sse2_64(dd - 127, ss - 127);
-	case 63:
-		memcpy_sse2_32(dd - 63, ss - 63);
-		memcpy_sse2_16(dd - 31, ss - 31);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 128:
-		memcpy_sse2_128(dd - 128, ss - 128);
-		break;
-	}
-
-	return dst;
-}
-
-
-//---------------------------------------------------------------------
-// main routine
-//---------------------------------------------------------------------
-static void* memcpy_fast(void *destination, const void *source, size_t size)
-{
-	unsigned char *dst = (unsigned char*)destination;
-	const unsigned char *src = (const unsigned char*)source;
-	static size_t cachesize = 0x200000; // L2-cache size
-	size_t padding;
-
-	// small memory copy
-	if (size <= 128) {
-		return memcpy_tiny(dst, src, size);
-	}
-
-	// align destination to 16 bytes boundary
-	padding = (16 - (((size_t)dst) & 15)) & 15;
-
-	if (padding > 0) {
-		__m128i head = _mm_loadu_si128((const __m128i*)src);
-		_mm_storeu_si128((__m128i*)dst, head);
-		dst += padding;
-		src += padding;
-		size -= padding;
-	}
-
-	// medium size copy
-	if (size <= cachesize) {
-		__m128i c0, c1, c2, c3, c4, c5, c6, c7;
-
-		for (; size >= 128; size -= 128) {
-			c0 = _mm_loadu_si128(((const __m128i*)src) + 0);
-			c1 = _mm_loadu_si128(((const __m128i*)src) + 1);
-			c2 = _mm_loadu_si128(((const __m128i*)src) + 2);
-			c3 = _mm_loadu_si128(((const __m128i*)src) + 3);
-			c4 = _mm_loadu_si128(((const __m128i*)src) + 4);
-			c5 = _mm_loadu_si128(((const __m128i*)src) + 5);
-			c6 = _mm_loadu_si128(((const __m128i*)src) + 6);
-			c7 = _mm_loadu_si128(((const __m128i*)src) + 7);
-			_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
-			src += 128;
-			_mm_store_si128((((__m128i*)dst) + 0), c0);
-			_mm_store_si128((((__m128i*)dst) + 1), c1);
-			_mm_store_si128((((__m128i*)dst) + 2), c2);
-			_mm_store_si128((((__m128i*)dst) + 3), c3);
-			_mm_store_si128((((__m128i*)dst) + 4), c4);
-			_mm_store_si128((((__m128i*)dst) + 5), c5);
-			_mm_store_si128((((__m128i*)dst) + 6), c6);
-			_mm_store_si128((((__m128i*)dst) + 7), c7);
-			dst += 128;
-		}
-	}
-	else {		// big memory copy
-		__m128i c0, c1, c2, c3, c4, c5, c6, c7;
-
-		_mm_prefetch((const char*)(src), _MM_HINT_NTA);
-
-		if ((((size_t)src) & 15) == 0) {	// source aligned
-			for (; size >= 128; size -= 128) {
-				c0 = _mm_load_si128(((const __m128i*)src) + 0);
-				c1 = _mm_load_si128(((const __m128i*)src) + 1);
-				c2 = _mm_load_si128(((const __m128i*)src) + 2);
-				c3 = _mm_load_si128(((const __m128i*)src) + 3);
-				c4 = _mm_load_si128(((const __m128i*)src) + 4);
-				c5 = _mm_load_si128(((const __m128i*)src) + 5);
-				c6 = _mm_load_si128(((const __m128i*)src) + 6);
-				c7 = _mm_load_si128(((const __m128i*)src) + 7);
-				_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
-				src += 128;
-				_mm_stream_si128((((__m128i*)dst) + 0), c0);
-				_mm_stream_si128((((__m128i*)dst) + 1), c1);
-				_mm_stream_si128((((__m128i*)dst) + 2), c2);
-				_mm_stream_si128((((__m128i*)dst) + 3), c3);
-				_mm_stream_si128((((__m128i*)dst) + 4), c4);
-				_mm_stream_si128((((__m128i*)dst) + 5), c5);
-				_mm_stream_si128((((__m128i*)dst) + 6), c6);
-				_mm_stream_si128((((__m128i*)dst) + 7), c7);
-				dst += 128;
-			}
-		}
-		else {							// source unaligned
-			for (; size >= 128; size -= 128) {
-				c0 = _mm_loadu_si128(((const __m128i*)src) + 0);
-				c1 = _mm_loadu_si128(((const __m128i*)src) + 1);
-				c2 = _mm_loadu_si128(((const __m128i*)src) + 2);
-				c3 = _mm_loadu_si128(((const __m128i*)src) + 3);
-				c4 = _mm_loadu_si128(((const __m128i*)src) + 4);
-				c5 = _mm_loadu_si128(((const __m128i*)src) + 5);
-				c6 = _mm_loadu_si128(((const __m128i*)src) + 6);
-				c7 = _mm_loadu_si128(((const __m128i*)src) + 7);
-				_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
-				src += 128;
-				_mm_stream_si128((((__m128i*)dst) + 0), c0);
-				_mm_stream_si128((((__m128i*)dst) + 1), c1);
-				_mm_stream_si128((((__m128i*)dst) + 2), c2);
-				_mm_stream_si128((((__m128i*)dst) + 3), c3);
-				_mm_stream_si128((((__m128i*)dst) + 4), c4);
-				_mm_stream_si128((((__m128i*)dst) + 5), c5);
-				_mm_stream_si128((((__m128i*)dst) + 6), c6);
-				_mm_stream_si128((((__m128i*)dst) + 7), c7);
-				dst += 128;
-			}
-		}
-		_mm_sfence();
-	}
-
-	memcpy_tiny(dst, src, size);
-
-	return destination;
-}
-
-
-#endif
diff --git a/be/src/glibc-compatibility/LICENSE_FastMemcpy b/be/src/glibc-compatibility/LICENSE_FastMemcpy
deleted file mode 100644
index c449da6aa8acfc..00000000000000
--- a/be/src/glibc-compatibility/LICENSE_FastMemcpy
+++ /dev/null
@@ -1,22 +0,0 @@
-The MIT License (MIT)
-
-Copyright (c) 2015 Linwei
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
diff --git a/be/src/glibc-compatibility/memcpy/memcpy_aarch64.cpp b/be/src/glibc-compatibility/memcpy/memcpy_aarch64.cpp
new file mode 100644
index 00000000000000..4a6a5ac96fbf4b
--- /dev/null
+++ b/be/src/glibc-compatibility/memcpy/memcpy_aarch64.cpp
@@ -0,0 +1,245 @@
+#include <stddef.h>
+
+#include <arm_neon.h>
+
+typedef int64x2_t __m128i;
+
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+
+static inline __attribute__((always_inline)) void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+static inline __attribute__((always_inline)) void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+static inline __attribute__((always_inline)) __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+static inline __attribute__((always_inline)) __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+/** Custom memcpy implementation for ClickHouse.
+  * It has the following benefits over using glibc's implementation:
+  * 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability.
+  * 2. Avoiding indirect call via PLT due to shared linking, that can be less efficient.
+  * 3. It's possible to include this header and call inline_memcpy directly for better inlining or interprocedural analysis.
+  * 4. Better results on our performance tests on current CPUs: up to 25% on some queries and up to 0.7%..1% in average across all queries.
+  *
+  * Writing our own memcpy is extremely difficult for the following reasons:
+  * 1. The optimal variant depends on the specific CPU model.
+  * 2. The optimal variant depends on the distribution of size arguments.
+  * 3. It depends on the number of threads copying data concurrently.
+  * 4. It also depends on how the calling code is using the copied data and how the different memcpy calls are related to each other.
+  * Due to vast range of scenarios it makes proper testing especially difficult.
+  * When writing our own memcpy there is a risk to overoptimize it
+  * on non-representative microbenchmarks while making real-world use cases actually worse.
+  *
+  * Most of the benchmarks for memcpy on the internet are wrong.
+  *
+  * Let's look at the details:
+  *
+  * For small size, the order of branches in code is important.
+  * There are variants with specific order of branches (like here or in glibc)
+  * or with jump table (in asm code see example from Cosmopolitan libc:
+  * https://github.com/jart/cosmopolitan/blob/de09bec215675e9b0beb722df89c6f794da74f3f/libc/nexgen32e/memcpy.S#L61)
+  * or with Duff device in C (see https://github.com/skywind3000/FastMemcpy/)
+  *
+  * It's also important how to copy uneven sizes.
+  * Almost every implementation, including this, is using two overlapping movs.
+  *
+  * It is important to disable -ftree-loop-distribute-patterns when compiling memcpy implementation,
+  * otherwise the compiler can replace internal loops to a call to memcpy that will lead to infinite recursion.
+  *
+  * For larger sizes it's important to choose the instructions used:
+  * - SSE or AVX or AVX-512;
+  * - rep movsb;
+  * Performance will depend on the size threshold, on the CPU model, on the "erms" flag
+  * ("Enhansed Rep MovS" - it indicates that performance of "rep movsb" is decent for large sizes)
+  * https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy
+  *
+  * Using AVX-512 can be bad due to throttling.
+  * Using AVX can be bad if most code is using SSE due to switching penalty
+  * (it also depends on the usage of "vzeroupper" instruction).
+  * But in some cases AVX gives a win.
+  *
+  * It also depends on how many times the loop will be unrolled.
+  * We are unrolling the loop 8 times (by the number of available registers), but it not always the best.
+  *
+  * It also depends on the usage of aligned or unaligned loads/stores.
+  * We are using unaligned loads and aligned stores.
+  *
+  * It also depends on the usage of prefetch instructions. It makes sense on some Intel CPUs but can slow down performance on AMD.
+  * Setting up correct offset for prefetching is non-obvious.
+  *
+  * Non-temporary (cache bypassing) stores can be used for very large sizes (more than a half of L3 cache).
+  * But the exact threshold is unclear - when doing memcpy from multiple threads the optimal threshold can be lower,
+  * because L3 cache is shared (and L2 cache is partially shared).
+  *
+  * Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios,
+  * so we don't pay attention to using non-temporary stores.
+  *
+  * On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most benefitial,
+  * even comparing to non-temporary aligned unrolled stores even with the most wide registers.
+  *
+  * memcpy can be written in asm, C or C++. The latter can also use inline asm.
+  * The asm implementation can be better to make sure that compiler won't make the code worse,
+  * to ensure the order of branches, the code layout, the usage of all required registers.
+  * But if it is located in separate translation unit, inlining will not be possible
+  * (inline asm can be used to overcome this limitation).
+  * Sometimes C or C++ code can be further optimized by compiler.
+  * For example, clang is capable replacing SSE intrinsics to AVX code if -mavx is used.
+  *
+  * Please note that compiler can replace plain code to memcpy and vice versa.
+  * - memcpy with compile-time known small size is replaced to simple instructions without a call to memcpy;
+  *   it is controlled by -fbuiltin-memcpy and can be manually ensured by calling __builtin_memcpy.
+  *   This is often used to implement unaligned load/store without undefined behaviour in C++.
+  * - a loop with copying bytes can be recognized and replaced by a call to memcpy;
+  *   it is controlled by -ftree-loop-distribute-patterns.
+  * - also note that a loop with copying bytes can be unrolled, peeled and vectorized that will give you
+  *   inline code somewhat similar to a decent implementation of memcpy.
+  *
+  * This description is up to date as of Mar 2021.
+  *
+  * How to test the memcpy implementation for performance:
+  * 1. Test on real production workload.
+  * 2. For synthetic test, see utils/memcpy-bench, but make sure you will do the best to exhaust the wide range of scenarios.
+  *
+  * TODO: Add self-tuning memcpy with bayesian bandits algorithm for large sizes.
+  * See https://habr.com/en/company/yandex/blog/457612/
+  */
+
+
+static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size)
+{
+    /// We will use pointer arithmetic, so char pointer will be used.
+    /// Note that __restrict makes sense (otherwise compiler will reload data from memory
+    /// instead of using the value of registers due to possible aliasing).
+    char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
+    const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);
+
+    /// Standard memcpy returns the original value of dst. It is rarely used but we have to do it.
+    /// If you use memcpy with small but non-constant sizes, you can call inline_memcpy directly
+    /// for inlining and removing this single instruction.
+    void * ret = dst;
+
+tail:
+    /// Small sizes and tails after the loop for large sizes.
+    /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application.
+    /// This order of branches is from the disassembly of glibc's code.
+    /// We copy chunks of possibly uneven size with two overlapping movs.
+    /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3].
+    if (size <= 16)
+    {
+        if (size >= 8)
+        {
+            /// Chunks of 8..16 bytes.
+            __builtin_memcpy(dst + size - 8, src + size - 8, 8);
+            __builtin_memcpy(dst, src, 8);
+        }
+        else if (size >= 4)
+        {
+            /// Chunks of 4..7 bytes.
+            __builtin_memcpy(dst + size - 4, src + size - 4, 4);
+            __builtin_memcpy(dst, src, 4);
+        }
+        else if (size >= 2)
+        {
+            /// Chunks of 2..3 bytes.
+            __builtin_memcpy(dst + size - 2, src + size - 2, 2);
+            __builtin_memcpy(dst, src, 2);
+        }
+        else if (size >= 1)
+        {
+            /// A single byte.
+            *dst = *src;
+        }
+        /// No bytes remaining.
+    }
+    else
+    {
+        /// Medium and large sizes.
+        if (size <= 128)
+        {
+            /// Medium size, not enough for full loop unrolling.
+
+            /// We will copy the last 16 bytes.
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + size - 16)));
+
+            /// Then we will copy every 16 bytes from the beginning in a loop.
+            /// The last loop iteration will possibly overwrite some part of already copied last 16 bytes.
+            /// This is Ok, similar to the code for small sizes above.
+            while (size > 16)
+            {
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
+                dst += 16;
+                src += 16;
+                size -= 16;
+            }
+        }
+        else
+        {
+            /// Large size with fully unrolled loop.
+
+            /// Align destination to 16 bytes boundary.
+            size_t padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
+
+            /// If not aligned - we will copy first 16 bytes with unaligned stores.
+            if (padding > 0)
+            {
+                __m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+                _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
+                dst += padding;
+                src += padding;
+                size -= padding;
+            }
+
+            /// Aligned unrolled copy. We will use half of available SSE registers.
+            /// It's not possible to have both src and dst aligned.
+            /// So, we will use aligned stores and unaligned loads.
+            __m128i c0, c1, c2, c3, c4, c5, c6, c7;
+
+            while (size >= 128)
+            {
+                c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
+                c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
+                c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
+                c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
+                c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
+                c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
+                c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
+                c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
+                src += 128;
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
+                dst += 128;
+
+                size -= 128;
+            }
+
+            /// The latest remaining 0..127 bytes will be processed as usual.
+            goto tail;
+        }
+    }
+
+    return ret;
+}
+
+extern "C" void * memcpy(void * __restrict dst, const void * __restrict src, size_t size)
+{
+    return inline_memcpy(dst, src, size);
+}
diff --git a/be/src/glibc-compatibility/memcpy/memcpy_x86_64.cpp b/be/src/glibc-compatibility/memcpy/memcpy_x86_64.cpp
new file mode 100644
index 00000000000000..50deabf892f31b
--- /dev/null
+++ b/be/src/glibc-compatibility/memcpy/memcpy_x86_64.cpp
@@ -0,0 +1,220 @@
+#include <stddef.h>
+
+#include <emmintrin.h>
+
+/** Custom memcpy implementation for ClickHouse.
+  * It has the following benefits over using glibc's implementation:
+  * 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability.
+  * 2. Avoiding indirect call via PLT due to shared linking, that can be less efficient.
+  * 3. It's possible to include this header and call inline_memcpy directly for better inlining or interprocedural analysis.
+  * 4. Better results on our performance tests on current CPUs: up to 25% on some queries and up to 0.7%..1% in average across all queries.
+  *
+  * Writing our own memcpy is extremely difficult for the following reasons:
+  * 1. The optimal variant depends on the specific CPU model.
+  * 2. The optimal variant depends on the distribution of size arguments.
+  * 3. It depends on the number of threads copying data concurrently.
+  * 4. It also depends on how the calling code is using the copied data and how the different memcpy calls are related to each other.
+  * Due to vast range of scenarios it makes proper testing especially difficult.
+  * When writing our own memcpy there is a risk to overoptimize it
+  * on non-representative microbenchmarks while making real-world use cases actually worse.
+  *
+  * Most of the benchmarks for memcpy on the internet are wrong.
+  *
+  * Let's look at the details:
+  *
+  * For small size, the order of branches in code is important.
+  * There are variants with specific order of branches (like here or in glibc)
+  * or with jump table (in asm code see example from Cosmopolitan libc:
+  * https://github.com/jart/cosmopolitan/blob/de09bec215675e9b0beb722df89c6f794da74f3f/libc/nexgen32e/memcpy.S#L61)
+  * or with Duff device in C (see https://github.com/skywind3000/FastMemcpy/)
+  *
+  * It's also important how to copy uneven sizes.
+  * Almost every implementation, including this, is using two overlapping movs.
+  *
+  * It is important to disable -ftree-loop-distribute-patterns when compiling memcpy implementation,
+  * otherwise the compiler can replace internal loops to a call to memcpy that will lead to infinite recursion.
+  *
+  * For larger sizes it's important to choose the instructions used:
+  * - SSE or AVX or AVX-512;
+  * - rep movsb;
+  * Performance will depend on the size threshold, on the CPU model, on the "erms" flag
+  * ("Enhansed Rep MovS" - it indicates that performance of "rep movsb" is decent for large sizes)
+  * https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy
+  *
+  * Using AVX-512 can be bad due to throttling.
+  * Using AVX can be bad if most code is using SSE due to switching penalty
+  * (it also depends on the usage of "vzeroupper" instruction).
+  * But in some cases AVX gives a win.
+  *
+  * It also depends on how many times the loop will be unrolled.
+  * We are unrolling the loop 8 times (by the number of available registers), but it not always the best.
+  *
+  * It also depends on the usage of aligned or unaligned loads/stores.
+  * We are using unaligned loads and aligned stores.
+  *
+  * It also depends on the usage of prefetch instructions. It makes sense on some Intel CPUs but can slow down performance on AMD.
+  * Setting up correct offset for prefetching is non-obvious.
+  *
+  * Non-temporary (cache bypassing) stores can be used for very large sizes (more than a half of L3 cache).
+  * But the exact threshold is unclear - when doing memcpy from multiple threads the optimal threshold can be lower,
+  * because L3 cache is shared (and L2 cache is partially shared).
+  *
+  * Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios,
+  * so we don't pay attention to using non-temporary stores.
+  *
+  * On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most benefitial,
+  * even comparing to non-temporary aligned unrolled stores even with the most wide registers.
+  *
+  * memcpy can be written in asm, C or C++. The latter can also use inline asm.
+  * The asm implementation can be better to make sure that compiler won't make the code worse,
+  * to ensure the order of branches, the code layout, the usage of all required registers.
+  * But if it is located in separate translation unit, inlining will not be possible
+  * (inline asm can be used to overcome this limitation).
+  * Sometimes C or C++ code can be further optimized by compiler.
+  * For example, clang is capable replacing SSE intrinsics to AVX code if -mavx is used.
+  *
+  * Please note that compiler can replace plain code to memcpy and vice versa.
+  * - memcpy with compile-time known small size is replaced to simple instructions without a call to memcpy;
+  *   it is controlled by -fbuiltin-memcpy and can be manually ensured by calling __builtin_memcpy.
+  *   This is often used to implement unaligned load/store without undefined behaviour in C++.
+  * - a loop with copying bytes can be recognized and replaced by a call to memcpy;
+  *   it is controlled by -ftree-loop-distribute-patterns.
+  * - also note that a loop with copying bytes can be unrolled, peeled and vectorized that will give you
+  *   inline code somewhat similar to a decent implementation of memcpy.
+  *
+  * This description is up to date as of Mar 2021.
+  *
+  * How to test the memcpy implementation for performance:
+  * 1. Test on real production workload.
+  * 2. For synthetic test, see utils/memcpy-bench, but make sure you will do the best to exhaust the wide range of scenarios.
+  *
+  * TODO: Add self-tuning memcpy with bayesian bandits algorithm for large sizes.
+  * See https://habr.com/en/company/yandex/blog/457612/
+  */
+
+
+static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size)
+{
+    /// We will use pointer arithmetic, so char pointer will be used.
+    /// Note that __restrict makes sense (otherwise compiler will reload data from memory
+    /// instead of using the value of registers due to possible aliasing).
+    char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
+    const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);
+
+    /// Standard memcpy returns the original value of dst. It is rarely used but we have to do it.
+    /// If you use memcpy with small but non-constant sizes, you can call inline_memcpy directly
+    /// for inlining and removing this single instruction.
+    void * ret = dst;
+
+tail:
+    /// Small sizes and tails after the loop for large sizes.
+    /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application.
+    /// This order of branches is from the disassembly of glibc's code.
+    /// We copy chunks of possibly uneven size with two overlapping movs.
+    /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3].
+    if (size <= 16)
+    {
+        if (size >= 8)
+        {
+            /// Chunks of 8..16 bytes.
+            __builtin_memcpy(dst + size - 8, src + size - 8, 8);
+            __builtin_memcpy(dst, src, 8);
+        }
+        else if (size >= 4)
+        {
+            /// Chunks of 4..7 bytes.
+            __builtin_memcpy(dst + size - 4, src + size - 4, 4);
+            __builtin_memcpy(dst, src, 4);
+        }
+        else if (size >= 2)
+        {
+            /// Chunks of 2..3 bytes.
+            __builtin_memcpy(dst + size - 2, src + size - 2, 2);
+            __builtin_memcpy(dst, src, 2);
+        }
+        else if (size >= 1)
+        {
+            /// A single byte.
+            *dst = *src;
+        }
+        /// No bytes remaining.
+    }
+    else
+    {
+        /// Medium and large sizes.
+        if (size <= 128)
+        {
+            /// Medium size, not enough for full loop unrolling.
+
+            /// We will copy the last 16 bytes.
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + size - 16)));
+
+            /// Then we will copy every 16 bytes from the beginning in a loop.
+            /// The last loop iteration will possibly overwrite some part of already copied last 16 bytes.
+            /// This is Ok, similar to the code for small sizes above.
+            while (size > 16)
+            {
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
+                dst += 16;
+                src += 16;
+                size -= 16;
+            }
+        }
+        else
+        {
+            /// Large size with fully unrolled loop.
+
+            /// Align destination to 16 bytes boundary.
+            size_t padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
+
+            /// If not aligned - we will copy first 16 bytes with unaligned stores.
+            if (padding > 0)
+            {
+                __m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+                _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
+                dst += padding;
+                src += padding;
+                size -= padding;
+            }
+
+            /// Aligned unrolled copy. We will use half of available SSE registers.
+            /// It's not possible to have both src and dst aligned.
+            /// So, we will use aligned stores and unaligned loads.
+            __m128i c0, c1, c2, c3, c4, c5, c6, c7;
+
+            while (size >= 128)
+            {
+                c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
+                c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
+                c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
+                c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
+                c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
+                c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
+                c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
+                c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
+                src += 128;
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
+                dst += 128;
+
+                size -= 128;
+            }
+
+            /// The latest remaining 0..127 bytes will be processed as usual.
+            goto tail;
+        }
+    }
+
+    return ret;
+}
+
+extern "C" void * memcpy(void * __restrict dst, const void * __restrict src, size_t size)
+{
+    return inline_memcpy(dst, src, size);
+}
diff --git a/be/src/glibc-compatibility/memcpy_wrapper.c b/be/src/glibc-compatibility/memcpy_wrapper.c
deleted file mode 100644
index 1f57345980ad52..00000000000000
--- a/be/src/glibc-compatibility/memcpy_wrapper.c
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "FastMemcpy.h"
-
-void * memcpy(void * __restrict destination, const void * __restrict source, size_t size)
-{
-    return memcpy_fast(destination, source, size);
-}
diff --git a/thirdparty/patches/libhdfs3-master.patch b/thirdparty/patches/libhdfs3-master.patch
index febf1c4ac9a557..6c4eb2bfd70e45 100644
--- a/thirdparty/patches/libhdfs3-master.patch
+++ b/thirdparty/patches/libhdfs3-master.patch
@@ -1,4 +1,4 @@
-diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
+diff -uprN a/src/CMakeLists.txt b/src/CMakeLists.txt
 --- a/src/CMakeLists.txt  2021-09-23 22:03:55.000000000 +0800
 +++ b/src/CMakeLists.txt        2022-01-18 00:58:22.411061469 +0800
 @@ -46,7 +46,7 @@ SET(HEADER
@@ -114,3 +114,25 @@ index 9aa9d7b..53893a1 100644
  
          if (output) {
              free(output);
+diff -uprN a/src/CMakeLists.txt b/src/CMakeLists.txt
+--- a/bootstrap 2022-02-26 16:12:06.065389096 +0800
++++ b/bootstrap 2022-02-26 16:11:45.989378097 +0800
+@@ -111,11 +111,17 @@ if [[ ! -x ${cmake} ]]; then
+     die "cannot found cmake"
+ fi
+
++arch=$(uname -i)
++if  [[ $arch == arm* ]] || [[ $arch = aarch64 ]]; then
++    CMAKE_EXTRA_FLAGS="-DENABLE_SSE=0"
++fi
++
+ # Configure
+ ${cmake} -DENABLE_DEBUG=${enable_build} -DCMAKE_INSTALL_PREFIX=${prefix_dirs} \
+     -DCMAKE_C_COMPILER=${c_compiler} -DCMAKE_CXX_COMPILER=${cxx_compiler} \
+     -DCMAKE_PREFIX_PATH=${dependency_dir} -DENABLE_BOOST=${enable_boost} \
+     -DENABLE_COVERAGE=${enable_coverage} -DENABLE_LIBCPP=${enable_clang_lib} ${source_dir} \
++    $CMAKE_EXTRA_FLAGS \
+     || die "failed to configure the project"
+
+ echo 'bootstrap success. Run "make" to build.'
+