From a37c01bd7360d1a3b502eb754156f2feaf8801e3 Mon Sep 17 00:00:00 2001 From: a1i3nj03 <36484251+a1i3nj03@users.noreply.github.com> Date: Sat, 21 Apr 2018 05:40:37 -0400 Subject: [PATCH] Improvemnts that work?! Any issues should be reported, or accepted. --- JHA/cuda_jha_keccak512.cu | 9 +- algos.h | 2 - api.cpp | 5 +- ccminer.cpp | 34 +- ccminer.vcxproj | 2 - ccminer.vcxproj.filters | 2 - cuda_checkhash.cu | 63 +- cuda_helper.h | 2 - cuda_helper_alexis.h | 1 + miner.h | 3 - quark/cuda_bmw512.cu | 34 +- quark/cuda_bmw512_sm3.cuh | 6 +- quark/cuda_jh512.cu | 12 +- quark/cuda_quark.h | 4 +- quark/cuda_quark_blake512.cu | 18 +- quark/cuda_quark_blake512_sp.cuh | 17 +- quark/cuda_quark_groestl512.cu | 19 +- quark/cuda_quark_keccak512.cu | 11 +- quark/cuda_skein512.cu | 13 +- quark/groestl_transf_quad_a1_min3r.cuh | 1 - qubit/qubit_luffa512_alexis.cu | 16 +- res/ccminer.aps | Bin 101972 -> 101944 bytes util.cpp | 3 - x11/cuda_x11_aes_alexis.cuh | 7 +- x11/cuda_x11_cubehash512.cu | 291 +--- x11/cuda_x11_echo.cu | 6 +- x11/cuda_x11_echo_aes.cuh | 1088 ++++++------- x11/cuda_x11_echo_alexis.cu | 19 +- x11/cuda_x11_luffa512.cu | 6 +- x11/cuda_x11_luffa512_Cubehash.cu | 8 +- x11/cuda_x11_shavite512.cu | 9 +- x11/cuda_x11_shavite512_alexis.cu | 8 +- x11/cuda_x11_simd512.cu | 784 +++++++++- x11/cuda_x11_simd512_func.cuh | 1937 ++++++++++++++++-------- x11/cuda_x11_simd512_sm2.cuh | 10 +- x13/cuda_x13_fugue512_alexis.cu | 12 +- x13/cuda_x13_hamsi512.cu | 13 +- x13/cuda_x13_hamsi512_alexis.cu | 6 +- x15/cuda_x14_shabal512.cu | 6 +- x15/cuda_x14_shabal512_alexis.cu | 4 +- x15/cuda_x15_whirlpool.cu | 9 +- x15/cuda_x15_whirlpool_sm3.cu | 15 +- x16/x16s.cu | 221 +-- x16r/cuda_x16_echo512.cu | 7 +- x16r/cuda_x16_fugue512.cu | 7 +- x16r/cuda_x16_shabal512.cu | 7 +- x16r/cuda_x16_simd512_80.cu | 7 +- x16r/x16r.cu | 427 +++--- x17/cuda_x17_sha512.cu | 10 +- 49 files changed, 3021 insertions(+), 2180 deletions(-) diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu index 70f3d0b860..a47fec181d 100644 --- a/JHA/cuda_jha_keccak512.cu +++ b/JHA/cuda_jha_keccak512.cu @@ -1,7 +1,7 @@ #include #include -#include "cuda_helper_alexis.h" +#include "cuda_helper.h" #include "miner.h" // ZR5 @@ -478,11 +478,8 @@ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen) } __global__ -void jackpot_keccak512_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { @@ -532,7 +529,7 @@ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNoun size_t shared_size = 0; - jackpot_keccak512_gpu_hash << > >(thr_id, threads, startNounce, (uint64_t*)d_hash); + jackpot_keccak512_gpu_hash<<>>(threads, startNounce, (uint64_t*)d_hash); //MyStreamSynchronize(NULL, order, thr_id); } diff --git a/algos.h b/algos.h index 800f602857..014b4cbf1b 100644 --- a/algos.h +++ b/algos.h @@ -60,7 +60,6 @@ enum sha_algos { ALGO_X14, ALGO_X15, ALGO_X16R, - ALGO_X16S, ALGO_X17, ALGO_VANILLA, ALGO_VELTOR, @@ -131,7 +130,6 @@ static const char *algo_names[] = { "x14", "x15", "x16r", - "x16s", "x17", "vanilla", "veltor", diff --git a/api.cpp b/api.cpp index dc57a35534..cd11fe93a2 100644 --- a/api.cpp +++ b/api.cpp @@ -1252,7 +1252,7 @@ static void api() char *wskey = NULL; n = recv(c, &buf[0], SOCK_REC_BUFSZ, 0); - fail = SOCKETFAIL(n) || n < 0; + fail = SOCKETFAIL(n); if (fail) buf[0] = '\0'; else if (n > 0 && buf[n-1] == '\n') { @@ -1261,8 +1261,7 @@ static void api() if (n > 0 && buf[n-1] == '\r') buf[n-1] = '\0'; } - else - buf[n] = '\0'; + buf[n] = '\0'; //if (opt_debug && opt_protocol && n > 0) // applog(LOG_DEBUG, "API: recv command: (%d) '%s'+char(%x)", n, buf, buf[n-1]); diff --git a/ccminer.cpp b/ccminer.cpp index 0bb8fa4e01..2dd6727bb4 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -108,7 +108,7 @@ bool use_colors = true; int use_pok = 0; static bool opt_background = false; bool opt_quiet = false; -int opt_maxlograte = 3; +int opt_maxlograte = 5;//3; static int opt_retries = -1; static int opt_fail_pause = 30; int opt_time_limit = -1; @@ -147,7 +147,6 @@ int32_t device_led[MAX_GPUS] = { -1, -1 }; int opt_led_mode = 0; int opt_cudaschedule = -1; static bool opt_keep_clocks = false; -extern "C" volatile int *volatile d_ark = NULL; // un-linked to cmdline scrypt options (useless) int device_batchsize[MAX_GPUS] = { 0 }; @@ -302,7 +301,6 @@ Options:\n\ x14 X14\n\ x15 X15\n\ x16r X16R (Raven)\n\ - x16s X16S\n\ x17 X17\n\ wildkeccak Boolberry\n\ zr5 ZR5 (ZiftrCoin)\n\ @@ -685,25 +683,24 @@ static void calc_network_diff(struct work *work) int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28 uint64_t diffone = 0x0000FFFF00000000ull; - + /* double d = (double)0x0000ffff / (double)bits; for (int m=shift; m < 29; m++) d *= 256.0; for (int m=29; m < shift; m++) d /= 256.0; + */ - /* uint32_t d = 0x0000ffff / bits; for (int m = shift; m < 29; m++) d <<= 8; for (int m = 29; m < shift; m++) d >>= 8; - */ + // if (opt_algo == ALGO_DECRED && shift == 28) d *= 256.0; if (opt_debug_diff) -// applog(LOG_DEBUG, "net diff: %u -> shift %u, bits %08x", d, shift, bits); - applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits); + applog(LOG_DEBUG, "net diff: %u -> shift %u, bits %08x", d, shift, bits); +// applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits); - net_diff = d; -// net_diff = (double)d; + net_diff = (double)d; } /* decode data from getwork (wallets and longpoll pools) */ @@ -1758,7 +1755,6 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) // case ALGO_TIMETRAVEL: // case ALGO_BITCORE: case ALGO_X16R: -// case ALGO_X16S: work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));//(256.0 * opt_difficulty)); break; #if 0 @@ -1785,12 +1781,13 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) sctx->job.clean = 1; //!!! return true; } + __host__ extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); void restart_threads(void) { if (opt_debug && !opt_quiet) - applog(LOG_DEBUG,"%s", __FUNCTION__); + applog(LOG_DEBUG, "%s", __FUNCTION__); // restart mining thread IRL for (int i = 0; i < opt_n_threads && work_restart; i++) { @@ -2511,9 +2508,6 @@ static void *miner_thread(void *userdata) rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done); break; #endif - case ALGO_X16S: -// rc = scanhash_x16s(thr_id, &work, max_nonce, &hashes_done); - break; case ALGO_X16R: // try{ rc = scanhash_x16r(thr_id, &work, max_nonce, &hashes_done); @@ -2656,8 +2650,8 @@ static void *miner_thread(void *userdata) work.submit_nonce_id = 0; nonceptr[0] = work.nonces[0]; - if (work_restart[thr_id].restart) - continue; +// if (work_restart[thr_id].restart) +// continue; if (!submit_work(mythr, &work)) break; nonceptr[0] = curnonce; @@ -2682,8 +2676,8 @@ static void *miner_thread(void *userdata) work.data[22] = 0; } #endif - if (work_restart[thr_id].restart) - continue; +// if (work_restart[thr_id].restart) +// continue; if (!submit_work(mythr, &work)) break; nonceptr[0] = curnonce; @@ -3960,7 +3954,7 @@ int main(int argc, char *argv[]) " `!!!!!!!!!!!!!!'\n" " `\\!!!!!!!!!~\n" "(Credit to http://www.asciiworld.com/-Aliens,128-.html )\n"); - if (!opt_quiet) { + if (!opt_quiet) { const char* arch = is_x64() ? "64-bits" : "32-bits"; #ifdef _MSC_VER printf(" Built with VC++ %d and nVidia CUDA SDK %d.%d %s\n\n", msver(), diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 954816bff6..942edcaa5e 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -269,7 +269,6 @@ - @@ -464,7 +463,6 @@ - diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index d0e8a2c782..5dbb0b0a06 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -611,7 +611,6 @@ - @@ -1017,7 +1016,6 @@ Source Files\CUDA\x13 - diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu index 0277732241..2d7fdddf76 100644 --- a/cuda_checkhash.cu +++ b/cuda_checkhash.cu @@ -192,71 +192,10 @@ void cuda_checkhash_32(uint32_t threads, uint32_t startNounce, uint32_t *hash, u } } -cudaError_t MyStreamSynchronize(cudaStream_t stream, uint32_t situation, int thr_id) -{ - cudaError_t result = cudaSuccess; - if (abort_flag) - return result; - if (situation >= 0) - { - if (cudaStreamQuery(stream) == cudaErrorNotReady) - { - while ((work_restart[thr_id].restart == 0) && cudaStreamQuery(stream) == cudaErrorNotReady) - { - usleep((useconds_t)(1000)); - } - if (work_restart[thr_id].restart) - return cudaErrorInvalidDevice; - result = cudaStreamSynchronize(stream); - } - } - else - result = cudaStreamSynchronize(stream); - return result; -} -/* -uint32_t glhf; -__host__ -void chk(int thr_id) -{ - int size = 128; - int* h_val = (int*)malloc(sizeof(int)*size); - bool * h_flag = new bool; - *h_flag = true; - - bool* d_flag; - cudaMalloc(&d_flag, sizeof(bool)); - cudaMemcpy(d_flag, h_flag, 1, cudaMemcpyHostToDevice); - - int* d_val; - cudaMalloc(&d_val, sizeof(int)*size); - - for (int i = 0; i> >(d_flag, d_val, size); - - //--------------sleep for a while -------------------------- - - *h_flag = false; - cudaMemcpy(d_flag, h_flag, 1, cudaMemcpyHostToDevice); - - glhf = 0 - cudaMemcpy(d_resNonces[thr_id], 0xff, sizeof(uint32_t)); - // -} -*/ __host__ uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash) { cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t)); -// if (MyStreamSynchronize(NULL, (uint32_t)1, thr_id) == cudaErrorInvalidDevice) -// return 0; const uint32_t threadsperblock = 512; @@ -272,7 +211,7 @@ uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uin } cuda_checkhash_64 <<>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]); -// cudaThreadSynchronize(); + cudaThreadSynchronize(); cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); return h_resNonces[thr_id][0]; diff --git a/cuda_helper.h b/cuda_helper.h index cc498af635..a3885d6f09 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -200,8 +200,6 @@ do { \ } \ } while (0) -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, uint32_t situation, int thr_id); - /*********************************************************************/ #if !defined(__CUDA_ARCH__) || defined(_WIN64) #define USE_XOR_ASM_OPTS 0 diff --git a/cuda_helper_alexis.h b/cuda_helper_alexis.h index affa560c7b..2a6e50cbc8 100644 --- a/cuda_helper_alexis.h +++ b/cuda_helper_alexis.h @@ -536,6 +536,7 @@ static __device__ __forceinline__ uint2 operator* (const uint2 a,const uint2 b){ : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); return result; } + // uint2 ROR/ROL methods __device__ __forceinline__ uint2 ROR2(const uint2 a, const uint32_t offset){ diff --git a/miner.h b/miner.h index ab51d5c0e6..a883a3380c 100644 --- a/miner.h +++ b/miner.h @@ -331,7 +331,6 @@ extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsig extern int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); -extern int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); @@ -397,7 +396,6 @@ extern void free_x13(int thr_id); extern void free_x14(int thr_id); extern void free_x15(int thr_id); extern void free_x16r(int thr_id); -extern void free_x16s(int thr_id); extern void free_x17(int thr_id); extern void free_zr5(int thr_id); //extern void free_sha256d(int thr_id); @@ -944,7 +942,6 @@ void x13hash(void *output, const void *input); void x14hash(void *output, const void *input); void x15hash(void *output, const void *input); void x16r_hash(void *output, const void *input); -void x16s_hash(void *output, const void *input); void x17hash(void *output, const void *input); void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize); void zr5hash(void *output, const void *input); diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu index e8faac4a91..9622d60343 100644 --- a/quark/cuda_bmw512.cu +++ b/quark/cuda_bmw512.cu @@ -8,7 +8,7 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) -//#include "cuda_bmw512_sm3.cuh" +#include "cuda_bmw512_sm3.cuh" #ifdef __INTELLISENSE__ /* just for vstudio code colors */ @@ -324,10 +324,9 @@ __launch_bounds__(64, 8) #endif void quark_bmw512_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) { //uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); @@ -393,11 +392,8 @@ void quark_bmw512_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash) } __global__ __launch_bounds__(256, 2) -void quark_bmw512_gpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { @@ -467,16 +463,16 @@ void quark_bmw512_cpu_setBlock_80(void *pdata) __host__ void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) // , int order) -{ +{ const uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); -// int dev_id = device_map[thr_id]; + int dev_id = device_map[thr_id]; -// if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) - quark_bmw512_gpu_hash_80 << > >(thr_id, threads, startNounce, (uint64_t*)d_hash); -// else -// quark_bmw512_gpu_hash_80_30<<>>(threads, startNounce, (uint64_t*)d_hash); + if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) + quark_bmw512_gpu_hash_80<<>>(threads, startNounce, (uint64_t*)d_hash); + else + quark_bmw512_gpu_hash_80_30<<>>(threads, startNounce, (uint64_t*)d_hash); } __host__ @@ -484,7 +480,7 @@ void quark_bmw512_cpu_init(int thr_id, uint32_t threads) { cuda_get_arch(thr_id); } - + __host__ void quark_bmw512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash) { @@ -492,9 +488,9 @@ void quark_bmw512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash) dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); -// int dev_id = device_map[thr_id]; -// if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) - quark_bmw512_gpu_hash_64 << > >(thr_id, threads, (uint64_t*)d_hash); -// else -// quark_bmw512_gpu_hash_64_30<<>>(threads, (uint64_t*)d_hash); + int dev_id = device_map[((uintptr_t)thr_id) & 15]; + if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) + quark_bmw512_gpu_hash_64<<>>(thr_id, threads, (uint64_t*)d_hash); + else + quark_bmw512_gpu_hash_64_30<<>>(thr_id, threads, (uint64_t*)d_hash); } diff --git a/quark/cuda_bmw512_sm3.cuh b/quark/cuda_bmw512_sm3.cuh index faa314e4f7..057e04031b 100644 --- a/quark/cuda_bmw512_sm3.cuh +++ b/quark/cuda_bmw512_sm3.cuh @@ -157,8 +157,10 @@ void Compression512_30(uint64_t *msg, uint64_t *hash) } __global__ -void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint64_t *g_hash) +void quark_bmw512_gpu_hash_64_30(int *thr_id, uint32_t threads, uint64_t *g_hash) { + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) + return; int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { @@ -265,7 +267,7 @@ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_ } #else /* stripped stubs for other archs */ -__global__ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint64_t *g_hash) {} +__global__ void quark_bmw512_gpu_hash_64_30(int *thr_id, uint32_t threads, uint64_t *g_hash) {} __global__ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {} #endif diff --git a/quark/cuda_jh512.cu b/quark/cuda_jh512.cu index 8af03d5201..a46e2e0633 100644 --- a/quark/cuda_jh512.cu +++ b/quark/cuda_jh512.cu @@ -279,10 +279,9 @@ __global__ //__launch_bounds__(256,2) void quark_jh512_gpu_hash_64(int *thr_id, const uint32_t threads, uint32_t* g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) { //const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); @@ -414,7 +413,7 @@ void jh512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint3 AS_UINT4(&Hash[12]) = AS_UINT4(&x[7][0]); } } - + __host__ void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash) { @@ -433,11 +432,8 @@ __constant__ static uint32_t c_JHState[32]; __constant__ static uint32_t c_Message[4]; __global__ -void jh512_gpu_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash) +void jh512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { @@ -489,7 +485,7 @@ void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - jh512_gpu_hash_80 << > > (thr_id, threads, startNounce, d_hash); + jh512_gpu_hash_80 <<>> (threads, startNounce, d_hash); } extern "C" { diff --git a/quark/cuda_quark.h b/quark/cuda_quark.h index 2a70e949a7..14f473b6aa 100644 --- a/quark/cuda_quark.h +++ b/quark/cuda_quark.h @@ -28,8 +28,8 @@ extern void quark_jh512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_h extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads); extern void quark_compactTest_cpu_free(int thr_id); extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, - uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order); + uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order); extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, - uint32_t *d_nonces1, uint32_t *nrm1, int order); + uint32_t *d_nonces1, uint32_t *nrm1, int order); extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu index b68aa21564..5532a85d61 100644 --- a/quark/cuda_quark_blake512.cu +++ b/quark/cuda_quark_blake512.cu @@ -118,12 +118,11 @@ void quark_blake512_compress(uint64_t *h, const uint64_t *block, const uint8_t ( __global__ __launch_bounds__(256, 4) void quark_blake512_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; #if !defined(SP_KERNEL) || __CUDA_ARCH__ < 500 uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - #if USE_SHUFFLE const uint32_t warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-Blöcke @@ -188,12 +187,9 @@ void quark_blake512_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash) } __global__ __launch_bounds__(256,4) -void quark_blake512_gpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHash) +void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash) { //#if !defined(SP_KERNEL) || __CUDA_ARCH__ < 500 -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { @@ -242,9 +238,9 @@ void quark_blake512_gpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun __host__ void quark_blake512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_outputHash) -{ +{ #ifdef SP_KERNEL - int dev_id = device_map[((uint64_t)thr_id)&15]; + int dev_id = device_map[((uintptr_t)thr_id) & 15]; if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) quark_blake512_cpu_hash_64_sp(thr_id, threads, d_outputHash); else @@ -260,11 +256,11 @@ void quark_blake512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_outpu __host__ void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) -{ +{ #ifdef SP_KERNEL int dev_id = device_map[thr_id]; if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) - quark_blake512_cpu_hash_80_sp(thr_id, threads, startNounce, d_outputHash); + quark_blake512_cpu_hash_80_sp(threads, startNounce, d_outputHash); else #endif { @@ -272,7 +268,7 @@ void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - quark_blake512_gpu_hash_80 << > >(thr_id, threads, startNounce, d_outputHash); + quark_blake512_gpu_hash_80<<>>(threads, startNounce, d_outputHash); } } diff --git a/quark/cuda_quark_blake512_sp.cuh b/quark/cuda_quark_blake512_sp.cuh index f85ef1d319..a17178362f 100644 --- a/quark/cuda_quark_blake512_sp.cuh +++ b/quark/cuda_quark_blake512_sp.cuh @@ -5,7 +5,6 @@ #include "miner.h" // Should stay outside the ifdef on WIN64 (wtf) -#include "cuda_helper_alexis.h" #include "cuda_vector_uint2x4.h" __constant__ static uint2 c_PaddedM[16]; __constant__ static uint2x4 c_Hostprecalc[4]; @@ -93,11 +92,10 @@ __launch_bounds__(256, 1) #endif void quark_blake512_gpu_hash_64_sp(int *thr_id, uint32_t threads, uint2* g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) { // const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); @@ -338,11 +336,8 @@ void quark_blake512_gpu_hash_64_sp(int *thr_id, uint32_t threads, uint2* g_hash) __global__ __launch_bounds__(128, 8) -void quark_blake512_gpu_hash_80_sp(int thr_id, uint32_t threads, uint32_t startNounce, uint2 *outputHash) +void quark_blake512_gpu_hash_80_sp(uint32_t threads, uint32_t startNounce, uint2 *outputHash) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { @@ -654,8 +649,8 @@ __host__ void quark_blake512_cpu_setBlock_80_sp(int thr_id, uint64_t *pdata) #else // __CUDA_ARCH__ < 500 __host__ void quark_blake512_cpu_setBlock_80_sp(int thr_id, uint64_t *pdata) {} -__global__ void quark_blake512_gpu_hash_64_sp(int*, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2 *const __restrict__ g_hash) {} -__global__ void quark_blake512_gpu_hash_80_sp(int*, uint32_t startNounce, uint2 *outputHash) {} +__global__ void quark_blake512_gpu_hash_64_sp(uint32_t, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2 *const __restrict__ g_hash) {} +__global__ void quark_blake512_gpu_hash_80_sp(uint32_t, uint32_t startNounce, uint2 *outputHash) {} #endif __host__ @@ -668,10 +663,10 @@ void quark_blake512_cpu_hash_64_sp(int *thr_id, uint32_t threads, uint32_t *d_ou } __host__ -void quark_blake512_cpu_hash_80_sp(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) +void quark_blake512_cpu_hash_80_sp(uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) { const uint32_t threadsperblock = 64; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - quark_blake512_gpu_hash_80_sp << > >(thr_id, threads, startNounce, (uint2*)d_outputHash); + quark_blake512_gpu_hash_80_sp <<>>(threads, startNounce, (uint2*)d_outputHash); } diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu index 5b5d4ddaba..61f6029953 100644 --- a/quark/cuda_quark_groestl512.cu +++ b/quark/cuda_quark_groestl512.cu @@ -38,10 +38,8 @@ __global__ __launch_bounds__(TPB, THF) //const uint32_t startNounce, void quark_groestl512_gpu_hash_64_quad_a1_min3r(int *thr_id, const uint32_t threads, uint4* g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; - - #if __CUDA_ARCH__ >= 300 // BEWARE : 4-WAY CODE (one hash need 4 threads) const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); // >> 2; // done on cpu @@ -189,7 +187,7 @@ void quark_groestl512_gpu_hash_64_quad_a1_min3r(int *thr_id, const uint32_t thre } __global__ __launch_bounds__(TPB, THF) -void quark_groestl512_gpu_hash_64_quad(const uint32_t threads, const uint32_t startNounce, uint32_t * g_hash, uint32_t * __restrict g_nonceVector) +void quark_groestl512_gpu_hash_64_quad(int *thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t * g_hash, uint32_t * __restrict g_nonceVector) { //! fixme please #if 0 // __CUDA_ARCH__ >= 300 @@ -263,7 +261,7 @@ void quark_groestl512_cpu_free(int thr_id) // if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) // quark_groestl512_sm20_free(thr_id); } - + __host__ void quark_groestl512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash) { @@ -302,15 +300,10 @@ void groestl512_setBlock_80(int thr_id, uint32_t *endiandata) } __global__ __launch_bounds__(TPB, THF) -void groestl512_gpu_hash_80_quad_a1_min3r(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint4* g_hash) +void groestl512_gpu_hash_80_quad_a1_min3r(const uint32_t threads, const uint32_t startNounce, uint4* g_hash) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - #if __CUDA_ARCH__ >= 300 // BEWARE : 4-WAY CODE (one hash need 4 threads) - - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); // >> 2; // done on cpu if (thread < threads) @@ -437,7 +430,7 @@ void groestl512_gpu_hash_80_quad(const uint32_t threads, const uint32_t startNou __host__ void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash) -{ +{ // int dev_id = device_map[thr_id]; // if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) { @@ -447,7 +440,7 @@ void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uin dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock)); dim3 block(threadsperblock); //! setup only for x16r(s?) - groestl512_gpu_hash_80_quad_a1_min3r << > > ( thr_id, threads << 2, startNounce, (uint4*)d_hash); + groestl512_gpu_hash_80_quad_a1_min3r <<>> (threads << 2, startNounce, (uint4*)d_hash); // groestl512_gpu_hash_80_quad<< > > (threads, startNounce, d_hash); /* diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu index 78ca86d154..993e95660f 100644 --- a/quark/cuda_quark_keccak512.cu +++ b/quark/cuda_quark_keccak512.cu @@ -98,11 +98,9 @@ static void keccak_block(uint2 *s) __global__ void quark_keccak512_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) { //uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); @@ -201,10 +199,9 @@ static void keccak_block_v30(uint64_t *s, const uint32_t *in) __global__ void quark_keccak512_gpu_hash_64_v30(int *thr_id, uint32_t threads, uint64_t *g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) { //uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); @@ -239,7 +236,7 @@ void quark_keccak512_gpu_hash_64_v30(int *thr_id, uint32_t threads, uint64_t *g_ outpHash[i] = hash[i]; } } - + __host__ void quark_keccak512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash) { @@ -248,7 +245,7 @@ void quark_keccak512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - int dev_id = device_map[((uint64_t)thr_id) & 15]; + int dev_id = device_map[((uintptr_t)thr_id) & 15]; if (device_sm[dev_id] >= 320) quark_keccak512_gpu_hash_64 << > >(thr_id, threads, (uint64_t*)d_hash); diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu index 0d30cd3db7..90e8f47d24 100644 --- a/quark/cuda_skein512.cu +++ b/quark/cuda_skein512.cu @@ -468,7 +468,7 @@ __launch_bounds__(TPB50, 5) #endif void quark_skein512_gpu_hash_64(int *thr_id, const uint32_t threads, uint64_t* __restrict__ g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); @@ -758,13 +758,13 @@ void quark_skein512_gpu_hash_64(int *thr_id, const uint32_t threads, uint64_t* _ #undef h7 } } - + __host__ //void quark_skein512_cpu_hash_64(int thr_id,uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_hash) void quark_skein512_cpu_hash_64(int *thr_id, const uint32_t threads, uint32_t *d_hash) { uint32_t tpb = TPB52; - int dev_id = device_map[((uint64_t)thr_id) & 15]; + int dev_id = device_map[((uintptr_t)thr_id) & 15]; if (device_sm[dev_id] <= 500) tpb = TPB50; const dim3 grid((threads + tpb-1)/tpb); @@ -782,11 +782,8 @@ __launch_bounds__(TPB52, 3) #else __launch_bounds__(TPB50, 5) #endif -void skein512_gpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *output64) +void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *output64) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { @@ -957,7 +954,7 @@ void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, ui const dim3 block(tpb); // hash function is cut in 2 parts to reduce kernel size - skein512_gpu_hash_80 << < grid, block >> > (thr_id, threads, startNounce, (uint64_t*)d_hash); + skein512_gpu_hash_80 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash); } __host__ diff --git a/quark/groestl_transf_quad_a1_min3r.cuh b/quark/groestl_transf_quad_a1_min3r.cuh index 6ee9e735e1..754bdd069d 100644 --- a/quark/groestl_transf_quad_a1_min3r.cuh +++ b/quark/groestl_transf_quad_a1_min3r.cuh @@ -100,7 +100,6 @@ other[i].__X = (__byte_perm(other[i].__X, 0, 0x1032) & -(threadIdx.x & 1)) | (in input[i].__X = __shfl((int)input[i].__X, n ^ (3 & -(n < 1 || n > 2)), 4);\ input[i].__X = __shfl((int)input[i].__X, n ^ (3 & -(n >= 1 && n <= 2)), 4);\ -input[i].__X = __shfl((int)input[i].__X, n ^ (3 & ((n >= 1 && n <= 2) | ((n >= 1 && n <= 2)<<1), 4);\ */ //input[i].__X = (__byte_perm(input[i].__X, 0, 0x1032) & (-(threadIdx.x & 1) | (-(threadIdx.x & 1) & input[i].__X)); //other[i].__X = (__byte_perm(other[i].__X, 0, 0x1032) & (-(threadIdx.x & 1) | (-(threadIdx.x & 1) & input[i].__X)); diff --git a/qubit/qubit_luffa512_alexis.cu b/qubit/qubit_luffa512_alexis.cu index 0cc6ed6f90..bccd10e4c1 100644 --- a/qubit/qubit_luffa512_alexis.cu +++ b/qubit/qubit_luffa512_alexis.cu @@ -3,7 +3,7 @@ */ #include -#include "cuda_helper_alexis.h" +#include "cuda_helper.h" #include "cuda_vectors_alexis.h" static unsigned char PaddedMessage[128]; @@ -621,11 +621,8 @@ static void rnd512_nullhash(uint32_t *const __restrict__ state){ __global__ __launch_bounds__(256, 4) -void qubit_luffa512_gpu_hash_80_alexis(int thr_id, const uint32_t threads,const uint32_t startNounce, uint32_t *outputHash) +void qubit_luffa512_gpu_hash_80_alexis(const uint32_t threads,const uint32_t startNounce, uint32_t *outputHash) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { @@ -678,7 +675,7 @@ void qubit_luffa512_cpu_hash_80_alexis(int thr_id, uint32_t threads, uint32_t st dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - qubit_luffa512_gpu_hash_80_alexis << > > (thr_id, threads, startNounce, d_outputHash); + qubit_luffa512_gpu_hash_80_alexis<<>> (threads, startNounce, d_outputHash); } //#if __CUDA_ARCH__ == 500 @@ -689,12 +686,11 @@ __global__ __launch_bounds__(384,2) void x11_luffa512_gpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *g_hash) { - - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t statebuffer[8]; - + if (thread < threads) { uint32_t statechainv[40] = { @@ -833,7 +829,7 @@ void qubit_luffa512_cpu_setBlock_80_alexis(void *pdata) CUDA_SAFE_CALL(cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); qubit_cpu_precalc(); } - + __host__ void x11_luffa512_cpu_hash_64_alexis(int *thr_id, uint32_t threads,uint32_t *d_hash) { diff --git a/res/ccminer.aps b/res/ccminer.aps index 61a3609f2e98a6037de86f6d9b6af3cc158475cb..fb2f903b74b7bc5582392b2f9b6189b4682d9c26 100644 GIT binary patch delta 258 zcmcaIhi%6kwh0Q13KJETfI*MJ5QL}KcQGDiW?1#ojisF!N*OX4au`w=iWy=U5*Z8`a)JCj24jYz>A78u zM;UibH|=Ju!CoNsV7eiQS}{EkL={Xg1W`YxPXtjZ p(>H=Bo9P#U6#ECDc_Cel)AhR;MW^%hF>> 1; - - const uint32_t even = (threadIdx.x & 1); - - if (thread < threads){ - uint32_t *Hash = (uint32_t*)&g_hash[8 * thread + 2 * even]; - - uint32_t x[16]; - - if (even == 0){ - x[0] = 0x2AEA2A61; x[1] = 0x50F494D4; x[2] = 0x2D538B8B; x[3] = 0x4167D83E; - x[4] = 0x4D42C787; x[5] = 0xA647A8B3; x[6] = 0x97CF0BEF; x[7] = 0x825B4537; - x[8] = 0xFCD398D9; x[9] = 0x148FE485; x[10] = 0x1B017BEF; x[11] = 0xB6444532; - x[12] = 0xD65C8A2B; x[13] = 0xA5A70E75; x[14] = 0xB1C62456; x[15] = 0xBC796576; - } - else{ - x[0] = 0x3FEE2313; x[1] = 0xC701CF8C; x[2] = 0xCC39968E; x[3] = 0x50AC5695; - x[4] = 0xEEF864D2; x[5] = 0xF22090C4; x[6] = 0xD0E5CD33; x[7] = 0xA23911AE; - x[8] = 0x6A536159; x[9] = 0x2FF5781C; x[10] = 0x91FA7934; x[11] = 0x0DBADEA9; - x[12] = 0x1921C8F7; x[13] = 0xE7989AF1; x[14] = 0x7795D246; x[15] = 0xD43E3B44; - } - *(uint4*)&x[0] ^= __ldg((uint4*)&Hash[0]); - rrounds(x); - - *(uint4*)&x[0] ^= __ldg((uint4*)&Hash[8]); - - rrounds(x); - - if (!even) - x[0] ^= 0x80; - - rrounds(x); - /* "the integer 1 is xored into the last state word x_11111" */ - if (even) - x[15] ^= 1; - -#pragma unroll 10 - for (int i = 0; i < 10; ++i) - rrounds(x); - - *(uint4*)&Hash[0] = *(uint4*)&x[0]; - *(uint4*)&Hash[8] = *(uint4*)&x[4]; - // g_hash[thread + (2*even+0) * threads] = *(uint2*)&x[ 0]; - // g_hash[thread + (2*even+1) * threads] = *(uint2*)&x[ 2]; - } -} -__host__ -void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){ - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((2 * threads + TPB - 1) / TPB); - dim3 block(TPB); - - x11_cubehash512_gpu_hash_64 << > >(threads, (uint64_t*)d_hash); - -} - -#else - -#define TPB 768 - -__device__ __forceinline__ -static void rrounds(uint32_t *x){ -#pragma unroll 2 - for (int r = 0; r < 16; r++) { - /* "add x_0jklm into x_1jklmn modulo 2^32 rotate x_0jklm upwards by 7 bits" */ - x[16] = x[16] + x[0]; x[0] = ROTL32(x[0], 7); x[17] = x[17] + x[1]; x[1] = ROTL32(x[1], 7); - x[18] = x[18] + x[2]; x[2] = ROTL32(x[2], 7); x[19] = x[19] + x[3]; x[3] = ROTL32(x[3], 7); - x[20] = x[20] + x[4]; x[4] = ROTL32(x[4], 7); x[21] = x[21] + x[5]; x[5] = ROTL32(x[5], 7); - x[22] = x[22] + x[6]; x[6] = ROTL32(x[6], 7); x[23] = x[23] + x[7]; x[7] = ROTL32(x[7], 7); - x[24] = x[24] + x[8]; x[8] = ROTL32(x[8], 7); x[25] = x[25] + x[9]; x[9] = ROTL32(x[9], 7); - x[26] = x[26] + x[10]; x[10] = ROTL32(x[10], 7); x[27] = x[27] + x[11]; x[11] = ROTL32(x[11], 7); - x[28] = x[28] + x[12]; x[12] = ROTL32(x[12], 7); x[29] = x[29] + x[13]; x[13] = ROTL32(x[13], 7); - x[30] = x[30] + x[14]; x[14] = ROTL32(x[14], 7); x[31] = x[31] + x[15]; x[15] = ROTL32(x[15], 7); - /* "swap x_00klm with x_01klm" */ - SWAP(x[0], x[8]); x[0] ^= x[16]; x[8] ^= x[24]; SWAP(x[1], x[9]); x[1] ^= x[17]; x[9] ^= x[25]; - SWAP(x[2], x[10]); x[2] ^= x[18]; x[10] ^= x[26]; SWAP(x[3], x[11]); x[3] ^= x[19]; x[11] ^= x[27]; - SWAP(x[4], x[12]); x[4] ^= x[20]; x[12] ^= x[28]; SWAP(x[5], x[13]); x[5] ^= x[21]; x[13] ^= x[29]; - SWAP(x[6], x[14]); x[6] ^= x[22]; x[14] ^= x[30]; SWAP(x[7], x[15]); x[7] ^= x[23]; x[15] ^= x[31]; - /* "swap x_1jk0m with x_1jk1m" */ - SWAP(x[16], x[18]); SWAP(x[17], x[19]); SWAP(x[20], x[22]); SWAP(x[21], x[23]); SWAP(x[24], x[26]); SWAP(x[25], x[27]); SWAP(x[28], x[30]); SWAP(x[29], x[31]); - /* "add x_0jklm into x_1jklm modulo 2^32 rotate x_0jklm upwards by 11 bits" */ - x[16] = x[16] + x[0]; x[0] = ROTL32(x[0], 11); x[17] = x[17] + x[1]; x[1] = ROTL32(x[1], 11); - x[18] = x[18] + x[2]; x[2] = ROTL32(x[2], 11); x[19] = x[19] + x[3]; x[3] = ROTL32(x[3], 11); - x[20] = x[20] + x[4]; x[4] = ROTL32(x[4], 11); x[21] = x[21] + x[5]; x[5] = ROTL32(x[5], 11); - x[22] = x[22] + x[6]; x[6] = ROTL32(x[6], 11); x[23] = x[23] + x[7]; x[7] = ROTL32(x[7], 11); - x[24] = x[24] + x[8]; x[8] = ROTL32(x[8], 11); x[25] = x[25] + x[9]; x[9] = ROTL32(x[9], 11); - x[26] = x[26] + x[10]; x[10] = ROTL32(x[10], 11); x[27] = x[27] + x[11]; x[11] = ROTL32(x[11], 11); - x[28] = x[28] + x[12]; x[12] = ROTL32(x[12], 11); x[29] = x[29] + x[13]; x[13] = ROTL32(x[13], 11); - x[30] = x[30] + x[14]; x[14] = ROTL32(x[14], 11); x[31] = x[31] + x[15]; x[15] = ROTL32(x[15], 11); - /* "swap x_0j0lm with x_0j1lm" */ - SWAP(x[0], x[4]); x[0] ^= x[16]; x[4] ^= x[20]; SWAP(x[1], x[5]); x[1] ^= x[17]; x[5] ^= x[21]; - SWAP(x[2], x[6]); x[2] ^= x[18]; x[6] ^= x[22]; SWAP(x[3], x[7]); x[3] ^= x[19]; x[7] ^= x[23]; - SWAP(x[8], x[12]); x[8] ^= x[24]; x[12] ^= x[28]; SWAP(x[9], x[13]); x[9] ^= x[25]; x[13] ^= x[29]; - SWAP(x[10], x[14]); x[10] ^= x[26]; x[14] ^= x[30]; SWAP(x[11], x[15]); x[11] ^= x[27]; x[15] ^= x[31]; - /* "swap x_1jkl0 with x_1jkl1" */ - SWAP(x[16], x[17]); SWAP(x[18], x[19]); SWAP(x[20], x[21]); SWAP(x[22], x[23]); SWAP(x[24], x[25]); SWAP(x[26], x[27]); SWAP(x[28], x[29]); SWAP(x[30], x[31]); - } -} - -/***************************************************/ -// GPU Hash Function -__global__ __launch_bounds__(TPB) -void x11_cubehash512_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash) -{ - - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) - return; - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - - if (thread < threads){ - - uint32_t *Hash = (uint32_t*)&g_hash[8 * thread]; - - uint32_t x[32] = { - 0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E, - 0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695, - 0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537, - 0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE, - 0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532, - 0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9, - 0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576, - 0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44 - }; - - // erste Hälfte des Hashes (32 bytes) - //Update32(x, (const BitSequence*)Hash); - *(uint2x4*)&x[0] ^= __ldg4((uint2x4*)&Hash[0]); - - rrounds(x); - - // zweite Hälfte des Hashes (32 bytes) - // Update32(x, (const BitSequence*)(Hash+8)); - *(uint2x4*)&x[0] ^= __ldg4((uint2x4*)&Hash[8]); - - rrounds(x); - - // Padding Block - x[0] ^= 0x80; - rrounds(x); - - // Final(x, (BitSequence*)Hash); - x[31] ^= 1; - - /* "the state is then transformed invertibly through 10r identical rounds" */ -#pragma unroll 10 - for (int i = 0; i < 10; ++i) - rrounds(x); - - /* "output the first h/8 bytes of the state" */ - *(uint2x4*)&Hash[0] = *(uint2x4*)&x[0]; - *(uint2x4*)&Hash[8] = *(uint2x4*)&x[8]; - } -} - - -__host__ -void x11_cubehash512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash){ - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + TPB - 1) / TPB); - dim3 block(TPB); - - x11_cubehash512_gpu_hash_64 << > >(thr_id, threads, (uint64_t*)d_hash); - -} -#endif - -// - __device__ __constant__ static const uint32_t c_IV_512[32] = { 0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E, @@ -456,6 +214,48 @@ static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval) /***************************************************/ +__global__ +void x11_cubehash512_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash) +{ + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) + return; + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + //uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + int hashPosition = thread;//nounce - startNounce; + uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; + + uint32_t x[2][2][2][2][2]; + Init(x); + + Update32(x, &Hash[0]); + Update32(x, &Hash[8]); + + // Padding Block + uint32_t last[8]; + last[0] = 0x80; + #pragma unroll 7 + for (int i=1; i < 8; i++) last[i] = 0; + Update32(x, last); + + Final(x, Hash); + } +} + +__host__ +void x11_cubehash512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash) +{ + const uint32_t threadsperblock = 256; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + size_t shared_size = 0; + + x11_cubehash512_gpu_hash_64 << > >(thr_id, threads, (uint64_t*)d_hash); +} __host__ void x11_cubehash512_cpu_init(int thr_id, uint32_t threads) { } @@ -476,11 +276,8 @@ void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata) } __global__ -void cubehash512_gpu_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash) +void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { @@ -521,7 +318,7 @@ void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const ui dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - cubehash512_gpu_hash_80 << > > (thr_id, threads, startNounce, (uint64_t*)d_hash); + cubehash512_gpu_hash_80 <<>> (threads, startNounce, (uint64_t*) d_hash); } #endif \ No newline at end of file diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu index 3cd9f4685c..6ce3d8e993 100644 --- a/x11/cuda_x11_echo.cu +++ b/x11/cuda_x11_echo.cu @@ -299,13 +299,13 @@ void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g cuda_echo_round(sharedMemory, Hash); } } -/* + __host__ -void x11_echo512_cpu_init(int thr_id, uint32_t threads) +void X11_echo512_cpu_init(int thr_id, uint32_t threads) { aes_cpu_init(thr_id); } -*/ + __host__ void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { diff --git a/x11/cuda_x11_echo_aes.cuh b/x11/cuda_x11_echo_aes.cuh index 00ea3aa7b2..79499c201c 100644 --- a/x11/cuda_x11_echo_aes.cuh +++ b/x11/cuda_x11_echo_aes.cuh @@ -1,3 +1,4 @@ +#if 1 #include "miner.h" #include "cuda_vectors_alexis.h" @@ -5,9 +6,9 @@ #define AESx(x) (x ##UL) /* SPH_C32(x) */ //#define DEVICE_DIRECT_CONSTANTS -//#ifndef DEF_OINTMENT + #ifdef DEVICE_DIRECT_CONSTANTS -static __constant__ __align__(64) uint32_t d_AES0[256] = { +__constant__ __align__(64) uint32_t d_AES0[256] = { #else static const uint32_t h_AES0[256] = { #endif @@ -78,149 +79,7 @@ static const uint32_t h_AES0[256] = { }; #ifdef DEVICE_DIRECT_CONSTANTS -static __constant__ __align__(64) uint32_t d_AES1[256] = { -#else -static const uint32_t h_AES1[256] = { -#endif - AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), - AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), - AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), - AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A), - AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87), - AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B), - AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA), - AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B), - AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A), - AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F), - AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908), - AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F), - AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E), - AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5), - AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D), - AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F), - AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E), - AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB), - AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE), - AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397), - AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C), - AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED), - AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B), - AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A), - AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16), - AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194), - AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81), - AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3), - AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A), - AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104), - AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263), - AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D), - AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F), - AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39), - AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47), - AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695), - AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F), - AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83), - AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C), - AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76), - AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E), - AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4), - AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6), - AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B), - AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7), - AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0), - AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25), - AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018), - AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72), - AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751), - AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21), - AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85), - AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA), - AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12), - AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0), - AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9), - AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233), - AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7), - AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920), - AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A), - AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17), - AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8), - AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11), - AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) -}; - -#ifdef DEVICE_DIRECT_CONSTANTS -static __constant__ __align__(64) uint32_t d_AES2[256] = { -#else -static const uint32_t h_AES2[256] = { -#endif - AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), - AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), - AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), - AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76), - AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D), - AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0), - AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF), - AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0), - AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26), - AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC), - AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1), - AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15), - AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3), - AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A), - AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2), - AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75), - AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A), - AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0), - AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3), - AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784), - AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED), - AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B), - AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39), - AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF), - AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB), - AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485), - AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F), - AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8), - AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F), - AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5), - AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321), - AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2), - AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC), - AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917), - AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D), - AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573), - AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC), - AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388), - AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14), - AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB), - AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A), - AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C), - AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662), - AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79), - AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D), - AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9), - AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA), - AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808), - AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E), - AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6), - AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F), - AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A), - AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66), - AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E), - AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9), - AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E), - AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311), - AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794), - AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9), - AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF), - AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D), - AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868), - AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F), - AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) -}; - -#ifdef DEVICE_DIRECT_CONSTANTS -static __constant__ __align__(64) uint32_t d_AES3[256] = { +__constant__ __align__(64) uint32_t d_AES3[256] = { #else static const uint32_t h_AES3[256] = { #endif @@ -290,9 +149,8 @@ static const uint32_t h_AES3[256] = { AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616) }; - #ifndef DEVICE_DIRECT_CONSTANTS -static __device__ uint32_t d_AES0[256] = { +__device__ uint32_t d_AES0[256] = { 0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591, 0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC, 0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB, 0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B, 0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83, 0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A, @@ -311,182 +169,36 @@ static __device__ uint32_t d_AES0[256] = { 0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0, 0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C }; -static __device__ uint32_t d_AES1[256] = { - - AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), - AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), - AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), - AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A), - AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87), - AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B), - AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA), - AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B), - AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A), - AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F), - AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908), - AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F), - AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E), - AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5), - AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D), - AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F), - AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E), - AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB), - AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE), - AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397), - AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C), - AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED), - AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B), - AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A), - AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16), - AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194), - AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81), - AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3), - AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A), - AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104), - AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263), - AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D), - AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F), - AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39), - AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47), - AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695), - AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F), - AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83), - AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C), - AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76), - AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E), - AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4), - AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6), - AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B), - AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7), - AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0), - AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25), - AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018), - AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72), - AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751), - AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21), - AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85), - AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA), - AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12), - AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0), - AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9), - AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233), - AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7), - AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920), - AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A), - AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17), - AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8), - AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11), - AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) +__device__ uint32_t d_AES3[256] = { + 0xC6A56363, 0xF8847C7C, 0xEE997777, 0xF68D7B7B, 0xFF0DF2F2, 0xD6BD6B6B, 0xDEB16F6F, 0x9154C5C5, 0x60503030, 0x02030101, 0xCEA96767, 0x567D2B2B, 0xE719FEFE, 0xB562D7D7, 0x4DE6ABAB, 0xEC9A7676, + 0x8F45CACA, 0x1F9D8282, 0x8940C9C9, 0xFA877D7D, 0xEF15FAFA, 0xB2EB5959, 0x8EC94747, 0xFB0BF0F0, 0x41ECADAD, 0xB367D4D4, 0x5FFDA2A2, 0x45EAAFAF, 0x23BF9C9C, 0x53F7A4A4, 0xE4967272, 0x9B5BC0C0, + 0x75C2B7B7, 0xE11CFDFD, 0x3DAE9393, 0x4C6A2626, 0x6C5A3636, 0x7E413F3F, 0xF502F7F7, 0x834FCCCC, 0x685C3434, 0x51F4A5A5, 0xD134E5E5, 0xF908F1F1, 0xE2937171, 0xAB73D8D8, 0x62533131, 0x2A3F1515, + 0x080C0404, 0x9552C7C7, 0x46652323, 0x9D5EC3C3, 0x30281818, 0x37A19696, 0x0A0F0505, 0x2FB59A9A, 0x0E090707, 0x24361212, 0x1B9B8080, 0xDF3DE2E2, 0xCD26EBEB, 0x4E692727, 0x7FCDB2B2, 0xEA9F7575, + 0x121B0909, 0x1D9E8383, 0x58742C2C, 0x342E1A1A, 0x362D1B1B, 0xDCB26E6E, 0xB4EE5A5A, 0x5BFBA0A0, 0xA4F65252, 0x764D3B3B, 0xB761D6D6, 0x7DCEB3B3, 0x527B2929, 0xDD3EE3E3, 0x5E712F2F, 0x13978484, + 0xA6F55353, 0xB968D1D1, 0x00000000, 0xC12CEDED, 0x40602020, 0xE31FFCFC, 0x79C8B1B1, 0xB6ED5B5B, 0xD4BE6A6A, 0x8D46CBCB, 0x67D9BEBE, 0x724B3939, 0x94DE4A4A, 0x98D44C4C, 0xB0E85858, 0x854ACFCF, + 0xBB6BD0D0, 0xC52AEFEF, 0x4FE5AAAA, 0xED16FBFB, 0x86C54343, 0x9AD74D4D, 0x66553333, 0x11948585, 0x8ACF4545, 0xE910F9F9, 0x04060202, 0xFE817F7F, 0xA0F05050, 0x78443C3C, 0x25BA9F9F, 0x4BE3A8A8, + 0xA2F35151, 0x5DFEA3A3, 0x80C04040, 0x058A8F8F, 0x3FAD9292, 0x21BC9D9D, 0x70483838, 0xF104F5F5, 0x63DFBCBC, 0x77C1B6B6, 0xAF75DADA, 0x42632121, 0x20301010, 0xE51AFFFF, 0xFD0EF3F3, 0xBF6DD2D2, + 0x814CCDCD, 0x18140C0C, 0x26351313, 0xC32FECEC, 0xBEE15F5F, 0x35A29797, 0x88CC4444, 0x2E391717, 0x9357C4C4, 0x55F2A7A7, 0xFC827E7E, 0x7A473D3D, 0xC8AC6464, 0xBAE75D5D, 0x322B1919, 0xE6957373, + 0xC0A06060, 0x19988181, 0x9ED14F4F, 0xA37FDCDC, 0x44662222, 0x547E2A2A, 0x3BAB9090, 0x0B838888, 0x8CCA4646, 0xC729EEEE, 0x6BD3B8B8, 0x283C1414, 0xA779DEDE, 0xBCE25E5E, 0x161D0B0B, 0xAD76DBDB, + 0xDB3BE0E0, 0x64563232, 0x744E3A3A, 0x141E0A0A, 0x92DB4949, 0x0C0A0606, 0x486C2424, 0xB8E45C5C, 0x9F5DC2C2, 0xBD6ED3D3, 0x43EFACAC, 0xC4A66262, 0x39A89191, 0x31A49595, 0xD337E4E4, 0xF28B7979, + 0xD532E7E7, 0x8B43C8C8, 0x6E593737, 0xDAB76D6D, 0x018C8D8D, 0xB164D5D5, 0x9CD24E4E, 0x49E0A9A9, 0xD8B46C6C, 0xACFA5656, 0xF307F4F4, 0xCF25EAEA, 0xCAAF6565, 0xF48E7A7A, 0x47E9AEAE, 0x10180808, + 0x6FD5BABA, 0xF0887878, 0x4A6F2525, 0x5C722E2E, 0x38241C1C, 0x57F1A6A6, 0x73C7B4B4, 0x9751C6C6, 0xCB23E8E8, 0xA17CDDDD, 0xE89C7474, 0x3E211F1F, 0x96DD4B4B, 0x61DCBDBD, 0x0D868B8B, 0x0F858A8A, + 0xE0907070, 0x7C423E3E, 0x71C4B5B5, 0xCCAA6666, 0x90D84848, 0x06050303, 0xF701F6F6, 0x1C120E0E, 0xC2A36161, 0x6A5F3535, 0xAEF95757, 0x69D0B9B9, 0x17918686, 0x9958C1C1, 0x3A271D1D, 0x27B99E9E, + 0xD938E1E1, 0xEB13F8F8, 0x2BB39898, 0x22331111, 0xD2BB6969, 0xA970D9D9, 0x07898E8E, 0x33A79494, 0x2DB69B9B, 0x3C221E1E, 0x15928787, 0xC920E9E9, 0x8749CECE, 0xAAFF5555, 0x50782828, 0xA57ADFDF, + 0x038F8C8C, 0x59F8A1A1, 0x09808989, 0x1A170D0D, 0x65DABFBF, 0xD731E6E6, 0x84C64242, 0xD0B86868, 0x82C34141, 0x29B09999, 0x5A772D2D, 0x1E110F0F, 0x7BCBB0B0, 0xA8FC5454, 0x6DD6BBBB, 0x2C3A1616 }; +/* +static __constant__ __align__(64) uint32_t d_AES0[256]; +static __constant__ __align__(64) uint32_t d_AES3[256]; +*/ -static __device__ uint32_t d_AES2[256] = { +static void aes_cpu_init(int thr_id) +{ + CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES0, + h_AES0, + sizeof(h_AES0), + 0, cudaMemcpyHostToDevice)); - AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), - AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), - AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), - AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76), - AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D), - AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0), - AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF), - AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0), - AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26), - AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC), - AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1), - AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15), - AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3), - AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A), - AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2), - AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75), - AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A), - AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0), - AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3), - AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784), - AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED), - AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B), - AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39), - AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF), - AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB), - AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485), - AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F), - AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8), - AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F), - AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5), - AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321), - AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2), - AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC), - AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917), - AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D), - AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573), - AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC), - AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388), - AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14), - AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB), - AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A), - AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C), - AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662), - AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79), - AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D), - AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9), - AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA), - AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808), - AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E), - AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6), - AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F), - AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A), - AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66), - AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E), - AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9), - AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E), - AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311), - AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794), - AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9), - AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF), - AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D), - AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868), - AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F), - AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) -}; - -static __device__ uint32_t d_AES3[256] = { - 0xC6A56363, 0xF8847C7C, 0xEE997777, 0xF68D7B7B, 0xFF0DF2F2, 0xD6BD6B6B, 0xDEB16F6F, 0x9154C5C5, 0x60503030, 0x02030101, 0xCEA96767, 0x567D2B2B, 0xE719FEFE, 0xB562D7D7, 0x4DE6ABAB, 0xEC9A7676, - 0x8F45CACA, 0x1F9D8282, 0x8940C9C9, 0xFA877D7D, 0xEF15FAFA, 0xB2EB5959, 0x8EC94747, 0xFB0BF0F0, 0x41ECADAD, 0xB367D4D4, 0x5FFDA2A2, 0x45EAAFAF, 0x23BF9C9C, 0x53F7A4A4, 0xE4967272, 0x9B5BC0C0, - 0x75C2B7B7, 0xE11CFDFD, 0x3DAE9393, 0x4C6A2626, 0x6C5A3636, 0x7E413F3F, 0xF502F7F7, 0x834FCCCC, 0x685C3434, 0x51F4A5A5, 0xD134E5E5, 0xF908F1F1, 0xE2937171, 0xAB73D8D8, 0x62533131, 0x2A3F1515, - 0x080C0404, 0x9552C7C7, 0x46652323, 0x9D5EC3C3, 0x30281818, 0x37A19696, 0x0A0F0505, 0x2FB59A9A, 0x0E090707, 0x24361212, 0x1B9B8080, 0xDF3DE2E2, 0xCD26EBEB, 0x4E692727, 0x7FCDB2B2, 0xEA9F7575, - 0x121B0909, 0x1D9E8383, 0x58742C2C, 0x342E1A1A, 0x362D1B1B, 0xDCB26E6E, 0xB4EE5A5A, 0x5BFBA0A0, 0xA4F65252, 0x764D3B3B, 0xB761D6D6, 0x7DCEB3B3, 0x527B2929, 0xDD3EE3E3, 0x5E712F2F, 0x13978484, - 0xA6F55353, 0xB968D1D1, 0x00000000, 0xC12CEDED, 0x40602020, 0xE31FFCFC, 0x79C8B1B1, 0xB6ED5B5B, 0xD4BE6A6A, 0x8D46CBCB, 0x67D9BEBE, 0x724B3939, 0x94DE4A4A, 0x98D44C4C, 0xB0E85858, 0x854ACFCF, - 0xBB6BD0D0, 0xC52AEFEF, 0x4FE5AAAA, 0xED16FBFB, 0x86C54343, 0x9AD74D4D, 0x66553333, 0x11948585, 0x8ACF4545, 0xE910F9F9, 0x04060202, 0xFE817F7F, 0xA0F05050, 0x78443C3C, 0x25BA9F9F, 0x4BE3A8A8, - 0xA2F35151, 0x5DFEA3A3, 0x80C04040, 0x058A8F8F, 0x3FAD9292, 0x21BC9D9D, 0x70483838, 0xF104F5F5, 0x63DFBCBC, 0x77C1B6B6, 0xAF75DADA, 0x42632121, 0x20301010, 0xE51AFFFF, 0xFD0EF3F3, 0xBF6DD2D2, - 0x814CCDCD, 0x18140C0C, 0x26351313, 0xC32FECEC, 0xBEE15F5F, 0x35A29797, 0x88CC4444, 0x2E391717, 0x9357C4C4, 0x55F2A7A7, 0xFC827E7E, 0x7A473D3D, 0xC8AC6464, 0xBAE75D5D, 0x322B1919, 0xE6957373, - 0xC0A06060, 0x19988181, 0x9ED14F4F, 0xA37FDCDC, 0x44662222, 0x547E2A2A, 0x3BAB9090, 0x0B838888, 0x8CCA4646, 0xC729EEEE, 0x6BD3B8B8, 0x283C1414, 0xA779DEDE, 0xBCE25E5E, 0x161D0B0B, 0xAD76DBDB, - 0xDB3BE0E0, 0x64563232, 0x744E3A3A, 0x141E0A0A, 0x92DB4949, 0x0C0A0606, 0x486C2424, 0xB8E45C5C, 0x9F5DC2C2, 0xBD6ED3D3, 0x43EFACAC, 0xC4A66262, 0x39A89191, 0x31A49595, 0xD337E4E4, 0xF28B7979, - 0xD532E7E7, 0x8B43C8C8, 0x6E593737, 0xDAB76D6D, 0x018C8D8D, 0xB164D5D5, 0x9CD24E4E, 0x49E0A9A9, 0xD8B46C6C, 0xACFA5656, 0xF307F4F4, 0xCF25EAEA, 0xCAAF6565, 0xF48E7A7A, 0x47E9AEAE, 0x10180808, - 0x6FD5BABA, 0xF0887878, 0x4A6F2525, 0x5C722E2E, 0x38241C1C, 0x57F1A6A6, 0x73C7B4B4, 0x9751C6C6, 0xCB23E8E8, 0xA17CDDDD, 0xE89C7474, 0x3E211F1F, 0x96DD4B4B, 0x61DCBDBD, 0x0D868B8B, 0x0F858A8A, - 0xE0907070, 0x7C423E3E, 0x71C4B5B5, 0xCCAA6666, 0x90D84848, 0x06050303, 0xF701F6F6, 0x1C120E0E, 0xC2A36161, 0x6A5F3535, 0xAEF95757, 0x69D0B9B9, 0x17918686, 0x9958C1C1, 0x3A271D1D, 0x27B99E9E, - 0xD938E1E1, 0xEB13F8F8, 0x2BB39898, 0x22331111, 0xD2BB6969, 0xA970D9D9, 0x07898E8E, 0x33A79494, 0x2DB69B9B, 0x3C221E1E, 0x15928787, 0xC920E9E9, 0x8749CECE, 0xAAFF5555, 0x50782828, 0xA57ADFDF, - 0x038F8C8C, 0x59F8A1A1, 0x09808989, 0x1A170D0D, 0x65DABFBF, 0xD731E6E6, 0x84C64242, 0xD0B86868, 0x82C34141, 0x29B09999, 0x5A772D2D, 0x1E110F0F, 0x7BCBB0B0, 0xA8FC5454, 0x6DD6BBBB, 0x2C3A1616 -}; -/* -static __constant__ __align__(64) uint32_t d_AES0[256]; -static __constant__ __align__(64) uint32_t d_AES3[256]; -*/ - -static void aes_cpu_init(int thr_id) -{ - CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES0, - h_AES0, - sizeof(h_AES0), - 0, cudaMemcpyHostToDevice)); - /* - CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES1, - h_AES1, - sizeof(h_AES1), - 0, cudaMemcpyHostToDevice)); - - CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES2, - h_AES2, - sizeof(h_AES2), - 0, cudaMemcpyHostToDevice)); - */ CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES3, h_AES3, sizeof(h_AES3), @@ -723,303 +435,443 @@ static void KEY_EXPAND_ELT(const uint32_t *sharedMemory, uint32_t *k){ k[3] = y0; } - - -__device__ __forceinline__ -static void aes_round(const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){ - - y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]); - y3 = __ldg(&d_AES1[__byte_perm(x0, 0, 0x4441)]); - y2 = __ldg(&d_AES2[__byte_perm(x0, 0, 0x4442)]); - y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); - - y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]); - y0 ^= __ldg(&d_AES1[__byte_perm(x1, 0, 0x4441)]); - y3 ^= __ldg(&d_AES2[__byte_perm(x1, 0, 0x4442)]); - y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); - - y0 ^= k0; - - y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]); - y1 ^= __ldg(&d_AES1[__byte_perm(x2, 0, 0x4441)]); - y0 ^= __ldg(&d_AES2[__byte_perm(x2, 0, 0x4442)]); - y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); - - y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]); - y2 ^= __ldg(&d_AES1[__byte_perm(x3, 0, 0x4441)]); - y1 ^= __ldg(&d_AES2[__byte_perm(x3, 0, 0x4442)]); - y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); -} - -__device__ __forceinline__ -static void aes_round_LDG(const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){ - - y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]); - y3 = __ldg(&d_AES1[__byte_perm(x0, 0, 0x4441)]); - y2 = __ldg(&d_AES2[__byte_perm(x0, 0, 0x4442)]); - y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); - - y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]); - y0 ^= __ldg(&d_AES1[__byte_perm(x1, 0, 0x4441)]); - y3 ^= __ldg(&d_AES2[__byte_perm(x1, 0, 0x4442)]); - y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); - - y0 ^= k0; - - y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]); - y1 ^= __ldg(&d_AES1[__byte_perm(x2, 0, 0x4441)]); - y0 ^= __ldg(&d_AES2[__byte_perm(x2, 0, 0x4442)]); - y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); - - y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]); - y2 ^= __ldg(&d_AES1[__byte_perm(x3, 0, 0x4441)]); - y1 ^= __ldg(&d_AES2[__byte_perm(x3, 0, 0x4442)]); - y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); -} - -__device__ __forceinline__ -static void aes_round(const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){ - - y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]); - y3 = __ldg(&d_AES1[__byte_perm(x0, 0, 0x4441)]); - y2 = __ldg(&d_AES2[__byte_perm(x0, 0, 0x4442)]); - y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); - - y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]); - y0 ^= __ldg(&d_AES1[__byte_perm(x1, 0, 0x4441)]); - y3 ^= __ldg(&d_AES2[__byte_perm(x1, 0, 0x4442)]); - y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); - - y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]); - y1 ^= __ldg(&d_AES1[__byte_perm(x2, 0, 0x4441)]); - y0 ^= __ldg(&d_AES2[__byte_perm(x2, 0, 0x4442)]); - y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); - - y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]); - y2 ^= __ldg(&d_AES1[__byte_perm(x3, 0, 0x4441)]); - y1 ^= __ldg(&d_AES2[__byte_perm(x3, 0, 0x4442)]); - y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); -} - -__device__ __forceinline__ -static void aes_round_LDG(const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){ - - y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]); - y3 = __ldg(&d_AES1[__byte_perm(x0, 0, 0x4441)]); - y2 = __ldg(&d_AES2[__byte_perm(x0, 0, 0x4442)]); - y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); - - y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]); - y0 ^= __ldg(&d_AES1[__byte_perm(x1, 0, 0x4441)]); - y3 ^= __ldg(&d_AES2[__byte_perm(x1, 0, 0x4442)]); - y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); - - y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]); - y1 ^= __ldg(&d_AES1[__byte_perm(x2, 0, 0x4441)]); - y0 ^= __ldg(&d_AES2[__byte_perm(x2, 0, 0x4442)]); - y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); - - y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]); - y2 ^= __ldg(&d_AES1[__byte_perm(x3, 0, 0x4441)]); - y1 ^= __ldg(&d_AES2[__byte_perm(x3, 0, 0x4442)]); - y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); -} - -__device__ __forceinline__ -static void AES_2ROUND(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0){ - - uint32_t y0, y1, y2, y3; - - aes_round(x0, x1, x2, x3, k0, y0, y1, y2, y3); - - aes_round(y0, y1, y2, y3, x0, x1, x2, x3); - - // hier werden wir ein carry brauchen (oder auch nicht) - k0++; -} - -__device__ __forceinline__ -static void AES_2ROUND_LDG(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0){ - - uint32_t y0, y1, y2, y3; - - aes_round_LDG(x0, x1, x2, x3, k0, y0, y1, y2, y3); - - aes_round_LDG(y0, y1, y2, y3, x0, x1, x2, x3); - - // hier werden wir ein carry brauchen (oder auch nicht) - k0++; -} - -__device__ __forceinline__ -static void AES_ROUND_NOKEY(uint4* x){ - - uint32_t y0, y1, y2, y3; - aes_round(x->x, x->y, x->z, x->w, y0, y1, y2, y3); - - x->x = y0; - x->y = y1; - x->z = y2; - x->w = y3; - -} -__device__ __forceinline__ -static void KEY_EXPAND_ELT(uint32_t *k){ - - uint32_t y0, y1, y2, y3; - aes_round(k[0], k[1], k[2], k[3], y0, y1, y2, y3); - - k[0] = y1; - k[1] = y2; - k[2] = y3; - k[3] = y0; -} - -__device__ __forceinline__ -void aes_gpu_init_mt_256(uint32_t sharedMemory[4][256]) -{ - /* each thread startup will fill a uint32 */ - if (threadIdx.x<256){ - uint32_t temp = __ldg(&d_AES0[threadIdx.x]); - sharedMemory[0][threadIdx.x] = temp; - sharedMemory[1][threadIdx.x] = ROL8(temp); - sharedMemory[2][threadIdx.x] = ROL16(temp); -#ifdef INTENSIVE_GMF -#else - sharedMemory[3][threadIdx.x] = ROR8(temp); -#endif - } -} - -__device__ __forceinline__ -void aes_gpu_init256(uint32_t sharedMemory[4][256]) -{ - /* each thread startup will fill a uint32 */ - uint32_t temp = __ldg(&d_AES0[threadIdx.x]); - sharedMemory[0][threadIdx.x] = temp; - sharedMemory[1][threadIdx.x] = ROL8(temp); - sharedMemory[2][threadIdx.x] = ROL16(temp); -#ifdef INTENSIVE_GMF -#else - sharedMemory[3][threadIdx.x] = ROR8(temp); -#endif -} - -__device__ __forceinline__ -void aes_gpu_init128(uint32_t sharedMemory[4][256]) -{ - /* each thread startup will fill 2 uint32 */ - uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]); - - sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x; - sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y; - sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x); - sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y); - sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x); - sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y); -#ifdef INTENSIVE_GMF -#else - sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x); - sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y); -#endif -} - -__device__ __forceinline__ -void aes_gpu_init_lt_256(uint32_t sharedMemory[4][256]) -{ - if (threadIdx.x<128){ - /* each thread startup will fill 2 uint32 */ - uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]); - - sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x; - sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y; - sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x); - sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y); - sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x); - sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y); -#ifdef INTENSIVE_GMF #else - sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x); - sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y); -#endif - } -} - +#include "miner.h" +#include "cuda_vectors_alexis.h" -__device__ __forceinline__ -static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){ +/* AES Helper for inline-usage from SPH */ +#define AESx(x) (x ##UL) /* SPH_C32(x) */ - y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]); - y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)]; - y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)]; - y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); +//#define DEVICE_DIRECT_CONSTANTS - y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)]; - y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)]; - y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)]; -#ifdef INTENSIVE_GMF - y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); +#ifdef DEVICE_DIRECT_CONSTANTS +__constant__ __align__(64) uint32_t d_AES0[256] = { #else - y2 ^= sharedMemory[3][__byte_perm(x1, 0, 0x4443)]; +static const uint32_t h_AES0[256] = { #endif - - y0 ^= k0; - - y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]); - y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)]; - y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)]; - y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); - - y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)]; - y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)]; - y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)]; - y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); + AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6), + AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591), + AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56), + AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC), + AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA), + AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB), + AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45), + AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B), + AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C), + AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83), + AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9), + AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A), + AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D), + AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F), + AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF), + AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA), + AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34), + AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B), + AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D), + AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413), + AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1), + AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6), + AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972), + AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85), + AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED), + AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511), + AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE), + AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B), + AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05), + AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1), + AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142), + AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF), + AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3), + AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E), + AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A), + AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6), + AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3), + AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B), + AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428), + AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD), + AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14), + AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8), + AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4), + AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2), + AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA), + AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949), + AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF), + AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810), + AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C), + AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697), + AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E), + AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F), + AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC), + AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C), + AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969), + AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27), + AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122), + AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433), + AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9), + AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5), + AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A), + AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0), + AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E), + AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C) +}; + +#ifdef DEVICE_DIRECT_CONSTANTS +__constant__ __align__(64) uint32_t d_AES1[256] = { +#else +static const uint32_t h_AES1[256] = { +#endif + AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), + AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), + AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), + AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A), + AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87), + AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B), + AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA), + AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B), + AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A), + AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F), + AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908), + AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F), + AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E), + AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5), + AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D), + AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F), + AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E), + AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB), + AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE), + AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397), + AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C), + AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED), + AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B), + AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A), + AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16), + AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194), + AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81), + AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3), + AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A), + AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104), + AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263), + AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D), + AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F), + AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39), + AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47), + AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695), + AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F), + AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83), + AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C), + AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76), + AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E), + AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4), + AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6), + AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B), + AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7), + AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0), + AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25), + AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018), + AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72), + AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751), + AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21), + AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85), + AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA), + AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12), + AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0), + AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9), + AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233), + AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7), + AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920), + AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A), + AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17), + AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8), + AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11), + AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) +}; + +#ifdef DEVICE_DIRECT_CONSTANTS +__constant__ __align__(64) uint32_t d_AES2[256] = { +#else +static const uint32_t h_AES2[256] = { +#endif + AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), + AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), + AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), + AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76), + AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D), + AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0), + AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF), + AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0), + AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26), + AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC), + AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1), + AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15), + AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3), + AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A), + AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2), + AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75), + AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A), + AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0), + AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3), + AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784), + AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED), + AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B), + AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39), + AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF), + AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB), + AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485), + AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F), + AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8), + AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F), + AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5), + AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321), + AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2), + AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC), + AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917), + AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D), + AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573), + AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC), + AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388), + AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14), + AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB), + AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A), + AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C), + AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662), + AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79), + AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D), + AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9), + AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA), + AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808), + AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E), + AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6), + AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F), + AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A), + AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66), + AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E), + AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9), + AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E), + AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311), + AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794), + AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9), + AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF), + AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D), + AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868), + AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F), + AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) +}; + +#ifdef DEVICE_DIRECT_CONSTANTS +__constant__ __align__(64) uint32_t d_AES3[256] = { +#else +static const uint32_t h_AES3[256] = { +#endif + AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B), + AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5), + AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B), + AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676), + AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D), + AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0), + AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF), + AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0), + AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626), + AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC), + AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1), + AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515), + AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3), + AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A), + AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2), + AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575), + AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A), + AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0), + AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3), + AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484), + AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED), + AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B), + AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939), + AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF), + AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB), + AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585), + AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F), + AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8), + AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F), + AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5), + AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121), + AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2), + AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC), + AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717), + AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D), + AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373), + AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC), + AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888), + AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414), + AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB), + AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A), + AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C), + AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262), + AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979), + AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D), + AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9), + AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA), + AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808), + AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E), + AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6), + AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F), + AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A), + AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666), + AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E), + AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9), + AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E), + AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111), + AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494), + AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9), + AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF), + AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D), + AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868), + AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F), + AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616) +}; + +#ifndef DEVICE_DIRECT_CONSTANTS +static __constant__ __align__(64) uint32_t d_AES0[256]; +static __constant__ __align__(64) uint32_t d_AES1[256]; +static __constant__ __align__(64) uint32_t d_AES2[256]; +static __constant__ __align__(64) uint32_t d_AES3[256]; + +static void aes_cpu_init(int thr_id) +{ + CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES0, + h_AES0, + sizeof(h_AES0), + 0, cudaMemcpyHostToDevice)); + + CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES1, + h_AES1, + sizeof(h_AES1), + 0, cudaMemcpyHostToDevice)); + + CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES2, + h_AES2, + sizeof(h_AES2), + 0, cudaMemcpyHostToDevice)); + + CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES3, + h_AES3, + sizeof(h_AES3), + 0, cudaMemcpyHostToDevice)); } +#else +static void aes_cpu_init(int thr_id) {} +#endif __device__ __forceinline__ -static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){ +void aes_gpu_init_mt_256(uint32_t sharedMemory[4][256]) +{ + /* each thread startup will fill a uint32 */ + if (threadIdx.x<256){ + uint32_t temp = __ldg(&d_AES0[threadIdx.x]); + sharedMemory[0][threadIdx.x] = temp; + sharedMemory[1][threadIdx.x] = ROL8(temp); + sharedMemory[2][threadIdx.x] = ROL16(temp); + sharedMemory[3][threadIdx.x] = ROR8(temp); + } +} - y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]); - y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)]; - y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)]; - y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); +__device__ __forceinline__ +void aes_gpu_init(uint32_t *sharedMemory) +{ + /* each thread startup will fill a uint32 */ + if (threadIdx.x < 256) { + sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; + sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x]; + sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x]; + sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x]; + } +} - y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)]; - y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)]; - y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)]; - y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); +/* tried with 3 xor.b32 asm, not faster */ +#define xor4_32(a,b,c,d) ((a ^ b) ^ (c ^ d)); + +__device__ +static void aes_round( +const uint32_t *sharedMemory, +uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0, +uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + y0 = xor4_32( + sharedMemory[__byte_perm(x0, 0, 0x4440)], + sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x2, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]); + + y1 = xor4_32( + sharedMemory[__byte_perm(x1, 0, 0x4440)], + sharedMemory[__byte_perm(x2, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x3, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]); + + y2 = xor4_32( + sharedMemory[__byte_perm(x2, 0, 0x4440)], + sharedMemory[__byte_perm(x3, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x0, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2 y0 ^= k0; - y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]); - y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)]; - y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)]; - y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); + y3 = xor4_32( + sharedMemory[__byte_perm(x3, 0, 0x4440)], + sharedMemory[__byte_perm(x0, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x1, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3 +} - y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]); - y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)]; - y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)]; - y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); +__device__ +static void aes_round( +const uint32_t *sharedMemory, +uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, +uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + y0 = xor4_32( + sharedMemory[__byte_perm(x0, 0, 0x4440)], + sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x2, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]); + + y1 = xor4_32( + sharedMemory[__byte_perm(x1, 0, 0x4440)], + sharedMemory[__byte_perm(x2, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x3, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]); + + y2 = xor4_32( + sharedMemory[__byte_perm(x2, 0, 0x4440)], + sharedMemory[__byte_perm(x3, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x0, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2 + + y3 = xor4_32( + sharedMemory[__byte_perm(x3, 0, 0x4440)], + sharedMemory[__byte_perm(x0, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x1, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3 } __device__ __forceinline__ -static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){ +static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){ - y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)]; + y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]); y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)]; y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)]; y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); -#ifdef INTENSIVE_GMF - y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]); -#else y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)]; -#endif y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)]; y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)]; +#ifdef INTENSIVE_GMF y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); +#else + y2 ^= sharedMemory[3][__byte_perm(x1, 0, 0x4443)]; +#endif - y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)]; + y0 ^= k0; + + y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]); y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)]; y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)]; y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); @@ -1031,14 +883,42 @@ static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, co } __device__ __forceinline__ -static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){ +static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){ + /* + y0 = xor4_32( + sharedMemory[0][__byte_perm(x0, 0, 0x4440)], + sharedMemory[1][__byte_perm(x1, 0, 0x4441)], + sharedMemory[2][__byte_perm(x2, 0, 0x4442)], + sharedMemory[3][__byte_perm(x3, 0, 0x4443)]); + + y1 = xor4_32( + sharedMemory[0][__byte_perm(x1, 0, 0x4440)], + sharedMemory[1][__byte_perm(x2, 0, 0x4441)], + sharedMemory[2][__byte_perm(x3, 0, 0x4442)], + sharedMemory[3][__byte_perm(x0, 0, 0x4443)]); + y2 = xor4_32( + sharedMemory[0][__byte_perm(x2, 0, 0x4440)], + sharedMemory[1][__byte_perm(x3, 0, 0x4441)], + sharedMemory[2][__byte_perm(x0, 0, 0x4442)], + sharedMemory[3][__byte_perm(x1, 0, 0x4443)]); // ^k2 + + y3 = xor4_32( + sharedMemory[0][__byte_perm(x3, 0, 0x4440)], + sharedMemory[1][__byte_perm(x0, 0, 0x4441)], + sharedMemory[2][__byte_perm(x1, 0, 0x4442)], + sharedMemory[3][__byte_perm(x2, 0, 0x4443)]); // ^k3 + */ y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)]; y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)]; y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)]; y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); +#ifdef INTENSIVE_GMF y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]); +#else + y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)]; +#endif y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)]; y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)]; y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); @@ -1052,34 +932,9 @@ static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0 y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)]; y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)]; y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); -} - -__device__ __forceinline__ -static void AES_2ROUND(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0){ - - uint32_t y0, y1, y2, y3; - - aes_round(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3); - - aes_round(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3); - - // hier werden wir ein carry brauchen (oder auch nicht) - k0++; -} -__device__ __forceinline__ -static void AES_2ROUND_LDG(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0){ - - uint32_t y0, y1, y2, y3; - - aes_round_LDG(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3); - - aes_round_LDG(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3); - - // hier werden wir ein carry brauchen (oder auch nicht) - k0++; } - +//! only bad people write code like this (ALL_CAPS non macro) __device__ __forceinline__ static void AES_ROUND_NOKEY(const uint32_t sharedMemory[4][256], uint4* x){ @@ -1103,3 +958,4 @@ static void KEY_EXPAND_ELT(const uint32_t sharedMemory[4][256], uint32_t *k){ k[2] = y3; k[3] = y0; } +#endif \ No newline at end of file diff --git a/x11/cuda_x11_echo_alexis.cu b/x11/cuda_x11_echo_alexis.cu index bff362b28a..84fe885c3b 100644 --- a/x11/cuda_x11_echo_alexis.cu +++ b/x11/cuda_x11_echo_alexis.cu @@ -2,14 +2,13 @@ Based on Tanguy Pruvot's repo Provos Alexis - 2016 */ + //#include "cuda_helper.h" -#include "miner.h" #include "cuda_helper_alexis.h" #include "cuda_vectors_alexis.h" #define INTENSIVE_GMF -//#include "cuda_x11_aes_alexis.cuh" -#include "../x11/cuda_x11_echo_aes.cuh" +#include "cuda_x11_aes_alexis.cuh" __device__ static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W, uint32_t &k0){ @@ -269,12 +268,6 @@ static void x11_echo512_gpu_hash_64_final_alexis(uint32_t threads, uint64_t *g_h } } -__host__ -void X11_shavite512_cpu_init(int thr_id, uint32_t threads) -{ - aes_cpu_init(thr_id); -} - __host__ void x11_echo512_cpu_hash_64_final_alexis(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target) { @@ -286,11 +279,10 @@ void x11_echo512_cpu_hash_64_final_alexis(int thr_id, uint32_t threads, uint32_t x11_echo512_gpu_hash_64_final_alexis<<>>(threads, (uint64_t*)d_hash,d_resNonce,target); } - __global__ __launch_bounds__(128, 5) /* will force 80 registers */ static void x11_echo512_gpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; __shared__ uint32_t sharedMemory[4][256]; @@ -446,14 +438,15 @@ static void x11_echo512_gpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32 *(uint2x4*)&Hash[ 0] = *(uint2x4*)&hash[ 0] ^ *(uint2x4*)&W[ 0]; *(uint2x4*)&Hash[ 8] = *(uint2x4*)&hash[ 8] ^ *(uint2x4*)&W[ 8]; } -} +} __host__ void x11_echo512_cpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *d_hash){ const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); + x11_echo512_gpu_hash_64_alexis << > >(thr_id, threads, d_hash); -// x11_echo512_gpu_hash_64_alexis << > >((int*)((uint64_t)d_ark | (thr_id & 15)), threads, d_hash); } diff --git a/x11/cuda_x11_luffa512.cu b/x11/cuda_x11_luffa512.cu index 2e71d77586..e8ac522865 100644 --- a/x11/cuda_x11_luffa512.cu +++ b/x11/cuda_x11_luffa512.cu @@ -335,7 +335,7 @@ void finalization512(hashState *state, uint32_t *b) /***************************************************/ // Die Hash-Funktion -__global__ void x11_luffa512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +__global__ void x11_luffa512_gpu_hash_64(int *thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -364,7 +364,7 @@ void x11_luffa512_cpu_init(int thr_id, uint32_t threads) CUDA_CALL_OR_RET(cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice)); } -__host__ void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void x11_luffa512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { const uint32_t threadsperblock = 256; @@ -375,7 +375,7 @@ __host__ void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; - x11_luffa512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x11_luffa512_gpu_hash_64 << > >(thr_id, threads, startNounce, (uint64_t*)d_hash, d_nonceVector); //MyStreamSynchronize(NULL, order, thr_id); } diff --git a/x11/cuda_x11_luffa512_Cubehash.cu b/x11/cuda_x11_luffa512_Cubehash.cu index cab0062443..7b55308950 100644 --- a/x11/cuda_x11_luffa512_Cubehash.cu +++ b/x11/cuda_x11_luffa512_Cubehash.cu @@ -732,7 +732,7 @@ __global__ #if __CUDA_ARCH__ > 500 __launch_bounds__(256, 4) #endif -void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) +void x11_luffaCubehash512_gpu_hash_64(int *thr_id, uint32_t threads, uint32_t *g_hash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -817,15 +817,15 @@ void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) } __host__ -void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order) +void x11_luffaCubehash512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash, int order) { const uint32_t threadsperblock = 256; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x11_luffaCubehash512_gpu_hash_64 <<>> (threads, d_hash); - MyStreamSynchronize(NULL, order, thr_id); + x11_luffaCubehash512_gpu_hash_64 <<>> (thr_id, threads, d_hash); + MyStreamSynchronize(NULL, order, ((uintptr_t)thr_id) & 15); } // Setup diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu index 0c774c4a63..4da808c218 100644 --- a/x11/cuda_x11_shavite512.cu +++ b/x11/cuda_x11_shavite512.cu @@ -534,12 +534,9 @@ __global__ __launch_bounds__(TPB, 2) #else #error "Not set up for this" #endif -void x11_shavite512_gpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - -#if TPB == 128 + #if TPB == 128 aes_gpu_init_128(sharedMemory); #elif TPB == 384 //! todo, fix naming and sharedMemory @@ -607,7 +604,7 @@ void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x11_shavite512_gpu_hash_80 << > >(thr_id, threads, startNounce, (uint64_t*)d_outputHash); + x11_shavite512_gpu_hash_80<<>>(threads, startNounce, (uint64_t*)d_outputHash); } __host__ diff --git a/x11/cuda_x11_shavite512_alexis.cu b/x11/cuda_x11_shavite512_alexis.cu index b67aaee900..b366f4a708 100644 --- a/x11/cuda_x11_shavite512_alexis.cu +++ b/x11/cuda_x11_shavite512_alexis.cu @@ -188,7 +188,7 @@ static void round_4_8_12(const uint32_t sharedMemory[4][256], uint32_t* r, uint4 __global__ __launch_bounds__(TPB,2) /* 64 registers with 128,8 - 72 regs with 128,7 */ void x11_shavite512_gpu_hash_64_alexis(int *thr_id, const uint32_t threads, uint64_t *g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; __shared__ uint32_t sharedMemory[4][256]; @@ -507,13 +507,13 @@ void x11_shavite512_gpu_hash_64_alexis(int *thr_id, const uint32_t threads, uint *(uint2x4*)&Hash[ 4] = *(uint2x4*)&state[ 8] ^ *(uint2x4*)&p[ 0]; } } - + __host__ void x11_shavite512_cpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *d_hash) { - dim3 grid((threads + TPB-1)/TPB); + dim3 grid((threads + TPB - 1) / TPB); dim3 block(TPB); // note: 128 threads minimum are required to init the shared memory array - x11_shavite512_gpu_hash_64_alexis << > >(thr_id, threads, (uint64_t*)d_hash); + x11_shavite512_gpu_hash_64_alexis<<>>(thr_id, threads, (uint64_t*)d_hash); } diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu index 44ec52ebb4..7da95c490f 100644 --- a/x11/cuda_x11_simd512.cu +++ b/x11/cuda_x11_simd512.cu @@ -1,113 +1,751 @@ /*************************************************************************************************** -* SIMD512 SM3+ CUDA IMPLEMENTATION (require cuda_x11_simd512_func.cuh) -* Uses Alexis78 simd modifications -*/ + * SIMD512 SM3+ CUDA IMPLEMENTATION (require cuda_x11_simd512_func.cuh) + */ #include "miner.h" #include "cuda_helper_alexis.h" -#include "cuda_vectors_alexis.h" + +#define TPB 128 + +uint32_t *d_state[MAX_GPUS]; +uint4 *d_temp4[MAX_GPUS]; + +// texture bound to d_temp4[thr_id], for read access in Compaction kernel +texture texRef1D_128; + +#define DEVICE_DIRECT_CONSTANTS + +#ifdef DEVICE_DIRECT_CONSTANTS +__constant__ uint8_t c_perm[8][8] = { +#else +__constant__ uint8_t c_perm[8][8]; +const uint8_t h_perm[8][8] = { +#endif + { 2, 3, 6, 7, 0, 1, 4, 5 }, + { 6, 7, 2, 3, 4, 5, 0, 1 }, + { 7, 6, 5, 4, 3, 2, 1, 0 }, + { 1, 0, 3, 2, 5, 4, 7, 6 }, + { 0, 1, 4, 5, 6, 7, 2, 3 }, + { 6, 7, 2, 3, 0, 1, 4, 5 }, + { 6, 7, 0, 1, 4, 5, 2, 3 }, + { 4, 5, 2, 3, 6, 7, 0, 1 } +}; + +/* used in cuda_x11_simd512_func.cuh (SIMD_Compress2) */ +#ifdef DEVICE_DIRECT_CONSTANTS +__constant__ uint32_t c_IV_512[32] = { +#else +__constant__ uint32_t c_IV_512[32]; +const uint32_t h_IV_512[32] = { +#endif + 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, + 0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e, + 0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257, + 0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 +}; + +#ifdef DEVICE_DIRECT_CONSTANTS +__constant__ short c_FFT128_8_16_Twiddle[128] = { +#else +__constant__ short c_FFT128_8_16_Twiddle[128]; +static const short h_FFT128_8_16_Twiddle[128] = { +#endif + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30, + 1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22, + 1, -67, 120, -73, 8, -22, -68, -70, 64, 81, -30, -46, -2,-123, 17,-111, + 1,-118, 46, -31, 60, 116, -67, -61, 2, 21, 92, -62, 120, -25, 123,-122, + 1, 116, 92,-122, -17, 84, -22, 18, 32, 114, 117, -49, -30, 118, 67, 62, + 1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114, + 1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79 +}; + +#ifdef DEVICE_DIRECT_CONSTANTS +__constant__ short c_FFT256_2_128_Twiddle[128] = { +#else +__constant__ short c_FFT256_2_128_Twiddle[128]; +static const short h_FFT256_2_128_Twiddle[128] = { +#endif + 1, 41,-118, 45, 46, 87, -31, 14, + 60,-110, 116,-127, -67, 80, -61, 69, + 2, 82, 21, 90, 92, -83, -62, 28, + 120, 37, -25, 3, 123, -97,-122,-119, + 4, -93, 42, -77, -73, 91,-124, 56, + -17, 74, -50, 6, -11, 63, 13, 19, + 8, 71, 84, 103, 111, -75, 9, 112, + -34,-109,-100, 12, -22, 126, 26, 38, + 16,-115, -89, -51, -35, 107, 18, -33, + -68, 39, 57, 24, -44, -5, 52, 76, + 32, 27, 79,-102, -70, -43, 36, -66, + 121, 78, 114, 48, -88, -10, 104,-105, + 64, 54, -99, 53, 117, -86, 72, 125, + -15,-101, -29, 96, 81, -20, -49, 47, + 128, 108, 59, 106, -23, 85,-113, -7, + -30, 55, -58, -65, -95, -40, -98, 94 +}; + +/************* the round function ****************/ +#define IF(x, y, z) (((y ^ z) & x) ^ z) +#define MAJ(x, y, z) ((z &y) | ((z|y) & x)) + +#include "cuda_x11_simd512_sm2.cuh" +#include "cuda_x11_simd512_func.cuh" #ifdef __INTELLISENSE__ /* just for vstudio code colors */ #define __CUDA_ARCH__ 500 #endif -#define TPB50_1 128 -#define TPB50_2 128 -#define TPB52_1 128 -#define TPB52_2 128 +#if __CUDA_ARCH__ >= 300 -static uint4 *d_temp4[MAX_GPUS]; -#include "cuda_x11_simd512_func.cuh" +/********************* Message expansion ************************/ -__global__ -#if __CUDA_ARCH__ > 500 -__launch_bounds__(TPB52_2, 1) -#else -__launch_bounds__(TPB50_2, 4) +/* + * Reduce modulo 257; result is in [-127; 383] + * REDUCE(x) := (x&255) - (x>>8) + */ +#define REDUCE(x) \ + (((x)&255) - ((x)>>8)) + +/* + * Reduce from [-127; 383] to [-128; 128] + * EXTRA_REDUCE_S(x) := x<=128 ? x : x-257 + */ +#define EXTRA_REDUCE_S(x) \ + ((x)<=128 ? (x) : (x)-257) + +/* + * Reduce modulo 257; result is in [-128; 128] + */ +#define REDUCE_FULL_S(x) \ + EXTRA_REDUCE_S(REDUCE(x)) + +// Parallelization: +// +// FFT_8 wird 2 times 8-fach parallel ausgeführt (in FFT_64) +// and 1 time 16-fach parallel (in FFT_128_full) +// +// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations + +/** + * FFT_8 using w=4 as 8th root of unity + * Unrolled decimation in frequency (DIF) radix-2 NTT. + * Output data is in revbin_permuted order. + */ +__device__ __forceinline__ +void FFT_8(int *y, int stripe) +{ +#define X(i) y[stripe*i] + +#define DO_REDUCE(i) \ + X(i) = REDUCE(X(i)) + +#define DO_REDUCE_FULL_S(i) \ +do { \ + X(i) = REDUCE(X(i)); \ + X(i) = EXTRA_REDUCE_S(X(i)); \ +} while(0) + +#define BUTTERFLY(i,j,n) \ +do { \ + int u= X(i); \ + int v= X(j); \ + X(i) = u+v; \ + X(j) = (u-v) << (2*n); \ +} while(0) + + BUTTERFLY(0, 4, 0); + BUTTERFLY(1, 5, 1); + BUTTERFLY(2, 6, 2); + BUTTERFLY(3, 7, 3); + + DO_REDUCE(6); + DO_REDUCE(7); + + BUTTERFLY(0, 2, 0); + BUTTERFLY(4, 6, 0); + BUTTERFLY(1, 3, 2); + BUTTERFLY(5, 7, 2); + + DO_REDUCE(7); + + BUTTERFLY(0, 1, 0); + BUTTERFLY(2, 3, 0); + BUTTERFLY(4, 5, 0); + BUTTERFLY(6, 7, 0); + + DO_REDUCE_FULL_S(0); + DO_REDUCE_FULL_S(1); + DO_REDUCE_FULL_S(2); + DO_REDUCE_FULL_S(3); + DO_REDUCE_FULL_S(4); + DO_REDUCE_FULL_S(5); + DO_REDUCE_FULL_S(6); + DO_REDUCE_FULL_S(7); + +#undef X +#undef DO_REDUCE +#undef DO_REDUCE_FULL_S +#undef BUTTERFLY +} + +#if defined(__CUDA_ARCH__) +#if __CUDA_ARCH__ < 300 + #define __shfl(var, srcLane, width) (uint32_t)(var) + // #error __shfl() not supported by SM 2.x +#endif #endif -static void x11_simd512_gpu_compress_64_maxwell(int *thr_id, uint32_t threads, uint32_t *g_hash, const uint4 *const __restrict__ g_fft4) + +/** + * FFT_16 using w=2 as 16th root of unity + * Unrolled decimation in frequency (DIF) radix-2 NTT. + * Output data is in revbin_permuted order. + */ +__device__ __forceinline__ +void FFT_16(int *y) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) - return; - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - const uint32_t thr_offset = thread << 6; // thr_id * 128 (je zwei elemente) - uint32_t IV[32]; - if (thread < threads){ +#define DO_REDUCE_FULL_S(i) \ + do { \ + y[i] = REDUCE(y[i]); \ + y[i] = EXTRA_REDUCE_S(y[i]); \ + } while(0) - uint32_t *Hash = &g_hash[thread << 4]; - // Compression1(Hash, thread, g_fft4, g_state); - uint32_t A[32]; + int u,v; - *(uint2x4*)&IV[0] = *(uint2x4*)&c_IV_512[0]; - *(uint2x4*)&IV[8] = *(uint2x4*)&c_IV_512[8]; - *(uint2x4*)&IV[16] = *(uint2x4*)&c_IV_512[16]; - *(uint2x4*)&IV[24] = *(uint2x4*)&c_IV_512[24]; + // BUTTERFLY(0, 8, 0); + // BUTTERFLY(1, 9, 1); + // BUTTERFLY(2, 10, 2); + // BUTTERFLY(3, 11, 3); + // BUTTERFLY(4, 12, 4); + // BUTTERFLY(5, 13, 5); + // BUTTERFLY(6, 14, 6); + // BUTTERFLY(7, 15, 7); + { + u = y[0]; // 0..7 + v = y[1]; // 8..15 + y[0] = u+v; + y[1] = (u-v) << (threadIdx.x&7); + } - *(uint2x4*)&A[0] = __ldg4((uint2x4*)&Hash[0]); - *(uint2x4*)&A[8] = __ldg4((uint2x4*)&Hash[8]); + // DO_REDUCE(11); + // DO_REDUCE(12); + // DO_REDUCE(13); + // DO_REDUCE(14); + // DO_REDUCE(15); + if ((threadIdx.x&7) >=3) y[1] = REDUCE(y[1]); // 11...15 -#pragma unroll 16 - for (uint32_t i = 0; i<16; i++) - A[i] = A[i] ^ IV[i]; + // BUTTERFLY( 0, 4, 0); + // BUTTERFLY( 1, 5, 2); + // BUTTERFLY( 2, 6, 4); + // BUTTERFLY( 3, 7, 6); + { + u = __shfl((int)y[0], (threadIdx.x&3),8); // 0,1,2,3 0,1,2,3 + v = __shfl((int)y[0],4+(threadIdx.x&3),8); // 4,5,6,7 4,5,6,7 + y[0] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3))); + } + + // BUTTERFLY( 8, 12, 0); + // BUTTERFLY( 9, 13, 2); + // BUTTERFLY(10, 14, 4); + // BUTTERFLY(11, 15, 6); + { + u = __shfl((int)y[1], (threadIdx.x&3),8); // 8,9,10,11 8,9,10,11 + v = __shfl((int)y[1],4+(threadIdx.x&3),8); // 12,13,14,15 12,13,14,15 + y[1] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3))); + } + + // DO_REDUCE(5); + // DO_REDUCE(7); + // DO_REDUCE(13); + // DO_REDUCE(15); + if ((threadIdx.x&1) && (threadIdx.x&7) >= 4) { + y[0] = REDUCE(y[0]); // 5, 7 + y[1] = REDUCE(y[1]); // 13, 15 + } + + // BUTTERFLY( 0, 2, 0); + // BUTTERFLY( 1, 3, 4); + // BUTTERFLY( 4, 6, 0); + // BUTTERFLY( 5, 7, 4); + { + u = __shfl((int)y[0], (threadIdx.x&5),8); // 0,1,0,1 4,5,4,5 + v = __shfl((int)y[0],2+(threadIdx.x&5),8); // 2,3,2,3 6,7,6,7 + y[0] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1))); + } + + // BUTTERFLY( 8, 10, 0); + // BUTTERFLY( 9, 11, 4); + // BUTTERFLY(12, 14, 0); + // BUTTERFLY(13, 15, 4); + { + u = __shfl((int)y[1], (threadIdx.x&5),8); // 8,9,8,9 12,13,12,13 + v = __shfl((int)y[1],2+(threadIdx.x&5),8); // 10,11,10,11 14,15,14,15 + y[1] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1))); + } + + // BUTTERFLY( 0, 1, 0); + // BUTTERFLY( 2, 3, 0); + // BUTTERFLY( 4, 5, 0); + // BUTTERFLY( 6, 7, 0); + { + u = __shfl((int)y[0], (threadIdx.x&6),8); // 0,0,2,2 4,4,6,6 + v = __shfl((int)y[0],1+(threadIdx.x&6),8); // 1,1,3,3 5,5,7,7 + y[0] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v); + } + + // BUTTERFLY( 8, 9, 0); + // BUTTERFLY(10, 11, 0); + // BUTTERFLY(12, 13, 0); + // BUTTERFLY(14, 15, 0); + { + u = __shfl((int)y[1], (threadIdx.x&6),8); // 8,8,10,10 12,12,14,14 + v = __shfl((int)y[1],1+(threadIdx.x&6),8); // 9,9,11,11 13,13,15,15 + y[1] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v); + } + + DO_REDUCE_FULL_S( 0); // 0...7 + DO_REDUCE_FULL_S( 1); // 8...15 + +#undef DO_REDUCE_FULL_S +} + +__device__ __forceinline__ +void FFT_128_full(int y[128]) +{ + int i; + + FFT_8(y+0,2); // eight parallel FFT8's + FFT_8(y+1,2); // eight parallel FFT8's #pragma unroll 16 - for (uint32_t i = 16; i<32; i++) - A[i] = IV[i]; + for (i=0; i<16; i++) + /*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i*8+(threadIdx.x&7)]); + +#pragma unroll 8 + for (i=0; i<8; i++) + FFT_16(y+2*i); // eight sequential FFT16's, each one executed in parallel by 8 threads +} - Round8(A, thr_offset, g_fft4); +__device__ __forceinline__ +void FFT_256_halfzero(int y[256]) +{ + /* + * FFT_256 using w=41 as 256th root of unity. + * Decimation in frequency (DIF) NTT. + * Output data is in revbin_permuted order. + * In place. + */ + const int tmp = y[15]; - STEP8_IF(&IV[0], 32, 4, 13, &A[0], &A[8], &A[16], &A[24]); - STEP8_IF(&IV[8], 33, 13, 10, &A[24], &A[0], &A[8], &A[16]); - STEP8_IF(&IV[16], 34, 10, 25, &A[16], &A[24], &A[0], &A[8]); - STEP8_IF(&IV[24], 35, 25, 4, &A[8], &A[16], &A[24], &A[0]); +#pragma unroll 8 + for (int i=0; i<8; i++) + y[16+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[8*i+(threadIdx.x&7)]); +#pragma unroll 8 + for (int i=8; i<16; i++) + y[16+i] = 0; -#pragma unroll 32 - for (uint32_t i = 0; i<32; i++){ - IV[i] = A[i]; - } + /* handle X^255 with an additional butterfly */ + if ((threadIdx.x&7) == 7) + { + y[15] = REDUCE(tmp + 1); + y[31] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]); + } - A[0] ^= 512; + FFT_128_full(y); + FFT_128_full(y+16); +} - Round8_0_final(A, 3, 23, 17, 27); - Round8_1_final(A, 28, 19, 22, 7); - Round8_2_final(A, 29, 9, 15, 5); - Round8_3_final(A, 4, 13, 10, 25); - STEP8_IF(&IV[0], 32, 4, 13, &A[0], &A[8], &A[16], &A[24]); - STEP8_IF(&IV[8], 33, 13, 10, &A[24], &A[0], &A[8], &A[16]); - STEP8_IF(&IV[16], 34, 10, 25, &A[16], &A[24], &A[0], &A[8]); - STEP8_IF(&IV[24], 35, 25, 4, &A[8], &A[16], &A[24], &A[0]); +/***************************************************/ - *(uint2x4*)&Hash[0] = *(uint2x4*)&A[0]; - *(uint2x4*)&Hash[8] = *(uint2x4*)&A[8]; +__device__ __forceinline__ +void Expansion(const uint32_t *data, uint4 *g_temp4) +{ + /* Message Expansion using Number Theoretical Transform similar to FFT */ + int expanded[32]; +#pragma unroll 4 + for (int i=0; i < 4; i++) { + expanded[ i] = __byte_perm(__shfl((int)data[0], 2*i, 8), __shfl((int)data[0], (2*i)+1, 8), threadIdx.x&7)&0xff; + expanded[4+i] = __byte_perm(__shfl((int)data[1], 2*i, 8), __shfl((int)data[1], (2*i)+1, 8), threadIdx.x&7)&0xff; + } +#pragma unroll 8 + for (int i=8; i < 16; i++) + expanded[i] = 0; + + FFT_256_halfzero(expanded); + + // store w matrices in global memory + +#define mul_185(x) ( (x)*185 ) +#define mul_233(x) ( (x)*233 ) + + uint4 vec0; + int P, Q, P1, Q1, P2, Q2; + bool even = (threadIdx.x & 1) == 0; + +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 + + // 2 6 0 4 + + P1 = expanded[ 0]; P2 = __shfl(expanded[ 2], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[16]; Q2 = __shfl(expanded[18], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8); + P1 = expanded[ 8]; P2 = __shfl(expanded[10], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[24]; Q2 = __shfl(expanded[26], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8); + P1 = expanded[ 4]; P2 = __shfl(expanded[ 6], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[20]; Q2 = __shfl(expanded[22], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8); + P1 = expanded[12]; P2 = __shfl(expanded[14], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[28]; Q2 = __shfl(expanded[30], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8); + g_temp4[threadIdx.x&7] = vec0; + +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + + // 6 2 4 0 + + P1 = expanded[ 1]; P2 = __shfl(expanded[ 3], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[17]; Q2 = __shfl(expanded[19], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8); + P1 = expanded[ 9]; P2 = __shfl(expanded[11], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[25]; Q2 = __shfl(expanded[27], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8); + P1 = expanded[ 5]; P2 = __shfl(expanded[ 7], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[21]; Q2 = __shfl(expanded[23], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8); + P1 = expanded[13]; P2 = __shfl(expanded[15], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[29]; Q2 = __shfl(expanded[31], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8); + g_temp4[8+(threadIdx.x&7)] = vec0; + +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + + // 7 5 3 1 + + bool hi = (threadIdx.x&7)>=4; + + P1 = hi?expanded[ 1]:expanded[ 0]; P2 = __shfl(hi?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = hi?expanded[17]:expanded[16]; Q2 = __shfl(hi?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8); + P1 = hi?expanded[ 9]:expanded[ 8]; P2 = __shfl(hi?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = hi?expanded[25]:expanded[24]; Q2 = __shfl(hi?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8); + P1 = hi?expanded[ 5]:expanded[ 4]; P2 = __shfl(hi?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = hi?expanded[21]:expanded[20]; Q2 = __shfl(hi?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8); + P1 = hi?expanded[13]:expanded[12]; P2 = __shfl(hi?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = hi?expanded[29]:expanded[28]; Q2 = __shfl(hi?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8); + g_temp4[16+(threadIdx.x&7)] = vec0; + +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 + + // 1 3 5 7 + + bool lo = (threadIdx.x&7)<4; + + P1 = lo?expanded[ 1]:expanded[ 0]; P2 = __shfl(lo?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = lo?expanded[17]:expanded[16]; Q2 = __shfl(lo?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8); + P1 = lo?expanded[ 9]:expanded[ 8]; P2 = __shfl(lo?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = lo?expanded[25]:expanded[24]; Q2 = __shfl(lo?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8); + P1 = lo?expanded[ 5]:expanded[ 4]; P2 = __shfl(lo?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = lo?expanded[21]:expanded[20]; Q2 = __shfl(lo?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8); + P1 = lo?expanded[13]:expanded[12]; P2 = __shfl(lo?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = lo?expanded[29]:expanded[28]; Q2 = __shfl(lo?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8); + g_temp4[24+(threadIdx.x&7)] = vec0; + +// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 +// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 +// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 +// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 + +//{ 8, 72, 40, 104, 24, 88, 56, 120 }, { 9, 73, 41, 105, 25, 89, 57, 121 }, +//{ 4, 68, 36, 100, 20, 84, 52, 116 }, { 5, 69, 37, 101, 21, 85, 53, 117 }, +//{ 14, 78, 46, 110, 30, 94, 62, 126 }, { 15, 79, 47, 111, 31, 95, 63, 127 }, +//{ 2, 66, 34, 98, 18, 82, 50, 114 }, { 3, 67, 35, 99, 19, 83, 51, 115 }, + + bool sel = ((threadIdx.x+2)&7) >= 4; // 2,3,4,5 + + P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[2]:expanded[3]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8); + P1 = sel?expanded[8]:expanded[9]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[10]:expanded[11]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8); + P1 = sel?expanded[4]:expanded[5]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[6]:expanded[7]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8); + P1 = sel?expanded[12]:expanded[13]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[14]:expanded[15]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8); + + g_temp4[32+(threadIdx.x&7)] = vec0; + +// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 +// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 +// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 +// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 + + P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8); + P1 = sel?expanded[9]:expanded[8]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[11]:expanded[10]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8); + P1 = sel?expanded[5]:expanded[4]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[7]:expanded[6]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8); + P1 = sel?expanded[13]:expanded[12]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[15]:expanded[14]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8); + + g_temp4[40+(threadIdx.x&7)] = vec0; + +// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 +// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 +// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 +// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 + + // sel markiert threads 2,3,4,5 + + int t; + t = __shfl(expanded[17],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[16]; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[19],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[18]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8); + t = __shfl(expanded[25],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[24]; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[27],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[26]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8); + t = __shfl(expanded[21],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[20]; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[23],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[22]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8); + t = __shfl(expanded[29],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[28]; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[31],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[30]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8); + + g_temp4[48+(threadIdx.x&7)] = vec0; + +// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 +// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 +// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 +// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 + + // sel markiert threads 2,3,4,5 + + t = __shfl(expanded[16],(threadIdx.x+4)&7,8); P1 = sel?expanded[17]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[18],(threadIdx.x+4)&7,8); Q2 = sel?expanded[19]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8); + t = __shfl(expanded[24],(threadIdx.x+4)&7,8); P1 = sel?expanded[25]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[26],(threadIdx.x+4)&7,8); Q2 = sel?expanded[27]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8); + t = __shfl(expanded[20],(threadIdx.x+4)&7,8); P1 = sel?expanded[21]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[22],(threadIdx.x+4)&7,8); Q2 = sel?expanded[23]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8); + t = __shfl(expanded[28],(threadIdx.x+4)&7,8); P1 = sel?expanded[29]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[30],(threadIdx.x+4)&7,8); Q2 = sel?expanded[31]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8); + + g_temp4[56+(threadIdx.x&7)] = vec0; + +#undef mul_185 +#undef mul_233 +} + +/***************************************************/ + +__global__ __launch_bounds__(TPB, 4) +void x11_simd512_gpu_expand_64(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_temp4) +{ + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) + return; + int threadBloc = (blockDim.x * blockIdx.x + threadIdx.x) / 8; + if (threadBloc < threads) + { + int hashPosition = threadBloc * 16; + uint32_t *inpHash = &g_hash[hashPosition]; + + // Read hash per 8 threads + uint32_t Hash[2]; + int ndx = threadIdx.x & 7; + Hash[0] = inpHash[ndx]; + Hash[1] = inpHash[ndx + 8]; + + // Puffer für expandierte Nachricht + uint4 *temp4 = &g_temp4[hashPosition * 4]; + + Expansion(Hash, temp4); + } +} + +__global__ __launch_bounds__(TPB, 1) +void x11_simd512_gpu_compress1_64(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) +{ + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) + return; + + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t *Hash = &g_hash[thread * 16]; + Compression1(Hash, thread, g_fft4, g_state); + } +} + +__global__ __launch_bounds__(TPB, 1) +void x11_simd512_gpu_compress2_64(int *thr_id, uint32_t threads, uint4 *g_fft4, uint32_t *g_state) +{ + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) + return; + + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + Compression2(thread, g_fft4, g_state); + } +} + +__global__ __launch_bounds__(TPB, 2) +void x11_simd512_gpu_compress_64_maxwell(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) +{ + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) + return; + + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t *Hash = &g_hash[thread * 16]; + Compression1(Hash, thread, g_fft4, g_state); + Compression2(thread, g_fft4, g_state); + } +} + +__global__ __launch_bounds__(TPB, 2) +void x11_simd512_gpu_final_64(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) +{ + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) + return; + + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t *Hash = &g_hash[thread * 16]; + Final(Hash, thread, g_fft4, g_state); } } +#else +__global__ void x11_simd512_gpu_expand_64(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_temp4) {} +__global__ void x11_simd512_gpu_compress1_64(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {} +__global__ void x11_simd512_gpu_compress2_64(int *thr_id, uint32_t threads, uint4 *g_fft4, uint32_t *g_state) {} +__global__ void x11_simd512_gpu_compress_64_maxwell(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {} +__global__ void x11_simd512_gpu_final_64(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {} +#endif /* SM3+ */ + __host__ -int x11_simd512_cpu_init(int thr_id, uint32_t threads){ - return cudaMalloc(&d_temp4[thr_id], 64 * sizeof(uint4)*threads); +int x11_simd512_cpu_init(int thr_id, uint32_t threads) +{ + int dev_id = device_map[thr_id]; + // cuda_get_arch(thr_id); // should be already done! + if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) { + x11_simd512_cpu_init_sm2(thr_id); + return 0; + } + //2097152 +#if 0 + if (threads > 2097152) + { + CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 32 * sizeof(uint4)*(threads >> 1)), (int)err); /* todo: prevent -i 21 */ + CUDA_CALL_OR_RET_X(cudaMalloc((&d_temp4[thr_id]) + 32 * (threads >> 1), 32 * sizeof(uint4)*(threads >> 1)), (int)err); /* todo: prevent -i 21 */ + } + else +#endif + CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64 * sizeof(uint4)*threads), (int)err); /* todo: prevent -i 21 */ + CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32 * sizeof(int)*threads), (int)err); + +#ifndef DEVICE_DIRECT_CONSTANTS + cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice); + + cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice); +#endif + + // Texture for 128-Bit Zugriffe + cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc(); + texRef1D_128.normalized = 0; + texRef1D_128.filterMode = cudaFilterModePoint; + texRef1D_128.addressMode[0] = cudaAddressModeClamp; + + CUDA_CALL_OR_RET_X(cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads), (int) err); + + return 0; } __host__ -void x11_simd512_cpu_free(int thr_id){ - cudaFree(d_temp4[thr_id]); +void x11_simd512_cpu_free(int thr_id) +{ + int dev_id = device_map[thr_id]; + if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) { + cudaFree(d_temp4[thr_id]); + cudaFree(d_state[thr_id]); + } } - + __host__ -void x11_simd512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash){ +void x11_simd512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash) +{ + const uint32_t threadsperblock = TPB; + int dev_id = device_map[((uintptr_t)thr_id) & 15]; + //2097152 + dim3 block(threadsperblock); + dim3 grid((threads + threadsperblock-1) / threadsperblock); + dim3 gridX8(grid.x * 8); - int dev_id = device_map[((uint64_t)thr_id) & 15]; + x11_simd512_gpu_expand_64 << > > (thr_id, threads, d_hash, d_temp4[((uintptr_t)thr_id) & 15]); - uint32_t tpb = TPB52_1; - if (device_sm[dev_id] <= 500) tpb = TPB50_1; - const dim3 grid1((8 * threads + tpb - 1) / tpb); - const dim3 block1(tpb); + if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) { + x11_simd512_gpu_compress_64_maxwell << < grid, block >> > (thr_id, threads, d_hash, d_temp4[((uintptr_t)thr_id) & 15], d_state[((uintptr_t)thr_id) & 15]); + } else { + x11_simd512_gpu_compress1_64 << < grid, block >> > (thr_id, threads, d_hash, d_temp4[((uintptr_t)thr_id) & 15], d_state[((uintptr_t)thr_id) & 15]); + x11_simd512_gpu_compress2_64 << < grid, block >> > (thr_id, threads, d_temp4[((uintptr_t)thr_id) & 15], d_state[((uintptr_t)thr_id) & 15]); + } - tpb = TPB52_2; - if (device_sm[dev_id] <= 500) tpb = TPB50_2; - const dim3 grid2((threads + tpb - 1) / tpb); - const dim3 block2(tpb); + x11_simd512_gpu_final_64 << > > (thr_id, threads, d_hash, d_temp4[((uintptr_t)thr_id) & 15], d_state[((uintptr_t)thr_id) & 15]); - x11_simd512_gpu_expand_64 << > > (thr_id, threads, d_hash, d_temp4[(uint64_t)thr_id & 15]); - x11_simd512_gpu_compress_64_maxwell << < grid2, block2 >> > (thr_id, threads, d_hash, d_temp4[(uint64_t)thr_id & 15]); +// MyStreamSynchronize(NULL, order, thr_id); } diff --git a/x11/cuda_x11_simd512_func.cuh b/x11/cuda_x11_simd512_func.cuh index 5470ae9c8e..f61eaa4f59 100644 --- a/x11/cuda_x11_simd512_func.cuh +++ b/x11/cuda_x11_simd512_func.cuh @@ -1,659 +1,1396 @@ - -static __constant__ const uint8_t c_perm[8][8] = { - { 2, 3, 6, 7, 0, 1, 4, 5 }, { 6, 7, 2, 3, 4, 5, 0, 1 }, { 7, 6, 5, 4, 3, 2, 1, 0 }, { 1, 0, 3, 2, 5, 4, 7, 6 }, - { 0, 1, 4, 5, 6, 7, 2, 3 }, { 6, 7, 2, 3, 0, 1, 4, 5 }, { 6, 7, 0, 1, 4, 5, 2, 3 }, { 4, 5, 2, 3, 6, 7, 0, 1 } -}; - -static __constant__ const uint32_t c_IV_512[32] = { - 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, - 0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e, - 0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257, - 0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 -}; - -static __constant__ const int16_t c_FFT128_8_16_Twiddle[128] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30, - 1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22, 1, -67, 120, -73, 8, -22, -68, -70, 64, 81, -30, -46, -2, -123, 17, -111, - 1, -118, 46, -31, 60, 116, -67, -61, 2, 21, 92, -62, 120, -25, 123, -122, 1, 116, 92, -122, -17, 84, -22, 18, 32, 114, 117, -49, -30, 118, 67, 62, - 1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114, 1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79 -}; - -static __constant__ const int16_t c_FFT256_2_128_Twiddle[128] = { - 1, 41, -118, 45, 46, 87, -31, 14, 60, -110, 116, -127, -67, 80, -61, 69, 2, 82, 21, 90, 92, -83, -62, 28, 120, 37, -25, 3, 123, -97, -122, -119, - 4, -93, 42, -77, -73, 91, -124, 56, -17, 74, -50, 6, -11, 63, 13, 19, 8, 71, 84, 103, 111, -75, 9, 112, -34, -109, -100, 12, -22, 126, 26, 38, - 16, -115, -89, -51, -35, 107, 18, -33, -68, 39, 57, 24, -44, -5, 52, 76, 32, 27, 79, -102, -70, -43, 36, -66, 121, 78, 114, 48, -88, -10, 104, -105, - 64, 54, -99, 53, 117, -86, 72, 125, -15, -101, -29, 96, 81, -20, -49, 47, 128, 108, 59, 106, -23, 85, -113, -7, -30, 55, -58, -65, -95, -40, -98, 94 -}; +#define SIMD_FUNCTIONS_CUH -__device__ __forceinline__ -static uint32_t IF(uint32_t x, uint32_t y, uint32_t z){ - /* - #if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 - uint32_t result; - asm("lop3.b32 %0, %1, %2, %3, 0xCA;" : "=r"(result) : "r"(x), "r"(y), "r"(z)); // x=F0, y=CC, z=AA // 0xCA = ((CC⊻AA)∧F0)⊻AA - return result; - #else - */ return (((y ^ z) & x) ^ z); -// #endif +__device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(int j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[1]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[0]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[3]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[2]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[5]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[4]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[7]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[6]; +#pragma unroll 8 + for(int j=0; j<8; j++) { + A[j] = R[j]; + } } - - -__device__ __forceinline__ -static uint32_t MAJ(const uint32_t x, const uint32_t y, const uint32_t z){ - -#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 - uint32_t result; - asm("lop3.b32 %0, %1, %2, %3, 0xE8;" : "=r"(result) : "r"(x), "r"(y), "r"(z)); // x=AA, y=CC, z=F0 // 0xCA = ((CC⊻AA)∧F0)⊻AA - return result; -#else - return ((z &y) | ((z | y) & x)); -#endif +__device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[6]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[7]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[4]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[5]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[2]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[3]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[0]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[1]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } } - -#define p8_xor(x) ( ((x)%7) == 0 ? 1 : \ - ((x)%7) == 1 ? 6 : \ - ((x)%7) == 2 ? 2 : \ - ((x)%7) == 3 ? 3 : \ - ((x)%7) == 4 ? 5 : \ - ((x)%7) == 5 ? 7 : 4 ) - -__device__ __forceinline__ -static void STEP8_IF(const uint32_t *w, const uint32_t i, const uint32_t r, const uint32_t s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D) +__device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) { + uint32_t temp; uint32_t R[8]; - #pragma unroll 8 - for (int j = 0; j<8; j++) + for (int j = 0; j<8; j++) { R[j] = ROTL32(A[j], r); - - uint32_t W[8]; - *(uint2x4*)&W[0] = *(uint2x4*)&w[0]; + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[2]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[3]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[0]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[1]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[6]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[7]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[4]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[5]; #pragma unroll 8 - for (int j = 0; j<8; j++) - D[j] += W[j] + IF(A[j], B[j], C[j]); + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; #pragma unroll 8 - for (int j = 0; j<8; j++) - D[j] = R[j^p8_xor(i)] + ROTL32(D[j], s); + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[3]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[2]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[1]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[0]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[7]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[6]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[5]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[4]; #pragma unroll 8 - for (int j = 0; j<8; j++) + for (int j = 0; j<8; j++) { A[j] = R[j]; + } } - -__device__ __forceinline__ -static void STEP8_MAJ(const uint32_t *w, const uint32_t i, const uint32_t r, const uint32_t s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D) +__device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) { + uint32_t temp; uint32_t R[8]; - - uint32_t W[8]; - *(uint2x4*)&W[0] = *(uint2x4*)&w[0]; - #pragma unroll 8 - for (int j = 0; j<8; j++) + for (int j = 0; j<8; j++) { R[j] = ROTL32(A[j], r); - + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[5]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[4]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[7]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[6]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[1]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[0]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[3]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[2]; #pragma unroll 8 - for (int j = 0; j<8; j++) - D[j] += W[j] + MAJ(A[j], B[j], C[j]); + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; #pragma unroll 8 - for (int j = 0; j<8; j++) - D[j] = R[j^p8_xor(i)] + ROTL32(D[j], s); + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[7]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[6]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[5]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[4]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[3]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[2]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[1]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[0]; #pragma unroll 8 - for (int j = 0; j<8; j++) + for (int j = 0; j<8; j++) { A[j] = R[j]; + } } - -static __constant__ uint32_t d_cw[4][8][8] = { - 0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, 0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380, - 0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, 0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3, - 0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09, 0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE, - 0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, 0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3, - 0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, 0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2, - 0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, 0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318, - 0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D, 0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12, - 0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, 0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80, - 0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, 0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3, - 0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, 0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6, - 0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80, 0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E, - 0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, 0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE, - 0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, 0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B, - 0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, 0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6, - 0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583, 0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F, - 0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, 0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D - -}; - -__device__ __forceinline__ -static void Round8_0_final(uint32_t* A, const uint32_t r, const uint32_t s, const uint32_t t, const uint32_t u){ - - STEP8_IF(d_cw[0][0], 0, r, s, A, &A[8], &A[16], &A[24]); - STEP8_IF(d_cw[0][1], 1, s, t, &A[24], A, &A[8], &A[16]); - STEP8_IF(d_cw[0][2], 2, t, u, &A[16], &A[24], A, &A[8]); - STEP8_IF(d_cw[0][3], 3, u, r, &A[8], &A[16], &A[24], A); - STEP8_MAJ(d_cw[0][4], 4, r, s, A, &A[8], &A[16], &A[24]); - STEP8_MAJ(d_cw[0][5], 5, s, t, &A[24], A, &A[8], &A[16]); - STEP8_MAJ(d_cw[0][6], 6, t, u, &A[16], &A[24], A, &A[8]); - STEP8_MAJ(d_cw[0][7], 7, u, r, &A[8], &A[16], &A[24], A); +__device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[4]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[5]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[6]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[7]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[0]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[1]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[2]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[3]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } } - -__device__ __forceinline__ -static void Round8_1_final(uint32_t* A, const uint32_t r, const uint32_t s, const uint32_t t, const uint32_t u){ - - STEP8_IF(d_cw[1][0], 8, r, s, A, &A[8], &A[16], &A[24]); - STEP8_IF(d_cw[1][1], 9, s, t, &A[24], A, &A[8], &A[16]); - STEP8_IF(d_cw[1][2], 10, t, u, &A[16], &A[24], A, &A[8]); - STEP8_IF(d_cw[1][3], 11, u, r, &A[8], &A[16], &A[24], A); - STEP8_MAJ(d_cw[1][4], 12, r, s, A, &A[8], &A[16], &A[24]); - STEP8_MAJ(d_cw[1][5], 13, s, t, &A[24], A, &A[8], &A[16]); - STEP8_MAJ(d_cw[1][6], 14, t, u, &A[16], &A[24], A, &A[8]); - STEP8_MAJ(d_cw[1][7], 15, u, r, &A[8], &A[16], &A[24], A); +__device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[1]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[0]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[3]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[2]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[5]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[4]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[7]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[6]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } } - -__device__ __forceinline__ -static void Round8_2_final(uint32_t* A, const uint32_t r, const uint32_t s, const uint32_t t, const uint32_t u){ - - STEP8_IF(d_cw[2][0], 16, r, s, A, &A[8], &A[16], &A[24]); - STEP8_IF(d_cw[2][1], 17, s, t, &A[24], A, &A[8], &A[16]); - STEP8_IF(d_cw[2][2], 18, t, u, &A[16], &A[24], A, &A[8]); - STEP8_IF(d_cw[2][3], 19, u, r, &A[8], &A[16], &A[24], A); - STEP8_MAJ(d_cw[2][4], 20, r, s, A, &A[8], &A[16], &A[24]); - STEP8_MAJ(d_cw[2][5], 21, s, t, &A[24], A, &A[8], &A[16]); - STEP8_MAJ(d_cw[2][6], 22, t, u, &A[16], &A[24], A, &A[8]); - STEP8_MAJ(d_cw[2][7], 23, u, r, &A[8], &A[16], &A[24], A); +__device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[6]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[7]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[4]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[5]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[2]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[3]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[0]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[1]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } } - -__device__ __forceinline__ -static void Round8_3_final(uint32_t* A, const uint32_t r, const uint32_t s, const uint32_t t, const uint32_t u){ - - STEP8_IF(d_cw[3][0], 24, r, s, A, &A[8], &A[16], &A[24]); - STEP8_IF(d_cw[3][1], 25, s, t, &A[24], A, &A[8], &A[16]); - STEP8_IF(d_cw[3][2], 26, t, u, &A[16], &A[24], A, &A[8]); - STEP8_IF(d_cw[3][3], 27, u, r, &A[8], &A[16], &A[24], A); - STEP8_MAJ(d_cw[3][4], 28, r, s, A, &A[8], &A[16], &A[24]); - STEP8_MAJ(d_cw[3][5], 29, s, t, &A[24], A, &A[8], &A[16]); - STEP8_MAJ(d_cw[3][6], 30, t, u, &A[16], &A[24], A, &A[8]); - STEP8_MAJ(d_cw[3][7], 31, u, r, &A[8], &A[16], &A[24], A); +__device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[2]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[3]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[0]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[1]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[6]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[7]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[4]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[5]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } } +__device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ -//#define expanded_vector(x) __ldg(&g_fft4[x]) -static __device__ __forceinline__ void expanded_vector(uint32_t* w, const uint4* ptr){ - asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(w[0]), "=r"(w[1]), "=r"(w[2]), "=r"(w[3]) : __LDG_PTR(ptr)); + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[3]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[2]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[1]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[0]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[7]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[6]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[5]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[4]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } } - -__device__ __forceinline__ -static void Round8(uint32_t* A, const uint32_t thr_offset, const uint4 *const __restrict__ g_fft4) { - - uint32_t w[8]; - uint32_t tmp = thr_offset; - - uint32_t r = 3, s = 23, t = 17, u = 27; - - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 0, r, s, A, &A[8], &A[16], &A[24]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 1, s, t, &A[24], A, &A[8], &A[16]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 2, t, u, &A[16], &A[24], A, &A[8]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 3, u, r, &A[8], &A[16], &A[24], A); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 4, r, s, A, &A[8], &A[16], &A[24]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 5, s, t, &A[24], A, &A[8], &A[16]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 6, t, u, &A[16], &A[24], A, &A[8]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 7, u, r, &A[8], &A[16], &A[24], A); - - r = 28; s = 19; t = 22; u = 7; - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 8, r, s, A, &A[8], &A[16], &A[24]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 9, s, t, &A[24], A, &A[8], &A[16]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 10, t, u, &A[16], &A[24], A, &A[8]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 11, u, r, &A[8], &A[16], &A[24], A); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 12, r, s, A, &A[8], &A[16], &A[24]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 13, s, t, &A[24], A, &A[8], &A[16]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 14, t, u, &A[16], &A[24], A, &A[8]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 15, u, r, &A[8], &A[16], &A[24], A); - - r = 29; s = 9; t = 15; u = 5; - - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 16, r, s, A, &A[8], &A[16], &A[24]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 17, s, t, &A[24], A, &A[8], &A[16]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 18, t, u, &A[16], &A[24], A, &A[8]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 19, u, r, &A[8], &A[16], &A[24], A); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 20, r, s, A, &A[8], &A[16], &A[24]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 21, s, t, &A[24], A, &A[8], &A[16]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 22, t, u, &A[16], &A[24], A, &A[8]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 23, u, r, &A[8], &A[16], &A[24], A); - - r = 4; s = 13; t = 10; u = 25; - - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 24, r, s, A, &A[8], &A[16], &A[24]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 25, s, t, &A[24], A, &A[8], &A[16]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 26, t, u, &A[16], &A[24], A, &A[8]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_IF(w, 27, u, r, &A[8], &A[16], &A[24], A); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 28, r, s, A, &A[8], &A[16], &A[24]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 29, s, t, &A[24], A, &A[8], &A[16]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 30, t, u, &A[16], &A[24], A, &A[8]); - expanded_vector(&w[0], &g_fft4[tmp++]); - expanded_vector(&w[4], &g_fft4[tmp++]); - STEP8_MAJ(w, 31, u, r, &A[8], &A[16], &A[24], A); - +__device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[5]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[4]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[7]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[6]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[1]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[0]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[3]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[2]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } } - -/********************* Message expansion ************************/ - -/* -* Reduce modulo 257; result is in [-127; 383] -* REDUCE(x) := (x&255) - (x>>8) -*/ -#define REDUCE(x) \ - (((x)&255) - ((x)>>8)) - -/* -* Reduce from [-127; 383] to [-128; 128] -* EXTRA_REDUCE_S(x) := x<=128 ? x : x-257 -*/ -#define EXTRA_REDUCE_S(x) \ - ((x)<=128 ? (x) : (x)-257) - -/* -* Reduce modulo 257; result is in [-128; 128] -*/ -#define REDUCE_FULL_S(x) \ - EXTRA_REDUCE_S(REDUCE(x)) - -// Parallelization: -// -// FFT_8 wird 2 times 8-fach parallel ausgeführt (in FFT_64) -// and 1 time 16-fach parallel (in FFT_128_full) -// -// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations - -/** -* FFT_8 using w=4 as 8th root of unity -* Unrolled decimation in frequency (DIF) radix-2 NTT. -* Output data is in revbin_permuted order. -*/ -__device__ __forceinline__ -static void FFT_8(int *y, const uint8_t stripe){ - -#define BUTTERFLY(i,j,n) \ -do { \ - int u= y[stripe*i]; \ - int v= y[stripe*j]; \ - y[stripe*i] = u+v; \ - y[stripe*j] = (u-v) << (n<<1); \ -} while(0) - - BUTTERFLY(0, 4, 0); - BUTTERFLY(1, 5, 1); - BUTTERFLY(2, 6, 2); - BUTTERFLY(3, 7, 3); - - y[stripe * 6] = REDUCE(y[stripe * 6]); - y[stripe * 7] = REDUCE(y[stripe * 7]); - - BUTTERFLY(0, 2, 0); - BUTTERFLY(4, 6, 0); - BUTTERFLY(1, 3, 2); - BUTTERFLY(5, 7, 2); - - y[stripe * 7] = REDUCE(y[stripe * 7]); - - BUTTERFLY(0, 1, 0); - BUTTERFLY(2, 3, 0); - BUTTERFLY(4, 5, 0); - BUTTERFLY(6, 7, 0); - - y[0] = REDUCE(y[0]); - y[stripe] = REDUCE(y[stripe]); - y[stripe << 1] = REDUCE(y[stripe << 1]); - y[stripe * 3] = REDUCE(y[stripe * 3]); - y[stripe << 2] = REDUCE(y[stripe << 2]); - y[stripe * 5] = REDUCE(y[stripe * 5]); - y[stripe * 6] = REDUCE(y[stripe * 6]); - y[stripe * 7] = REDUCE(y[stripe * 7]); - - y[0] = EXTRA_REDUCE_S(y[0]); - y[stripe] = EXTRA_REDUCE_S(y[stripe]); - y[stripe << 1] = EXTRA_REDUCE_S(y[stripe << 1]); - y[stripe * 3] = EXTRA_REDUCE_S(y[stripe * 3]); - y[stripe << 2] = EXTRA_REDUCE_S(y[stripe << 2]); - y[stripe * 5] = EXTRA_REDUCE_S(y[stripe * 5]); - y[stripe * 6] = EXTRA_REDUCE_S(y[stripe * 6]); - y[stripe * 7] = EXTRA_REDUCE_S(y[stripe * 7]); - -#undef BUTTERFLY +__device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[7]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[6]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[5]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[4]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[3]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[2]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[1]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[0]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } } - -/** -* FFT_16 using w=2 as 16th root of unity -* Unrolled decimation in frequency (DIF) radix-2 NTT. -* Output data is in revbin_permuted order. -*/ -__device__ __forceinline__ -static void FFT_16(int *y){ - -#define DO_REDUCE_FULL_S(i) \ - do { \ - y[i] = REDUCE(y[i]); \ - y[i] = EXTRA_REDUCE_S(y[i]); \ - } while(0) - - int u, v; - - const uint8_t thr = threadIdx.x & 7; - - u = y[0]; // 0..7 - v = y[1]; // 8..15 - y[0] = u + v; - y[1] = (u - v) << (thr); - - if ((thr) >= 3) y[1] = REDUCE(y[1]); // 11...15 - - u = __shfl(y[0], (threadIdx.x & 3), 8); // 0,1,2,3 0,1,2,3 - v = __shfl(y[0], 4 + (threadIdx.x & 3), 8); // 4,5,6,7 4,5,6,7 - y[0] = ((thr) < 4) ? (u + v) : ((u - v) << ((threadIdx.x & 3) << 1)); - - u = __shfl(y[1], (threadIdx.x & 3), 8); // 8,9,10,11 8,9,10,11 - v = __shfl(y[1], 4 + (threadIdx.x & 3), 8); // 12,13,14,15 12,13,14,15 - y[1] = ((thr) < 4) ? (u + v) : ((u - v) << ((threadIdx.x & 3) << 1)); - - if ((threadIdx.x & 1) && (thr >= 4)) { - y[0] = REDUCE(y[0]); // 5, 7 - y[1] = REDUCE(y[1]); // 13, 15 +__device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[4]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[5]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[6]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[7]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[0]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[1]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[2]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[3]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[1]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[0]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[3]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[2]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[5]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[4]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[7]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[6]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[6]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[7]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[4]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[5]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[2]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[3]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[0]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[1]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[2]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[3]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[0]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[1]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[6]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[7]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[4]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[5]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[3]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[2]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[1]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[0]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[7]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[6]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[5]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[4]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[5]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[4]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[7]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[6]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[1]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[0]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[3]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[2]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[7]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[6]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[5]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[4]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[3]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[2]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[1]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[0]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[4]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[5]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[6]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[7]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[0]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[1]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[2]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[3]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[1]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[0]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[3]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[2]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[5]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[4]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[7]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[6]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[6]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[7]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[4]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[5]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[2]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[3]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[0]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[1]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[2]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[3]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[0]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[1]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[6]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[7]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[4]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[5]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[3]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[2]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[1]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[0]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[7]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[6]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[5]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[4]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[5]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[4]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[7]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[6]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[1]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[0]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[3]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[2]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[7]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[6]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[5]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[4]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[3]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[2]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[1]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[0]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[4]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[5]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[6]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[7]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[0]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[1]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[2]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[3]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[1]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[0]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[3]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[2]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[5]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[4]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[7]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[6]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[6]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[7]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[4]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[5]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[2]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[3]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[0]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[1]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[2]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[3]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[0]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[1]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[6]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[7]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[4]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[5]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[3]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[2]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[1]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[0]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[7]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[6]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[5]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[4]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[5]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[4]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[7]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[6]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[1]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[0]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[3]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[2]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[7]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[6]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[5]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[4]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[3]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[2]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[1]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[0]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[4]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[5]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[6]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[7]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[0]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[1]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[2]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[3]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[1]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[0]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[3]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[2]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[5]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[4]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[7]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[6]; +#pragma unroll 8 + for (int j = 0; j<8; j++) { + A[j] = R[j]; + } +} - u = __shfl(y[0], (threadIdx.x & 5), 8); // 0,1,0,1 4,5,4,5 - v = __shfl(y[0], 2 + (threadIdx.x & 5), 8); // 2,3,2,3 6,7,6,7 - y[0] = ((threadIdx.x & 3) < 2) ? (u + v) : ((u - v) << ((threadIdx.x & 1) << 2)); +#ifdef DEVICE_DIRECT_CONSTANTS +static __constant__ uint32_t d_cw0[8][8] = { +#else +static __constant__ uint32_t d_cw0[8][8]; +static const uint32_t h_cw0[8][8] = { +#endif + 0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, + 0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380, + 0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, + 0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3, + 0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09, + 0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE, + 0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, + 0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3 +}; - u = __shfl(y[1], (threadIdx.x & 5), 8); // 8,9,8,9 12,13,12,13 - v = __shfl(y[1], 2 + (threadIdx.x & 5), 8); // 10,11,10,11 14,15,14,15 - y[1] = ((threadIdx.x & 3) < 2) ? (u + v) : ((u - v) << ((threadIdx.x & 1) << 2)); +__device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t, int u) +{ + STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]); + STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]); + STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]); + STEP8_IF_3(d_cw0[3], u, r, &A[8], &A[16], &A[24], A); + STEP8_MAJ_4(d_cw0[4], r, s, A, &A[8], &A[16], &A[24]); + STEP8_MAJ_5(d_cw0[5], s, t, &A[24], A, &A[8], &A[16]); + STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]); + STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A); +} - u = __shfl(y[0], (threadIdx.x & 6), 8); // 0,0,2,2 4,4,6,6 - v = __shfl(y[0], 1 + (threadIdx.x & 6), 8); // 1,1,3,3 5,5,7,7 - y[0] = ((threadIdx.x & 1) < 1) ? (u + v) : (u - v); +#ifdef DEVICE_DIRECT_CONSTANTS +static __constant__ uint32_t d_cw1[8][8] = { +#else +static __constant__ uint32_t d_cw1[8][8]; +static const uint32_t h_cw1[8][8] = { +#endif + 0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, + 0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2, + 0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, + 0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318, + 0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D, + 0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12, + 0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, + 0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80 +}; - u = __shfl(y[1], (threadIdx.x & 6), 8); // 8,8,10,10 12,12,14,14 - v = __shfl(y[1], 1 + (threadIdx.x & 6), 8); // 9,9,11,11 13,13,15,15 - y[1] = ((threadIdx.x & 1) < 1) ? (u + v) : (u - v); +__device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t, int u) +{ + STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]); + STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]); + STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]); + STEP8_IF_11(d_cw1[3], u, r, &A[8], &A[16], &A[24], A); + STEP8_MAJ_12(d_cw1[4], r, s, A, &A[8], &A[16], &A[24]); + STEP8_MAJ_13(d_cw1[5], s, t, &A[24], A, &A[8], &A[16]); + STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]); + STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A); +} - DO_REDUCE_FULL_S(0); // 0...7 - DO_REDUCE_FULL_S(1); // 8...15 +#ifdef DEVICE_DIRECT_CONSTANTS +static __constant__ uint32_t d_cw2[8][8] = { +#else +static __constant__ uint32_t d_cw2[8][8]; +static const uint32_t h_cw2[8][8] = { +#endif + 0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, + 0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3, + 0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, + 0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6, + 0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80, + 0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E, + 0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, + 0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE +}; -#undef DO_REDUCE_FULL_S +__device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t, int u) +{ + STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]); + STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]); + STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]); + STEP8_IF_19(d_cw2[3], u, r, &A[8], &A[16], &A[24], A); + STEP8_MAJ_20(d_cw2[4], r, s, A, &A[8], &A[16], &A[24]); + STEP8_MAJ_21(d_cw2[5], s, t, &A[24], A, &A[8], &A[16]); + STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]); + STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A); } -/***************************************************/ -#if __CUDA_ARCH__ > 500 -__global__ __launch_bounds__(TPB52_1, 9) +#ifdef DEVICE_DIRECT_CONSTANTS +static __constant__ uint32_t d_cw3[8][8] = { #else -__global__ __launch_bounds__(TPB50_1, 9) +static __constant__ uint32_t d_cw3[8][8]; +static const uint32_t h_cw3[8][8] = { #endif -static void x11_simd512_gpu_expand_64(int *thr_id, uint32_t threads, const uint32_t* __restrict__ g_hash, uint4 *g_temp4) -{ - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) - return; - const uint32_t threadBloc = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; - const uint8_t thr = (threadIdx.x & 7); - /* Message Expansion using Number Theoretical Transform similar to FFT */ - int expanded[32]; + 0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, + 0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B, + 0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, + 0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6, + 0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583, + 0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F, + 0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, + 0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D +}; - uint4 vec0; - int P, Q, P1, Q1, P2, Q2; +__device__ __forceinline__ void Round8_3_final(uint32_t *A, int r, int s, int t, int u) +{ + STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]); + STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]); + STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]); + STEP8_IF_27(d_cw3[3], u, r, &A[8], &A[16], &A[24], A); + STEP8_MAJ_28(d_cw3[4], r, s, A, &A[8], &A[16], &A[24]); + STEP8_MAJ_29(d_cw3[5], s, t, &A[24], A, &A[8], &A[16]); + STEP8_MAJ_30(d_cw3[6], t, u, &A[16], &A[24], A, &A[8]); + STEP8_MAJ_31(d_cw3[7], u, r, &A[8], &A[16], &A[24], A); +} - const bool even = (threadIdx.x & 1) == 0; - const bool hi = (thr) >= 4; - const bool lo = (thr)<4; - const bool sel = ((threadIdx.x + 2) & 7) >= 4; // 2,3,4,5 +#if __CUDA_ARCH__ < 350 +#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x)) +#else +//#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x)) +#define expanded_vector(x) __ldg(&g_fft4[x]) +#endif - if (threadBloc < threads){ +__device__ __forceinline__ void Round8_0(uint32_t *A, const int thr_offset, + int r, int s, int t, int u, uint4 *g_fft4) { + uint32_t w[8]; + uint4 hv1, hv2; + + int tmp = 0 + thr_offset; + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_0(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_1(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_2(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_3(w, u, r, &A[8], &A[16], &A[24], A); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_4(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_5(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_6(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_7(w, u, r, &A[8], &A[16], &A[24], A); - const uint32_t hashPosition = threadBloc << 4; - const uint32_t *inpHash = &g_hash[hashPosition]; +} +__device__ __forceinline__ void Round8_1(uint32_t *A, const int thr_offset, + int r, int s, int t, int u, uint4 *g_fft4) { + uint32_t w[8]; + uint4 hv1, hv2; + + int tmp = 16 + thr_offset; + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_8(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_9(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_10(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_11(w, u, r, &A[8], &A[16], &A[24], A); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_12(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_13(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_14(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_15(w, u, r, &A[8], &A[16], &A[24], A); - const uint32_t data0 = __ldg(&inpHash[thr]); - const uint32_t data1 = __ldg(&inpHash[thr + 8]); - // Puffer fur expandierte Nachricht - uint4 *temp4 = &g_temp4[hashPosition << 2]; +} +__device__ __forceinline__ void Round8_2(uint32_t *A, const int thr_offset, + int r, int s, int t, int u, uint4 *g_fft4) { + uint32_t w[8]; + uint4 hv1, hv2; + + int tmp = 32 + thr_offset; + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_16(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_17(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_18(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_19(w, u, r, &A[8], &A[16], &A[24], A); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_20(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_21(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_22(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_23(w, u, r, &A[8], &A[16], &A[24], A); -#pragma unroll 4 - for (uint32_t i = 0; i < 4; i++) { - expanded[i] = bfe(__byte_perm(__shfl(data0, i << 1, 8), __shfl(data0, (i << 1) + 1, 8), thr), 0, 8); - } -#pragma unroll 4 - for (uint32_t i = 0; i < 4; i++) { - expanded[4 + i] = bfe(__byte_perm(__shfl(data1, i << 1, 8), __shfl(data1, (i << 1) + 1, 8), thr), 0, 8); - } -#pragma unroll 8 - for (uint32_t i = 8; i < 16; i++) { - expanded[i] = 0; - } - /* - * FFT_256 using w=41 as 256th root of unity. Decimation in frequency (DIF) NTT. Output data is in revbin_permuted order. In place. - */ -#pragma unroll 8 - for (uint32_t i = 0; i<8; i++) - expanded[16 + i] = REDUCE(expanded[i] * c_FFT256_2_128_Twiddle[8 * i + (thr)]); -#pragma unroll 8 - for (uint32_t i = 24; i < 32; i++) { - expanded[i] = 0; - } - /* handle X^255 with an additional butterfly */ - if (thr == 7){ - expanded[15] = 1; - expanded[31] = REDUCE((-1) * c_FFT256_2_128_Twiddle[127]); - } +} +__device__ __forceinline__ void Round8_3(uint32_t *A, const int thr_offset, + int r, int s, int t, int u, uint4 *g_fft4) { + uint32_t w[8]; + uint4 hv1, hv2; + + int tmp = 48 + thr_offset; + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_24(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_25(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_26(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_27(w, u, r, &A[8], &A[16], &A[24], A); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_28(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_29(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_30(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_31(w, u, r, &A[8], &A[16], &A[24], A); - // FFT_128_full(expanded); - FFT_8(expanded, 2); // eight parallel FFT8's - FFT_8(&expanded[16], 2); // eight parallel FFT8's - FFT_8(&expanded[1], 2); // eight parallel FFT8's - FFT_8(&expanded[17], 2); // eight parallel FFT8's -#pragma unroll 16 - for (uint32_t i = 0; i<16; i++){ - expanded[i] = REDUCE(expanded[i] * c_FFT128_8_16_Twiddle[i * 8 + (thr)]); - expanded[i + 16] = REDUCE(expanded[i + 16] * c_FFT128_8_16_Twiddle[i * 8 + (thr)]); - } +} +__device__ __forceinline__ void SIMD_Compress1(uint32_t *A, const int thr_id, const uint32_t *M, uint4 *g_fft4) { + int i; + const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente) #pragma unroll 8 - for (uint32_t i = 0; i<8; i++){ - FFT_16(expanded + (i << 1)); // eight sequential FFT16's, each one executed in parallel by 8 threads - FFT_16(expanded + 16 + (i << 1)); // eight sequential FFT16's, each one executed in parallel by 8 threads - } - - // store w matrices in global memory - P1 = expanded[0]; P2 = __shfl(expanded[2], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; - Q1 = expanded[16]; Q2 = __shfl(expanded[18], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; - vec0.x = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[0][thr], 8); - P1 = expanded[8]; P2 = __shfl(expanded[10], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; - Q1 = expanded[24]; Q2 = __shfl(expanded[26], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; - vec0.y = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[0][thr], 8); - P1 = expanded[4]; P2 = __shfl(expanded[6], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; - Q1 = expanded[20]; Q2 = __shfl(expanded[22], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; - vec0.z = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[0][thr], 8); - P1 = expanded[12]; P2 = __shfl(expanded[14], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; - Q1 = expanded[28]; Q2 = __shfl(expanded[30], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; - vec0.w = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[0][thr], 8); - temp4[thr] = vec0; - - P1 = expanded[1]; P2 = __shfl(expanded[3], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; - Q1 = expanded[17]; Q2 = __shfl(expanded[19], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; - vec0.x = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[1][thr], 8); - P1 = expanded[9]; P2 = __shfl(expanded[11], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; - Q1 = expanded[25]; Q2 = __shfl(expanded[27], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; - vec0.y = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[1][thr], 8); - P1 = expanded[5]; P2 = __shfl(expanded[7], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; - Q1 = expanded[21]; Q2 = __shfl(expanded[23], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; - vec0.z = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[1][thr], 8); - P1 = expanded[13]; P2 = __shfl(expanded[15], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; - Q1 = expanded[29]; Q2 = __shfl(expanded[31], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; - vec0.w = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[1][thr], 8); - temp4[8 + (thr)] = vec0; - - P1 = hi ? expanded[1] : expanded[0]; P2 = __shfl(hi ? expanded[3] : expanded[2], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; - Q1 = hi ? expanded[17] : expanded[16]; Q2 = __shfl(hi ? expanded[19] : expanded[18], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; - vec0.x = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[2][thr], 8); - P1 = hi ? expanded[9] : expanded[8]; P2 = __shfl(hi ? expanded[11] : expanded[10], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; - Q1 = hi ? expanded[25] : expanded[24]; Q2 = __shfl(hi ? expanded[27] : expanded[26], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; - vec0.y = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[2][thr], 8); - P1 = hi ? expanded[5] : expanded[4]; P2 = __shfl(hi ? expanded[7] : expanded[6], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; - Q1 = hi ? expanded[21] : expanded[20]; Q2 = __shfl(hi ? expanded[23] : expanded[22], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; - vec0.z = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[2][thr], 8); - P1 = hi ? expanded[13] : expanded[12]; P2 = __shfl(hi ? expanded[15] : expanded[14], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; - Q1 = hi ? expanded[29] : expanded[28]; Q2 = __shfl(hi ? expanded[31] : expanded[30], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; - vec0.w = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[2][thr], 8); - temp4[16 + (thr)] = vec0; - - P1 = lo ? expanded[1] : expanded[0]; P2 = __shfl(lo ? expanded[3] : expanded[2], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; - Q1 = lo ? expanded[17] : expanded[16]; Q2 = __shfl(lo ? expanded[19] : expanded[18], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; - vec0.x = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[3][thr], 8); - P1 = lo ? expanded[9] : expanded[8]; P2 = __shfl(lo ? expanded[11] : expanded[10], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; - Q1 = lo ? expanded[25] : expanded[24]; Q2 = __shfl(lo ? expanded[27] : expanded[26], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; - vec0.y = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[3][thr], 8); - P1 = lo ? expanded[5] : expanded[4]; P2 = __shfl(lo ? expanded[7] : expanded[6], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; - Q1 = lo ? expanded[21] : expanded[20]; Q2 = __shfl(lo ? expanded[23] : expanded[22], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; - vec0.z = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[3][thr], 8); - P1 = lo ? expanded[13] : expanded[12]; P2 = __shfl(lo ? expanded[15] : expanded[14], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; - Q1 = lo ? expanded[29] : expanded[28]; Q2 = __shfl(lo ? expanded[31] : expanded[30], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; - vec0.w = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[3][thr], 8); - temp4[24 + (thr)] = vec0; - - P1 = sel ? expanded[0] : expanded[1]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - Q2 = sel ? expanded[2] : expanded[3]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.x = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[4][thr], 8); - P1 = sel ? expanded[8] : expanded[9]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - Q2 = sel ? expanded[10] : expanded[11]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.y = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[4][thr], 8); - P1 = sel ? expanded[4] : expanded[5]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - Q2 = sel ? expanded[6] : expanded[7]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.z = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[4][thr], 8); - P1 = sel ? expanded[12] : expanded[13]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - Q2 = sel ? expanded[14] : expanded[15]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.w = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[4][thr], 8); - - temp4[32 + thr] = vec0; - - P1 = sel ? expanded[1] : expanded[0]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - Q2 = sel ? expanded[3] : expanded[2]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.x = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[5][thr], 8); - P1 = sel ? expanded[9] : expanded[8]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - Q2 = sel ? expanded[11] : expanded[10]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.y = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[5][thr], 8); - P1 = sel ? expanded[5] : expanded[4]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - Q2 = sel ? expanded[7] : expanded[6]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.z = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[5][thr], 8); - P1 = sel ? expanded[13] : expanded[12]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - Q2 = sel ? expanded[15] : expanded[14]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.w = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[5][thr], 8); - - temp4[40 + thr] = vec0; + for(i=0; i<8; i++) { + A[i] ^= M[i]; + (&A[8])[i] ^= M[8+i]; + } + Round8_0(A, thr_offset, 3, 23, 17, 27, g_fft4); + Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4); +} - uint32_t t; - t = __shfl(expanded[17], (threadIdx.x + 4) & 7, 8); P1 = sel ? t : expanded[16]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - t = __shfl(expanded[19], (threadIdx.x + 4) & 7, 8); Q2 = sel ? t : expanded[18]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.x = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[6][thr], 8); - t = __shfl(expanded[25], (threadIdx.x + 4) & 7, 8); P1 = sel ? t : expanded[24]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - t = __shfl(expanded[27], (threadIdx.x + 4) & 7, 8); Q2 = sel ? t : expanded[26]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.y = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[6][thr], 8); - t = __shfl(expanded[21], (threadIdx.x + 4) & 7, 8); P1 = sel ? t : expanded[20]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - t = __shfl(expanded[23], (threadIdx.x + 4) & 7, 8); Q2 = sel ? t : expanded[22]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.z = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[6][thr], 8); - t = __shfl(expanded[29], (threadIdx.x + 4) & 7, 8); P1 = sel ? t : expanded[28]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - t = __shfl(expanded[31], (threadIdx.x + 4) & 7, 8); Q2 = sel ? t : expanded[30]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.w = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[6][thr], 8); +__device__ __forceinline__ void Compression1(const uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) { + uint32_t A[32]; + int i; +#pragma unroll 32 + for (i=0; i < 32; i++) A[i] = c_IV_512[i]; + uint32_t buffer[16]; +#pragma unroll 16 + for (i=0; i < 16; i++) buffer[i] = hashval[i]; + SIMD_Compress1(A, texture_id, buffer, g_fft4); + uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)]; +#pragma unroll 32 + for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i]; +} - temp4[48 + thr] = vec0; +__device__ __forceinline__ void SIMD_Compress2(uint32_t *A, const int thr_id, uint4 *g_fft4) { + uint32_t IV[4][8]; + int i; + const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente) +#pragma unroll 8 + for(i=0; i<8; i++) { + IV[0][i] = c_IV_512[i]; + IV[1][i] = c_IV_512[8+i]; + IV[2][i] = c_IV_512[16+i]; + IV[3][i] = c_IV_512[24+i]; + } + Round8_2(A, thr_offset, 29, 9, 15, 5, g_fft4); + Round8_3(A, thr_offset, 4, 13, 10, 25, g_fft4); + STEP8_IF_32(IV[0], 4, 13, A, &A[8], &A[16], &A[24]); + STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]); + STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]); + STEP8_IF_35(IV[3], 25, 4, &A[8], &A[16], &A[24], A); +} - t = __shfl(expanded[16], (threadIdx.x + 4) & 7, 8); P1 = sel ? expanded[17] : t; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - t = __shfl(expanded[18], (threadIdx.x + 4) & 7, 8); Q2 = sel ? expanded[19] : t; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.x = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[7][thr], 8); - t = __shfl(expanded[24], (threadIdx.x + 4) & 7, 8); P1 = sel ? expanded[25] : t; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - t = __shfl(expanded[26], (threadIdx.x + 4) & 7, 8); Q2 = sel ? expanded[27] : t; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.y = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[7][thr], 8); - t = __shfl(expanded[20], (threadIdx.x + 4) & 7, 8); P1 = sel ? expanded[21] : t; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - t = __shfl(expanded[22], (threadIdx.x + 4) & 7, 8); Q2 = sel ? expanded[23] : t; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.z = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[7][thr], 8); - t = __shfl(expanded[28], (threadIdx.x + 4) & 7, 8); P1 = sel ? expanded[29] : t; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); - t = __shfl(expanded[30], (threadIdx.x + 4) & 7, 8); Q2 = sel ? expanded[31] : t; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); - P = even ? P1 : P2; Q = even ? Q1 : Q2; - vec0.w = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[7][thr], 8); +__device__ __forceinline__ void Compression2(const int texture_id, uint4 *g_fft4, uint32_t *g_state) { + uint32_t A[32]; + int i; + uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)]; +#pragma unroll 32 + for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i]; + SIMD_Compress2(A, texture_id, g_fft4); +#pragma unroll 32 + for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i]; +} - temp4[56 + thr] = vec0; +__device__ __forceinline__ void SIMD_Compress_Final(uint32_t *A, const uint32_t *M) { + uint32_t IV[4][8]; + int i; +#pragma unroll 8 + for(i=0; i<8; i++) { + IV[0][i] = A[i]; + IV[1][i] = (&A[8])[i]; + IV[2][i] = (&A[16])[i]; + IV[3][i] = (&A[24])[i]; + } +#pragma unroll 8 + for(i=0; i<8; i++) { + A[i] ^= M[i]; + (&A[8])[i] ^= M[8+i]; } + Round8_0_final(A, 3, 23, 17, 27); + Round8_1_final(A, 28, 19, 22, 7); + Round8_2_final(A, 29, 9, 15, 5); + Round8_3_final(A, 4, 13, 10, 25); + STEP8_IF_32(IV[0], 4, 13, A, &A[8], &A[16], &A[24]); + STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]); + STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]); + STEP8_IF_35(IV[3], 25, 4, &A[8], &A[16], &A[24], A); +} + +__device__ __forceinline__ void Final(uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) { + uint32_t A[32]; + int i; + uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)]; +#pragma unroll 32 + for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i]; + uint32_t buffer[16]; + buffer[0] = 512; +#pragma unroll 15 + for (i=1; i < 16; i++) buffer[i] = 0; + SIMD_Compress_Final(A, buffer); +#pragma unroll 16 + for (i=0; i < 16; i++) + hashval[i] = A[i]; } diff --git a/x11/cuda_x11_simd512_sm2.cuh b/x11/cuda_x11_simd512_sm2.cuh index 7abbac163e..34041e4bc1 100644 --- a/x11/cuda_x11_simd512_sm2.cuh +++ b/x11/cuda_x11_simd512_sm2.cuh @@ -532,7 +532,7 @@ void SIMDHash(const uint32_t *data, uint32_t *hashval) /***************************************************/ __global__ -void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +void x11_simd512_gpu_hash_64_sm2(int *thr_id, const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -547,7 +547,7 @@ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNou } #else -__global__ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {} +__global__ void x11_simd512_gpu_hash_64_sm2(int *thr_id, const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {} #endif /* __CUDA_ARCH__ < 300 */ __host__ @@ -561,7 +561,7 @@ static void x11_simd512_cpu_init_sm2(int thr_id) } __host__ -static void x11_simd512_cpu_hash_64_sm2(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +static void x11_simd512_cpu_hash_64_sm2(int *thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { const int threadsperblock = 256; @@ -570,6 +570,6 @@ static void x11_simd512_cpu_hash_64_sm2(int thr_id, uint32_t threads, uint32_t s size_t shared_size = 0; - x11_simd512_gpu_hash_64_sm2<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - MyStreamSynchronize(NULL, order, thr_id); + x11_simd512_gpu_hash_64_sm2 << > >(thr_id, threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + MyStreamSynchronize(NULL, order, ((uintptr_t)thr_id) & 15); } diff --git a/x13/cuda_x13_fugue512_alexis.cu b/x13/cuda_x13_fugue512_alexis.cu index e35cd48ad0..e5fae9ec48 100644 --- a/x13/cuda_x13_fugue512_alexis.cu +++ b/x13/cuda_x13_fugue512_alexis.cu @@ -245,11 +245,11 @@ static void SMIX_LDG(const uint32_t shared[4][256], uint32_t &x0,uint32_t &x1,ui __global__ __launch_bounds__(256,3) void x13_fugue512_gpu_hash_64_alexis(int *thr_id, uint32_t threads, uint64_t *g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; __shared__ uint32_t shared[4][256]; - // if(threadIdx.x<256){ +// if(threadIdx.x<256){ const uint32_t tmp = mixtab0[threadIdx.x]; shared[0][threadIdx.x] = tmp; shared[1][threadIdx.x] = ROR8(tmp); @@ -405,18 +405,18 @@ void x13_fugue512_gpu_hash_64_final_alexis(uint32_t threads,const uint32_t* __re resNonce[1] = tmp; } } -} +} __host__ -void x13_fugue512_cpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *d_hash){ - +void x13_fugue512_cpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *d_hash) +{ const uint32_t threadsperblock = 256; // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x13_fugue512_gpu_hash_64_alexis << > >(thr_id, threads, (uint64_t*)d_hash); + x13_fugue512_gpu_hash_64_alexis<<>>(thr_id, threads, (uint64_t*)d_hash); } __host__ diff --git a/x13/cuda_x13_hamsi512.cu b/x13/cuda_x13_hamsi512.cu index ae796d2ce5..30216e40e9 100644 --- a/x13/cuda_x13_hamsi512.cu +++ b/x13/cuda_x13_hamsi512.cu @@ -318,7 +318,7 @@ static const uint32_t T512[64][16] = { }; __global__ -void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +void x13_hamsi512_gpu_hash_64(int *thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -416,14 +416,14 @@ void x13_hamsi512_cpu_init(int thr_id, uint32_t threads) } __host__ -void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +void x13_hamsi512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { const uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x13_hamsi512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x13_hamsi512_gpu_hash_64<<>>(thr_id, threads, startNounce, (uint64_t*)d_hash, d_nonceVector); //MyStreamSynchronize(NULL, order, thr_id); } @@ -436,11 +436,8 @@ void x16_hamsi512_setBlock_80(void *pdata) } __global__ -void x16_hamsi512_gpu_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash) +void x16_hamsi512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { @@ -551,5 +548,5 @@ void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_ dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - x16_hamsi512_gpu_hash_80 << > > (thr_id, threads, startNounce, (uint64_t*)d_hash); + x16_hamsi512_gpu_hash_80 <<>> (threads, startNounce, (uint64_t*)d_hash); } diff --git a/x13/cuda_x13_hamsi512_alexis.cu b/x13/cuda_x13_hamsi512_alexis.cu index f6d0f249df..e0641e22cc 100644 --- a/x13/cuda_x13_hamsi512_alexis.cu +++ b/x13/cuda_x13_hamsi512_alexis.cu @@ -177,7 +177,7 @@ static __constant__ const uint32_t d_T512[1024] = { __global__ __launch_bounds__(384,2) void x13_hamsi512_gpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -288,7 +288,7 @@ void x13_hamsi512_gpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *g_ } } -__host__ +__host__ void x13_hamsi512_cpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *d_hash) { const uint32_t threadsperblock = 384; @@ -296,6 +296,6 @@ void x13_hamsi512_cpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *d_ dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x13_hamsi512_gpu_hash_64_alexis << > >(thr_id, threads, d_hash); + x13_hamsi512_gpu_hash_64_alexis<<>>(thr_id, threads, d_hash); } diff --git a/x15/cuda_x14_shabal512.cu b/x15/cuda_x14_shabal512.cu index 43c5ebf1ca..fec59deac7 100644 --- a/x15/cuda_x14_shabal512.cu +++ b/x15/cuda_x14_shabal512.cu @@ -361,7 +361,7 @@ static const uint32_t d_C512[] = { /***************************************************/ // GPU Hash Function -__global__ void x14_shabal512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +__global__ void x14_shabal512_gpu_hash_64(int *thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) { __syncthreads(); @@ -458,7 +458,7 @@ __host__ void x14_shabal512_cpu_init(int thr_id, uint32_t threads) } // #include -__host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void x14_shabal512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { const uint32_t threadsperblock = 256; @@ -470,6 +470,6 @@ __host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t s // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - x14_shabal512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x14_shabal512_gpu_hash_64<<>>(thr_id, threads, startNounce, (uint64_t*)d_hash, d_nonceVector); //MyStreamSynchronize(NULL, order, thr_id); } diff --git a/x15/cuda_x14_shabal512_alexis.cu b/x15/cuda_x14_shabal512_alexis.cu index 84d36234be..69ef2a5dff 100644 --- a/x15/cuda_x14_shabal512_alexis.cu +++ b/x15/cuda_x14_shabal512_alexis.cu @@ -105,7 +105,7 @@ void ROTATE(uint32_t* A){ __global__ __launch_bounds__(384,3) void x14_shabal512_gpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); @@ -177,7 +177,7 @@ __host__ void x14_shabal512_cpu_hash_64_alexis(int *thr_id, uint32_t threads, ui dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x14_shabal512_gpu_hash_64_alexis << > >(thr_id, threads, d_hash); + x14_shabal512_gpu_hash_64_alexis<<>>(thr_id, threads, d_hash); } __global__ __launch_bounds__(512,2) diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu index 827f8ecbf7..79d67f0f37 100644 --- a/x15/cuda_x15_whirlpool.cu +++ b/x15/cuda_x15_whirlpool.cu @@ -41,8 +41,9 @@ extern "C" { #include } -#include "cuda_helper_alexis.h" -#include "cuda_vectors_alexis.h" +#include +#include +#include #define xor3x(a,b,c) (a^b^c) @@ -620,7 +621,7 @@ __global__ __launch_bounds__(TPB64,2) void x15_whirlpool_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; __shared__ uint2 sharedMemory[7][256]; @@ -735,7 +736,7 @@ static void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_ x15_whirlpool_gpu_hash_64 <<>> (threads, (uint64_t*)d_hash); } -*/ +*/ __host__ void x15_whirlpool_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash) { diff --git a/x15/cuda_x15_whirlpool_sm3.cu b/x15/cuda_x15_whirlpool_sm3.cu index 1251f5cdc8..f99c7afca1 100644 --- a/x15/cuda_x15_whirlpool_sm3.cu +++ b/x15/cuda_x15_whirlpool_sm3.cu @@ -2000,11 +2000,8 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int __global__ -void oldwhirlpool_gpu_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, void *outputHash, int swab) +void oldwhirlpool_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, void *outputHash, int swab) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - __shared__ uint64_t sharedMemory[2048]; if (threadIdx.x < 256) { @@ -2100,7 +2097,7 @@ void oldwhirlpool_gpu_hash_80(int thr_id, const uint32_t threads, const uint32_t } __global__ -void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +void x15_whirlpool_gpu_hash_64(int *thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) { __shared__ uint64_t sharedMemory[2048]; @@ -2309,12 +2306,12 @@ void whirlpool512_free_sm3(int thr_id) } __host__ -void whirlpool512_hash_64_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +void whirlpool512_hash_64_sm3(int *thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { dim3 grid((threads + threadsperblock-1) / threadsperblock); dim3 block(threadsperblock); - x15_whirlpool_gpu_hash_64 <<>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x15_whirlpool_gpu_hash_64 <<>> (thr_id, threads, startNounce, (uint64_t*)d_hash, d_nonceVector); //MyStreamSynchronize(NULL, order, thr_id); } @@ -2347,7 +2344,7 @@ void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNonce, if (threads < 256) applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!"); - oldwhirlpool_gpu_hash_80 << > >(thr_id, threads, startNonce, d_outputHash, 1); + oldwhirlpool_gpu_hash_80<<>>(threads, startNonce, d_outputHash, 1); } extern void whirl_midstate(void *state, const void *input); @@ -2420,5 +2417,5 @@ void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t if (threads < 256) applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!"); - oldwhirlpool_gpu_hash_80 << > > (thr_id, threads, startNonce, d_outputHash, 1); + oldwhirlpool_gpu_hash_80 <<>> (threads, startNonce, d_outputHash, 1); } diff --git a/x16/x16s.cu b/x16/x16s.cu index 5c555ad0c9..36aeacbc21 100644 --- a/x16/x16s.cu +++ b/x16/x16s.cu @@ -31,10 +31,7 @@ extern "C" { #include "miner.h" #include "cuda_helper.h" -//#include "cuda_x16.h" - -#include "../x16r/cuda_x16r.h" // todo, re-unify these like core ccminer is. - +#include "cuda_x16.h" static uint32_t *d_hash[MAX_GPUS]; @@ -237,15 +234,9 @@ static bool use_compat_kernels[MAX_GPUS] = { 0 }; //#define _DEBUG #define _DEBUG_PREFIX "x16s-" #include "cuda_debug.cuh" -/* -static int algo80_tests[HASH_FUNC_COUNT] = { 0 }; -static int algo64_tests[HASH_FUNC_COUNT] = { 0 }; -static int algo80_fails[HASH_FUNC_COUNT] = { 0 }; -*/ + extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { - return -1; -#if 0 uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; @@ -270,30 +261,6 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, if (use_compat_kernels[thr_id]) x11_echo512_cpu_init(thr_id, throughput); - gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); - if (throughput2intensity(throughput) > 21) gpulog(LOG_INFO, thr_id, "SIMD throws error on malloc call, TBD if there is a fix"); - - quark_groestl512_cpu_init(thr_id, throughput); - // quark_blake512_cpu_init(thr_id, throughput); - // quark_bmw512_cpu_init(thr_id, throughput); - // quark_skein512_cpu_init(thr_id, throughput); - quark_jh512_cpu_init(thr_id, throughput); - quark_keccak512_cpu_init(thr_id, throughput); - // x11_shavite512_cpu_init(thr_id, throughput); - if (x11_simd512_cpu_init(thr_id, throughput)) - { - applog(LOG_WARNING, "SIMD was unable to initialize :( exiting..."); - exit(-1); - }// 64 - x16_echo512_cuda_init(thr_id, throughput); - x13_hamsi512_cpu_init(thr_id, throughput); - x13_fugue512_cpu_init(thr_id, throughput); - x16_fugue512_cpu_init(thr_id, throughput); - // x14_shabal512_cpu_init(thr_id, throughput); - x15_whirlpool_cpu_init(thr_id, throughput, 0); - x16_whirlpool512_init(thr_id, throughput); - x17_sha512_cpu_init(thr_id, throughput); - /* quark_blake512_cpu_init(thr_id, throughput); quark_bmw512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput); @@ -312,7 +279,7 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, x15_whirlpool_cpu_init(thr_id, throughput, 0); x16_whirlpool512_init(thr_id, throughput); x17_sha512_cpu_init(thr_id, throughput); - */ + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); cuda_check_cpu_init(thr_id, throughput); @@ -346,70 +313,57 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; switch (algo80) { - case BLAKE: - //! low impact, can do a lot to optimize quark_blake512 - quark_blake512_cpu_setBlock_80(thr_id, endiandata); - break; - case BMW: - //! low impact, painfully optimize quark_bmw512 - quark_bmw512_cpu_setBlock_80(endiandata); - break; - case GROESTL: - //! second most used algo historically - groestl512_setBlock_80(thr_id, endiandata); - break; - case JH: - //! average use, optimization tbd - jh512_setBlock_80(thr_id, endiandata); - break; - case KECCAK: - //! low impact - keccak512_setBlock_80(thr_id, endiandata); - break; - case SKEIN: - //! very low impact - skein512_cpu_setBlock_80((void*)endiandata); - break; - case LUFFA: - //! moderate impact (more than shavite) - qubit_luffa512_cpu_setBlock_80_alexis((void*)endiandata); - break; - case CUBEHASH: - //! moderate impact (more than shavite) - cubehash512_setBlock_80(thr_id, endiandata); - break; - case SHAVITE: - //! has been optimized fairly well - x11_shavite512_setBlock_80((void*)endiandata); - break; - case SIMD: - //! high impact optimization. -i > 21 causes error. - x16_simd512_setBlock_80((void*)endiandata); - break; - case ECHO: - //! high impact needs more optimizations - x16_echo512_setBlock_80((void*)endiandata); - break; - case HAMSI: - //! ***highest impact*** - x16_hamsi512_setBlock_80((void*)endiandata); - break; - case FUGUE: - //! very high impact! - x16_fugue512_setBlock_80((void*)pdata); - break; - case SHABAL: - //! very low impact. - x16_shabal512_setBlock_80((void*)endiandata); - break; - case WHIRLPOOL: - //! moderate impact (more than shavite by a bit) - x16_whirlpool512_setBlock_80((void*)endiandata); - break; - case SHA512: - //! second lowest impact. - x16_sha512_setBlock_80(endiandata); - break; + case BLAKE: + quark_blake512_cpu_setBlock_80(thr_id, endiandata); + break; + case BMW: + quark_bmw512_cpu_setBlock_80(endiandata); + break; + case GROESTL: + groestl512_setBlock_80(thr_id, endiandata); + break; + case JH: + jh512_setBlock_80(thr_id, endiandata); + break; + case KECCAK: + keccak512_setBlock_80(thr_id, endiandata); + break; + case SKEIN: + skein512_cpu_setBlock_80((void*)endiandata); + break; + case LUFFA: + qubit_luffa512_cpu_setBlock_80((void*)endiandata); + break; + case CUBEHASH: + cubehash512_setBlock_80(thr_id, endiandata); + break; + case SHAVITE: + x11_shavite512_setBlock_80((void*)endiandata); + break; + case SIMD: + x16_simd512_setBlock_80((void*)endiandata); + break; + case ECHO: + x16_echo512_setBlock_80((void*)endiandata); + break; + case HAMSI: + x16_hamsi512_setBlock_80((void*)endiandata); + break; + case FUGUE: + x16_fugue512_setBlock_80((void*)pdata); + break; + case SHABAL: + x16_shabal512_setBlock_80((void*)endiandata); + break; + case WHIRLPOOL: + x16_whirlpool512_setBlock_80((void*)endiandata); + break; + case SHA512: + x16_sha512_setBlock_80(endiandata); + break; + default: { + return -1; + } } int warn = 0; @@ -419,13 +373,13 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, // Hash with CUDA - switch (algo80) { + switch (algo80) { case BLAKE: quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; TRACE("blake80:"); break; case BMW: - quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); TRACE("bmw80 :"); break; case GROESTL: @@ -441,11 +395,11 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, TRACE("kecck80:"); break; case SKEIN: - skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; TRACE("skein80:"); break; case LUFFA: - qubit_luffa512_cpu_hash_80_alexis(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); TRACE("luffa80:"); break; case CUBEHASH: @@ -453,7 +407,7 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, TRACE("cube 80:"); break; case SHAVITE: - x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); TRACE("shavite:"); break; case SIMD: @@ -493,67 +447,70 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, switch (algo64) { case BLAKE: - quark_blake512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("blake :"); break; case BMW: - quark_bmw512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("bmw :"); break; case GROESTL: - quark_groestl512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("groestl:"); break; case JH: - quark_jh512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("jh512 :"); break; case KECCAK: - quark_keccak512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("keccak :"); break; case SKEIN: - quark_skein512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("skein :"); break; case LUFFA: - x11_luffa512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++; + x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("luffa :"); break; case CUBEHASH: - x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("cube :"); break; case SHAVITE: - x11_shavite512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++; + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("shavite:"); break; case SIMD: - x11_simd512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("simd :"); break; case ECHO: - x11_echo512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++; + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + else + x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; TRACE("echo :"); break; case HAMSI: - x13_hamsi512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++; + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("hamsi :"); break; case FUGUE: - x13_fugue512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++; + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("fugue :"); break; case SHABAL: - x14_shabal512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++; + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("shabal :"); break; case WHIRLPOOL: - x15_whirlpool_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("shabal :"); break; case SHA512: - x17_sha512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; TRACE("sha512 :"); break; } @@ -589,26 +546,7 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, } else { pdata[19] = work->nonces[0] + 1; // cursor } -// gpulog(LOG_INFO, thr_id, "hash found with %s 80 (%s)!", algo_strings[algo80], hashOrder); -#if 0 - gpulog(LOG_INFO, thr_id, "hash found with %s 80!", algo_strings[algo80]); - - algo80_tests[algo80] += work->valid_nonces; - char oks64[128] = { 0 }; - char oks80[128] = { 0 }; - char fails[128] = { 0 }; - for (int a = 0; a < HASH_FUNC_COUNT; a++) { - const char elem = hashOrder[a]; - const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; - if (a > 0) algo64_tests[algo64] += work->valid_nonces; - sprintf(&oks64[strlen(oks64)], "|%X:%2d", a, algo64_tests[a] < 100 ? algo64_tests[a] : 99); - sprintf(&oks80[strlen(oks80)], "|%X:%2d", a, algo80_tests[a] < 100 ? algo80_tests[a] : 99); - sprintf(&fails[strlen(fails)], "|%X:%2d", a, algo80_fails[a] < 100 ? algo80_fails[a] : 99); - } - applog(LOG_INFO, "K64: %s", oks64); - applog(LOG_INFO, "K80: %s", oks80); - applog(LOG_ERR, "F80: %s", fails); -#endif + //gpulog(LOG_INFO, thr_id, "hash found with %s 80 (%s)!", algo_strings[algo80], hashOrder); return work->valid_nonces; } else if (vhash[7] > Htarg) { @@ -637,7 +575,6 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, *hashes_done = pdata[19] - first_nonce; return 0; -#endif } // cleanup diff --git a/x16r/cuda_x16_echo512.cu b/x16r/cuda_x16_echo512.cu index 373978a213..bd1139d8df 100644 --- a/x16r/cuda_x16_echo512.cu +++ b/x16r/cuda_x16_echo512.cu @@ -297,11 +297,8 @@ void x16_echo512_setBlock_80(void *endiandata) } __global__ __launch_bounds__(128, 7) /* will force 72 registers */ -void x16_echo512_gpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *g_hash) +void x16_echo512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_hash) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - __shared__ uint32_t sharedMemory[1024]; // echo_gpu_init(sharedMemory); @@ -331,5 +328,5 @@ void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x16_echo512_gpu_hash_80 << > >(thr_id, threads, startNonce, (uint64_t*)d_hash); + x16_echo512_gpu_hash_80<<>>(threads, startNonce, (uint64_t*)d_hash); } diff --git a/x16r/cuda_x16_fugue512.cu b/x16r/cuda_x16_fugue512.cu index 7c8893f86b..5967087f1e 100644 --- a/x16r/cuda_x16_fugue512.cu +++ b/x16r/cuda_x16_fugue512.cu @@ -306,11 +306,8 @@ void x16_fugue512_setBlock_80(void *pdata) __global__ __launch_bounds__(TPB) -void x16_fugue512_gpu_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash) +void x16_fugue512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - __shared__ uint32_t mixtabs[1024]; // load shared mem (with 256 threads) @@ -468,5 +465,5 @@ void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_ dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x16_fugue512_gpu_hash_80 << > > (thr_id, threads, startNonce, (uint64_t*)d_hash); + x16_fugue512_gpu_hash_80 <<>> (threads, startNonce, (uint64_t*)d_hash); } diff --git a/x16r/cuda_x16_shabal512.cu b/x16r/cuda_x16_shabal512.cu index c037f205f5..c1d3e66ee0 100644 --- a/x16r/cuda_x16_shabal512.cu +++ b/x16r/cuda_x16_shabal512.cu @@ -241,11 +241,8 @@ void x16_shabal512_setBlock_80(void *pdata) #define TPB_SHABAL 256 __global__ __launch_bounds__(TPB_SHABAL, 2) -void x16_shabal512_gpu_hash_80(int thr_id, uint32_t threads, const uint32_t startNonce, uint32_t *g_hash) +void x16_shabal512_gpu_hash_80(uint32_t threads, const uint32_t startNonce, uint32_t *g_hash) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t B[] = { @@ -351,5 +348,5 @@ void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32 dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - x16_shabal512_gpu_hash_80 << > >(thr_id, threads, startNonce, d_hash); + x16_shabal512_gpu_hash_80 <<>>(threads, startNonce, d_hash); } diff --git a/x16r/cuda_x16_simd512_80.cu b/x16r/cuda_x16_simd512_80.cu index 76c810c289..c9c1544062 100644 --- a/x16r/cuda_x16_simd512_80.cu +++ b/x16r/cuda_x16_simd512_80.cu @@ -1680,11 +1680,8 @@ void x16_simd512_setBlock_80(void *pdata) #define TPB_SIMD 128 __global__ __launch_bounds__(TPB_SIMD,1) -static void x16_simd512_gpu_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *g_outputhash) +static void x16_simd512_gpu_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_outputhash) { -// if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15))) -// return; - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { @@ -1837,5 +1834,5 @@ void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t const uint32_t tpb = 128; const dim3 grid((threads + tpb - 1) / tpb); const dim3 block(tpb); - x16_simd512_gpu_80 << > > (thr_id, threads, startNonce, (uint64_t*)d_hash); + x16_simd512_gpu_80 <<>> (threads, startNonce, (uint64_t*) d_hash); } diff --git a/x16r/x16r.cu b/x16r/x16r.cu index 0eded40125..55260dc441 100644 --- a/x16r/x16r.cu +++ b/x16r/x16r.cu @@ -1,8 +1,8 @@ /** - * X16R algorithm (X16 with Randomized chain order) - * - * tpruvot 2018 - GPL code - */ +* X16R algorithm (X16 with Randomized chain order) +* +* tpruvot 2018 - GPL code +*/ #include #include @@ -27,7 +27,7 @@ extern "C" { #include "sph/sph_shabal.h" #include "sph/sph_whirlpool.h" #include "sph/sph_sha2.h" -//extern struct work_restart *work_restart; + //extern struct work_restart *work_restart; } #include "miner.h" @@ -35,7 +35,7 @@ extern "C" { #include "cuda_x16r.h" #define GPU_HASH_CHECK_LOG 0 -static uint32_t *d_hash[MAX_GPUS+1]; +static uint32_t *d_hash[MAX_GPUS + 1]; enum Algo { BLAKE = 0, @@ -98,7 +98,7 @@ static void(*pAlgo64[16])(int*, uint32_t, uint32_t*) = x13_fugue512_cpu_hash_64_alexis, x14_shabal512_cpu_hash_64_alexis, x15_whirlpool_cpu_hash_64, - x17_sha512_cpu_hash_64 + x17_sha512_cpu_hash_64 }; static void(*pAlgo80[16])(int, uint32_t, uint32_t, uint32_t*) = { @@ -154,32 +154,32 @@ static void run_x16r_rounds(const uint32_t* prevblock, int thr_id, uint32_t thre pAlgo64[(*(uint64_t*)prevblock >> 60 - (7 * 4)) & 0x0f](thr_id, threads, d_hash, 7); pAlgo64[(*(uint64_t*)prevblock >> 60 - (8 * 4)) & 0x0f](thr_id, threads, d_hash, 8); pAlgo64[(*(uint64_t*)prevblock >> 60 - (9 * 4)) & 0x0f](thr_id, threads, d_hash, 9); - pAlgo64[(*(uint64_t*)prevblock >> 60 - (10* 4)) & 0x0f](thr_id, threads, d_hash,10); - pAlgo64[(*(uint64_t*)prevblock >> 60 - (11* 4)) & 0x0f](thr_id, threads, d_hash,11); - pAlgo64[(*(uint64_t*)prevblock >> 60 - (12* 4)) & 0x0f](thr_id, threads, d_hash,12); - pAlgo64[(*(uint64_t*)prevblock >> 60 - (13* 4)) & 0x0f](thr_id, threads, d_hash,13); - pAlgo64[(*(uint64_t*)prevblock >> 60 - (14* 4)) & 0x0f](thr_id, threads, d_hash,14); - pAlgo64[(*(uint64_t*)prevblock >> 60 - (15* 4)) & 0x0f](thr_id, threads, d_hash,15); + pAlgo64[(*(uint64_t*)prevblock >> 60 - (10 * 4)) & 0x0f](thr_id, threads, d_hash, 10); + pAlgo64[(*(uint64_t*)prevblock >> 60 - (11 * 4)) & 0x0f](thr_id, threads, d_hash, 11); + pAlgo64[(*(uint64_t*)prevblock >> 60 - (12 * 4)) & 0x0f](thr_id, threads, d_hash, 12); + pAlgo64[(*(uint64_t*)prevblock >> 60 - (13 * 4)) & 0x0f](thr_id, threads, d_hash, 13); + pAlgo64[(*(uint64_t*)prevblock >> 60 - (14 * 4)) & 0x0f](thr_id, threads, d_hash, 14); + pAlgo64[(*(uint64_t*)prevblock >> 60 - (15 * 4)) & 0x0f](thr_id, threads, d_hash, 15); } #endif static void getAlgoString(const uint32_t* prevblock, char *output) { for (int i = 0; i < 16; i++) { - *output++ = (*(uint64_t*)prevblock >> 60 - (i * 4)) & 0x0f; + *output++ = (*(uint64_t*)prevblock >> 60 - (i * 4)) & 0x0f; } /* char *sptr = output; uint8_t* data = (uint8_t*)prevblock; //if data == 0x123456789abcdef how does it order? for (uint8_t j = 0; j < HASH_FUNC_COUNT; j++) { - uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed - uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4; - if (algoDigit >= 10) - sprintf(sptr, "%c", 'A' + (algoDigit - 10)); - else - sprintf(sptr, "%u", (uint32_t) algoDigit); - sptr++; + uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed + uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4; + if (algoDigit >= 10) + sprintf(sptr, "%c", 'A' + (algoDigit - 10)); + else + sprintf(sptr, "%u", (uint32_t) algoDigit); + sptr++; } *sptr = '\0'; */ @@ -207,18 +207,18 @@ extern "C" void x16r_hash(void *output, const void *input) sph_whirlpool_context ctx_whirlpool; sph_sha512_context ctx_sha512; - void *in = (void*) input; + void *in = (void*)input; int size = 80; - uint32_t *in32 = (uint32_t*) input; -// getAlgoString(&in32[1], hashOrder); + uint32_t *in32 = (uint32_t*)input; + // getAlgoString(&in32[1], hashOrder); uint64_t prevblock = *(uint64_t*)&in32[1]; for (int i = 0; i < 16; i++) { -// const char elem = hashOrder[i]; -// const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; -// uint8_t algo = hashOrder[i]; + // const char elem = hashOrder[i]; + // const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + // uint8_t algo = hashOrder[i]; switch ((prevblock >> 60 - (i << 2)) & 0x0f) { case BLAKE: sph_blake512_init(&ctx_blake); @@ -297,14 +297,14 @@ extern "C" void x16r_hash(void *output, const void *input) break; case SHA512: sph_sha512_init(&ctx_sha512); - sph_sha512(&ctx_sha512,(const void*) in, size); + sph_sha512(&ctx_sha512, (const void*)in, size); sph_sha512_close(&ctx_sha512, (void*)output); break; } - in = (void*) output; + in = (void*)output; size = 64; } -// memcpy(output, hash, 32); + // memcpy(output, hash, 32); } void whirlpool_midstate(void *state, const void *input) @@ -340,11 +340,11 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; const int dev_id = device_map[thr_id]; -// int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19; -// if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20; + // int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19; + // if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20; uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); -// if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + // if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (init[thr_id]){ throughput = min(throughput, max_nonce - first_nonce); if (throughput == max_nonce - first_nonce) @@ -362,11 +362,11 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, } gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); if (throughput2intensity(throughput) > 21) gpulog(LOG_INFO, thr_id, "SIMD throws error on malloc call, TBD if there is a fix"); - +/* quark_groestl512_cpu_init(thr_id, throughput); -// quark_blake512_cpu_init(thr_id, throughput); -// quark_bmw512_cpu_init(thr_id, throughput); -// quark_skein512_cpu_init(thr_id, throughput); + // quark_blake512_cpu_init(thr_id, throughput); + // quark_bmw512_cpu_init(thr_id, throughput); + // quark_skein512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput); quark_keccak512_cpu_init(thr_id, throughput); x11_shavite512_cpu_init(thr_id, throughput); @@ -384,9 +384,32 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, x15_whirlpool_cpu_init(thr_id, throughput, 0); x16_whirlpool512_init(thr_id, throughput); x17_sha512_cpu_init(thr_id, throughput); +*/ + quark_groestl512_cpu_init(thr_id, throughput); + quark_blake512_cpu_init(thr_id, throughput); + quark_bmw512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + quark_keccak512_cpu_init(thr_id, throughput); + x11_shavite512_cpu_init(thr_id, throughput); + if (x11_simd512_cpu_init(thr_id, throughput)) + { + applog(LOG_WARNING, "SIMD was unable to initialize :( exiting..."); + exit(-1); + }// 64 + x16_echo512_cuda_init(thr_id, throughput); + x11_echo512_cuda_init(thr_id, throughput); + x13_hamsi512_cpu_init(thr_id, throughput); + x13_fugue512_cpu_init(thr_id, throughput); + x16_fugue512_cpu_init(thr_id, throughput); + x14_shabal512_cpu_init(thr_id, throughput); + x15_whirlpool_cpu_init(thr_id, throughput, 0); + x16_whirlpool512_init(thr_id, throughput); + x17_sha512_cpu_init(thr_id, throughput); + + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput), 0); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput + 0x10000000), 0); - cuda_check_cpu_init(thr_id, throughput); init[thr_id] = true; @@ -402,13 +425,16 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, //testing 0xb, 0xc //6FB7C831F4ED0A52 -// ((uint32_t*)ptarget)[7] = 0x5ac6acf2; + // ((uint32_t*)ptarget)[7] = 0x5ac6acf2; ((uint32_t*)ptarget)[7] = 0x003f; +// ((uint32_t*)ptarget)[7] = 0x123f; +// ((uint32_t*)pdata)[1] = 0xEFCDAB89; +// ((uint32_t*)pdata)[2] = 0x67452301; ((uint32_t*)pdata)[1] = 0xEFCDAB89; - ((uint32_t*)pdata)[2] = 0x67452301; - // *((uint64_t*)&pdata[1]) = 0xaaaaaaaaaaaaaaaa;//0x67452301EFCDAB89;//0x31C8B76F520AEDF4; -// ((uint32_t*)pdata)[1] = 0x99999999; //E4F361B3 -// ((uint32_t*)pdata)[2] = 0x99999999; //427B6D24 + ((uint32_t*)pdata)[2] = 0x67452301; // 8:64,C:64 bad + //*((uint64_t*)&pdata[1]) = 0xffffffffffffffff;//0x67452301EFCDAB89;//0x31C8B76F520AEDF4; + // ((uint32_t*)pdata)[1] = 0x99999999; //E4F361B3 + // ((uint32_t*)pdata)[2] = 0x99999999; //427B6D24 /* BLAKE = 0, BMW,1 @@ -430,7 +456,7 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, } uint32_t _ALIGN(64) endiandata[20]; - for (int k=0; k < 19; k++) + for (int k = 0; k < 19; k++) be32enc(&endiandata[k], pdata[k]); uint32_t ntime = swab32(pdata[17]); @@ -450,171 +476,172 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, uint8_t algo80 = (*(uint64_t*)&endiandata[1] >> 60) & 0x0f; switch (algo80) { + case BLAKE: + //! low impact, can do a lot to optimize quark_blake512 + quark_blake512_cpu_setBlock_80(thr_id, endiandata); + break; + case BMW: + //! low impact, painfully optimize quark_bmw512 + quark_bmw512_cpu_setBlock_80(endiandata); + break; + case GROESTL: + //! second most used algo historically + groestl512_setBlock_80(thr_id, endiandata); + break; + case JH: + //! average use, optimization tbd + jh512_setBlock_80(thr_id, endiandata); + break; + case KECCAK: + //! low impact + keccak512_setBlock_80(thr_id, endiandata); + break; + case SKEIN: + //! very low impact + skein512_cpu_setBlock_80((void*)endiandata); + break; + case LUFFA: + //! moderate impact (more than shavite) + qubit_luffa512_cpu_setBlock_80_alexis((void*)endiandata); + break; + case CUBEHASH: + //! moderate impact (more than shavite) + cubehash512_setBlock_80(thr_id, endiandata); + break; + case SHAVITE: + //! has been optimized fairly well + x11_shavite512_setBlock_80((void*)endiandata); + break; + case SIMD: + //! high impact optimization. -i > 21 causes error. + x16_simd512_setBlock_80((void*)endiandata); + break; + case ECHO: + //! high impact needs more optimizations + x16_echo512_setBlock_80((void*)endiandata); + break; + case HAMSI: + //! ***highest impact*** + x16_hamsi512_setBlock_80((void*)endiandata); + break; + case FUGUE: + //! very high impact! + x16_fugue512_setBlock_80((void*)pdata); + break; + case SHABAL: + //! very low impact. + x16_shabal512_setBlock_80((void*)endiandata); + break; + case WHIRLPOOL: + //! moderate impact (more than shavite by a bit) + x16_whirlpool512_setBlock_80((void*)endiandata); + break; + case SHA512: + //! second lowest impact. + x16_sha512_setBlock_80(endiandata); + break; + } + + int warn = 0; + // int rowdy = 16; + do { + // Hash with CUDA + /* + switch (algo80) { case BLAKE: - //! low impact, can do a lot to optimize quark_blake512 - quark_blake512_cpu_setBlock_80(thr_id, endiandata); - break; + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis + TRACE("blake80:"); + break; case BMW: - //! low impact, painfully optimize quark_bmw512 - quark_bmw512_cpu_setBlock_80(endiandata); - break; + quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // , 0); // alexis x + TRACE("bmw80 :"); + break; case GROESTL: - //! second most used algo historically - groestl512_setBlock_80(thr_id, endiandata); - break; + groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis + TRACE("grstl80:"); + break; case JH: - //! average use, optimization tbd - jh512_setBlock_80(thr_id, endiandata); - break; + jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x + TRACE("jh51280:"); + break; case KECCAK: - //! low impact - keccak512_setBlock_80(thr_id, endiandata); - break; + keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x + TRACE("kecck80:"); + break; case SKEIN: - //! very low impact - skein512_cpu_setBlock_80((void*)endiandata); - break; + skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // , 1); + TRACE("skein80:"); + break; case LUFFA: - //! moderate impact (more than shavite) - qubit_luffa512_cpu_setBlock_80_alexis((void*)endiandata); - break; + qubit_luffa512_cpu_hash_80_alexis(thr_id, throughput, pdata[19], d_hash[thr_id]); + TRACE("luffa80:"); + break; case CUBEHASH: - //! moderate impact (more than shavite) - cubehash512_setBlock_80(thr_id, endiandata); - break; + cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x + TRACE("cube 80:"); + break; case SHAVITE: - //! has been optimized fairly well - x11_shavite512_setBlock_80((void*)endiandata); - break; + x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // , 0); + TRACE("shavite:"); + break; case SIMD: - //! high impact optimization. -i > 21 causes error. - x16_simd512_setBlock_80((void*)endiandata); - break; + x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x + TRACE("simd512:"); + break; case ECHO: - //! high impact needs more optimizations - x16_echo512_setBlock_80((void*)endiandata); - break; + x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); + TRACE("echo :"); + break; case HAMSI: - //! ***highest impact*** - x16_hamsi512_setBlock_80((void*)endiandata); - break; + x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); + TRACE("hamsi :"); + break; case FUGUE: - //! very high impact! - x16_fugue512_setBlock_80((void*)pdata); - break; + x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x + TRACE("fugue :"); + break; case SHABAL: - //! very low impact. - x16_shabal512_setBlock_80((void*)endiandata); - break; + x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x + TRACE("shabal :"); + break; case WHIRLPOOL: - //! moderate impact (more than shavite by a bit) - x16_whirlpool512_setBlock_80((void*)endiandata); - break; + x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x + TRACE("whirl :"); + break; case SHA512: - //! second lowest impact. - x16_sha512_setBlock_80(endiandata); - break; - } - - int warn = 0; -// int rowdy = 16; - do { - // Hash with CUDA -/* - switch (algo80) { - case BLAKE: - quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis - TRACE("blake80:"); - break; - case BMW: - quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // , 0); // alexis x - TRACE("bmw80 :"); - break; - case GROESTL: - groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis - TRACE("grstl80:"); - break; - case JH: - jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x - TRACE("jh51280:"); - break; - case KECCAK: - keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x - TRACE("kecck80:"); - break; - case SKEIN: - skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // , 1); - TRACE("skein80:"); - break; - case LUFFA: - qubit_luffa512_cpu_hash_80_alexis(thr_id, throughput, pdata[19], d_hash[thr_id]); - TRACE("luffa80:"); - break; - case CUBEHASH: - cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x - TRACE("cube 80:"); - break; - case SHAVITE: - x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // , 0); - TRACE("shavite:"); - break; - case SIMD: - x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x - TRACE("simd512:"); - break; - case ECHO: - x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); - TRACE("echo :"); - break; - case HAMSI: - x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); - TRACE("hamsi :"); - break; - case FUGUE: - x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x - TRACE("fugue :"); - break; - case SHABAL: - x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x - TRACE("shabal :"); - break; - case WHIRLPOOL: - x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x - TRACE("whirl :"); - break; - case SHA512: - x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x - TRACE("sha512 :"); - break; + x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x + TRACE("sha512 :"); + break; } -*/ - if (work_restart[thr_id].restart) return -127; + */ + + if (work_restart[thr_id].restart) return -127; pAlgo80[(*(uint64_t*)&endiandata[1] >> 60 - (0 * 4)) & 0x0f](thr_id, throughput, pdata[19], d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (1 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (2 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (3 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (4 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (5 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (6 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (7 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (8 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (9 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (10 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (11 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (12 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (13 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (14 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); - pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (15 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (1 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (2 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (3 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (4 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (5 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (6 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (7 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (8 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (9 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (10 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (11 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (12 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (13 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (14 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); + pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (15 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]); x13_echo512_cpu_init(thr_id, throughput); -// if (work_restart[thr_id].restart) return -127; + // if (work_restart[thr_id].restart) return -127; -// run_x16r_rounds(&endiandata[1], thr_id, throughput, pdata[19], d_hash[thr_id]); + // run_x16r_rounds(&endiandata[1], thr_id, throughput, pdata[19], d_hash[thr_id]); *hashes_done = pdata[19] - first_nonce + throughput; - + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); - if (work_restart[thr_id].restart) return -127; +// if (work_restart[thr_id].restart) return -127; #ifdef _DEBUG uint32_t _ALIGN(64) dhash[8]; be32enc(&endiandata[19], pdata[19]); @@ -639,7 +666,8 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, bn_set_target_ratio(work, vhash, 1); work->valid_nonces++; pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; - } else { + } + else { pdata[19] = work->nonces[0] + 1; // cursor } #if GPU_HASH_CHECK_LOG == 1 @@ -659,9 +687,9 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, } applog(LOG_INFO, "K64: %s", oks64); applog(LOG_INFO, "K80: %s", oks80); - applog(LOG_ERR, "F80: %s", fails); + applog(LOG_ERR, "F80: %s", fails); #endif - if (work_restart[thr_id].restart) return -127; +// if (work_restart[thr_id].restart) return -127; return work->valid_nonces; } else if (vhash[7] > Htarg) { @@ -672,12 +700,13 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, warn++; pdata[19] = work->nonces[0] + 1; continue; - } else { + } + else { if (!opt_quiet) gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %X%X", work->nonces[0], algo_strings[algo80], endiandata[2], endiandata[1]); -// work->nonces[0], algo_strings[algo80], hashOrder); + // work->nonces[0], algo_strings[algo80], hashOrder); warn = 0; -// work->data[19] = max_nonce; + // work->data[19] = max_nonce; if (work_restart[thr_id].restart) return -127; return -128; } @@ -720,7 +749,7 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); *hashes_done = pdata[19] - first_nonce; - if (work_restart[thr_id].restart) return -127; +// if (work_restart[thr_id].restart) return -127; return 0; } @@ -733,7 +762,7 @@ extern "C" void free_x16r(int thr_id) cudaThreadSynchronize(); cudaFree(d_hash[thr_id]); - cudaFree((void *)d_ark); + cudaFree((void *)&d_ark); quark_blake512_cpu_free(thr_id); quark_groestl512_cpu_free(thr_id); x11_simd512_cpu_free(thr_id); @@ -751,17 +780,12 @@ volatile int h_ark = 0; extern "C" int *_d_ark = NULL; static int q = 0; -static int* skin = NULL; + __host__ void x11_echo512_cuda_init(int thr_id, uint32_t threads) { if (q++) return; cudaMalloc(&d_ark, (size_t)64); - skin = d_ark; - if ((uint64_t)d_ark & 15) - { - d_ark = (int*)((uint64_t)d_ark + ~((uint64_t)d_ark & 15)); - } cudaMemcpyToSymbol(d_ark, (int*)&h_ark, sizeof(int), 0, cudaMemcpyHostToDevice); } __host__ extern void x11_echo512_cpu_init(int thr_id, uint32_t threads) @@ -772,6 +796,7 @@ __host__ extern void x11_echo512_cpu_init(int thr_id, uint32_t threads) } __host__ extern void x13_echo512_cpu_init(int thr_id, uint32_t threads) { - h_ark ^= 1 << thr_id; +// h_ark ^= (1 << thr_id); + h_ark &= ~(1 << thr_id); cudaMemcpyToSymbol(d_ark, (int*)&h_ark, sizeof(int), 0, cudaMemcpyHostToDevice); } diff --git a/x17/cuda_x17_sha512.cu b/x17/cuda_x17_sha512.cu index 08e3335025..ce13188223 100644 --- a/x17/cuda_x17_sha512.cu +++ b/x17/cuda_x17_sha512.cu @@ -92,9 +92,8 @@ __global__ /*__launch_bounds__(256, 4)*/ void x17_sha512_gpu_hash_64(int *thr_id, const uint32_t threads, uint64_t *g_hash) { - if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15))) + if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15))) return; - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { @@ -162,7 +161,7 @@ void x17_sha512_cpu_init(int thr_id, uint32_t threads) { cudaMemcpyToSymbol(c_WB, WB, 80 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice); } - + __host__ void x17_sha512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash) { @@ -179,9 +178,8 @@ static uint64_t c_PaddedMessage80[10]; __global__ /*__launch_bounds__(256, 4)*/ -void x16_sha512_gpu_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash) +void x16_sha512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash) { - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { @@ -243,7 +241,7 @@ void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - x16_sha512_gpu_hash_80 << > > (thr_id, threads, startNounce, (uint64_t*)d_hash); + x16_sha512_gpu_hash_80 << > > (threads, startNounce, (uint64_t*)d_hash); } __host__