From a37c01bd7360d1a3b502eb754156f2feaf8801e3 Mon Sep 17 00:00:00 2001
From: a1i3nj03 <36484251+a1i3nj03@users.noreply.github.com>
Date: Sat, 21 Apr 2018 05:40:37 -0400
Subject: [PATCH] Improvemnts that work?!

Any issues should be reported, or accepted.
---
 JHA/cuda_jha_keccak512.cu              |    9 +-
 algos.h                                |    2 -
 api.cpp                                |    5 +-
 ccminer.cpp                            |   34 +-
 ccminer.vcxproj                        |    2 -
 ccminer.vcxproj.filters                |    2 -
 cuda_checkhash.cu                      |   63 +-
 cuda_helper.h                          |    2 -
 cuda_helper_alexis.h                   |    1 +
 miner.h                                |    3 -
 quark/cuda_bmw512.cu                   |   34 +-
 quark/cuda_bmw512_sm3.cuh              |    6 +-
 quark/cuda_jh512.cu                    |   12 +-
 quark/cuda_quark.h                     |    4 +-
 quark/cuda_quark_blake512.cu           |   18 +-
 quark/cuda_quark_blake512_sp.cuh       |   17 +-
 quark/cuda_quark_groestl512.cu         |   19 +-
 quark/cuda_quark_keccak512.cu          |   11 +-
 quark/cuda_skein512.cu                 |   13 +-
 quark/groestl_transf_quad_a1_min3r.cuh |    1 -
 qubit/qubit_luffa512_alexis.cu         |   16 +-
 res/ccminer.aps                        |  Bin 101972 -> 101944 bytes
 util.cpp                               |    3 -
 x11/cuda_x11_aes_alexis.cuh            |    7 +-
 x11/cuda_x11_cubehash512.cu            |  291 +---
 x11/cuda_x11_echo.cu                   |    6 +-
 x11/cuda_x11_echo_aes.cuh              | 1088 ++++++-------
 x11/cuda_x11_echo_alexis.cu            |   19 +-
 x11/cuda_x11_luffa512.cu               |    6 +-
 x11/cuda_x11_luffa512_Cubehash.cu      |    8 +-
 x11/cuda_x11_shavite512.cu             |    9 +-
 x11/cuda_x11_shavite512_alexis.cu      |    8 +-
 x11/cuda_x11_simd512.cu                |  784 +++++++++-
 x11/cuda_x11_simd512_func.cuh          | 1937 ++++++++++++++++--------
 x11/cuda_x11_simd512_sm2.cuh           |   10 +-
 x13/cuda_x13_fugue512_alexis.cu        |   12 +-
 x13/cuda_x13_hamsi512.cu               |   13 +-
 x13/cuda_x13_hamsi512_alexis.cu        |    6 +-
 x15/cuda_x14_shabal512.cu              |    6 +-
 x15/cuda_x14_shabal512_alexis.cu       |    4 +-
 x15/cuda_x15_whirlpool.cu              |    9 +-
 x15/cuda_x15_whirlpool_sm3.cu          |   15 +-
 x16/x16s.cu                            |  221 +--
 x16r/cuda_x16_echo512.cu               |    7 +-
 x16r/cuda_x16_fugue512.cu              |    7 +-
 x16r/cuda_x16_shabal512.cu             |    7 +-
 x16r/cuda_x16_simd512_80.cu            |    7 +-
 x16r/x16r.cu                           |  427 +++---
 x17/cuda_x17_sha512.cu                 |   10 +-
 49 files changed, 3021 insertions(+), 2180 deletions(-)

diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu
index 70f3d0b860..a47fec181d 100644
--- a/JHA/cuda_jha_keccak512.cu
+++ b/JHA/cuda_jha_keccak512.cu
@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <memory.h>
 
-#include "cuda_helper_alexis.h"
+#include "cuda_helper.h"
 #include "miner.h"
 
 // ZR5
@@ -478,11 +478,8 @@ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
 }
 
 __global__
-void jackpot_keccak512_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
@@ -532,7 +529,7 @@ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNoun
 
 	size_t shared_size = 0;
 
-	jackpot_keccak512_gpu_hash << <grid, block, shared_size >> >(thr_id, threads, startNounce, (uint64_t*)d_hash);
+	jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
 
diff --git a/algos.h b/algos.h
index 800f602857..014b4cbf1b 100644
--- a/algos.h
+++ b/algos.h
@@ -60,7 +60,6 @@ enum sha_algos {
 	ALGO_X14,
 	ALGO_X15,
 	ALGO_X16R,
-	ALGO_X16S,
 	ALGO_X17,
 	ALGO_VANILLA,
 	ALGO_VELTOR,
@@ -131,7 +130,6 @@ static const char *algo_names[] = {
 	"x14",
 	"x15",
 	"x16r",
-	"x16s",
 	"x17",
 	"vanilla",
 	"veltor",
diff --git a/api.cpp b/api.cpp
index dc57a35534..cd11fe93a2 100644
--- a/api.cpp
+++ b/api.cpp
@@ -1252,7 +1252,7 @@ static void api()
 			char *wskey = NULL;
 			n = recv(c, &buf[0], SOCK_REC_BUFSZ, 0);
 
-			fail = SOCKETFAIL(n) || n < 0;
+			fail = SOCKETFAIL(n);
 			if (fail)
 				buf[0] = '\0';
 			else if (n > 0 && buf[n-1] == '\n') {
@@ -1261,8 +1261,7 @@ static void api()
 				if (n > 0 && buf[n-1] == '\r')
 					buf[n-1] = '\0';
 			}
-			else
-				buf[n] = '\0';
+			buf[n] = '\0';
 
 			//if (opt_debug && opt_protocol && n > 0)
 			//	applog(LOG_DEBUG, "API: recv command: (%d) '%s'+char(%x)", n, buf, buf[n-1]);
diff --git a/ccminer.cpp b/ccminer.cpp
index 0bb8fa4e01..2dd6727bb4 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -108,7 +108,7 @@ bool use_colors = true;
 int use_pok = 0;
 static bool opt_background = false;
 bool opt_quiet = false;
-int opt_maxlograte = 3;
+int opt_maxlograte = 5;//3;
 static int opt_retries = -1;
 static int opt_fail_pause = 30;
 int opt_time_limit = -1;
@@ -147,7 +147,6 @@ int32_t device_led[MAX_GPUS] = { -1, -1 };
 int opt_led_mode = 0;
 int opt_cudaschedule = -1;
 static bool opt_keep_clocks = false;
-extern "C" volatile int *volatile d_ark = NULL;
 
 // un-linked to cmdline scrypt options (useless)
 int device_batchsize[MAX_GPUS] = { 0 };
@@ -302,7 +301,6 @@ Options:\n\
 			x14         X14\n\
 			x15         X15\n\
 			x16r        X16R (Raven)\n\
-			x16s        X16S\n\
 			x17         X17\n\
 			wildkeccak  Boolberry\n\
 			zr5         ZR5 (ZiftrCoin)\n\
@@ -685,25 +683,24 @@ static void calc_network_diff(struct work *work)
 	int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
 
 	uint64_t diffone = 0x0000FFFF00000000ull;
-
+	/*
 	double d = (double)0x0000ffff / (double)bits;
 
 	for (int m=shift; m < 29; m++) d *= 256.0;
 	for (int m=29; m < shift; m++) d /= 256.0;
+	*/
 
-	/*
 	uint32_t d = 0x0000ffff / bits;
 
 	for (int m = shift; m < 29; m++) d <<= 8;
 	for (int m = 29; m < shift; m++) d >>= 8;
-	*/
+
 	//	if (opt_algo == ALGO_DECRED && shift == 28) d *= 256.0;
 	if (opt_debug_diff)
-//		applog(LOG_DEBUG, "net diff: %u -> shift %u, bits %08x", d, shift, bits);
-		applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
+		applog(LOG_DEBUG, "net diff: %u -> shift %u, bits %08x", d, shift, bits);
+//		applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
 
-	net_diff = d;
-//	net_diff = (double)d;
+	net_diff = (double)d;
 }
 
 /* decode data from getwork (wallets and longpoll pools) */
@@ -1758,7 +1755,6 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 //		case ALGO_TIMETRAVEL:
 //		case ALGO_BITCORE:
 		case ALGO_X16R:
-//		case ALGO_X16S:
 			work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));//(256.0 * opt_difficulty));
 			break;
 #if 0
@@ -1785,12 +1781,13 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	sctx->job.clean = 1; //!!!
 	return true;
 }
+
 __host__ extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 
 void restart_threads(void)
 {
 	if (opt_debug && !opt_quiet)
-		applog(LOG_DEBUG,"%s", __FUNCTION__);
+		applog(LOG_DEBUG, "%s", __FUNCTION__);
 	// restart mining thread IRL
 	for (int i = 0; i < opt_n_threads && work_restart; i++)
 	{
@@ -2511,9 +2508,6 @@ static void *miner_thread(void *userdata)
 			rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done);
 			break;
 #endif
-		case ALGO_X16S:
-//			rc = scanhash_x16s(thr_id, &work, max_nonce, &hashes_done);
-			break;
 		case ALGO_X16R:
 //			try{
 				rc = scanhash_x16r(thr_id, &work, max_nonce, &hashes_done);
@@ -2656,8 +2650,8 @@ static void *miner_thread(void *userdata)
 
 			work.submit_nonce_id = 0;
 			nonceptr[0] = work.nonces[0];
-			if (work_restart[thr_id].restart)
-				continue;
+//			if (work_restart[thr_id].restart)
+//				continue;
 			if (!submit_work(mythr, &work))
 				break;
 			nonceptr[0] = curnonce;
@@ -2682,8 +2676,8 @@ static void *miner_thread(void *userdata)
 					work.data[22] = 0;
 				}
 #endif
-				if (work_restart[thr_id].restart)
-					continue;
+//				if (work_restart[thr_id].restart)
+//					continue;
 				if (!submit_work(mythr, &work))
 					break;
 				nonceptr[0] = curnonce;
@@ -3960,7 +3954,7 @@ int main(int argc, char *argv[])
 		"                    `!!!!!!!!!!!!!!'\n"
 		"                      `\\!!!!!!!!!~\n"
 		"(Credit to http://www.asciiworld.com/-Aliens,128-.html )\n");
-		if (!opt_quiet) {
+	if (!opt_quiet) {
 		const char* arch = is_x64() ? "64-bits" : "32-bits";
 #ifdef _MSC_VER
 		printf("    Built with VC++ %d and nVidia CUDA SDK %d.%d %s\n\n", msver(),
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 954816bff6..942edcaa5e 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -269,7 +269,6 @@
     <ClCompile Include="lyra2\Lyra2.c" />
     <ClCompile Include="lyra2\Sponge.c" />
     <ClCompile Include="lyra2\Lyra2Z.c" />
-    <ClInclude Include="cuda_helper_alexis.h" />
     <ClInclude Include="equi\eqcuda.hpp" />
     <ClInclude Include="equi\equihash.h" />
     <ClInclude Include="neoscrypt\neoscrypt.h" />
@@ -464,7 +463,6 @@
     <CudaCompile Include="x16r\cuda_x16_fugue512.cu" />
     <CudaCompile Include="x16r\cuda_x16_shabal512.cu" />
     <CudaCompile Include="x16r\cuda_x16_simd512_80.cu" />
-    <CudaCompile Include="x16\x16s.cu" />
     <CudaCompile Include="zr5.cu" />
     <CudaCompile Include="heavy\cuda_blake512.cu">
     </CudaCompile>
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index d0e8a2c782..5dbb0b0a06 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -611,7 +611,6 @@
     </ClInclude>
     <ClInclude Include="quark\groestl_functions_quad_a1_min3r.cuh" />
     <ClInclude Include="quark\groestl_transf_quad_a1_min3r.cuh" />
-    <ClInclude Include="cuda_helper_alexis.h" />
   </ItemGroup>
   <ItemGroup>
     <CudaCompile Include="cuda.cpp">
@@ -1017,7 +1016,6 @@
       <Filter>Source Files\CUDA\x13</Filter>
     </CudaCompile>
     <CudaCompile Include="x13\cuda_x13_hamsi512_alexis.cu" />
-    <CudaCompile Include="x16\x16s.cu" />
   </ItemGroup>
   <ItemGroup>
     <Image Include="res\ccminer.ico">
diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu
index 0277732241..2d7fdddf76 100644
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@@ -192,71 +192,10 @@ void cuda_checkhash_32(uint32_t threads, uint32_t startNounce, uint32_t *hash, u
 	}
 }
 
-cudaError_t MyStreamSynchronize(cudaStream_t stream, uint32_t situation, int thr_id)
-{
-	cudaError_t result = cudaSuccess;
-	if (abort_flag)
-		return result;
-	if (situation >= 0)
-	{
-		if (cudaStreamQuery(stream) == cudaErrorNotReady)
-		{
-			while ((work_restart[thr_id].restart == 0) && cudaStreamQuery(stream) == cudaErrorNotReady)
-			{
-				usleep((useconds_t)(1000));
-			}
-			if (work_restart[thr_id].restart)
-				return cudaErrorInvalidDevice;
-			result = cudaStreamSynchronize(stream);
-		}
-	}
-	else
-		result = cudaStreamSynchronize(stream);
-	return result;
-}
-/*
-uint32_t glhf;
-__host__
-void chk(int thr_id)
-{
-	int size = 128;
-	int* h_val = (int*)malloc(sizeof(int)*size);
-	bool * h_flag = new bool;
-	*h_flag = true;
-
-	bool* d_flag;
-	cudaMalloc(&d_flag, sizeof(bool));
-	cudaMemcpy(d_flag, h_flag, 1, cudaMemcpyHostToDevice);
-
-	int* d_val;
-	cudaMalloc(&d_val, sizeof(int)*size);
-
-	for (int i = 0; i<size; i++){
-		h_val[i] = i;
-	}
-	cudaMemcpy(d_val, h_val, size, cudaMemcpyHostToDevice);
-
-	int BSIZE = 32;
-	int nblocks = size / BSIZE;
-	printf("%i,%i", nblocks, BSIZE);
-	stopme << <nblocks, BSIZE >> >(d_flag, d_val, size);
-
-	//--------------sleep for a while --------------------------
-
-	*h_flag = false;
-	cudaMemcpy(d_flag, h_flag, 1, cudaMemcpyHostToDevice);
-
-	glhf = 0
-	cudaMemcpy(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
-	//
-}
-*/
 __host__
 uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash)
 {
 	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
-//	if (MyStreamSynchronize(NULL, (uint32_t)1, thr_id) == cudaErrorInvalidDevice)
-//		return 0;
 
 	const uint32_t threadsperblock = 512;
 
@@ -272,7 +211,7 @@ uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uin
 	}
 
 	cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
-//	cudaThreadSynchronize();
+	cudaThreadSynchronize();
 
 	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	return h_resNonces[thr_id][0];
diff --git a/cuda_helper.h b/cuda_helper.h
index cc498af635..a3885d6f09 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -200,8 +200,6 @@ do {                                                                  \
 	}                                                                 \
 } while (0)
 
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, uint32_t situation, int thr_id);
-
 /*********************************************************************/
 #if !defined(__CUDA_ARCH__) || defined(_WIN64)
 #define USE_XOR_ASM_OPTS 0
diff --git a/cuda_helper_alexis.h b/cuda_helper_alexis.h
index affa560c7b..2a6e50cbc8 100644
--- a/cuda_helper_alexis.h
+++ b/cuda_helper_alexis.h
@@ -536,6 +536,7 @@ static __device__ __forceinline__ uint2 operator* (const uint2 a,const uint2 b){
 		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
 	return result;
 }
+
 // uint2 ROR/ROL methods
 __device__ __forceinline__
 uint2 ROR2(const uint2 a, const uint32_t offset){
diff --git a/miner.h b/miner.h
index ab51d5c0e6..a883a3380c 100644
--- a/miner.h
+++ b/miner.h
@@ -331,7 +331,6 @@ extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsig
 
 
 extern int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
-extern int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 
@@ -397,7 +396,6 @@ extern void free_x13(int thr_id);
 extern void free_x14(int thr_id);
 extern void free_x15(int thr_id);
 extern void free_x16r(int thr_id);
-extern void free_x16s(int thr_id);
 extern void free_x17(int thr_id);
 extern void free_zr5(int thr_id);
 //extern void free_sha256d(int thr_id);
@@ -944,7 +942,6 @@ void x13hash(void *output, const void *input);
 void x14hash(void *output, const void *input);
 void x15hash(void *output, const void *input);
 void x16r_hash(void *output, const void *input);
-void x16s_hash(void *output, const void *input);
 void x17hash(void *output, const void *input);
 void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize);
 void zr5hash(void *output, const void *input);
diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu
index e8faac4a91..9622d60343 100644
--- a/quark/cuda_bmw512.cu
+++ b/quark/cuda_bmw512.cu
@@ -8,7 +8,7 @@
 
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 
-//#include "cuda_bmw512_sm3.cuh"
+#include "cuda_bmw512_sm3.cuh"
 
 #ifdef __INTELLISENSE__
 /* just for vstudio code colors */
@@ -324,10 +324,9 @@ __launch_bounds__(64, 8)
 #endif
 void quark_bmw512_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
 	if (thread < threads)
 	{
 		//uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@@ -393,11 +392,8 @@ void quark_bmw512_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash)
 }
 
 __global__ __launch_bounds__(256, 2)
-void quark_bmw512_gpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
@@ -467,16 +463,16 @@ void quark_bmw512_cpu_setBlock_80(void *pdata)
 
 __host__
 void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) // , int order)
-{ 
+{
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-//	int dev_id = device_map[thr_id];
+	int dev_id = device_map[thr_id];
 
-//	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
-	quark_bmw512_gpu_hash_80 << <grid, block >> >(thr_id, threads, startNounce, (uint64_t*)d_hash);
-//	else
-//		quark_bmw512_gpu_hash_80_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
+	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
+		quark_bmw512_gpu_hash_80<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
+	else
+		quark_bmw512_gpu_hash_80_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
 }
 
 __host__
@@ -484,7 +480,7 @@ void quark_bmw512_cpu_init(int thr_id, uint32_t threads)
 {
 	cuda_get_arch(thr_id);
 }
- 
+
 __host__
 void quark_bmw512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash)
 {
@@ -492,9 +488,9 @@ void quark_bmw512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash)
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-//	int dev_id = device_map[thr_id];
-//	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
-	quark_bmw512_gpu_hash_64 << <grid, block >> >(thr_id, threads, (uint64_t*)d_hash);
-//	else
-//		quark_bmw512_gpu_hash_64_30<<<grid, block>>>(threads, (uint64_t*)d_hash);
+	int dev_id = device_map[((uintptr_t)thr_id) & 15];
+	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
+		quark_bmw512_gpu_hash_64<<<grid, block>>>(thr_id, threads, (uint64_t*)d_hash);
+	else
+		quark_bmw512_gpu_hash_64_30<<<grid, block>>>(thr_id, threads, (uint64_t*)d_hash);
 }
diff --git a/quark/cuda_bmw512_sm3.cuh b/quark/cuda_bmw512_sm3.cuh
index faa314e4f7..057e04031b 100644
--- a/quark/cuda_bmw512_sm3.cuh
+++ b/quark/cuda_bmw512_sm3.cuh
@@ -157,8 +157,10 @@ void Compression512_30(uint64_t *msg, uint64_t *hash)
 }
 
 __global__
-void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint64_t *g_hash)
+void quark_bmw512_gpu_hash_64_30(int *thr_id, uint32_t threads, uint64_t *g_hash)
 {
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
+		return;
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
@@ -265,7 +267,7 @@ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_
 }
 
 #else /* stripped stubs for other archs */
-__global__ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint64_t *g_hash) {}
+__global__ void quark_bmw512_gpu_hash_64_30(int *thr_id, uint32_t threads, uint64_t *g_hash) {}
 __global__ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
 #endif
 
diff --git a/quark/cuda_jh512.cu b/quark/cuda_jh512.cu
index 8af03d5201..a46e2e0633 100644
--- a/quark/cuda_jh512.cu
+++ b/quark/cuda_jh512.cu
@@ -279,10 +279,9 @@ __global__
 //__launch_bounds__(256,2)
 void quark_jh512_gpu_hash_64(int *thr_id, const uint32_t threads, uint32_t* g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
 	if (thread < threads)
 	{
 		//const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@@ -414,7 +413,7 @@ void jh512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint3
 		AS_UINT4(&Hash[12]) = AS_UINT4(&x[7][0]);
 	}
 }
- 
+
 __host__
 void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
 {
@@ -433,11 +432,8 @@ __constant__ static uint32_t c_JHState[32];
 __constant__ static uint32_t c_Message[4];
 
 __global__
-void jh512_gpu_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash)
+void jh512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
@@ -489,7 +485,7 @@ void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	jh512_gpu_hash_80 << <grid, block >> > (thr_id, threads, startNounce, d_hash);
+	jh512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, d_hash);
 }
 
 extern "C" {
diff --git a/quark/cuda_quark.h b/quark/cuda_quark.h
index 2a70e949a7..14f473b6aa 100644
--- a/quark/cuda_quark.h
+++ b/quark/cuda_quark.h
@@ -28,8 +28,8 @@ extern void quark_jh512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_h
 extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
 extern void quark_compactTest_cpu_free(int thr_id);
 extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order);
+	uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order);
 extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, uint32_t *nrm1, int order);
+	uint32_t *d_nonces1, uint32_t *nrm1, int order);
 
 extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu
index b68aa21564..5532a85d61 100644
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@@ -118,12 +118,11 @@ void quark_blake512_compress(uint64_t *h, const uint64_t *block, const uint8_t (
 __global__ __launch_bounds__(256, 4)
 void quark_blake512_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
 #if !defined(SP_KERNEL) || __CUDA_ARCH__ < 500
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
-
 #if USE_SHUFFLE
 	const uint32_t warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-Blöcke
 
@@ -188,12 +187,9 @@ void quark_blake512_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash)
 }
 
 __global__ __launch_bounds__(256,4)
-void quark_blake512_gpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHash)
+void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
 {
 //#if !defined(SP_KERNEL) || __CUDA_ARCH__ < 500
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
@@ -242,9 +238,9 @@ void quark_blake512_gpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun
 
 __host__
 void quark_blake512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_outputHash)
-{ 
+{
 #ifdef SP_KERNEL
-	int dev_id = device_map[((uint64_t)thr_id)&15];
+	int dev_id = device_map[((uintptr_t)thr_id) & 15];
 	if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
 		quark_blake512_cpu_hash_64_sp(thr_id, threads, d_outputHash);
 	else
@@ -260,11 +256,11 @@ void quark_blake512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_outpu
 
 __host__
 void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
-{ 
+{
 #ifdef SP_KERNEL
 	int dev_id = device_map[thr_id];
 	if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
-		quark_blake512_cpu_hash_80_sp(thr_id, threads, startNounce, d_outputHash);
+		quark_blake512_cpu_hash_80_sp(threads, startNounce, d_outputHash);
 	else
 #endif
 	{
@@ -272,7 +268,7 @@ void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun
 		dim3 grid((threads + threadsperblock-1)/threadsperblock);
 		dim3 block(threadsperblock);
 
-		quark_blake512_gpu_hash_80 << <grid, block >> >(thr_id, threads, startNounce, d_outputHash);
+		quark_blake512_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash);
 	}
 }
 
diff --git a/quark/cuda_quark_blake512_sp.cuh b/quark/cuda_quark_blake512_sp.cuh
index f85ef1d319..a17178362f 100644
--- a/quark/cuda_quark_blake512_sp.cuh
+++ b/quark/cuda_quark_blake512_sp.cuh
@@ -5,7 +5,6 @@
 #include "miner.h"
 
 // Should stay outside the ifdef on WIN64 (wtf)
-#include "cuda_helper_alexis.h"
 #include "cuda_vector_uint2x4.h"
 __constant__ static uint2 c_PaddedM[16];
 __constant__ static uint2x4 c_Hostprecalc[4];
@@ -93,11 +92,10 @@ __launch_bounds__(256, 1)
 #endif
 void quark_blake512_gpu_hash_64_sp(int *thr_id, uint32_t threads, uint2* g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
-
 	if (thread < threads)
 	{
 //		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@@ -338,11 +336,8 @@ void quark_blake512_gpu_hash_64_sp(int *thr_id, uint32_t threads, uint2* g_hash)
 
 __global__
 __launch_bounds__(128, 8)
-void quark_blake512_gpu_hash_80_sp(int thr_id, uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+void quark_blake512_gpu_hash_80_sp(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
@@ -654,8 +649,8 @@ __host__ void quark_blake512_cpu_setBlock_80_sp(int thr_id, uint64_t *pdata)
 #else
 // __CUDA_ARCH__ < 500
 __host__ void quark_blake512_cpu_setBlock_80_sp(int thr_id, uint64_t *pdata) {}
-__global__ void quark_blake512_gpu_hash_64_sp(int*, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2 *const __restrict__ g_hash) {}
-__global__ void quark_blake512_gpu_hash_80_sp(int*, uint32_t startNounce, uint2 *outputHash) {}
+__global__ void quark_blake512_gpu_hash_64_sp(uint32_t, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2 *const __restrict__ g_hash) {}
+__global__ void quark_blake512_gpu_hash_80_sp(uint32_t, uint32_t startNounce, uint2 *outputHash) {}
 #endif
 
 __host__
@@ -668,10 +663,10 @@ void quark_blake512_cpu_hash_64_sp(int *thr_id, uint32_t threads, uint32_t *d_ou
 }
 
 __host__
-void quark_blake512_cpu_hash_80_sp(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
+void quark_blake512_cpu_hash_80_sp(uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
 {
 	const uint32_t threadsperblock = 64;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
-	quark_blake512_gpu_hash_80_sp << <grid, block >> >(thr_id, threads, startNounce, (uint2*)d_outputHash);
+	quark_blake512_gpu_hash_80_sp <<<grid, block>>>(threads, startNounce, (uint2*)d_outputHash);
 }
diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu
index 5b5d4ddaba..61f6029953 100644
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@@ -38,10 +38,8 @@ __global__ __launch_bounds__(TPB, THF)
 //const uint32_t startNounce, 
 void quark_groestl512_gpu_hash_64_quad_a1_min3r(int *thr_id, const uint32_t threads, uint4* g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
-
-
 #if __CUDA_ARCH__ >= 300
 	// BEWARE : 4-WAY CODE (one hash need 4 threads)
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); // >> 2; // done on cpu
@@ -189,7 +187,7 @@ void quark_groestl512_gpu_hash_64_quad_a1_min3r(int *thr_id, const uint32_t thre
 }
 
 __global__ __launch_bounds__(TPB, THF)
-void quark_groestl512_gpu_hash_64_quad(const uint32_t threads, const uint32_t startNounce, uint32_t * g_hash, uint32_t * __restrict g_nonceVector)
+void quark_groestl512_gpu_hash_64_quad(int *thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t * g_hash, uint32_t * __restrict g_nonceVector)
 {
 	//! fixme please
 #if 0 // __CUDA_ARCH__ >= 300
@@ -263,7 +261,7 @@ void quark_groestl512_cpu_free(int thr_id)
 //	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
 //		quark_groestl512_sm20_free(thr_id);
 }
- 
+
 __host__
 void quark_groestl512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash)
 {
@@ -302,15 +300,10 @@ void groestl512_setBlock_80(int thr_id, uint32_t *endiandata)
 }
 
 __global__ __launch_bounds__(TPB, THF)
-void groestl512_gpu_hash_80_quad_a1_min3r(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint4* g_hash)
+void groestl512_gpu_hash_80_quad_a1_min3r(const uint32_t threads, const uint32_t startNounce, uint4* g_hash)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 #if __CUDA_ARCH__ >= 300
 	// BEWARE : 4-WAY CODE (one hash need 4 threads)
-
-
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); // >> 2; // done on cpu
 
 	if (thread < threads)
@@ -437,7 +430,7 @@ void groestl512_gpu_hash_80_quad(const uint32_t threads, const uint32_t startNou
 
 __host__
 void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
-{ 
+{
 //	int dev_id = device_map[thr_id];
 
 //	if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) {
@@ -447,7 +440,7 @@ void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uin
 		dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
 		dim3 block(threadsperblock);
 		//! setup only for x16r(s?)
-		groestl512_gpu_hash_80_quad_a1_min3r << <grid, block >> > ( thr_id, threads << 2, startNounce, (uint4*)d_hash);
+		groestl512_gpu_hash_80_quad_a1_min3r <<<grid, block>>> (threads << 2, startNounce, (uint4*)d_hash);
 //		groestl512_gpu_hash_80_quad<< <grid, block >> > (threads, startNounce, d_hash);
 		/*
 
diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu
index 78ca86d154..993e95660f 100644
--- a/quark/cuda_quark_keccak512.cu
+++ b/quark/cuda_quark_keccak512.cu
@@ -98,11 +98,9 @@ static void keccak_block(uint2 *s)
 __global__
 void quark_keccak512_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
-
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
 	if (thread < threads)
 	{
 		//uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@@ -201,10 +199,9 @@ static void keccak_block_v30(uint64_t *s, const uint32_t *in)
 __global__
 void quark_keccak512_gpu_hash_64_v30(int *thr_id, uint32_t threads, uint64_t *g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
 	if (thread < threads)
 	{
 		//uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@@ -239,7 +236,7 @@ void quark_keccak512_gpu_hash_64_v30(int *thr_id, uint32_t threads, uint64_t *g_
 			outpHash[i] = hash[i];
 	}
 }
- 
+
 __host__
 void quark_keccak512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash)
 {
@@ -248,7 +245,7 @@ void quark_keccak512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	int dev_id = device_map[((uint64_t)thr_id) & 15];
+	int dev_id = device_map[((uintptr_t)thr_id) & 15];
 
 	if (device_sm[dev_id] >= 320)
 		quark_keccak512_gpu_hash_64 << <grid, block >> >(thr_id, threads, (uint64_t*)d_hash);
diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu
index 0d30cd3db7..90e8f47d24 100644
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@@ -468,7 +468,7 @@ __launch_bounds__(TPB50, 5)
 #endif
 void quark_skein512_gpu_hash_64(int *thr_id, const uint32_t threads, uint64_t* __restrict__ g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
@@ -758,13 +758,13 @@ void quark_skein512_gpu_hash_64(int *thr_id, const uint32_t threads, uint64_t* _
 		#undef h7
 	}
 }
- 
+
 __host__
 //void quark_skein512_cpu_hash_64(int thr_id,uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_hash)
 void quark_skein512_cpu_hash_64(int *thr_id, const uint32_t threads, uint32_t *d_hash)
 {
 	uint32_t tpb = TPB52;
-	int dev_id = device_map[((uint64_t)thr_id) & 15];
+	int dev_id = device_map[((uintptr_t)thr_id) & 15];
 
 	if (device_sm[dev_id] <= 500) tpb = TPB50;
 	const dim3 grid((threads + tpb-1)/tpb);
@@ -782,11 +782,8 @@ __launch_bounds__(TPB52, 3)
 #else
 __launch_bounds__(TPB50, 5)
 #endif
-void skein512_gpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *output64)
+void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *output64)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
@@ -957,7 +954,7 @@ void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, ui
 	const dim3 block(tpb);
 
 	// hash function is cut in 2 parts to reduce kernel size
-	skein512_gpu_hash_80 << < grid, block >> > (thr_id, threads, startNounce, (uint64_t*)d_hash);
+	skein512_gpu_hash_80 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash);
 }
 
 __host__
diff --git a/quark/groestl_transf_quad_a1_min3r.cuh b/quark/groestl_transf_quad_a1_min3r.cuh
index 6ee9e735e1..754bdd069d 100644
--- a/quark/groestl_transf_quad_a1_min3r.cuh
+++ b/quark/groestl_transf_quad_a1_min3r.cuh
@@ -100,7 +100,6 @@ other[i].__X = (__byte_perm(other[i].__X, 0, 0x1032) & -(threadIdx.x & 1)) | (in
 input[i].__X = __shfl((int)input[i].__X, n ^ (3 & -(n < 1 || n > 2)), 4);\
 
 input[i].__X = __shfl((int)input[i].__X, n ^ (3 & -(n >= 1 && n <= 2)), 4);\
-input[i].__X = __shfl((int)input[i].__X, n ^ (3 & ((n >= 1 && n <= 2) | ((n >= 1 && n <= 2)<<1), 4);\
 */
 //input[i].__X = (__byte_perm(input[i].__X, 0, 0x1032) & (-(threadIdx.x & 1) | (-(threadIdx.x & 1) & input[i].__X));
 //other[i].__X = (__byte_perm(other[i].__X, 0, 0x1032) & (-(threadIdx.x & 1) | (-(threadIdx.x & 1) & input[i].__X));
diff --git a/qubit/qubit_luffa512_alexis.cu b/qubit/qubit_luffa512_alexis.cu
index 0cc6ed6f90..bccd10e4c1 100644
--- a/qubit/qubit_luffa512_alexis.cu
+++ b/qubit/qubit_luffa512_alexis.cu
@@ -3,7 +3,7 @@
  */
 
 #include <miner.h>
-#include "cuda_helper_alexis.h"
+#include "cuda_helper.h"
 #include "cuda_vectors_alexis.h"
 
 static unsigned char PaddedMessage[128];
@@ -621,11 +621,8 @@ static void rnd512_nullhash(uint32_t *const __restrict__ state){
 
 
 __global__ __launch_bounds__(256, 4)
-void qubit_luffa512_gpu_hash_80_alexis(int thr_id, const uint32_t threads,const uint32_t startNounce, uint32_t *outputHash)
+void qubit_luffa512_gpu_hash_80_alexis(const uint32_t threads,const uint32_t startNounce, uint32_t *outputHash)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
@@ -678,7 +675,7 @@ void qubit_luffa512_cpu_hash_80_alexis(int thr_id, uint32_t threads, uint32_t st
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	qubit_luffa512_gpu_hash_80_alexis << <grid, block >> > (thr_id, threads, startNounce, d_outputHash);
+	qubit_luffa512_gpu_hash_80_alexis<<<grid, block>>> (threads, startNounce, d_outputHash);
 }
 
 //#if __CUDA_ARCH__ == 500
@@ -689,12 +686,11 @@ __global__
 __launch_bounds__(384,2)
 void x11_luffa512_gpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *g_hash)
 {
-
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	uint32_t statebuffer[8];
-
+	
 	if (thread < threads)
 	{
 		uint32_t statechainv[40] = {
@@ -833,7 +829,7 @@ void qubit_luffa512_cpu_setBlock_80_alexis(void *pdata)
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
 	qubit_cpu_precalc();
 }
- 
+
 __host__
 void x11_luffa512_cpu_hash_64_alexis(int *thr_id, uint32_t threads,uint32_t *d_hash)
 {
diff --git a/res/ccminer.aps b/res/ccminer.aps
index 61a3609f2e98a6037de86f6d9b6af3cc158475cb..fb2f903b74b7bc5582392b2f9b6189b4682d9c26 100644
GIT binary patch
delta 258
zcmcaIhi%6kwh0Q13KJET<vDs78QiU6Qc}xe4D<~33}cE?i(`_Lb2IZ&i}Z?;8?(1&
zGal_^d^X*%i_usxg&~!pj3I`>fI*MJ5QL}KcQGDiW?<Mb-L;#sk`r#s^fTRzddhGv
znyCzS3@Qx&|AX*!l^#YFMuq8~J&fv%Go}~zFd8%7n7C1&4QMEX3gg6odaN28j0_&r
zd3zb<87EBF1yMJq`+}$i({n*o#`Ing^<ny25EV21EQm6h{uW5FJ_DNKGhMI`$lL&A
JMoc#aG6A1VPTl|j

delta 287
zcmdlnhwaK7wh0Q1suLBJHC1{T8QiU6oJuouQi@{|4RbT|jEiE5Qj24flY#8iBE6zy
z1|Z;`_))R3ZEG9jqE5yS(>1#ojisF!N*OX4au`w=iWy=U5*Z8`a)JCj24jYz>A78u
zM;UibH|=Ju<U=)k`u1){J#?{m-HcL93=GpbdKgt0Rj2FrFsd^xV3@wIhf!u?n-H5A
z&;bu1-091D7{wTMrtj@xG-eE#{;`Kqo^i%>!CoNsV7eiQS}{EkL={Xg1W`YxPXtjZ
p(>H=Bo9P#U6#ECDc_Cel)AhR;MW^%hF><i(0*Yizm+WJd2LNP~Sxf)`

diff --git a/util.cpp b/util.cpp
index f6607d59c3..fca0c70a8e 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2328,9 +2328,6 @@ void print_hash_tests(void)
 	x16r_hash(&hash[0], &buf[0]);
 	printpfx("X16r", hash);
 
-	x16s_hash(&hash[0], &buf[0]);
-	printpfx("X16s", hash);
-
 	x17hash(&hash[0], &buf[0]);
 	printpfx("X17", hash);
 
diff --git a/x11/cuda_x11_aes_alexis.cuh b/x11/cuda_x11_aes_alexis.cuh
index 49c4dee711..0cb7c51965 100644
--- a/x11/cuda_x11_aes_alexis.cuh
+++ b/x11/cuda_x11_aes_alexis.cuh
@@ -1,10 +1,7 @@
 #include "miner.h"
 #include "cuda_vectors_alexis.h"
-#ifndef DEF_OINTMENT
-#define DEF_OINTMENT
-#endif
 
-static __device__ uint32_t d_AES0[256] = {
+__device__ uint32_t d_AES0[256] = {
 	0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6,	0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591,	0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56,	0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC,
 	0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA,	0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB,	0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45,	0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B,
 	0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C,	0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83,	0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9,	0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A,
@@ -23,7 +20,7 @@ static __device__ uint32_t d_AES0[256] = {
 	0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A,	0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0,	0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E,	0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C
 };
 
-static __device__ uint32_t d_AES3[256] = {
+__device__ uint32_t d_AES3[256] = {
 	0xC6A56363, 0xF8847C7C, 0xEE997777, 0xF68D7B7B, 0xFF0DF2F2, 0xD6BD6B6B, 0xDEB16F6F, 0x9154C5C5,	0x60503030, 0x02030101, 0xCEA96767, 0x567D2B2B,	0xE719FEFE, 0xB562D7D7, 0x4DE6ABAB, 0xEC9A7676,
 	0x8F45CACA, 0x1F9D8282, 0x8940C9C9, 0xFA877D7D,	0xEF15FAFA, 0xB2EB5959, 0x8EC94747, 0xFB0BF0F0,	0x41ECADAD, 0xB367D4D4, 0x5FFDA2A2, 0x45EAAFAF,	0x23BF9C9C, 0x53F7A4A4, 0xE4967272, 0x9B5BC0C0,
 	0x75C2B7B7, 0xE11CFDFD, 0x3DAE9393, 0x4C6A2626,	0x6C5A3636, 0x7E413F3F, 0xF502F7F7, 0x834FCCCC,	0x685C3434, 0x51F4A5A5, 0xD134E5E5, 0xF908F1F1,	0xE2937171, 0xAB73D8D8, 0x62533131, 0x2A3F1515,
diff --git a/x11/cuda_x11_cubehash512.cu b/x11/cuda_x11_cubehash512.cu
index 39a0afff55..f7b951abe3 100644
--- a/x11/cuda_x11_cubehash512.cu
+++ b/x11/cuda_x11_cubehash512.cu
@@ -17,248 +17,6 @@
 
 #define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
 
-//
-
-#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
-
-//#define SHUFFLE
-
-#if defined(SHUFFLE)
-
-#define TPB 1024
-
-__device__ __forceinline__
-void rrounds(uint32_t *x){
-#pragma unroll 16
-	for (int r = 0; r < 16; ++r) {
-		/* "add x_0jklm into x_1jklm modulo 2^32 rotate x_0jklm upwards by 7 bits" */
-		x[8] += x[0]; x[9] += x[1];
-		x[10] += x[2]; x[11] += x[3];
-		x[12] += x[4]; x[13] += x[5];
-		x[14] += x[6]; x[15] += x[7];
-		x[0] = ROTL32(x[0], 7);
-		x[1] = ROTL32(x[1], 7);
-		x[2] = ROTL32(x[2], 7);
-		x[3] = ROTL32(x[3], 7);
-		x[4] = ROTL32(x[4], 7);
-		x[5] = ROTL32(x[5], 7);
-		x[6] = ROTL32(x[6], 7);
-		x[7] = ROTL32(x[7], 7);
-		/* "swap x_00klm with x_01klm" */
-		SWAP(x[0], x[4]); SWAP(x[1], x[5]);
-		SWAP(x[2], x[6]); SWAP(x[3], x[7]);
-
-		x[0] ^= x[8]; x[4] ^= x[12];
-		x[1] ^= x[9]; x[5] ^= x[13];
-		x[2] ^= x[10]; x[6] ^= x[14];
-		x[3] ^= x[11]; x[7] ^= x[15];
-		/* "swap x_1jk0m with x_1jk1m" */
-		SWAP(x[8], x[10]); SWAP(x[9], x[11]);
-		SWAP(x[12], x[14]); SWAP(x[13], x[15]);
-		/* "add x_0jklm into x_1jklm modulo 2^32 rotate x_0jklm upwards by 11 bits" */
-		x[8] += x[0]; x[9] += x[1];
-		x[10] += x[2]; x[11] += x[3];
-		x[12] += x[4]; x[13] += x[5];
-		x[14] += x[6]; x[15] += x[7];
-		x[0] = ROTL32(x[0], 11);
-		x[1] = ROTL32(x[1], 11);
-		x[2] = ROTL32(x[2], 11);
-		x[3] = ROTL32(x[3], 11);
-		x[4] = ROTL32(x[4], 11);
-		x[5] = ROTL32(x[5], 11);
-		x[6] = ROTL32(x[6], 11);
-		x[7] = ROTL32(x[7], 11);
-		/* "swap x_0j0lm with x_0j1lm" */
-		x[0] = __shfl(x[0], threadIdx.x ^ 1);
-		x[1] = __shfl(x[1], threadIdx.x ^ 1);
-		x[2] = __shfl(x[2], threadIdx.x ^ 1);
-		x[3] = __shfl(x[3], threadIdx.x ^ 1);
-		x[4] = __shfl(x[4], threadIdx.x ^ 1);
-		x[5] = __shfl(x[5], threadIdx.x ^ 1);
-		x[6] = __shfl(x[6], threadIdx.x ^ 1);
-		x[7] = __shfl(x[7], threadIdx.x ^ 1);
-
-		x[0] ^= x[8]; x[1] ^= x[9];
-		x[2] ^= x[10]; x[3] ^= x[11];
-		x[4] ^= x[12]; x[5] ^= x[13];
-		x[6] ^= x[14]; x[7] ^= x[15];
-		/* "swap x_1jkl0 with x_1jkl1" */
-		SWAP(x[8], x[9]); SWAP(x[10], x[11]);
-		SWAP(x[12], x[13]); SWAP(x[14], x[15]);
-	}
-}
-__global__ __launch_bounds__(TPB, 1)
-void x11_cubehash512_gpu_hash_64(uint32_t threads, uint64_t *g_hash){
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 1;
-
-	const uint32_t even = (threadIdx.x & 1);
-
-	if (thread < threads){
-		uint32_t *Hash = (uint32_t*)&g_hash[8 * thread + 2 * even];
-
-		uint32_t x[16];
-
-		if (even == 0){
-			x[0] = 0x2AEA2A61;	x[1] = 0x50F494D4;	x[2] = 0x2D538B8B;	x[3] = 0x4167D83E;
-			x[4] = 0x4D42C787;	x[5] = 0xA647A8B3;	x[6] = 0x97CF0BEF;	x[7] = 0x825B4537;
-			x[8] = 0xFCD398D9;	x[9] = 0x148FE485;	x[10] = 0x1B017BEF;	x[11] = 0xB6444532;
-			x[12] = 0xD65C8A2B;	x[13] = 0xA5A70E75;	x[14] = 0xB1C62456;	x[15] = 0xBC796576;
-		}
-		else{
-			x[0] = 0x3FEE2313;	x[1] = 0xC701CF8C;	x[2] = 0xCC39968E;	x[3] = 0x50AC5695;
-			x[4] = 0xEEF864D2;	x[5] = 0xF22090C4;	x[6] = 0xD0E5CD33;	x[7] = 0xA23911AE;
-			x[8] = 0x6A536159;	x[9] = 0x2FF5781C;	x[10] = 0x91FA7934;	x[11] = 0x0DBADEA9;
-			x[12] = 0x1921C8F7;	x[13] = 0xE7989AF1; 	x[14] = 0x7795D246;	x[15] = 0xD43E3B44;
-		}
-		*(uint4*)&x[0] ^= __ldg((uint4*)&Hash[0]);
-		rrounds(x);
-
-		*(uint4*)&x[0] ^= __ldg((uint4*)&Hash[8]);
-
-		rrounds(x);
-
-		if (!even)
-			x[0] ^= 0x80;
-
-		rrounds(x);
-		/* "the integer 1 is xored into the last state word x_11111" */
-		if (even)
-			x[15] ^= 1;
-
-#pragma unroll 10
-		for (int i = 0; i < 10; ++i)
-			rrounds(x);
-
-		*(uint4*)&Hash[0] = *(uint4*)&x[0];
-		*(uint4*)&Hash[8] = *(uint4*)&x[4];
-		//		g_hash[thread + (2*even+0) * threads]	= *(uint2*)&x[ 0];
-		//		g_hash[thread + (2*even+1) * threads]	= *(uint2*)&x[ 2];
-	}
-}
-__host__
-void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){
-
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((2 * threads + TPB - 1) / TPB);
-	dim3 block(TPB);
-
-	x11_cubehash512_gpu_hash_64 << <grid, block >> >(threads, (uint64_t*)d_hash);
-
-}
-
-#else
-
-#define TPB 768
-
-__device__ __forceinline__
-static void rrounds(uint32_t *x){
-#pragma unroll 2
-	for (int r = 0; r < 16; r++) {
-		/* "add x_0jklm into x_1jklmn modulo 2^32 rotate x_0jklm upwards by 7 bits" */
-		x[16] = x[16] + x[0]; x[0] = ROTL32(x[0], 7); x[17] = x[17] + x[1]; x[1] = ROTL32(x[1], 7);
-		x[18] = x[18] + x[2]; x[2] = ROTL32(x[2], 7); x[19] = x[19] + x[3]; x[3] = ROTL32(x[3], 7);
-		x[20] = x[20] + x[4]; x[4] = ROTL32(x[4], 7); x[21] = x[21] + x[5]; x[5] = ROTL32(x[5], 7);
-		x[22] = x[22] + x[6]; x[6] = ROTL32(x[6], 7); x[23] = x[23] + x[7]; x[7] = ROTL32(x[7], 7);
-		x[24] = x[24] + x[8]; x[8] = ROTL32(x[8], 7); x[25] = x[25] + x[9]; x[9] = ROTL32(x[9], 7);
-		x[26] = x[26] + x[10]; x[10] = ROTL32(x[10], 7); x[27] = x[27] + x[11]; x[11] = ROTL32(x[11], 7);
-		x[28] = x[28] + x[12]; x[12] = ROTL32(x[12], 7); x[29] = x[29] + x[13]; x[13] = ROTL32(x[13], 7);
-		x[30] = x[30] + x[14]; x[14] = ROTL32(x[14], 7); x[31] = x[31] + x[15]; x[15] = ROTL32(x[15], 7);
-		/* "swap x_00klm with x_01klm" */
-		SWAP(x[0], x[8]); x[0] ^= x[16]; x[8] ^= x[24]; SWAP(x[1], x[9]); x[1] ^= x[17]; x[9] ^= x[25];
-		SWAP(x[2], x[10]); x[2] ^= x[18]; x[10] ^= x[26]; SWAP(x[3], x[11]); x[3] ^= x[19]; x[11] ^= x[27];
-		SWAP(x[4], x[12]); x[4] ^= x[20]; x[12] ^= x[28]; SWAP(x[5], x[13]); x[5] ^= x[21]; x[13] ^= x[29];
-		SWAP(x[6], x[14]); x[6] ^= x[22]; x[14] ^= x[30]; SWAP(x[7], x[15]); x[7] ^= x[23]; x[15] ^= x[31];
-		/* "swap x_1jk0m with x_1jk1m" */
-		SWAP(x[16], x[18]); SWAP(x[17], x[19]); SWAP(x[20], x[22]); SWAP(x[21], x[23]); SWAP(x[24], x[26]); SWAP(x[25], x[27]); SWAP(x[28], x[30]); SWAP(x[29], x[31]);
-		/* "add x_0jklm into x_1jklm modulo 2^32 rotate x_0jklm upwards by 11 bits" */
-		x[16] = x[16] + x[0]; x[0] = ROTL32(x[0], 11); x[17] = x[17] + x[1]; x[1] = ROTL32(x[1], 11);
-		x[18] = x[18] + x[2]; x[2] = ROTL32(x[2], 11); x[19] = x[19] + x[3]; x[3] = ROTL32(x[3], 11);
-		x[20] = x[20] + x[4]; x[4] = ROTL32(x[4], 11); x[21] = x[21] + x[5]; x[5] = ROTL32(x[5], 11);
-		x[22] = x[22] + x[6]; x[6] = ROTL32(x[6], 11); x[23] = x[23] + x[7]; x[7] = ROTL32(x[7], 11);
-		x[24] = x[24] + x[8]; x[8] = ROTL32(x[8], 11); x[25] = x[25] + x[9]; x[9] = ROTL32(x[9], 11);
-		x[26] = x[26] + x[10]; x[10] = ROTL32(x[10], 11); x[27] = x[27] + x[11]; x[11] = ROTL32(x[11], 11);
-		x[28] = x[28] + x[12]; x[12] = ROTL32(x[12], 11); x[29] = x[29] + x[13]; x[13] = ROTL32(x[13], 11);
-		x[30] = x[30] + x[14]; x[14] = ROTL32(x[14], 11); x[31] = x[31] + x[15]; x[15] = ROTL32(x[15], 11);
-		/* "swap x_0j0lm with x_0j1lm" */
-		SWAP(x[0], x[4]); x[0] ^= x[16]; x[4] ^= x[20]; SWAP(x[1], x[5]); x[1] ^= x[17]; x[5] ^= x[21];
-		SWAP(x[2], x[6]); x[2] ^= x[18]; x[6] ^= x[22]; SWAP(x[3], x[7]); x[3] ^= x[19]; x[7] ^= x[23];
-		SWAP(x[8], x[12]); x[8] ^= x[24]; x[12] ^= x[28]; SWAP(x[9], x[13]); x[9] ^= x[25]; x[13] ^= x[29];
-		SWAP(x[10], x[14]); x[10] ^= x[26]; x[14] ^= x[30]; SWAP(x[11], x[15]); x[11] ^= x[27]; x[15] ^= x[31];
-		/* "swap x_1jkl0 with x_1jkl1" */
-		SWAP(x[16], x[17]); SWAP(x[18], x[19]); SWAP(x[20], x[21]); SWAP(x[22], x[23]); SWAP(x[24], x[25]); SWAP(x[26], x[27]); SWAP(x[28], x[29]); SWAP(x[30], x[31]);
-	}
-}
-
-/***************************************************/
-// GPU Hash Function
-__global__ __launch_bounds__(TPB)
-void x11_cubehash512_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash)
-{
-
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
-		return;
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
-	if (thread < threads){
-
-		uint32_t *Hash = (uint32_t*)&g_hash[8 * thread];
-
-		uint32_t x[32] = {
-			0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E,
-			0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695,
-			0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537,
-			0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE,
-			0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532,
-			0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
-			0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576,
-			0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44
-		};
-
-		// erste H�lfte des Hashes (32 bytes)
-		//Update32(x, (const BitSequence*)Hash);
-		*(uint2x4*)&x[0] ^= __ldg4((uint2x4*)&Hash[0]);
-
-		rrounds(x);
-
-		// zweite H�lfte des Hashes (32 bytes)
-		//        Update32(x, (const BitSequence*)(Hash+8));
-		*(uint2x4*)&x[0] ^= __ldg4((uint2x4*)&Hash[8]);
-
-		rrounds(x);
-
-		// Padding Block
-		x[0] ^= 0x80;
-		rrounds(x);
-
-		//	Final(x, (BitSequence*)Hash);
-		x[31] ^= 1;
-
-		/* "the state is then transformed invertibly through 10r identical rounds" */
-#pragma unroll 10
-		for (int i = 0; i < 10; ++i)
-			rrounds(x);
-
-		/* "output the first h/8 bytes of the state" */
-		*(uint2x4*)&Hash[0] = *(uint2x4*)&x[0];
-		*(uint2x4*)&Hash[8] = *(uint2x4*)&x[8];
-	}
-}
- 
- 
-__host__
-void x11_cubehash512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash){
-
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + TPB - 1) / TPB);
-	dim3 block(TPB);
-
-	x11_cubehash512_gpu_hash_64 << <grid, block >> >(thr_id, threads, (uint64_t*)d_hash);
-
-}
-#endif
-
-//
-
 __device__ __constant__
 static const uint32_t c_IV_512[32] = {
 	0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E,
@@ -456,6 +214,48 @@ static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
 
 /***************************************************/
 
+__global__
+void x11_cubehash512_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash)
+{
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
+		return;
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		//uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		int hashPosition = thread;//nounce - startNounce;
+		uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
+
+		uint32_t x[2][2][2][2][2];
+		Init(x);
+
+		Update32(x, &Hash[0]);
+		Update32(x, &Hash[8]);
+
+		// Padding Block
+		uint32_t last[8];
+		last[0] = 0x80;
+		#pragma unroll 7
+		for (int i=1; i < 8; i++) last[i] = 0;
+		Update32(x, last);
+
+		Final(x, Hash);
+	}
+}
+
+__host__
+void x11_cubehash512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	size_t shared_size = 0;
+
+	x11_cubehash512_gpu_hash_64 << <grid, block, shared_size >> >(thr_id, threads, (uint64_t*)d_hash);
+}
 
 __host__
 void x11_cubehash512_cpu_init(int thr_id, uint32_t threads) { }
@@ -476,11 +276,8 @@ void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata)
 }
 
 __global__
-void cubehash512_gpu_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash)
+void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
@@ -521,7 +318,7 @@ void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const ui
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	cubehash512_gpu_hash_80 << <grid, block >> > (thr_id, threads, startNounce, (uint64_t*)d_hash);
+	cubehash512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, (uint64_t*) d_hash);
 }
 
 #endif
\ No newline at end of file
diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu
index 3cd9f4685c..6ce3d8e993 100644
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@@ -299,13 +299,13 @@ void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g
 		cuda_echo_round(sharedMemory, Hash);
 	}
 }
-/*
+
 __host__
-void x11_echo512_cpu_init(int thr_id, uint32_t threads)
+void X11_echo512_cpu_init(int thr_id, uint32_t threads)
 {
 	aes_cpu_init(thr_id);
 }
-*/
+
 __host__
 void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
diff --git a/x11/cuda_x11_echo_aes.cuh b/x11/cuda_x11_echo_aes.cuh
index 00ea3aa7b2..79499c201c 100644
--- a/x11/cuda_x11_echo_aes.cuh
+++ b/x11/cuda_x11_echo_aes.cuh
@@ -1,3 +1,4 @@
+#if 1
 #include "miner.h"
 #include "cuda_vectors_alexis.h"
 
@@ -5,9 +6,9 @@
 #define AESx(x) (x ##UL) /* SPH_C32(x) */
 
 //#define DEVICE_DIRECT_CONSTANTS
-//#ifndef DEF_OINTMENT
+
 #ifdef DEVICE_DIRECT_CONSTANTS
-static __constant__ __align__(64) uint32_t d_AES0[256] = {
+__constant__ __align__(64) uint32_t d_AES0[256] = {
 #else
 static const uint32_t h_AES0[256] = {
 #endif
@@ -78,149 +79,7 @@ static const uint32_t h_AES0[256] = {
 };
 
 #ifdef DEVICE_DIRECT_CONSTANTS
-static __constant__ __align__(64) uint32_t d_AES1[256] = {
-#else
-static const uint32_t h_AES1[256] = {
-#endif
-	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
-	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
-	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
-	AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
-	AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
-	AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
-	AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
-	AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
-	AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
-	AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
-	AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
-	AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
-	AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
-	AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
-	AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
-	AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
-	AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
-	AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
-	AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
-	AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
-	AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
-	AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
-	AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
-	AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
-	AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
-	AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
-	AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
-	AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
-	AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
-	AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
-	AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
-	AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
-	AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
-	AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
-	AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
-	AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
-	AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
-	AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
-	AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
-	AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
-	AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
-	AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
-	AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
-	AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
-	AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
-	AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
-	AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
-	AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
-	AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
-	AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
-	AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
-	AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
-	AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
-	AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
-	AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
-	AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
-	AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
-	AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
-	AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
-	AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
-	AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
-	AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
-	AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
-	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
-};
-
-#ifdef DEVICE_DIRECT_CONSTANTS
-static __constant__ __align__(64) uint32_t d_AES2[256] = {
-#else
-static const uint32_t h_AES2[256] = {
-#endif
-	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
-	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
-	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
-	AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
-	AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
-	AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
-	AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
-	AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
-	AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
-	AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
-	AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
-	AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
-	AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
-	AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
-	AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
-	AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
-	AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
-	AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
-	AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
-	AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
-	AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
-	AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
-	AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
-	AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
-	AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
-	AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
-	AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
-	AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
-	AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
-	AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
-	AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
-	AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
-	AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
-	AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
-	AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
-	AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
-	AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
-	AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
-	AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
-	AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
-	AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
-	AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
-	AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
-	AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
-	AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
-	AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
-	AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
-	AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
-	AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
-	AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
-	AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
-	AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
-	AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
-	AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
-	AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
-	AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
-	AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
-	AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
-	AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
-	AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
-	AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
-	AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
-	AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
-	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
-};
-
-#ifdef DEVICE_DIRECT_CONSTANTS
-static __constant__ __align__(64) uint32_t d_AES3[256] = {
+__constant__ __align__(64) uint32_t d_AES3[256] = {
 #else
 static const uint32_t h_AES3[256] = {
 #endif
@@ -290,9 +149,8 @@ static const uint32_t h_AES3[256] = {
 	AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
 };
 
-
 #ifndef DEVICE_DIRECT_CONSTANTS
-static __device__ uint32_t d_AES0[256] = {
+__device__ uint32_t d_AES0[256] = {
 	0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591, 0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC,
 	0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB, 0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B,
 	0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83, 0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A,
@@ -311,182 +169,36 @@ static __device__ uint32_t d_AES0[256] = {
 	0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0, 0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C
 };
 
-static __device__ uint32_t d_AES1[256] = {
-
-	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
-	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
-	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
-	AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
-	AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
-	AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
-	AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
-	AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
-	AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
-	AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
-	AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
-	AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
-	AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
-	AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
-	AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
-	AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
-	AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
-	AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
-	AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
-	AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
-	AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
-	AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
-	AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
-	AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
-	AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
-	AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
-	AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
-	AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
-	AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
-	AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
-	AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
-	AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
-	AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
-	AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
-	AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
-	AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
-	AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
-	AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
-	AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
-	AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
-	AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
-	AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
-	AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
-	AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
-	AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
-	AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
-	AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
-	AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
-	AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
-	AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
-	AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
-	AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
-	AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
-	AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
-	AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
-	AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
-	AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
-	AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
-	AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
-	AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
-	AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
-	AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
-	AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
-	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
+__device__ uint32_t d_AES3[256] = {
+	0xC6A56363, 0xF8847C7C, 0xEE997777, 0xF68D7B7B, 0xFF0DF2F2, 0xD6BD6B6B, 0xDEB16F6F, 0x9154C5C5, 0x60503030, 0x02030101, 0xCEA96767, 0x567D2B2B, 0xE719FEFE, 0xB562D7D7, 0x4DE6ABAB, 0xEC9A7676,
+	0x8F45CACA, 0x1F9D8282, 0x8940C9C9, 0xFA877D7D, 0xEF15FAFA, 0xB2EB5959, 0x8EC94747, 0xFB0BF0F0, 0x41ECADAD, 0xB367D4D4, 0x5FFDA2A2, 0x45EAAFAF, 0x23BF9C9C, 0x53F7A4A4, 0xE4967272, 0x9B5BC0C0,
+	0x75C2B7B7, 0xE11CFDFD, 0x3DAE9393, 0x4C6A2626, 0x6C5A3636, 0x7E413F3F, 0xF502F7F7, 0x834FCCCC, 0x685C3434, 0x51F4A5A5, 0xD134E5E5, 0xF908F1F1, 0xE2937171, 0xAB73D8D8, 0x62533131, 0x2A3F1515,
+	0x080C0404, 0x9552C7C7, 0x46652323, 0x9D5EC3C3, 0x30281818, 0x37A19696, 0x0A0F0505, 0x2FB59A9A, 0x0E090707, 0x24361212, 0x1B9B8080, 0xDF3DE2E2, 0xCD26EBEB, 0x4E692727, 0x7FCDB2B2, 0xEA9F7575,
+	0x121B0909, 0x1D9E8383, 0x58742C2C, 0x342E1A1A, 0x362D1B1B, 0xDCB26E6E, 0xB4EE5A5A, 0x5BFBA0A0, 0xA4F65252, 0x764D3B3B, 0xB761D6D6, 0x7DCEB3B3, 0x527B2929, 0xDD3EE3E3, 0x5E712F2F, 0x13978484,
+	0xA6F55353, 0xB968D1D1, 0x00000000, 0xC12CEDED, 0x40602020, 0xE31FFCFC, 0x79C8B1B1, 0xB6ED5B5B, 0xD4BE6A6A, 0x8D46CBCB, 0x67D9BEBE, 0x724B3939, 0x94DE4A4A, 0x98D44C4C, 0xB0E85858, 0x854ACFCF,
+	0xBB6BD0D0, 0xC52AEFEF, 0x4FE5AAAA, 0xED16FBFB, 0x86C54343, 0x9AD74D4D, 0x66553333, 0x11948585, 0x8ACF4545, 0xE910F9F9, 0x04060202, 0xFE817F7F, 0xA0F05050, 0x78443C3C, 0x25BA9F9F, 0x4BE3A8A8,
+	0xA2F35151, 0x5DFEA3A3, 0x80C04040, 0x058A8F8F, 0x3FAD9292, 0x21BC9D9D, 0x70483838, 0xF104F5F5, 0x63DFBCBC, 0x77C1B6B6, 0xAF75DADA, 0x42632121, 0x20301010, 0xE51AFFFF, 0xFD0EF3F3, 0xBF6DD2D2,
+	0x814CCDCD, 0x18140C0C, 0x26351313, 0xC32FECEC, 0xBEE15F5F, 0x35A29797, 0x88CC4444, 0x2E391717, 0x9357C4C4, 0x55F2A7A7, 0xFC827E7E, 0x7A473D3D, 0xC8AC6464, 0xBAE75D5D, 0x322B1919, 0xE6957373,
+	0xC0A06060, 0x19988181, 0x9ED14F4F, 0xA37FDCDC, 0x44662222, 0x547E2A2A, 0x3BAB9090, 0x0B838888, 0x8CCA4646, 0xC729EEEE, 0x6BD3B8B8, 0x283C1414, 0xA779DEDE, 0xBCE25E5E, 0x161D0B0B, 0xAD76DBDB,
+	0xDB3BE0E0, 0x64563232, 0x744E3A3A, 0x141E0A0A, 0x92DB4949, 0x0C0A0606, 0x486C2424, 0xB8E45C5C, 0x9F5DC2C2, 0xBD6ED3D3, 0x43EFACAC, 0xC4A66262, 0x39A89191, 0x31A49595, 0xD337E4E4, 0xF28B7979,
+	0xD532E7E7, 0x8B43C8C8, 0x6E593737, 0xDAB76D6D, 0x018C8D8D, 0xB164D5D5, 0x9CD24E4E, 0x49E0A9A9, 0xD8B46C6C, 0xACFA5656, 0xF307F4F4, 0xCF25EAEA, 0xCAAF6565, 0xF48E7A7A, 0x47E9AEAE, 0x10180808,
+	0x6FD5BABA, 0xF0887878, 0x4A6F2525, 0x5C722E2E, 0x38241C1C, 0x57F1A6A6, 0x73C7B4B4, 0x9751C6C6, 0xCB23E8E8, 0xA17CDDDD, 0xE89C7474, 0x3E211F1F, 0x96DD4B4B, 0x61DCBDBD, 0x0D868B8B, 0x0F858A8A,
+	0xE0907070, 0x7C423E3E, 0x71C4B5B5, 0xCCAA6666, 0x90D84848, 0x06050303, 0xF701F6F6, 0x1C120E0E, 0xC2A36161, 0x6A5F3535, 0xAEF95757, 0x69D0B9B9, 0x17918686, 0x9958C1C1, 0x3A271D1D, 0x27B99E9E,
+	0xD938E1E1, 0xEB13F8F8, 0x2BB39898, 0x22331111, 0xD2BB6969, 0xA970D9D9, 0x07898E8E, 0x33A79494, 0x2DB69B9B, 0x3C221E1E, 0x15928787, 0xC920E9E9, 0x8749CECE, 0xAAFF5555, 0x50782828, 0xA57ADFDF,
+	0x038F8C8C, 0x59F8A1A1, 0x09808989, 0x1A170D0D, 0x65DABFBF, 0xD731E6E6, 0x84C64242, 0xD0B86868, 0x82C34141, 0x29B09999, 0x5A772D2D, 0x1E110F0F, 0x7BCBB0B0, 0xA8FC5454, 0x6DD6BBBB, 0x2C3A1616
 };
+/*
+static __constant__ __align__(64) uint32_t d_AES0[256];
+static __constant__ __align__(64) uint32_t d_AES3[256];
+*/
 
-static __device__ uint32_t d_AES2[256] = {
+static void aes_cpu_init(int thr_id)
+{
+	CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES0,
+		h_AES0,
+		sizeof(h_AES0),
+		0, cudaMemcpyHostToDevice));
 
-	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
-	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
-	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
-	AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
-	AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
-	AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
-	AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
-	AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
-	AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
-	AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
-	AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
-	AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
-	AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
-	AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
-	AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
-	AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
-	AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
-	AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
-	AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
-	AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
-	AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
-	AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
-	AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
-	AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
-	AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
-	AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
-	AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
-	AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
-	AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
-	AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
-	AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
-	AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
-	AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
-	AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
-	AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
-	AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
-	AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
-	AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
-	AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
-	AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
-	AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
-	AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
-	AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
-	AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
-	AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
-	AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
-	AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
-	AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
-	AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
-	AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
-	AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
-	AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
-	AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
-	AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
-	AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
-	AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
-	AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
-	AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
-	AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
-	AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
-	AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
-	AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
-	AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
-	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
-};
-
-static __device__ uint32_t d_AES3[256] = {
-	0xC6A56363, 0xF8847C7C, 0xEE997777, 0xF68D7B7B, 0xFF0DF2F2, 0xD6BD6B6B, 0xDEB16F6F, 0x9154C5C5, 0x60503030, 0x02030101, 0xCEA96767, 0x567D2B2B, 0xE719FEFE, 0xB562D7D7, 0x4DE6ABAB, 0xEC9A7676,
-	0x8F45CACA, 0x1F9D8282, 0x8940C9C9, 0xFA877D7D, 0xEF15FAFA, 0xB2EB5959, 0x8EC94747, 0xFB0BF0F0, 0x41ECADAD, 0xB367D4D4, 0x5FFDA2A2, 0x45EAAFAF, 0x23BF9C9C, 0x53F7A4A4, 0xE4967272, 0x9B5BC0C0,
-	0x75C2B7B7, 0xE11CFDFD, 0x3DAE9393, 0x4C6A2626, 0x6C5A3636, 0x7E413F3F, 0xF502F7F7, 0x834FCCCC, 0x685C3434, 0x51F4A5A5, 0xD134E5E5, 0xF908F1F1, 0xE2937171, 0xAB73D8D8, 0x62533131, 0x2A3F1515,
-	0x080C0404, 0x9552C7C7, 0x46652323, 0x9D5EC3C3, 0x30281818, 0x37A19696, 0x0A0F0505, 0x2FB59A9A, 0x0E090707, 0x24361212, 0x1B9B8080, 0xDF3DE2E2, 0xCD26EBEB, 0x4E692727, 0x7FCDB2B2, 0xEA9F7575,
-	0x121B0909, 0x1D9E8383, 0x58742C2C, 0x342E1A1A, 0x362D1B1B, 0xDCB26E6E, 0xB4EE5A5A, 0x5BFBA0A0, 0xA4F65252, 0x764D3B3B, 0xB761D6D6, 0x7DCEB3B3, 0x527B2929, 0xDD3EE3E3, 0x5E712F2F, 0x13978484,
-	0xA6F55353, 0xB968D1D1, 0x00000000, 0xC12CEDED, 0x40602020, 0xE31FFCFC, 0x79C8B1B1, 0xB6ED5B5B, 0xD4BE6A6A, 0x8D46CBCB, 0x67D9BEBE, 0x724B3939, 0x94DE4A4A, 0x98D44C4C, 0xB0E85858, 0x854ACFCF,
-	0xBB6BD0D0, 0xC52AEFEF, 0x4FE5AAAA, 0xED16FBFB, 0x86C54343, 0x9AD74D4D, 0x66553333, 0x11948585, 0x8ACF4545, 0xE910F9F9, 0x04060202, 0xFE817F7F, 0xA0F05050, 0x78443C3C, 0x25BA9F9F, 0x4BE3A8A8,
-	0xA2F35151, 0x5DFEA3A3, 0x80C04040, 0x058A8F8F, 0x3FAD9292, 0x21BC9D9D, 0x70483838, 0xF104F5F5, 0x63DFBCBC, 0x77C1B6B6, 0xAF75DADA, 0x42632121, 0x20301010, 0xE51AFFFF, 0xFD0EF3F3, 0xBF6DD2D2,
-	0x814CCDCD, 0x18140C0C, 0x26351313, 0xC32FECEC, 0xBEE15F5F, 0x35A29797, 0x88CC4444, 0x2E391717, 0x9357C4C4, 0x55F2A7A7, 0xFC827E7E, 0x7A473D3D, 0xC8AC6464, 0xBAE75D5D, 0x322B1919, 0xE6957373,
-	0xC0A06060, 0x19988181, 0x9ED14F4F, 0xA37FDCDC, 0x44662222, 0x547E2A2A, 0x3BAB9090, 0x0B838888, 0x8CCA4646, 0xC729EEEE, 0x6BD3B8B8, 0x283C1414, 0xA779DEDE, 0xBCE25E5E, 0x161D0B0B, 0xAD76DBDB,
-	0xDB3BE0E0, 0x64563232, 0x744E3A3A, 0x141E0A0A, 0x92DB4949, 0x0C0A0606, 0x486C2424, 0xB8E45C5C, 0x9F5DC2C2, 0xBD6ED3D3, 0x43EFACAC, 0xC4A66262, 0x39A89191, 0x31A49595, 0xD337E4E4, 0xF28B7979,
-	0xD532E7E7, 0x8B43C8C8, 0x6E593737, 0xDAB76D6D, 0x018C8D8D, 0xB164D5D5, 0x9CD24E4E, 0x49E0A9A9, 0xD8B46C6C, 0xACFA5656, 0xF307F4F4, 0xCF25EAEA, 0xCAAF6565, 0xF48E7A7A, 0x47E9AEAE, 0x10180808,
-	0x6FD5BABA, 0xF0887878, 0x4A6F2525, 0x5C722E2E, 0x38241C1C, 0x57F1A6A6, 0x73C7B4B4, 0x9751C6C6, 0xCB23E8E8, 0xA17CDDDD, 0xE89C7474, 0x3E211F1F, 0x96DD4B4B, 0x61DCBDBD, 0x0D868B8B, 0x0F858A8A,
-	0xE0907070, 0x7C423E3E, 0x71C4B5B5, 0xCCAA6666, 0x90D84848, 0x06050303, 0xF701F6F6, 0x1C120E0E, 0xC2A36161, 0x6A5F3535, 0xAEF95757, 0x69D0B9B9, 0x17918686, 0x9958C1C1, 0x3A271D1D, 0x27B99E9E,
-	0xD938E1E1, 0xEB13F8F8, 0x2BB39898, 0x22331111, 0xD2BB6969, 0xA970D9D9, 0x07898E8E, 0x33A79494, 0x2DB69B9B, 0x3C221E1E, 0x15928787, 0xC920E9E9, 0x8749CECE, 0xAAFF5555, 0x50782828, 0xA57ADFDF,
-	0x038F8C8C, 0x59F8A1A1, 0x09808989, 0x1A170D0D, 0x65DABFBF, 0xD731E6E6, 0x84C64242, 0xD0B86868, 0x82C34141, 0x29B09999, 0x5A772D2D, 0x1E110F0F, 0x7BCBB0B0, 0xA8FC5454, 0x6DD6BBBB, 0x2C3A1616
-};
-/*
-static __constant__ __align__(64) uint32_t d_AES0[256];
-static __constant__ __align__(64) uint32_t d_AES3[256];
-*/
-
-static void aes_cpu_init(int thr_id)
-{
-	CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES0,
-		h_AES0,
-		sizeof(h_AES0),
-		0, cudaMemcpyHostToDevice));
-	/*
-	CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES1,
-		h_AES1,
-		sizeof(h_AES1),
-		0, cudaMemcpyHostToDevice));
-
-	CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES2,
-		h_AES2,
-		sizeof(h_AES2),
-		0, cudaMemcpyHostToDevice));
-	*/
 	CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES3,
 		h_AES3,
 		sizeof(h_AES3),
@@ -723,303 +435,443 @@ static void KEY_EXPAND_ELT(const uint32_t *sharedMemory, uint32_t *k){
 	k[3] = y0;
 }
 
-
-
-__device__ __forceinline__
-static void aes_round(const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){
-
-	y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
-	y3 = __ldg(&d_AES1[__byte_perm(x0, 0, 0x4441)]);
-	y2 = __ldg(&d_AES2[__byte_perm(x0, 0, 0x4442)]);
-	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
-
-	y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
-	y0 ^= __ldg(&d_AES1[__byte_perm(x1, 0, 0x4441)]);
-	y3 ^= __ldg(&d_AES2[__byte_perm(x1, 0, 0x4442)]);
-	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
-
-	y0 ^= k0;
-
-	y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
-	y1 ^= __ldg(&d_AES1[__byte_perm(x2, 0, 0x4441)]);
-	y0 ^= __ldg(&d_AES2[__byte_perm(x2, 0, 0x4442)]);
-	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
-
-	y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]);
-	y2 ^= __ldg(&d_AES1[__byte_perm(x3, 0, 0x4441)]);
-	y1 ^= __ldg(&d_AES2[__byte_perm(x3, 0, 0x4442)]);
-	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
-}
-
-__device__ __forceinline__
-static void aes_round_LDG(const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){
-
-	y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
-	y3 = __ldg(&d_AES1[__byte_perm(x0, 0, 0x4441)]);
-	y2 = __ldg(&d_AES2[__byte_perm(x0, 0, 0x4442)]);
-	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
-
-	y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
-	y0 ^= __ldg(&d_AES1[__byte_perm(x1, 0, 0x4441)]);
-	y3 ^= __ldg(&d_AES2[__byte_perm(x1, 0, 0x4442)]);
-	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
-
-	y0 ^= k0;
-
-	y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
-	y1 ^= __ldg(&d_AES1[__byte_perm(x2, 0, 0x4441)]);
-	y0 ^= __ldg(&d_AES2[__byte_perm(x2, 0, 0x4442)]);
-	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
-
-	y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]);
-	y2 ^= __ldg(&d_AES1[__byte_perm(x3, 0, 0x4441)]);
-	y1 ^= __ldg(&d_AES2[__byte_perm(x3, 0, 0x4442)]);
-	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
-}
-
-__device__ __forceinline__
-static void aes_round(const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){
-
-	y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
-	y3 = __ldg(&d_AES1[__byte_perm(x0, 0, 0x4441)]);
-	y2 = __ldg(&d_AES2[__byte_perm(x0, 0, 0x4442)]);
-	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
-
-	y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
-	y0 ^= __ldg(&d_AES1[__byte_perm(x1, 0, 0x4441)]);
-	y3 ^= __ldg(&d_AES2[__byte_perm(x1, 0, 0x4442)]);
-	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
-
-	y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
-	y1 ^= __ldg(&d_AES1[__byte_perm(x2, 0, 0x4441)]);
-	y0 ^= __ldg(&d_AES2[__byte_perm(x2, 0, 0x4442)]);
-	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
-
-	y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]);
-	y2 ^= __ldg(&d_AES1[__byte_perm(x3, 0, 0x4441)]);
-	y1 ^= __ldg(&d_AES2[__byte_perm(x3, 0, 0x4442)]);
-	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
-}
-
-__device__ __forceinline__
-static void aes_round_LDG(const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){
-
-	y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
-	y3 = __ldg(&d_AES1[__byte_perm(x0, 0, 0x4441)]);
-	y2 = __ldg(&d_AES2[__byte_perm(x0, 0, 0x4442)]);
-	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
-
-	y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
-	y0 ^= __ldg(&d_AES1[__byte_perm(x1, 0, 0x4441)]);
-	y3 ^= __ldg(&d_AES2[__byte_perm(x1, 0, 0x4442)]);
-	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
-
-	y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
-	y1 ^= __ldg(&d_AES1[__byte_perm(x2, 0, 0x4441)]);
-	y0 ^= __ldg(&d_AES2[__byte_perm(x2, 0, 0x4442)]);
-	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
-
-	y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]);
-	y2 ^= __ldg(&d_AES1[__byte_perm(x3, 0, 0x4441)]);
-	y1 ^= __ldg(&d_AES2[__byte_perm(x3, 0, 0x4442)]);
-	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
-}
-
-__device__ __forceinline__
-static void AES_2ROUND(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0){
-
-	uint32_t y0, y1, y2, y3;
-
-	aes_round(x0, x1, x2, x3, k0, y0, y1, y2, y3);
-
-	aes_round(y0, y1, y2, y3, x0, x1, x2, x3);
-
-	// hier werden wir ein carry brauchen (oder auch nicht)
-	k0++;
-}
-
-__device__ __forceinline__
-static void AES_2ROUND_LDG(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0){
-
-	uint32_t y0, y1, y2, y3;
-
-	aes_round_LDG(x0, x1, x2, x3, k0, y0, y1, y2, y3);
-
-	aes_round_LDG(y0, y1, y2, y3, x0, x1, x2, x3);
-
-	// hier werden wir ein carry brauchen (oder auch nicht)
-	k0++;
-}
-
-__device__ __forceinline__
-static void AES_ROUND_NOKEY(uint4* x){
-
-	uint32_t y0, y1, y2, y3;
-	aes_round(x->x, x->y, x->z, x->w, y0, y1, y2, y3);
-
-	x->x = y0;
-	x->y = y1;
-	x->z = y2;
-	x->w = y3;
-
-}
-__device__ __forceinline__
-static void KEY_EXPAND_ELT(uint32_t *k){
-
-	uint32_t y0, y1, y2, y3;
-	aes_round(k[0], k[1], k[2], k[3], y0, y1, y2, y3);
-
-	k[0] = y1;
-	k[1] = y2;
-	k[2] = y3;
-	k[3] = y0;
-}
-
-__device__ __forceinline__
-void aes_gpu_init_mt_256(uint32_t sharedMemory[4][256])
-{
-	/* each thread startup will fill a uint32 */
-	if (threadIdx.x<256){
-		uint32_t temp = __ldg(&d_AES0[threadIdx.x]);
-		sharedMemory[0][threadIdx.x] = temp;
-		sharedMemory[1][threadIdx.x] = ROL8(temp);
-		sharedMemory[2][threadIdx.x] = ROL16(temp);
-#ifdef INTENSIVE_GMF
-#else
-		sharedMemory[3][threadIdx.x] = ROR8(temp);
-#endif
-	}
-}
-
-__device__ __forceinline__
-void aes_gpu_init256(uint32_t sharedMemory[4][256])
-{
-	/* each thread startup will fill a uint32 */
-	uint32_t temp = __ldg(&d_AES0[threadIdx.x]);
-	sharedMemory[0][threadIdx.x] = temp;
-	sharedMemory[1][threadIdx.x] = ROL8(temp);
-	sharedMemory[2][threadIdx.x] = ROL16(temp);
-#ifdef INTENSIVE_GMF
-#else
-	sharedMemory[3][threadIdx.x] = ROR8(temp);
-#endif
-}
-
-__device__ __forceinline__
-void aes_gpu_init128(uint32_t sharedMemory[4][256])
-{
-	/* each thread startup will fill 2 uint32 */
-	uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]);
-
-	sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x;
-	sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y;
-	sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x);
-	sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y);
-	sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x);
-	sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y);
-#ifdef INTENSIVE_GMF
-#else
-	sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x);
-	sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y);
-#endif
-}
-
-__device__ __forceinline__
-void aes_gpu_init_lt_256(uint32_t sharedMemory[4][256])
-{
-	if (threadIdx.x<128){
-		/* each thread startup will fill 2 uint32 */
-		uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]);
-
-		sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x;
-		sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y;
-		sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x);
-		sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y);
-		sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x);
-		sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y);
-#ifdef INTENSIVE_GMF
 #else
-		sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x);
-		sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y);
-#endif
-	}
-}
-
+#include "miner.h"
+#include "cuda_vectors_alexis.h"
 
-__device__ __forceinline__
-static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){
+/* AES Helper for inline-usage from SPH */
+#define AESx(x) (x ##UL) /* SPH_C32(x) */
 
-	y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
-	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
-	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
-	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
+//#define DEVICE_DIRECT_CONSTANTS
 
-	y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
-	y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
-	y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
-#ifdef INTENSIVE_GMF
-	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ __align__(64) uint32_t d_AES0[256] = {
 #else
-	y2 ^= sharedMemory[3][__byte_perm(x1, 0, 0x4443)];
+static const uint32_t h_AES0[256] = {
 #endif
-
-	y0 ^= k0;
-
-	y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
-	y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
-	y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
-	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
-
-	y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
-	y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
-	y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
-	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
+	AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
+	AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
+	AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
+	AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC),
+	AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA),
+	AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB),
+	AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45),
+	AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B),
+	AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C),
+	AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83),
+	AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9),
+	AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A),
+	AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D),
+	AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F),
+	AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF),
+	AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA),
+	AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34),
+	AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B),
+	AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D),
+	AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413),
+	AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1),
+	AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6),
+	AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972),
+	AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85),
+	AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED),
+	AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511),
+	AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE),
+	AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B),
+	AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05),
+	AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1),
+	AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142),
+	AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF),
+	AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3),
+	AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E),
+	AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A),
+	AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6),
+	AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3),
+	AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B),
+	AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428),
+	AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD),
+	AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14),
+	AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8),
+	AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4),
+	AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2),
+	AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA),
+	AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949),
+	AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF),
+	AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810),
+	AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C),
+	AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697),
+	AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E),
+	AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F),
+	AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC),
+	AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C),
+	AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969),
+	AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27),
+	AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122),
+	AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433),
+	AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9),
+	AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5),
+	AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A),
+	AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0),
+	AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E),
+	AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
+};
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ __align__(64) uint32_t d_AES1[256] = {
+#else
+static const uint32_t h_AES1[256] = {
+#endif
+	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
+	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
+	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
+	AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
+	AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
+	AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
+	AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
+	AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
+	AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
+	AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
+	AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
+	AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
+	AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
+	AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
+	AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
+	AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
+	AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
+	AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
+	AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
+	AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
+	AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
+	AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
+	AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
+	AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
+	AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
+	AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
+	AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
+	AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
+	AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
+	AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
+	AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
+	AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
+	AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
+	AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
+	AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
+	AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
+	AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
+	AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
+	AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
+	AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
+	AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
+	AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
+	AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
+	AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
+	AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
+	AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
+	AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
+	AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
+	AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
+	AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
+	AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
+	AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
+	AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
+	AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
+	AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
+	AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
+	AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
+	AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
+	AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
+	AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
+	AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
+	AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
+	AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
+	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
+};
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ __align__(64) uint32_t d_AES2[256] = {
+#else
+static const uint32_t h_AES2[256] = {
+#endif
+	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
+	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
+	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
+	AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
+	AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
+	AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
+	AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
+	AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
+	AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
+	AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
+	AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
+	AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
+	AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
+	AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
+	AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
+	AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
+	AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
+	AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
+	AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
+	AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
+	AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
+	AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
+	AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
+	AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
+	AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
+	AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
+	AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
+	AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
+	AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
+	AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
+	AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
+	AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
+	AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
+	AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
+	AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
+	AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
+	AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
+	AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
+	AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
+	AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
+	AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
+	AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
+	AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
+	AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
+	AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
+	AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
+	AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
+	AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
+	AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
+	AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
+	AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
+	AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
+	AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
+	AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
+	AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
+	AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
+	AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
+	AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
+	AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
+	AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
+	AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
+	AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
+	AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
+	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
+};
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ __align__(64) uint32_t d_AES3[256] = {
+#else
+static const uint32_t h_AES3[256] = {
+#endif
+	AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
+	AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
+	AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
+	AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676),
+	AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D),
+	AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0),
+	AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF),
+	AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0),
+	AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626),
+	AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC),
+	AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1),
+	AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515),
+	AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3),
+	AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A),
+	AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2),
+	AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575),
+	AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A),
+	AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0),
+	AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3),
+	AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484),
+	AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED),
+	AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B),
+	AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939),
+	AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF),
+	AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB),
+	AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585),
+	AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F),
+	AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8),
+	AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F),
+	AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5),
+	AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121),
+	AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2),
+	AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC),
+	AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717),
+	AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D),
+	AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373),
+	AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC),
+	AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888),
+	AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414),
+	AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB),
+	AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A),
+	AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C),
+	AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262),
+	AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979),
+	AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D),
+	AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9),
+	AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA),
+	AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808),
+	AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E),
+	AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6),
+	AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F),
+	AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A),
+	AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666),
+	AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E),
+	AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9),
+	AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E),
+	AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111),
+	AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494),
+	AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9),
+	AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF),
+	AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D),
+	AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868),
+	AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F),
+	AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
+};
+
+#ifndef DEVICE_DIRECT_CONSTANTS
+static __constant__ __align__(64) uint32_t d_AES0[256];
+static __constant__ __align__(64) uint32_t d_AES1[256];
+static __constant__ __align__(64) uint32_t d_AES2[256];
+static __constant__ __align__(64) uint32_t d_AES3[256];
+
+static void aes_cpu_init(int thr_id)
+{
+	CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES0,
+		h_AES0,
+		sizeof(h_AES0),
+		0, cudaMemcpyHostToDevice));
+
+	CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES1,
+		h_AES1,
+		sizeof(h_AES1),
+		0, cudaMemcpyHostToDevice));
+
+	CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES2,
+		h_AES2,
+		sizeof(h_AES2),
+		0, cudaMemcpyHostToDevice));
+
+	CUDA_CALL_OR_RET(cudaMemcpyToSymbol(d_AES3,
+		h_AES3,
+		sizeof(h_AES3),
+		0, cudaMemcpyHostToDevice));
 }
+#else
+static void aes_cpu_init(int thr_id) {}
+#endif
 
 __device__ __forceinline__
-static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){
+void aes_gpu_init_mt_256(uint32_t sharedMemory[4][256])
+{
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x<256){
+		uint32_t temp = __ldg(&d_AES0[threadIdx.x]);
+		sharedMemory[0][threadIdx.x] = temp;
+		sharedMemory[1][threadIdx.x] = ROL8(temp);
+		sharedMemory[2][threadIdx.x] = ROL16(temp);
+		sharedMemory[3][threadIdx.x] = ROR8(temp);
+	}
+}
 
-	y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
-	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
-	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
-	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
+__device__ __forceinline__
+void aes_gpu_init(uint32_t *sharedMemory)
+{
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 256) {
+		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
+		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
+		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
+		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
+	}
+}
 
-	y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
-	y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
-	y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
-	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+/* tried with 3 xor.b32 asm, not faster */
+#define xor4_32(a,b,c,d) ((a ^ b) ^ (c ^ d));
+
+__device__
+static void aes_round(
+const uint32_t *sharedMemory,
+uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0,
+uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+	y0 = xor4_32(
+		sharedMemory[__byte_perm(x0, 0, 0x4440)],
+		sharedMemory[__byte_perm(x1, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x2, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]);
+
+	y1 = xor4_32(
+		sharedMemory[__byte_perm(x1, 0, 0x4440)],
+		sharedMemory[__byte_perm(x2, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x3, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]);
+
+	y2 = xor4_32(
+		sharedMemory[__byte_perm(x2, 0, 0x4440)],
+		sharedMemory[__byte_perm(x3, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x0, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2
 
 	y0 ^= k0;
 
-	y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
-	y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
-	y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
-	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
+	y3 = xor4_32(
+		sharedMemory[__byte_perm(x3, 0, 0x4440)],
+		sharedMemory[__byte_perm(x0, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x1, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3
+}
 
-	y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]);
-	y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
-	y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
-	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
+__device__
+static void aes_round(
+const uint32_t *sharedMemory,
+uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
+uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+	y0 = xor4_32(
+		sharedMemory[__byte_perm(x0, 0, 0x4440)],
+		sharedMemory[__byte_perm(x1, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x2, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]);
+
+	y1 = xor4_32(
+		sharedMemory[__byte_perm(x1, 0, 0x4440)],
+		sharedMemory[__byte_perm(x2, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x3, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]);
+
+	y2 = xor4_32(
+		sharedMemory[__byte_perm(x2, 0, 0x4440)],
+		sharedMemory[__byte_perm(x3, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x0, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2
+
+	y3 = xor4_32(
+		sharedMemory[__byte_perm(x3, 0, 0x4440)],
+		sharedMemory[__byte_perm(x0, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x1, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3
 }
 
 __device__ __forceinline__
-static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){
+static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){
 
-	y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)];
+	y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
 	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
 	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
 	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
 
-#ifdef INTENSIVE_GMF
-	y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
-#else
 	y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
-#endif
 	y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
 	y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
+#ifdef INTENSIVE_GMF
 	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+#else
+	y2 ^= sharedMemory[3][__byte_perm(x1, 0, 0x4443)];
+#endif
 
-	y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)];
+	y0 ^= k0;
+
+	y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
 	y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
 	y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
 	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
@@ -1031,14 +883,42 @@ static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, co
 }
 
 __device__ __forceinline__
-static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){
+static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){
+	/*
+	y0 = xor4_32(
+	sharedMemory[0][__byte_perm(x0, 0, 0x4440)],
+	sharedMemory[1][__byte_perm(x1, 0, 0x4441)],
+	sharedMemory[2][__byte_perm(x2, 0, 0x4442)],
+	sharedMemory[3][__byte_perm(x3, 0, 0x4443)]);
+
+	y1 = xor4_32(
+	sharedMemory[0][__byte_perm(x1, 0, 0x4440)],
+	sharedMemory[1][__byte_perm(x2, 0, 0x4441)],
+	sharedMemory[2][__byte_perm(x3, 0, 0x4442)],
+	sharedMemory[3][__byte_perm(x0, 0, 0x4443)]);
 
+	y2 = xor4_32(
+	sharedMemory[0][__byte_perm(x2, 0, 0x4440)],
+	sharedMemory[1][__byte_perm(x3, 0, 0x4441)],
+	sharedMemory[2][__byte_perm(x0, 0, 0x4442)],
+	sharedMemory[3][__byte_perm(x1, 0, 0x4443)]); // ^k2
+
+	y3 = xor4_32(
+	sharedMemory[0][__byte_perm(x3, 0, 0x4440)],
+	sharedMemory[1][__byte_perm(x0, 0, 0x4441)],
+	sharedMemory[2][__byte_perm(x1, 0, 0x4442)],
+	sharedMemory[3][__byte_perm(x2, 0, 0x4443)]); // ^k3
+	*/
 	y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)];
 	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
 	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
 	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
 
+#ifdef INTENSIVE_GMF
 	y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
+#else
+	y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
+#endif
 	y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
 	y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
 	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
@@ -1052,34 +932,9 @@ static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0
 	y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
 	y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
 	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
-}
-
-__device__ __forceinline__
-static void AES_2ROUND(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0){
-
-	uint32_t y0, y1, y2, y3;
-
-	aes_round(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3);
-
-	aes_round(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3);
-
-	// hier werden wir ein carry brauchen (oder auch nicht)
-	k0++;
-}
 
-__device__ __forceinline__
-static void AES_2ROUND_LDG(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0){
-
-	uint32_t y0, y1, y2, y3;
-
-	aes_round_LDG(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3);
-
-	aes_round_LDG(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3);
-
-	// hier werden wir ein carry brauchen (oder auch nicht)
-	k0++;
 }
-
+//! only bad people write code like this (ALL_CAPS non macro)
 __device__ __forceinline__
 static void AES_ROUND_NOKEY(const uint32_t sharedMemory[4][256], uint4* x){
 
@@ -1103,3 +958,4 @@ static void KEY_EXPAND_ELT(const uint32_t sharedMemory[4][256], uint32_t *k){
 	k[2] = y3;
 	k[3] = y0;
 }
+#endif
\ No newline at end of file
diff --git a/x11/cuda_x11_echo_alexis.cu b/x11/cuda_x11_echo_alexis.cu
index bff362b28a..84fe885c3b 100644
--- a/x11/cuda_x11_echo_alexis.cu
+++ b/x11/cuda_x11_echo_alexis.cu
@@ -2,14 +2,13 @@
 	Based on Tanguy Pruvot's repo
 	Provos Alexis - 2016
 */
+
 //#include "cuda_helper.h"
-#include "miner.h"
 #include "cuda_helper_alexis.h"
 #include "cuda_vectors_alexis.h"
 
 #define INTENSIVE_GMF
-//#include "cuda_x11_aes_alexis.cuh"
-#include "../x11/cuda_x11_echo_aes.cuh"
+#include "cuda_x11_aes_alexis.cuh"
 
 __device__
 static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W, uint32_t &k0){
@@ -269,12 +268,6 @@ static void x11_echo512_gpu_hash_64_final_alexis(uint32_t threads, uint64_t *g_h
 	}
 }
 
-__host__
-void X11_shavite512_cpu_init(int thr_id, uint32_t threads)
-{
-	aes_cpu_init(thr_id);
-}
-
 __host__
 void x11_echo512_cpu_hash_64_final_alexis(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target)
 {
@@ -286,11 +279,10 @@ void x11_echo512_cpu_hash_64_final_alexis(int thr_id, uint32_t threads, uint32_t
 	x11_echo512_gpu_hash_64_final_alexis<<<grid, block>>>(threads, (uint64_t*)d_hash,d_resNonce,target);
 }
 
-
 __global__ __launch_bounds__(128, 5) /* will force 80 registers */
 static void x11_echo512_gpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
 	__shared__ uint32_t sharedMemory[4][256];
 
@@ -446,14 +438,15 @@ static void x11_echo512_gpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32
 		*(uint2x4*)&Hash[ 0] = *(uint2x4*)&hash[ 0] ^ *(uint2x4*)&W[ 0];
 		*(uint2x4*)&Hash[ 8] = *(uint2x4*)&hash[ 8] ^ *(uint2x4*)&W[ 8];
 	}
-} 
+}
 
 __host__
 void x11_echo512_cpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *d_hash){
 
 	const uint32_t threadsperblock = 128;
+
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
+
 	x11_echo512_gpu_hash_64_alexis << <grid, block >> >(thr_id, threads, d_hash);
-//	x11_echo512_gpu_hash_64_alexis << <grid, block >> >((int*)((uint64_t)d_ark | (thr_id & 15)), threads, d_hash);
 }
diff --git a/x11/cuda_x11_luffa512.cu b/x11/cuda_x11_luffa512.cu
index 2e71d77586..e8ac522865 100644
--- a/x11/cuda_x11_luffa512.cu
+++ b/x11/cuda_x11_luffa512.cu
@@ -335,7 +335,7 @@ void finalization512(hashState *state, uint32_t *b)
 
 /***************************************************/
 // Die Hash-Funktion
-__global__ void x11_luffa512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+__global__ void x11_luffa512_gpu_hash_64(int *thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
     uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
     if (thread < threads)
@@ -364,7 +364,7 @@ void x11_luffa512_cpu_init(int thr_id, uint32_t threads)
     CUDA_CALL_OR_RET(cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice));
 }
 
-__host__ void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void x11_luffa512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
     const uint32_t threadsperblock = 256;
 
@@ -375,7 +375,7 @@ __host__ void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st
     // Größe des dynamischen Shared Memory Bereichs
     size_t shared_size = 0;
 
-    x11_luffa512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x11_luffa512_gpu_hash_64 << <grid, block, shared_size >> >(thr_id, threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
     //MyStreamSynchronize(NULL, order, thr_id);
 }
 
diff --git a/x11/cuda_x11_luffa512_Cubehash.cu b/x11/cuda_x11_luffa512_Cubehash.cu
index cab0062443..7b55308950 100644
--- a/x11/cuda_x11_luffa512_Cubehash.cu
+++ b/x11/cuda_x11_luffa512_Cubehash.cu
@@ -732,7 +732,7 @@ __global__
 #if __CUDA_ARCH__ > 500
 __launch_bounds__(256, 4)
 #endif
-void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
+void x11_luffaCubehash512_gpu_hash_64(int *thr_id, uint32_t threads, uint32_t *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -817,15 +817,15 @@ void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
 }
 
 __host__
-void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order)
+void x11_luffaCubehash512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash, int order)
 {
 	const uint32_t threadsperblock = 256;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	x11_luffaCubehash512_gpu_hash_64 <<<grid, block>>> (threads, d_hash);
-	MyStreamSynchronize(NULL, order, thr_id);
+	x11_luffaCubehash512_gpu_hash_64 <<<grid, block>>> (thr_id, threads, d_hash);
+	MyStreamSynchronize(NULL, order, ((uintptr_t)thr_id) & 15);
 }
 
 // Setup
diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu
index 0c774c4a63..4da808c218 100644
--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@@ -534,12 +534,9 @@ __global__ __launch_bounds__(TPB, 2)
 #else
 #error "Not set up for this"
 #endif
-void x11_shavite512_gpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
-#if TPB == 128
+	#if TPB == 128
 	aes_gpu_init_128(sharedMemory);
 	#elif TPB == 384
 	//! todo, fix naming and sharedMemory
@@ -607,7 +604,7 @@ void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	x11_shavite512_gpu_hash_80 << <grid, block >> >(thr_id, threads, startNounce, (uint64_t*)d_outputHash);
+	x11_shavite512_gpu_hash_80<<<grid, block>>>(threads, startNounce, (uint64_t*)d_outputHash);
 }
 
 __host__
diff --git a/x11/cuda_x11_shavite512_alexis.cu b/x11/cuda_x11_shavite512_alexis.cu
index b67aaee900..b366f4a708 100644
--- a/x11/cuda_x11_shavite512_alexis.cu
+++ b/x11/cuda_x11_shavite512_alexis.cu
@@ -188,7 +188,7 @@ static void round_4_8_12(const uint32_t sharedMemory[4][256], uint32_t* r, uint4
 __global__ __launch_bounds__(TPB,2) /* 64 registers with 128,8 - 72 regs with 128,7 */
 void x11_shavite512_gpu_hash_64_alexis(int *thr_id, const uint32_t threads, uint64_t *g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
 	__shared__ uint32_t sharedMemory[4][256];
 
@@ -507,13 +507,13 @@ void x11_shavite512_gpu_hash_64_alexis(int *thr_id, const uint32_t threads, uint
 		*(uint2x4*)&Hash[ 4] = *(uint2x4*)&state[ 8] ^ *(uint2x4*)&p[ 0];
 	}
 }
- 
+
 __host__
 void x11_shavite512_cpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *d_hash)
 {
-	dim3 grid((threads + TPB-1)/TPB);
+	dim3 grid((threads + TPB - 1) / TPB);
 	dim3 block(TPB);
 
 	// note: 128 threads minimum are required to init the shared memory array
-	x11_shavite512_gpu_hash_64_alexis << <grid, block >> >(thr_id, threads, (uint64_t*)d_hash);
+	x11_shavite512_gpu_hash_64_alexis<<<grid, block>>>(thr_id, threads, (uint64_t*)d_hash);
 }
diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu
index 44ec52ebb4..7da95c490f 100644
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@@ -1,113 +1,751 @@
 /***************************************************************************************************
-* SIMD512 SM3+ CUDA IMPLEMENTATION (require cuda_x11_simd512_func.cuh)
-* Uses Alexis78 simd modifications
-*/
+ * SIMD512 SM3+ CUDA IMPLEMENTATION (require cuda_x11_simd512_func.cuh)
+ */
 
 #include "miner.h"
 #include "cuda_helper_alexis.h"
-#include "cuda_vectors_alexis.h"
+
+#define TPB 128
+
+uint32_t *d_state[MAX_GPUS];
+uint4 *d_temp4[MAX_GPUS];
+
+// texture bound to d_temp4[thr_id], for read access in Compaction kernel
+texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
+
+#define DEVICE_DIRECT_CONSTANTS
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ uint8_t c_perm[8][8] = {
+#else
+__constant__ uint8_t c_perm[8][8];
+const uint8_t h_perm[8][8] = {
+#endif
+	{ 2, 3, 6, 7, 0, 1, 4, 5 },
+	{ 6, 7, 2, 3, 4, 5, 0, 1 },
+	{ 7, 6, 5, 4, 3, 2, 1, 0 },
+	{ 1, 0, 3, 2, 5, 4, 7, 6 },
+	{ 0, 1, 4, 5, 6, 7, 2, 3 },
+	{ 6, 7, 2, 3, 0, 1, 4, 5 },
+	{ 6, 7, 0, 1, 4, 5, 2, 3 },
+	{ 4, 5, 2, 3, 6, 7, 0, 1 }
+};
+
+/* used in cuda_x11_simd512_func.cuh (SIMD_Compress2) */
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ uint32_t c_IV_512[32] = {
+#else
+__constant__ uint32_t c_IV_512[32];
+const uint32_t h_IV_512[32] = {
+#endif
+	0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
+	0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
+	0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
+	0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
+};
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ short c_FFT128_8_16_Twiddle[128] = {
+#else
+__constant__ short c_FFT128_8_16_Twiddle[128];
+static const short h_FFT128_8_16_Twiddle[128] = {
+#endif
+	1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+	1,  60,   2, 120,   4,  -17,   8, -34,  16, -68,  32, 121,  64, -15, 128, -30,
+	1,  46,  60, -67,   2,   92, 120, 123,   4, -73, -17, -11,   8, 111, -34, -22,
+	1, -67, 120, -73,   8,  -22, -68, -70,  64,  81, -30, -46,  -2,-123,  17,-111,
+	1,-118,  46, -31,  60,  116, -67, -61,   2,  21,  92, -62, 120, -25, 123,-122,
+	1, 116,  92,-122, -17,   84, -22,  18,  32, 114, 117, -49, -30, 118,  67,  62,
+	1, -31, -67,  21, 120, -122, -73, -50,   8,   9, -22, -89, -68,  52, -70, 114,
+	1, -61, 123, -50, -34,   18, -70, -99, 128, -98,  67,  25,  17,  -9,  35, -79
+};
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ short c_FFT256_2_128_Twiddle[128] = {
+#else
+__constant__ short c_FFT256_2_128_Twiddle[128];
+static const short h_FFT256_2_128_Twiddle[128] = {
+#endif
+	  1,  41,-118,  45,  46,  87, -31,  14,
+	 60,-110, 116,-127, -67,  80, -61,  69,
+	  2,  82,  21,  90,  92, -83, -62,  28,
+	120,  37, -25,   3, 123, -97,-122,-119,
+	  4, -93,  42, -77, -73,  91,-124,  56,
+	-17,  74, -50,   6, -11,  63,  13,  19,
+	  8,  71,  84, 103, 111, -75,   9, 112,
+	-34,-109,-100,  12, -22, 126,  26,  38,
+	 16,-115, -89, -51, -35, 107,  18, -33,
+	-68,  39,  57,  24, -44,  -5,  52,  76,
+	 32,  27,  79,-102, -70, -43,  36, -66,
+	121,  78, 114,  48, -88, -10, 104,-105,
+	 64,  54, -99,  53, 117, -86,  72, 125,
+	-15,-101, -29,  96,  81, -20, -49,  47,
+	128, 108,  59, 106, -23,  85,-113,  -7,
+	-30,  55, -58, -65, -95, -40, -98,  94
+};
+
+/************* the round function ****************/
+#define IF(x, y, z) (((y ^ z) & x) ^ z)
+#define MAJ(x, y, z) ((z &y) | ((z|y) & x))
+
+#include "cuda_x11_simd512_sm2.cuh"
+#include "cuda_x11_simd512_func.cuh"
 
 #ifdef __INTELLISENSE__
 /* just for vstudio code colors */
 #define __CUDA_ARCH__ 500
 #endif
 
-#define TPB50_1 128
-#define TPB50_2 128
-#define TPB52_1 128
-#define TPB52_2 128
+#if __CUDA_ARCH__ >= 300
 
-static uint4 *d_temp4[MAX_GPUS];
-#include "cuda_x11_simd512_func.cuh"
+/********************* Message expansion ************************/
 
-__global__
-#if __CUDA_ARCH__ > 500
-__launch_bounds__(TPB52_2, 1)
-#else
-__launch_bounds__(TPB50_2, 4)
+/*
+ * Reduce modulo 257; result is in [-127; 383]
+ * REDUCE(x) := (x&255) - (x>>8)
+ */
+#define REDUCE(x) \
+	(((x)&255) - ((x)>>8))
+
+/*
+ * Reduce from [-127; 383] to [-128; 128]
+ * EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
+ */
+#define EXTRA_REDUCE_S(x) \
+	((x)<=128 ? (x) : (x)-257)
+
+/*
+ * Reduce modulo 257; result is in [-128; 128]
+ */
+#define REDUCE_FULL_S(x) \
+	EXTRA_REDUCE_S(REDUCE(x))
+
+// Parallelization:
+//
+// FFT_8  wird 2 times 8-fach parallel ausgeführt (in FFT_64)
+//        and  1 time 16-fach parallel (in FFT_128_full)
+//
+// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations
+
+/**
+ * FFT_8 using w=4 as 8th root of unity
+ * Unrolled decimation in frequency (DIF) radix-2 NTT.
+ * Output data is in revbin_permuted order.
+ */
+__device__ __forceinline__
+void FFT_8(int *y, int stripe)
+{
+#define X(i) y[stripe*i]
+
+#define DO_REDUCE(i) \
+	X(i) = REDUCE(X(i))
+
+#define DO_REDUCE_FULL_S(i) \
+do { \
+	X(i) = REDUCE(X(i)); \
+	X(i) = EXTRA_REDUCE_S(X(i)); \
+} while(0)
+
+#define BUTTERFLY(i,j,n) \
+do { \
+	int u= X(i); \
+	int v= X(j); \
+	X(i) = u+v; \
+	X(j) = (u-v) << (2*n); \
+} while(0)
+
+	BUTTERFLY(0, 4, 0);
+	BUTTERFLY(1, 5, 1);
+	BUTTERFLY(2, 6, 2);
+	BUTTERFLY(3, 7, 3);
+
+	DO_REDUCE(6);
+	DO_REDUCE(7);
+
+	BUTTERFLY(0, 2, 0);
+	BUTTERFLY(4, 6, 0);
+	BUTTERFLY(1, 3, 2);
+	BUTTERFLY(5, 7, 2);
+
+	DO_REDUCE(7);
+
+	BUTTERFLY(0, 1, 0);
+	BUTTERFLY(2, 3, 0);
+	BUTTERFLY(4, 5, 0);
+	BUTTERFLY(6, 7, 0);
+
+	DO_REDUCE_FULL_S(0);
+	DO_REDUCE_FULL_S(1);
+	DO_REDUCE_FULL_S(2);
+	DO_REDUCE_FULL_S(3);
+	DO_REDUCE_FULL_S(4);
+	DO_REDUCE_FULL_S(5);
+	DO_REDUCE_FULL_S(6);
+	DO_REDUCE_FULL_S(7);
+
+#undef X
+#undef DO_REDUCE
+#undef DO_REDUCE_FULL_S
+#undef BUTTERFLY
+}
+
+#if defined(__CUDA_ARCH__)
+#if __CUDA_ARCH__ < 300
+  #define __shfl(var, srcLane, width) (uint32_t)(var)
+  // #error __shfl() not supported by SM 2.x
+#endif
 #endif
-static void x11_simd512_gpu_compress_64_maxwell(int *thr_id, uint32_t threads, uint32_t *g_hash, const uint4 *const __restrict__ g_fft4)
+
+/**
+ * FFT_16 using w=2 as 16th root of unity
+ * Unrolled decimation in frequency (DIF) radix-2 NTT.
+ * Output data is in revbin_permuted order.
+ */
+__device__ __forceinline__
+void FFT_16(int *y)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
-		return;
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	const uint32_t thr_offset = thread << 6; // thr_id * 128 (je zwei elemente)
-	uint32_t IV[32];
-	if (thread < threads){
+#define DO_REDUCE_FULL_S(i) \
+	do { \
+		y[i] = REDUCE(y[i]); \
+		y[i] = EXTRA_REDUCE_S(y[i]); \
+	} while(0)
 
-		uint32_t *Hash = &g_hash[thread << 4];
-		//		Compression1(Hash, thread, g_fft4, g_state);
-		uint32_t A[32];
+	int u,v;
 
-		*(uint2x4*)&IV[0] = *(uint2x4*)&c_IV_512[0];
-		*(uint2x4*)&IV[8] = *(uint2x4*)&c_IV_512[8];
-		*(uint2x4*)&IV[16] = *(uint2x4*)&c_IV_512[16];
-		*(uint2x4*)&IV[24] = *(uint2x4*)&c_IV_512[24];
+	// BUTTERFLY(0, 8, 0);
+	// BUTTERFLY(1, 9, 1);
+	// BUTTERFLY(2, 10, 2);
+	// BUTTERFLY(3, 11, 3);
+	// BUTTERFLY(4, 12, 4);
+	// BUTTERFLY(5, 13, 5);
+	// BUTTERFLY(6, 14, 6);
+	// BUTTERFLY(7, 15, 7);
+	{
+		u = y[0]; // 0..7
+		v = y[1]; // 8..15
+		y[0] = u+v;
+		y[1] = (u-v) << (threadIdx.x&7);
+	}
 
-		*(uint2x4*)&A[0] = __ldg4((uint2x4*)&Hash[0]);
-		*(uint2x4*)&A[8] = __ldg4((uint2x4*)&Hash[8]);
+	// DO_REDUCE(11);
+	// DO_REDUCE(12);
+	// DO_REDUCE(13);
+	// DO_REDUCE(14);
+	// DO_REDUCE(15);
+	if ((threadIdx.x&7) >=3) y[1] = REDUCE(y[1]);  // 11...15
 
-#pragma unroll 16
-		for (uint32_t i = 0; i<16; i++)
-			A[i] = A[i] ^ IV[i];
+	// BUTTERFLY( 0, 4, 0);
+	// BUTTERFLY( 1, 5, 2);
+	// BUTTERFLY( 2, 6, 4);
+	// BUTTERFLY( 3, 7, 6);
+	{
+		u = __shfl((int)y[0],  (threadIdx.x&3),8); // 0,1,2,3  0,1,2,3
+		v = __shfl((int)y[0],4+(threadIdx.x&3),8); // 4,5,6,7  4,5,6,7
+		y[0] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3)));
+	}
+
+	// BUTTERFLY( 8, 12, 0);
+	// BUTTERFLY( 9, 13, 2);
+	// BUTTERFLY(10, 14, 4);
+	// BUTTERFLY(11, 15, 6);
+	{
+		u = __shfl((int)y[1],  (threadIdx.x&3),8); // 8,9,10,11    8,9,10,11
+		v = __shfl((int)y[1],4+(threadIdx.x&3),8); // 12,13,14,15  12,13,14,15
+		y[1] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3)));
+	}
+
+	// DO_REDUCE(5);
+	// DO_REDUCE(7);
+	// DO_REDUCE(13);
+	// DO_REDUCE(15);
+	if ((threadIdx.x&1) && (threadIdx.x&7) >= 4) {
+		y[0] = REDUCE(y[0]);  // 5, 7
+		y[1] = REDUCE(y[1]);  // 13, 15
+	}
+
+	// BUTTERFLY( 0, 2, 0);
+	// BUTTERFLY( 1, 3, 4);
+	// BUTTERFLY( 4, 6, 0);
+	// BUTTERFLY( 5, 7, 4);
+	{
+		u = __shfl((int)y[0],  (threadIdx.x&5),8); // 0,1,0,1  4,5,4,5
+		v = __shfl((int)y[0],2+(threadIdx.x&5),8); // 2,3,2,3  6,7,6,7
+		y[0] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1)));
+	}
+
+	// BUTTERFLY( 8, 10, 0);
+	// BUTTERFLY( 9, 11, 4);
+	// BUTTERFLY(12, 14, 0);
+	// BUTTERFLY(13, 15, 4);
+	{
+		u = __shfl((int)y[1],  (threadIdx.x&5),8); // 8,9,8,9      12,13,12,13
+		v = __shfl((int)y[1],2+(threadIdx.x&5),8); // 10,11,10,11  14,15,14,15
+		y[1] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1)));
+	}
+
+	// BUTTERFLY( 0, 1, 0);
+	// BUTTERFLY( 2, 3, 0);
+	// BUTTERFLY( 4, 5, 0);
+	// BUTTERFLY( 6, 7, 0);
+	{
+		u = __shfl((int)y[0],  (threadIdx.x&6),8); // 0,0,2,2      4,4,6,6
+		v = __shfl((int)y[0],1+(threadIdx.x&6),8); // 1,1,3,3      5,5,7,7
+		y[0] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v);
+	}
+
+	// BUTTERFLY( 8, 9, 0);
+	// BUTTERFLY(10, 11, 0);
+	// BUTTERFLY(12, 13, 0);
+	// BUTTERFLY(14, 15, 0);
+	{
+		u = __shfl((int)y[1],  (threadIdx.x&6),8); // 8,8,10,10    12,12,14,14
+		v = __shfl((int)y[1],1+(threadIdx.x&6),8); // 9,9,11,11    13,13,15,15
+		y[1] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v);
+	}
+
+	DO_REDUCE_FULL_S( 0); // 0...7
+	DO_REDUCE_FULL_S( 1); // 8...15
+
+#undef DO_REDUCE_FULL_S
+}
+
+__device__ __forceinline__
+void FFT_128_full(int y[128])
+{
+	int i;
+
+	FFT_8(y+0,2); // eight parallel FFT8's
+	FFT_8(y+1,2); // eight parallel FFT8's
 
 #pragma unroll 16
-		for (uint32_t i = 16; i<32; i++)
-			A[i] = IV[i];
+	for (i=0; i<16; i++)
+	/*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i*8+(threadIdx.x&7)]);
+
+#pragma unroll 8
+	for (i=0; i<8; i++)
+		FFT_16(y+2*i);  // eight sequential FFT16's, each one executed in parallel by 8 threads
+}
 
-		Round8(A, thr_offset, g_fft4);
+__device__ __forceinline__
+void FFT_256_halfzero(int y[256])
+{
+	/*
+	 * FFT_256 using w=41 as 256th root of unity.
+	 * Decimation in frequency (DIF) NTT.
+	 * Output data is in revbin_permuted order.
+	 * In place.
+	 */
+	const int tmp = y[15];
 
-		STEP8_IF(&IV[0], 32, 4, 13, &A[0], &A[8], &A[16], &A[24]);
-		STEP8_IF(&IV[8], 33, 13, 10, &A[24], &A[0], &A[8], &A[16]);
-		STEP8_IF(&IV[16], 34, 10, 25, &A[16], &A[24], &A[0], &A[8]);
-		STEP8_IF(&IV[24], 35, 25, 4, &A[8], &A[16], &A[24], &A[0]);
+#pragma unroll 8
+	for (int i=0; i<8; i++)
+		y[16+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[8*i+(threadIdx.x&7)]);
+#pragma unroll 8
+	for (int i=8; i<16; i++)
+		y[16+i] = 0;
 
-#pragma unroll 32
-		for (uint32_t i = 0; i<32; i++){
-			IV[i] = A[i];
-		}
+	/* handle X^255 with an additional butterfly */
+	if ((threadIdx.x&7) == 7)
+	{
+		y[15] = REDUCE(tmp + 1);
+		y[31] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]);
+	}
 
-		A[0] ^= 512;
+	FFT_128_full(y);
+	FFT_128_full(y+16);
+}
 
-		Round8_0_final(A, 3, 23, 17, 27);
-		Round8_1_final(A, 28, 19, 22, 7);
-		Round8_2_final(A, 29, 9, 15, 5);
-		Round8_3_final(A, 4, 13, 10, 25);
-		STEP8_IF(&IV[0], 32, 4, 13, &A[0], &A[8], &A[16], &A[24]);
-		STEP8_IF(&IV[8], 33, 13, 10, &A[24], &A[0], &A[8], &A[16]);
-		STEP8_IF(&IV[16], 34, 10, 25, &A[16], &A[24], &A[0], &A[8]);
-		STEP8_IF(&IV[24], 35, 25, 4, &A[8], &A[16], &A[24], &A[0]);
+/***************************************************/
 
-		*(uint2x4*)&Hash[0] = *(uint2x4*)&A[0];
-		*(uint2x4*)&Hash[8] = *(uint2x4*)&A[8];
+__device__ __forceinline__
+void Expansion(const uint32_t *data, uint4 *g_temp4)
+{
+	/* Message Expansion using Number Theoretical Transform similar to FFT */
+	int expanded[32];
+#pragma unroll 4
+	for (int i=0; i < 4; i++) {
+		expanded[  i] = __byte_perm(__shfl((int)data[0], 2*i, 8), __shfl((int)data[0], (2*i)+1, 8), threadIdx.x&7)&0xff;
+		expanded[4+i] = __byte_perm(__shfl((int)data[1], 2*i, 8), __shfl((int)data[1], (2*i)+1, 8), threadIdx.x&7)&0xff;
+	}
+#pragma unroll 8
+	for (int i=8; i < 16; i++)
+		expanded[i] = 0;
+
+	FFT_256_halfzero(expanded);
+
+	// store w matrices in global memory
+
+#define mul_185(x) ( (x)*185 )
+#define mul_233(x) ( (x)*233 )
+
+	uint4 vec0;
+	int P, Q, P1, Q1, P2, Q2;
+	bool even = (threadIdx.x & 1) == 0;
+
+//  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         2 2 2 2 2 2 2 2     2 2 2 2 2 2 2 2
+//  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         6 6 6 6 6 6 6 6     6 6 6 6 6 6 6 6
+//  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         0 0 0 0 0 0 0 0     0 0 0 0 0 0 0 0
+//  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         4 4 4 4 4 4 4 4     4 4 4 4 4 4 4 4
+
+	// 2 6 0 4
+
+	P1 = expanded[ 0]; P2 = __shfl(expanded[ 2], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[16]; Q2 = __shfl(expanded[18], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
+	P1 = expanded[ 8]; P2 = __shfl(expanded[10], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[24]; Q2 = __shfl(expanded[26], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
+	P1 = expanded[ 4]; P2 = __shfl(expanded[ 6], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[20]; Q2 = __shfl(expanded[22], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
+	P1 = expanded[12]; P2 = __shfl(expanded[14], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[28]; Q2 = __shfl(expanded[30], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
+	g_temp4[threadIdx.x&7] = vec0;
+
+//  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         6 6 6 6 6 6 6 6     6 6 6 6 6 6 6 6
+//  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         2 2 2 2 2 2 2 2     2 2 2 2 2 2 2 2
+//  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         4 4 4 4 4 4 4 4     4 4 4 4 4 4 4 4
+//  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         0 0 0 0 0 0 0 0     0 0 0 0 0 0 0 0
+
+	// 6 2 4 0
+
+	P1 = expanded[ 1]; P2 = __shfl(expanded[ 3], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[17]; Q2 = __shfl(expanded[19], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
+	P1 = expanded[ 9]; P2 = __shfl(expanded[11], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[25]; Q2 = __shfl(expanded[27], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
+	P1 = expanded[ 5]; P2 = __shfl(expanded[ 7], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[21]; Q2 = __shfl(expanded[23], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
+	P1 = expanded[13]; P2 = __shfl(expanded[15], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[29]; Q2 = __shfl(expanded[31], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
+	g_temp4[8+(threadIdx.x&7)] = vec0;
+
+//  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         7 7 7 7 7 7 7 7     7 7 7 7 7 7 7 7
+//  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         5 5 5 5 5 5 5 5     5 5 5 5 5 5 5 5
+//  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         3 3 3 3 3 3 3 3     3 3 3 3 3 3 3 3
+//  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         1 1 1 1 1 1 1 1     1 1 1 1 1 1 1 1
+
+	// 7 5 3 1
+
+	bool hi = (threadIdx.x&7)>=4;
+
+	P1 = hi?expanded[ 1]:expanded[ 0]; P2 = __shfl(hi?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = hi?expanded[17]:expanded[16]; Q2 = __shfl(hi?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
+	P1 = hi?expanded[ 9]:expanded[ 8]; P2 = __shfl(hi?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = hi?expanded[25]:expanded[24]; Q2 = __shfl(hi?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
+	P1 = hi?expanded[ 5]:expanded[ 4]; P2 = __shfl(hi?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = hi?expanded[21]:expanded[20]; Q2 = __shfl(hi?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
+	P1 = hi?expanded[13]:expanded[12]; P2 = __shfl(hi?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = hi?expanded[29]:expanded[28]; Q2 = __shfl(hi?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
+	g_temp4[16+(threadIdx.x&7)] = vec0;
+
+//  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         1 1 1 1 1 1 1 1     1 1 1 1 1 1 1 1
+//  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         3 3 3 3 3 3 3 3     3 3 3 3 3 3 3 3
+//  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         5 5 5 5 5 5 5 5     5 5 5 5 5 5 5 5
+//  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         7 7 7 7 7 7 7 7     7 7 7 7 7 7 7 7
+
+  // 1 3 5 7
+
+	bool lo = (threadIdx.x&7)<4;
+
+	P1 = lo?expanded[ 1]:expanded[ 0]; P2 = __shfl(lo?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = lo?expanded[17]:expanded[16]; Q2 = __shfl(lo?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
+	P1 = lo?expanded[ 9]:expanded[ 8]; P2 = __shfl(lo?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = lo?expanded[25]:expanded[24]; Q2 = __shfl(lo?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
+	P1 = lo?expanded[ 5]:expanded[ 4]; P2 = __shfl(lo?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = lo?expanded[21]:expanded[20]; Q2 = __shfl(lo?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
+	P1 = lo?expanded[13]:expanded[12]; P2 = __shfl(lo?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = lo?expanded[29]:expanded[28]; Q2 = __shfl(lo?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
+	g_temp4[24+(threadIdx.x&7)] = vec0;
+
+//  1   9   5  13   3  11   7  15       1   9   5  13   3  11   7  15         0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1
+//  0   8   4  12   2  10   6  14       0   8   4  12   2  10   6  14         4 4 4 4 4 4 4 4     5 5 5 5 5 5 5 5
+//  1   9   5  13   3  11   7  15       1   9   5  13   3  11   7  15         6 6 6 6 6 6 6 6     7 7 7 7 7 7 7 7
+//  0   8   4  12   2  10   6  14       0   8   4  12   2  10   6  14         2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3
+
+//{ 8, 72, 40, 104, 24, 88, 56, 120 },   { 9, 73, 41, 105, 25, 89, 57, 121 },
+//{ 4, 68, 36, 100, 20, 84, 52, 116 },   { 5, 69, 37, 101, 21, 85, 53, 117 },
+//{ 14, 78, 46, 110, 30, 94, 62, 126 },  { 15, 79, 47, 111, 31, 95, 63, 127 },
+//{ 2, 66, 34, 98, 18, 82, 50, 114 },    { 3, 67, 35, 99, 19, 83, 51, 115 },
+
+	bool sel = ((threadIdx.x+2)&7) >= 4;  // 2,3,4,5
+
+	P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[2]:expanded[3]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
+	P1 = sel?expanded[8]:expanded[9]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[10]:expanded[11]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
+	P1 = sel?expanded[4]:expanded[5]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[6]:expanded[7]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
+	P1 = sel?expanded[12]:expanded[13]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[14]:expanded[15]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
+
+	g_temp4[32+(threadIdx.x&7)] = vec0;
+
+//  0   8   4  12   2  10   6  14       0   8   4  12   2  10   6  14         6 6 6 6 6 6 6 6     7 7 7 7 7 7 7 7
+//  1   9   5  13   3  11   7  15       1   9   5  13   3  11   7  15         2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3
+//  0   8   4  12   2  10   6  14       0   8   4  12   2  10   6  14         0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1
+//  1   9   5  13   3  11   7  15       1   9   5  13   3  11   7  15         4 4 4 4 4 4 4 4     5 5 5 5 5 5 5 5
+
+	P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
+	P1 = sel?expanded[9]:expanded[8]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[11]:expanded[10]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
+	P1 = sel?expanded[5]:expanded[4]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[7]:expanded[6]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
+	P1 = sel?expanded[13]:expanded[12]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[15]:expanded[14]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
+
+	g_temp4[40+(threadIdx.x&7)] = vec0;
+
+// 16  24  20  28  18  26  22  30      16  24  20  28  18  26  22  30         6 6 6 6 6 6 6 6     7 7 7 7 7 7 7 7
+// 16  24  20  28  18  26  22  30      16  24  20  28  18  26  22  30         0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1
+// 17  25  21  29  19  27  23  31      17  25  21  29  19  27  23  31         0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1
+// 17  25  21  29  19  27  23  31      17  25  21  29  19  27  23  31         6 6 6 6 6 6 6 6     7 7 7 7 7 7 7 7
+
+	// sel markiert threads 2,3,4,5
+
+	int t;
+	t = __shfl(expanded[17],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[16]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[19],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[18]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
+	t = __shfl(expanded[25],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[24]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[27],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[26]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
+	t = __shfl(expanded[21],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[20]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[23],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[22]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
+	t = __shfl(expanded[29],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[28]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[31],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[30]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
+
+	g_temp4[48+(threadIdx.x&7)] = vec0;
+
+// 17  25  21  29  19  27  23  31      17  25  21  29  19  27  23  31         4 4 4 4 4 4 4 4     5 5 5 5 5 5 5 5
+// 17  25  21  29  19  27  23  31      17  25  21  29  19  27  23  31         2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3
+// 16  24  20  28  18  26  22  30      16  24  20  28  18  26  22  30         2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3
+// 16  24  20  28  18  26  22  30      16  24  20  28  18  26  22  30         4 4 4 4 4 4 4 4     5 5 5 5 5 5 5 5
+
+	// sel markiert threads 2,3,4,5
+
+	t = __shfl(expanded[16],(threadIdx.x+4)&7,8); P1 = sel?expanded[17]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[18],(threadIdx.x+4)&7,8); Q2 = sel?expanded[19]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
+	t = __shfl(expanded[24],(threadIdx.x+4)&7,8); P1 = sel?expanded[25]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[26],(threadIdx.x+4)&7,8); Q2 = sel?expanded[27]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
+	t = __shfl(expanded[20],(threadIdx.x+4)&7,8); P1 = sel?expanded[21]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[22],(threadIdx.x+4)&7,8); Q2 = sel?expanded[23]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
+	t = __shfl(expanded[28],(threadIdx.x+4)&7,8); P1 = sel?expanded[29]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[30],(threadIdx.x+4)&7,8); Q2 = sel?expanded[31]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
+
+	g_temp4[56+(threadIdx.x&7)] = vec0;
+
+#undef mul_185
+#undef mul_233
+}
+
+/***************************************************/
+
+__global__ __launch_bounds__(TPB, 4)
+void x11_simd512_gpu_expand_64(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_temp4)
+{
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
+		return;
+	int threadBloc = (blockDim.x * blockIdx.x + threadIdx.x) / 8;
+	if (threadBloc < threads)
+	{
+		int hashPosition = threadBloc * 16;
+		uint32_t *inpHash = &g_hash[hashPosition];
+
+		// Read hash per 8 threads
+		uint32_t Hash[2];
+		int ndx = threadIdx.x & 7;
+		Hash[0] = inpHash[ndx];
+		Hash[1] = inpHash[ndx + 8];
+
+		// Puffer für expandierte Nachricht
+		uint4 *temp4 = &g_temp4[hashPosition * 4];
+
+		Expansion(Hash, temp4);
+	}
+}
+
+__global__ __launch_bounds__(TPB, 1)
+void x11_simd512_gpu_compress1_64(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state)
+{
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
+		return;
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t *Hash = &g_hash[thread * 16];
+		Compression1(Hash, thread, g_fft4, g_state);
+	}
+}
+
+__global__ __launch_bounds__(TPB, 1)
+void x11_simd512_gpu_compress2_64(int *thr_id, uint32_t threads, uint4 *g_fft4, uint32_t *g_state)
+{
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
+		return;
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		Compression2(thread, g_fft4, g_state);
+	}
+}
+
+__global__ __launch_bounds__(TPB, 2)
+void x11_simd512_gpu_compress_64_maxwell(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state)
+{
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
+		return;
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t *Hash = &g_hash[thread * 16];
+		Compression1(Hash, thread, g_fft4, g_state);
+		Compression2(thread, g_fft4, g_state);
+	}
+}
+
+__global__ __launch_bounds__(TPB, 2)
+void x11_simd512_gpu_final_64(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state)
+{
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
+		return;
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t *Hash = &g_hash[thread * 16];
+		Final(Hash, thread, g_fft4, g_state);
 	}
 }
 
+#else
+__global__ void x11_simd512_gpu_expand_64(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_temp4) {}
+__global__ void x11_simd512_gpu_compress1_64(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {}
+__global__ void x11_simd512_gpu_compress2_64(int *thr_id, uint32_t threads, uint4 *g_fft4, uint32_t *g_state) {}
+__global__ void x11_simd512_gpu_compress_64_maxwell(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {}
+__global__ void x11_simd512_gpu_final_64(int *thr_id, uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {}
+#endif /* SM3+ */
+
 __host__
-int x11_simd512_cpu_init(int thr_id, uint32_t threads){
-	return cudaMalloc(&d_temp4[thr_id], 64 * sizeof(uint4)*threads);
+int x11_simd512_cpu_init(int thr_id, uint32_t threads)
+{
+	int dev_id = device_map[thr_id];
+	// cuda_get_arch(thr_id); // should be already done!
+	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
+		x11_simd512_cpu_init_sm2(thr_id);
+		return 0;
+	}
+	//2097152
+#if 0
+	if (threads > 2097152)
+	{
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 32 * sizeof(uint4)*(threads >> 1)), (int)err); /* todo: prevent -i 21 */
+		CUDA_CALL_OR_RET_X(cudaMalloc((&d_temp4[thr_id]) + 32 * (threads >> 1), 32 * sizeof(uint4)*(threads >> 1)), (int)err); /* todo: prevent -i 21 */
+	}
+	else
+#endif
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64 * sizeof(uint4)*threads), (int)err); /* todo: prevent -i 21 */
+	CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32 * sizeof(int)*threads), (int)err);
+
+#ifndef DEVICE_DIRECT_CONSTANTS
+	cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
+
+	cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice);
+#endif
+
+	// Texture for 128-Bit Zugriffe
+	cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc<uint4>();
+	texRef1D_128.normalized = 0;
+	texRef1D_128.filterMode = cudaFilterModePoint;
+	texRef1D_128.addressMode[0] = cudaAddressModeClamp;
+
+	CUDA_CALL_OR_RET_X(cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads), (int) err);
+
+	return 0;
 }
 
 __host__
-void x11_simd512_cpu_free(int thr_id){
-	cudaFree(d_temp4[thr_id]);
+void x11_simd512_cpu_free(int thr_id)
+{
+	int dev_id = device_map[thr_id];
+	if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) {
+		cudaFree(d_temp4[thr_id]);
+		cudaFree(d_state[thr_id]);
+	}
 }
- 
+
 __host__
-void x11_simd512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash){
+void x11_simd512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = TPB;
+	int dev_id = device_map[((uintptr_t)thr_id) & 15];
+	//2097152
+	dim3 block(threadsperblock);
+	dim3 grid((threads + threadsperblock-1) / threadsperblock);
+	dim3 gridX8(grid.x * 8);
 
-	int dev_id = device_map[((uint64_t)thr_id) & 15];
+	x11_simd512_gpu_expand_64 << <gridX8, block >> > (thr_id, threads, d_hash, d_temp4[((uintptr_t)thr_id) & 15]);
 
-	uint32_t tpb = TPB52_1;
-	if (device_sm[dev_id] <= 500) tpb = TPB50_1;
-	const dim3 grid1((8 * threads + tpb - 1) / tpb);
-	const dim3 block1(tpb);
+	if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) {
+		x11_simd512_gpu_compress_64_maxwell << < grid, block >> > (thr_id, threads, d_hash, d_temp4[((uintptr_t)thr_id) & 15], d_state[((uintptr_t)thr_id) & 15]);
+	} else {
+		x11_simd512_gpu_compress1_64 << < grid, block >> > (thr_id, threads, d_hash, d_temp4[((uintptr_t)thr_id) & 15], d_state[((uintptr_t)thr_id) & 15]);
+		x11_simd512_gpu_compress2_64 << < grid, block >> > (thr_id, threads, d_temp4[((uintptr_t)thr_id) & 15], d_state[((uintptr_t)thr_id) & 15]);
+	}
 
-	tpb = TPB52_2;
-	if (device_sm[dev_id] <= 500) tpb = TPB50_2;
-	const dim3 grid2((threads + tpb - 1) / tpb);
-	const dim3 block2(tpb);
+	x11_simd512_gpu_final_64 << <grid, block >> > (thr_id, threads, d_hash, d_temp4[((uintptr_t)thr_id) & 15], d_state[((uintptr_t)thr_id) & 15]);
 
-	x11_simd512_gpu_expand_64 << <grid1, block1 >> > (thr_id, threads, d_hash, d_temp4[(uint64_t)thr_id & 15]);
-	x11_simd512_gpu_compress_64_maxwell << < grid2, block2 >> > (thr_id, threads, d_hash, d_temp4[(uint64_t)thr_id & 15]);
+//	MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/x11/cuda_x11_simd512_func.cuh b/x11/cuda_x11_simd512_func.cuh
index 5470ae9c8e..f61eaa4f59 100644
--- a/x11/cuda_x11_simd512_func.cuh
+++ b/x11/cuda_x11_simd512_func.cuh
@@ -1,659 +1,1396 @@
-﻿
-static __constant__ const uint8_t c_perm[8][8] = {
-	{ 2, 3, 6, 7, 0, 1, 4, 5 }, { 6, 7, 2, 3, 4, 5, 0, 1 }, { 7, 6, 5, 4, 3, 2, 1, 0 }, { 1, 0, 3, 2, 5, 4, 7, 6 },
-	{ 0, 1, 4, 5, 6, 7, 2, 3 }, { 6, 7, 2, 3, 0, 1, 4, 5 }, { 6, 7, 0, 1, 4, 5, 2, 3 }, { 4, 5, 2, 3, 6, 7, 0, 1 }
-};
-
-static __constant__ const uint32_t c_IV_512[32] = {
-	0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
-	0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
-	0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
-	0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
-};
-
-static __constant__ const int16_t c_FFT128_8_16_Twiddle[128] = {
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30,
-	1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22, 1, -67, 120, -73, 8, -22, -68, -70, 64, 81, -30, -46, -2, -123, 17, -111,
-	1, -118, 46, -31, 60, 116, -67, -61, 2, 21, 92, -62, 120, -25, 123, -122, 1, 116, 92, -122, -17, 84, -22, 18, 32, 114, 117, -49, -30, 118, 67, 62,
-	1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114, 1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79
-};
-
-static __constant__ const int16_t c_FFT256_2_128_Twiddle[128] = {
-	1, 41, -118, 45, 46, 87, -31, 14, 60, -110, 116, -127, -67, 80, -61, 69, 2, 82, 21, 90, 92, -83, -62, 28, 120, 37, -25, 3, 123, -97, -122, -119,
-	4, -93, 42, -77, -73, 91, -124, 56, -17, 74, -50, 6, -11, 63, 13, 19, 8, 71, 84, 103, 111, -75, 9, 112, -34, -109, -100, 12, -22, 126, 26, 38,
-	16, -115, -89, -51, -35, 107, 18, -33, -68, 39, 57, 24, -44, -5, 52, 76, 32, 27, 79, -102, -70, -43, 36, -66, 121, 78, 114, 48, -88, -10, 104, -105,
-	64, 54, -99, 53, 117, -86, 72, 125, -15, -101, -29, 96, 81, -20, -49, 47, 128, 108, 59, 106, -23, 85, -113, -7, -30, 55, -58, -65, -95, -40, -98, 94
-};
+#define SIMD_FUNCTIONS_CUH
 
-__device__ __forceinline__
-static uint32_t IF(uint32_t x, uint32_t y, uint32_t z){
-	/*
-	#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
-	uint32_t result;
-	asm("lop3.b32 %0, %1, %2, %3, 0xCA;" : "=r"(result)	: "r"(x), "r"(y), "r"(z));	// x=F0, y=CC, z=AA // 0xCA = ((CC⊻AA)∧F0)⊻AA
-	return result;
-	#else
-	*/		return (((y ^ z) & x) ^ z);
-//	#endif
+__device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for(int j=0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for(int j=0; j<8; j++) {
+		A[j] = R[j];
+	}
 }
-
-
-__device__ __forceinline__
-static uint32_t MAJ(const uint32_t x, const uint32_t y, const uint32_t z){
-
-#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
-	uint32_t result;
-	asm("lop3.b32 %0, %1, %2, %3, 0xE8;" : "=r"(result) : "r"(x), "r"(y), "r"(z));	// x=AA, y=CC, z=F0 // 0xCA = ((CC⊻AA)∧F0)⊻AA
-	return result;
-#else
-	return ((z &y) | ((z | y) & x));
-#endif
+__device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
 }
-
-#define p8_xor(x) ( ((x)%7) == 0 ? 1 : \
-	((x)%7) == 1 ? 6 : \
-	((x)%7) == 2 ? 2 : \
-	((x)%7) == 3 ? 3 : \
-	((x)%7) == 4 ? 5 : \
-	((x)%7) == 5 ? 7 : 4 )
-
-__device__ __forceinline__
-static void STEP8_IF(const uint32_t *w, const uint32_t i, const uint32_t r, const uint32_t s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
+__device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
+	uint32_t temp;
 	uint32_t R[8];
-
 #pragma unroll 8
-	for (int j = 0; j<8; j++)
+	for (int j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
-
-	uint32_t W[8];
-	*(uint2x4*)&W[0] = *(uint2x4*)&w[0];
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
 #pragma unroll 8
-	for (int j = 0; j<8; j++)
-		D[j] += W[j] + IF(A[j], B[j], C[j]);
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
 #pragma unroll 8
-	for (int j = 0; j<8; j++)
-		D[j] = R[j^p8_xor(i)] + ROTL32(D[j], s);
+	for (int  j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
 #pragma unroll 8
-	for (int j = 0; j<8; j++)
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
+	}
 }
-
-__device__ __forceinline__
-static void STEP8_MAJ(const uint32_t *w, const uint32_t i, const uint32_t r, const uint32_t s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
+__device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
 {
+	uint32_t temp;
 	uint32_t R[8];
-
-	uint32_t W[8];
-	*(uint2x4*)&W[0] = *(uint2x4*)&w[0];
-
 #pragma unroll 8
-	for (int j = 0; j<8; j++)
+	for (int  j = 0; j<8; j++) {
 		R[j] = ROTL32(A[j], r);
-
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
 #pragma unroll 8
-	for (int j = 0; j<8; j++)
-		D[j] += W[j] + MAJ(A[j], B[j], C[j]);
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
 #pragma unroll 8
-	for (int j = 0; j<8; j++)
-		D[j] = R[j^p8_xor(i)] + ROTL32(D[j], s);
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
 #pragma unroll 8
-	for (int j = 0; j<8; j++)
+	for (int j = 0; j<8; j++) {
 		A[j] = R[j];
+	}
 }
-
-static __constant__ uint32_t d_cw[4][8][8] = {
-	0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, 0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380,
-	0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, 0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3,
-	0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09, 0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE,
-	0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, 0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3,
-	0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, 0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2,
-	0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, 0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318,
-	0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D, 0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12,
-	0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, 0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80,
-	0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, 0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3,
-	0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, 0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6,
-	0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80, 0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E,
-	0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, 0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE,
-	0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, 0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B,
-	0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, 0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6,
-	0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583, 0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F,
-	0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, 0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D
-
-};
-
-__device__ __forceinline__
-static void Round8_0_final(uint32_t* A, const uint32_t r, const  uint32_t s, const uint32_t t, const uint32_t u){
-
-	STEP8_IF(d_cw[0][0], 0, r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_IF(d_cw[0][1], 1, s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_IF(d_cw[0][2], 2, t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_IF(d_cw[0][3], 3, u, r, &A[8], &A[16], &A[24], A);
-	STEP8_MAJ(d_cw[0][4], 4, r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_MAJ(d_cw[0][5], 5, s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_MAJ(d_cw[0][6], 6, t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_MAJ(d_cw[0][7], 7, u, r, &A[8], &A[16], &A[24], A);
+__device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
 }
-
-__device__ __forceinline__
-static void Round8_1_final(uint32_t* A, const uint32_t r, const  uint32_t s, const uint32_t t, const uint32_t u){
-
-	STEP8_IF(d_cw[1][0], 8, r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_IF(d_cw[1][1], 9, s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_IF(d_cw[1][2], 10, t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_IF(d_cw[1][3], 11, u, r, &A[8], &A[16], &A[24], A);
-	STEP8_MAJ(d_cw[1][4], 12, r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_MAJ(d_cw[1][5], 13, s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_MAJ(d_cw[1][6], 14, t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_MAJ(d_cw[1][7], 15, u, r, &A[8], &A[16], &A[24], A);
+__device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
 }
-
-__device__ __forceinline__
-static void Round8_2_final(uint32_t* A, const uint32_t r, const  uint32_t s, const uint32_t t, const uint32_t u){
-
-	STEP8_IF(d_cw[2][0], 16, r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_IF(d_cw[2][1], 17, s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_IF(d_cw[2][2], 18, t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_IF(d_cw[2][3], 19, u, r, &A[8], &A[16], &A[24], A);
-	STEP8_MAJ(d_cw[2][4], 20, r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_MAJ(d_cw[2][5], 21, s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_MAJ(d_cw[2][6], 22, t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_MAJ(d_cw[2][7], 23, u, r, &A[8], &A[16], &A[24], A);
+__device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
 }
-
-__device__ __forceinline__
-static void Round8_3_final(uint32_t* A, const uint32_t r, const  uint32_t s, const uint32_t t, const uint32_t u){
-
-	STEP8_IF(d_cw[3][0], 24, r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_IF(d_cw[3][1], 25, s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_IF(d_cw[3][2], 26, t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_IF(d_cw[3][3], 27, u, r, &A[8], &A[16], &A[24], A);
-	STEP8_MAJ(d_cw[3][4], 28, r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_MAJ(d_cw[3][5], 29, s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_MAJ(d_cw[3][6], 30, t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_MAJ(d_cw[3][7], 31, u, r, &A[8], &A[16], &A[24], A);
+__device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
 }
+__device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
 
-//#define expanded_vector(x) __ldg(&g_fft4[x])
-static __device__ __forceinline__ void expanded_vector(uint32_t* w, const uint4* ptr){
-	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(w[0]), "=r"(w[1]), "=r"(w[2]), "=r"(w[3]) : __LDG_PTR(ptr));
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
 }
-
-__device__ __forceinline__
-static void Round8(uint32_t* A, const uint32_t thr_offset, const uint4 *const __restrict__ g_fft4) {
-
-	uint32_t w[8];
-	uint32_t tmp = thr_offset;
-
-	uint32_t r = 3, s = 23, t = 17, u = 27;
-
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 0, r, s, A, &A[8], &A[16], &A[24]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 1, s, t, &A[24], A, &A[8], &A[16]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 2, t, u, &A[16], &A[24], A, &A[8]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 3, u, r, &A[8], &A[16], &A[24], A);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 4, r, s, A, &A[8], &A[16], &A[24]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 5, s, t, &A[24], A, &A[8], &A[16]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 6, t, u, &A[16], &A[24], A, &A[8]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 7, u, r, &A[8], &A[16], &A[24], A);
-
-	r = 28; s = 19; t = 22; u = 7;
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 8, r, s, A, &A[8], &A[16], &A[24]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 9, s, t, &A[24], A, &A[8], &A[16]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 10, t, u, &A[16], &A[24], A, &A[8]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 11, u, r, &A[8], &A[16], &A[24], A);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 12, r, s, A, &A[8], &A[16], &A[24]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 13, s, t, &A[24], A, &A[8], &A[16]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 14, t, u, &A[16], &A[24], A, &A[8]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 15, u, r, &A[8], &A[16], &A[24], A);
-
-	r = 29; s = 9; t = 15; u = 5;
-
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 16, r, s, A, &A[8], &A[16], &A[24]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 17, s, t, &A[24], A, &A[8], &A[16]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 18, t, u, &A[16], &A[24], A, &A[8]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 19, u, r, &A[8], &A[16], &A[24], A);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 20, r, s, A, &A[8], &A[16], &A[24]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 21, s, t, &A[24], A, &A[8], &A[16]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 22, t, u, &A[16], &A[24], A, &A[8]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 23, u, r, &A[8], &A[16], &A[24], A);
-
-	r = 4; s = 13; t = 10; u = 25;
-
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 24, r, s, A, &A[8], &A[16], &A[24]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 25, s, t, &A[24], A, &A[8], &A[16]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 26, t, u, &A[16], &A[24], A, &A[8]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_IF(w, 27, u, r, &A[8], &A[16], &A[24], A);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 28, r, s, A, &A[8], &A[16], &A[24]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 29, s, t, &A[24], A, &A[8], &A[16]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 30, t, u, &A[16], &A[24], A, &A[8]);
-	expanded_vector(&w[0], &g_fft4[tmp++]);
-	expanded_vector(&w[4], &g_fft4[tmp++]);
-	STEP8_MAJ(w, 31, u, r, &A[8], &A[16], &A[24], A);
-
+__device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
 }
-
-/********************* Message expansion ************************/
-
-/*
-* Reduce modulo 257; result is in [-127; 383]
-* REDUCE(x) := (x&255) - (x>>8)
-*/
-#define REDUCE(x) \
-	(((x)&255) - ((x)>>8))
-
-/*
-* Reduce from [-127; 383] to [-128; 128]
-* EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
-*/
-#define EXTRA_REDUCE_S(x) \
-	((x)<=128 ? (x) : (x)-257)
-
-/*
-* Reduce modulo 257; result is in [-128; 128]
-*/
-#define REDUCE_FULL_S(x) \
-	EXTRA_REDUCE_S(REDUCE(x))
-
-// Parallelization:
-//
-// FFT_8  wird 2 times 8-fach parallel ausgeführt (in FFT_64)
-//        and  1 time 16-fach parallel (in FFT_128_full)
-//
-// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations
-
-/**
-* FFT_8 using w=4 as 8th root of unity
-* Unrolled decimation in frequency (DIF) radix-2 NTT.
-* Output data is in revbin_permuted order.
-*/
-__device__ __forceinline__
-static void FFT_8(int *y, const uint8_t stripe){
-
-#define BUTTERFLY(i,j,n) \
-do { \
-	int u= y[stripe*i]; \
-	int v= y[stripe*j]; \
-	y[stripe*i] = u+v; \
-	y[stripe*j] = (u-v) << (n<<1); \
-} while(0)
-
-	BUTTERFLY(0, 4, 0);
-	BUTTERFLY(1, 5, 1);
-	BUTTERFLY(2, 6, 2);
-	BUTTERFLY(3, 7, 3);
-
-	y[stripe * 6] = REDUCE(y[stripe * 6]);
-	y[stripe * 7] = REDUCE(y[stripe * 7]);
-
-	BUTTERFLY(0, 2, 0);
-	BUTTERFLY(4, 6, 0);
-	BUTTERFLY(1, 3, 2);
-	BUTTERFLY(5, 7, 2);
-
-	y[stripe * 7] = REDUCE(y[stripe * 7]);
-
-	BUTTERFLY(0, 1, 0);
-	BUTTERFLY(2, 3, 0);
-	BUTTERFLY(4, 5, 0);
-	BUTTERFLY(6, 7, 0);
-
-	y[0] = REDUCE(y[0]);
-	y[stripe] = REDUCE(y[stripe]);
-	y[stripe << 1] = REDUCE(y[stripe << 1]);
-	y[stripe * 3] = REDUCE(y[stripe * 3]);
-	y[stripe << 2] = REDUCE(y[stripe << 2]);
-	y[stripe * 5] = REDUCE(y[stripe * 5]);
-	y[stripe * 6] = REDUCE(y[stripe * 6]);
-	y[stripe * 7] = REDUCE(y[stripe * 7]);
-
-	y[0] = EXTRA_REDUCE_S(y[0]);
-	y[stripe] = EXTRA_REDUCE_S(y[stripe]);
-	y[stripe << 1] = EXTRA_REDUCE_S(y[stripe << 1]);
-	y[stripe * 3] = EXTRA_REDUCE_S(y[stripe * 3]);
-	y[stripe << 2] = EXTRA_REDUCE_S(y[stripe << 2]);
-	y[stripe * 5] = EXTRA_REDUCE_S(y[stripe * 5]);
-	y[stripe * 6] = EXTRA_REDUCE_S(y[stripe * 6]);
-	y[stripe * 7] = EXTRA_REDUCE_S(y[stripe * 7]);
-
-#undef BUTTERFLY
+__device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
 }
-
-/**
-* FFT_16 using w=2 as 16th root of unity
-* Unrolled decimation in frequency (DIF) radix-2 NTT.
-* Output data is in revbin_permuted order.
-*/
-__device__ __forceinline__
-static void FFT_16(int *y){
-
-#define DO_REDUCE_FULL_S(i) \
-	do { \
-		y[i] = REDUCE(y[i]); \
-		y[i] = EXTRA_REDUCE_S(y[i]); \
-		} while(0)
-
-	int u, v;
-
-	const uint8_t thr = threadIdx.x & 7;
-
-	u = y[0]; // 0..7
-	v = y[1]; // 8..15
-	y[0] = u + v;
-	y[1] = (u - v) << (thr);
-
-	if ((thr) >= 3) y[1] = REDUCE(y[1]);  // 11...15
-
-	u = __shfl(y[0], (threadIdx.x & 3), 8); // 0,1,2,3  0,1,2,3
-	v = __shfl(y[0], 4 + (threadIdx.x & 3), 8); // 4,5,6,7  4,5,6,7
-	y[0] = ((thr) < 4) ? (u + v) : ((u - v) << ((threadIdx.x & 3) << 1));
-
-	u = __shfl(y[1], (threadIdx.x & 3), 8); // 8,9,10,11    8,9,10,11
-	v = __shfl(y[1], 4 + (threadIdx.x & 3), 8); // 12,13,14,15  12,13,14,15
-	y[1] = ((thr) < 4) ? (u + v) : ((u - v) << ((threadIdx.x & 3) << 1));
-
-	if ((threadIdx.x & 1) && (thr >= 4)) {
-		y[0] = REDUCE(y[0]);  // 5, 7
-		y[1] = REDUCE(y[1]);  // 13, 15
+__device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
 	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
 
-	u = __shfl(y[0], (threadIdx.x & 5), 8); // 0,1,0,1  4,5,4,5
-	v = __shfl(y[0], 2 + (threadIdx.x & 5), 8); // 2,3,2,3  6,7,6,7
-	y[0] = ((threadIdx.x & 3) < 2) ? (u + v) : ((u - v) << ((threadIdx.x & 1) << 2));
+#ifdef DEVICE_DIRECT_CONSTANTS
+static __constant__ uint32_t d_cw0[8][8] = {
+#else
+static __constant__ uint32_t d_cw0[8][8];
+static const uint32_t h_cw0[8][8] = {
+#endif
+	0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6,
+	0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380,
+	0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8,
+	0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3,
+	0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09,
+	0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE,
+	0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E,
+	0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3
+};
 
-	u = __shfl(y[1], (threadIdx.x & 5), 8); // 8,9,8,9      12,13,12,13
-	v = __shfl(y[1], 2 + (threadIdx.x & 5), 8); // 10,11,10,11  14,15,14,15
-	y[1] = ((threadIdx.x & 3) < 2) ? (u + v) : ((u - v) << ((threadIdx.x & 1) << 2));
+__device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t, int u)
+{
+	STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_3(d_cw0[3], u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_4(d_cw0[4], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_5(d_cw0[5], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
+}
 
-	u = __shfl(y[0], (threadIdx.x & 6), 8); // 0,0,2,2      4,4,6,6
-	v = __shfl(y[0], 1 + (threadIdx.x & 6), 8); // 1,1,3,3      5,5,7,7
-	y[0] = ((threadIdx.x & 1) < 1) ? (u + v) : (u - v);
+#ifdef DEVICE_DIRECT_CONSTANTS
+static __constant__ uint32_t d_cw1[8][8] = {
+#else
+static __constant__ uint32_t d_cw1[8][8];
+static const uint32_t h_cw1[8][8] = {
+#endif
+	0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7,
+	0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2,
+	0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A,
+	0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318,
+	0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D,
+	0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12,
+	0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D,
+	0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80
+};
 
-	u = __shfl(y[1], (threadIdx.x & 6), 8); // 8,8,10,10    12,12,14,14
-	v = __shfl(y[1], 1 + (threadIdx.x & 6), 8); // 9,9,11,11    13,13,15,15
-	y[1] = ((threadIdx.x & 1) < 1) ? (u + v) : (u - v);
+__device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t, int u)
+{
+	STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_11(d_cw1[3], u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_12(d_cw1[4], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_13(d_cw1[5], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
+}
 
-	DO_REDUCE_FULL_S(0); // 0...7
-	DO_REDUCE_FULL_S(1); // 8...15
+#ifdef DEVICE_DIRECT_CONSTANTS
+static __constant__ uint32_t d_cw2[8][8] = {
+#else
+static __constant__ uint32_t d_cw2[8][8];
+static const uint32_t h_cw2[8][8] = {
+#endif
+	0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3,
+	0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3,
+	0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539,
+	0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6,
+	0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80,
+	0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E,
+	0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468,
+	0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE
+};
 
-#undef DO_REDUCE_FULL_S
+__device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t, int u)
+{
+	STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_19(d_cw2[3], u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_20(d_cw2[4], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_21(d_cw2[5], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
 }
 
-/***************************************************/
-#if __CUDA_ARCH__ > 500
-__global__ __launch_bounds__(TPB52_1, 9)
+#ifdef DEVICE_DIRECT_CONSTANTS
+static __constant__ uint32_t d_cw3[8][8] = {
 #else
-__global__ __launch_bounds__(TPB50_1, 9)
+static __constant__ uint32_t d_cw3[8][8];
+static const uint32_t h_cw3[8][8] = {
 #endif
-static void x11_simd512_gpu_expand_64(int *thr_id, uint32_t threads, const uint32_t* __restrict__ g_hash, uint4 *g_temp4)
-{
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
-		return;
-	const uint32_t threadBloc = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
-	const uint8_t thr = (threadIdx.x & 7);
-	/* Message Expansion using Number Theoretical Transform similar to FFT */
-	int expanded[32];
+	0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D,
+	0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B,
+	0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A,
+	0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6,
+	0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583,
+	0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F,
+	0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA,
+	0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D
+};
 
-	uint4 vec0;
-	int P, Q, P1, Q1, P2, Q2;
+__device__ __forceinline__ void Round8_3_final(uint32_t *A, int r, int s, int t, int u)
+{
+	STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_27(d_cw3[3], u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_28(d_cw3[4], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_29(d_cw3[5], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_30(d_cw3[6], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_31(d_cw3[7], u, r, &A[8], &A[16], &A[24], A);
+}
 
-	const bool even = (threadIdx.x & 1) == 0;
-	const bool hi = (thr) >= 4;
-	const bool lo = (thr)<4;
-	const bool sel = ((threadIdx.x + 2) & 7) >= 4;  // 2,3,4,5
+#if __CUDA_ARCH__ < 350
+#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x))
+#else
+//#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x))
+#define expanded_vector(x) __ldg(&g_fft4[x])
+#endif
 
-	if (threadBloc < threads){
+__device__ __forceinline__ void Round8_0(uint32_t *A, const int thr_offset,
+		int r, int s, int t, int u, uint4 *g_fft4) {
+	uint32_t w[8];
+    uint4 hv1, hv2;
+
+	int tmp = 0 + thr_offset;
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_0(w, r, s, A, &A[8], &A[16], &A[24]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_1(w, s, t, &A[24], A, &A[8], &A[16]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_2(w, t, u, &A[16], &A[24], A, &A[8]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_3(w, u, r, &A[8], &A[16], &A[24], A);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_4(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_5(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_6(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_7(w, u, r, &A[8], &A[16], &A[24], A);
 
-		const uint32_t hashPosition = threadBloc << 4;
 
-		const uint32_t *inpHash = &g_hash[hashPosition];
+}
+__device__ __forceinline__ void Round8_1(uint32_t *A, const int thr_offset,
+		int r, int s, int t, int u, uint4 *g_fft4) {
+	uint32_t w[8];
+    uint4 hv1, hv2;
+
+	int tmp = 16 + thr_offset;
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_8(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_9(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_10(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_11(w, u, r, &A[8], &A[16], &A[24], A);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_12(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_13(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_14(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_15(w, u, r, &A[8], &A[16], &A[24], A);
 
-		const uint32_t data0 = __ldg(&inpHash[thr]);
-		const uint32_t data1 = __ldg(&inpHash[thr + 8]);
 
-		// Puffer fur expandierte Nachricht
-		uint4 *temp4 = &g_temp4[hashPosition << 2];
+}
+__device__ __forceinline__ void Round8_2(uint32_t *A, const int thr_offset,
+		int r, int s, int t, int u, uint4 *g_fft4) {
+	uint32_t w[8];
+    uint4 hv1, hv2;
+
+	int tmp = 32 + thr_offset;
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_16(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_17(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_18(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_19(w, u, r, &A[8], &A[16], &A[24], A);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_20(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_21(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_22(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_23(w, u, r, &A[8], &A[16], &A[24], A);
 
-#pragma unroll 4
-		for (uint32_t i = 0; i < 4; i++) {
-			expanded[i] = bfe(__byte_perm(__shfl(data0, i << 1, 8), __shfl(data0, (i << 1) + 1, 8), thr), 0, 8);
-		}
-#pragma unroll 4
-		for (uint32_t i = 0; i < 4; i++) {
-			expanded[4 + i] = bfe(__byte_perm(__shfl(data1, i << 1, 8), __shfl(data1, (i << 1) + 1, 8), thr), 0, 8);
-		}
-#pragma unroll 8
-		for (uint32_t i = 8; i < 16; i++) {
-			expanded[i] = 0;
-		}
-		/*
-		* FFT_256 using w=41 as 256th root of unity. Decimation in frequency (DIF) NTT. Output data is in revbin_permuted order. In place.
-		*/
-#pragma unroll 8
-		for (uint32_t i = 0; i<8; i++)
-			expanded[16 + i] = REDUCE(expanded[i] * c_FFT256_2_128_Twiddle[8 * i + (thr)]);
 
-#pragma unroll 8
-		for (uint32_t i = 24; i < 32; i++) {
-			expanded[i] = 0;
-		}
-		/* handle X^255 with an additional butterfly */
-		if (thr == 7){
-			expanded[15] = 1;
-			expanded[31] = REDUCE((-1) * c_FFT256_2_128_Twiddle[127]);
-		}
+}
+__device__ __forceinline__ void Round8_3(uint32_t *A, const int thr_offset,
+		int r, int s, int t, int u, uint4 *g_fft4) {
+	uint32_t w[8];
+    uint4 hv1, hv2;
+
+	int tmp = 48 + thr_offset;
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_24(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_25(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_26(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_27(w, u, r, &A[8], &A[16], &A[24], A);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_28(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_29(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_30(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_31(w, u, r, &A[8], &A[16], &A[24], A);
 
-		//		FFT_128_full(expanded);
-		FFT_8(expanded, 2); // eight parallel FFT8's
-		FFT_8(&expanded[16], 2); // eight parallel FFT8's
-		FFT_8(&expanded[1], 2); // eight parallel FFT8's
-		FFT_8(&expanded[17], 2); // eight parallel FFT8's
 
-#pragma unroll 16
-		for (uint32_t i = 0; i<16; i++){
-			expanded[i] = REDUCE(expanded[i] * c_FFT128_8_16_Twiddle[i * 8 + (thr)]);
-			expanded[i + 16] = REDUCE(expanded[i + 16] * c_FFT128_8_16_Twiddle[i * 8 + (thr)]);
-		}
+}
 
+__device__ __forceinline__ void SIMD_Compress1(uint32_t *A, const int thr_id, const uint32_t *M, uint4 *g_fft4) {
+	int i;
+	const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente)
 #pragma unroll 8
-		for (uint32_t i = 0; i<8; i++){
-			FFT_16(expanded + (i << 1));  // eight sequential FFT16's, each one executed in parallel by 8 threads
-			FFT_16(expanded + 16 + (i << 1));  // eight sequential FFT16's, each one executed in parallel by 8 threads			
-		}
-
-		// store w matrices in global memory
-		P1 = expanded[0]; P2 = __shfl(expanded[2], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2;
-		Q1 = expanded[16]; Q2 = __shfl(expanded[18], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2;
-		vec0.x = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[0][thr], 8);
-		P1 = expanded[8]; P2 = __shfl(expanded[10], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2;
-		Q1 = expanded[24]; Q2 = __shfl(expanded[26], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2;
-		vec0.y = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[0][thr], 8);
-		P1 = expanded[4]; P2 = __shfl(expanded[6], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2;
-		Q1 = expanded[20]; Q2 = __shfl(expanded[22], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2;
-		vec0.z = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[0][thr], 8);
-		P1 = expanded[12]; P2 = __shfl(expanded[14], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2;
-		Q1 = expanded[28]; Q2 = __shfl(expanded[30], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2;
-		vec0.w = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[0][thr], 8);
-		temp4[thr] = vec0;
-
-		P1 = expanded[1]; P2 = __shfl(expanded[3], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2;
-		Q1 = expanded[17]; Q2 = __shfl(expanded[19], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2;
-		vec0.x = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[1][thr], 8);
-		P1 = expanded[9]; P2 = __shfl(expanded[11], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2;
-		Q1 = expanded[25]; Q2 = __shfl(expanded[27], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2;
-		vec0.y = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[1][thr], 8);
-		P1 = expanded[5]; P2 = __shfl(expanded[7], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2;
-		Q1 = expanded[21]; Q2 = __shfl(expanded[23], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2;
-		vec0.z = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[1][thr], 8);
-		P1 = expanded[13]; P2 = __shfl(expanded[15], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2;
-		Q1 = expanded[29]; Q2 = __shfl(expanded[31], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2;
-		vec0.w = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[1][thr], 8);
-		temp4[8 + (thr)] = vec0;
-
-		P1 = hi ? expanded[1] : expanded[0]; P2 = __shfl(hi ? expanded[3] : expanded[2], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2;
-		Q1 = hi ? expanded[17] : expanded[16]; Q2 = __shfl(hi ? expanded[19] : expanded[18], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2;
-		vec0.x = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[2][thr], 8);
-		P1 = hi ? expanded[9] : expanded[8]; P2 = __shfl(hi ? expanded[11] : expanded[10], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2;
-		Q1 = hi ? expanded[25] : expanded[24]; Q2 = __shfl(hi ? expanded[27] : expanded[26], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2;
-		vec0.y = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[2][thr], 8);
-		P1 = hi ? expanded[5] : expanded[4]; P2 = __shfl(hi ? expanded[7] : expanded[6], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2;
-		Q1 = hi ? expanded[21] : expanded[20]; Q2 = __shfl(hi ? expanded[23] : expanded[22], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2;
-		vec0.z = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[2][thr], 8);
-		P1 = hi ? expanded[13] : expanded[12]; P2 = __shfl(hi ? expanded[15] : expanded[14], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2;
-		Q1 = hi ? expanded[29] : expanded[28]; Q2 = __shfl(hi ? expanded[31] : expanded[30], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2;
-		vec0.w = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[2][thr], 8);
-		temp4[16 + (thr)] = vec0;
-
-		P1 = lo ? expanded[1] : expanded[0]; P2 = __shfl(lo ? expanded[3] : expanded[2], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2;
-		Q1 = lo ? expanded[17] : expanded[16]; Q2 = __shfl(lo ? expanded[19] : expanded[18], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2;
-		vec0.x = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[3][thr], 8);
-		P1 = lo ? expanded[9] : expanded[8]; P2 = __shfl(lo ? expanded[11] : expanded[10], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2;
-		Q1 = lo ? expanded[25] : expanded[24]; Q2 = __shfl(lo ? expanded[27] : expanded[26], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2;
-		vec0.y = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[3][thr], 8);
-		P1 = lo ? expanded[5] : expanded[4]; P2 = __shfl(lo ? expanded[7] : expanded[6], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2;
-		Q1 = lo ? expanded[21] : expanded[20]; Q2 = __shfl(lo ? expanded[23] : expanded[22], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2;
-		vec0.z = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[3][thr], 8);
-		P1 = lo ? expanded[13] : expanded[12]; P2 = __shfl(lo ? expanded[15] : expanded[14], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2;
-		Q1 = lo ? expanded[29] : expanded[28]; Q2 = __shfl(lo ? expanded[31] : expanded[30], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2;
-		vec0.w = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[3][thr], 8);
-		temp4[24 + (thr)] = vec0;
-
-		P1 = sel ? expanded[0] : expanded[1]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		Q2 = sel ? expanded[2] : expanded[3]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.x = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[4][thr], 8);
-		P1 = sel ? expanded[8] : expanded[9]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		Q2 = sel ? expanded[10] : expanded[11]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.y = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[4][thr], 8);
-		P1 = sel ? expanded[4] : expanded[5]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		Q2 = sel ? expanded[6] : expanded[7]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.z = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[4][thr], 8);
-		P1 = sel ? expanded[12] : expanded[13]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		Q2 = sel ? expanded[14] : expanded[15]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.w = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[4][thr], 8);
-
-		temp4[32 + thr] = vec0;
-
-		P1 = sel ? expanded[1] : expanded[0]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		Q2 = sel ? expanded[3] : expanded[2]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.x = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[5][thr], 8);
-		P1 = sel ? expanded[9] : expanded[8]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		Q2 = sel ? expanded[11] : expanded[10]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.y = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[5][thr], 8);
-		P1 = sel ? expanded[5] : expanded[4]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		Q2 = sel ? expanded[7] : expanded[6]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.z = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[5][thr], 8);
-		P1 = sel ? expanded[13] : expanded[12]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		Q2 = sel ? expanded[15] : expanded[14]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.w = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[5][thr], 8);
-
-		temp4[40 + thr] = vec0;
+	for(i=0; i<8; i++) {
+		A[i] ^= M[i];
+		(&A[8])[i] ^= M[8+i];
+	}
+	Round8_0(A, thr_offset, 3, 23, 17, 27, g_fft4);
+	Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4);
+}
 
-		uint32_t t;
-		t = __shfl(expanded[17], (threadIdx.x + 4) & 7, 8); P1 = sel ? t : expanded[16]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		t = __shfl(expanded[19], (threadIdx.x + 4) & 7, 8); Q2 = sel ? t : expanded[18]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.x = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[6][thr], 8);
-		t = __shfl(expanded[25], (threadIdx.x + 4) & 7, 8); P1 = sel ? t : expanded[24]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		t = __shfl(expanded[27], (threadIdx.x + 4) & 7, 8); Q2 = sel ? t : expanded[26]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.y = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[6][thr], 8);
-		t = __shfl(expanded[21], (threadIdx.x + 4) & 7, 8); P1 = sel ? t : expanded[20]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		t = __shfl(expanded[23], (threadIdx.x + 4) & 7, 8); Q2 = sel ? t : expanded[22]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.z = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[6][thr], 8);
-		t = __shfl(expanded[29], (threadIdx.x + 4) & 7, 8); P1 = sel ? t : expanded[28]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		t = __shfl(expanded[31], (threadIdx.x + 4) & 7, 8); Q2 = sel ? t : expanded[30]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.w = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[6][thr], 8);
+__device__ __forceinline__ void Compression1(const uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
+	uint32_t A[32];
+	int i;
+#pragma unroll 32
+	for (i=0; i < 32; i++) A[i] = c_IV_512[i];
+	uint32_t buffer[16];
+#pragma unroll 16
+	for (i=0; i < 16; i++) buffer[i] = hashval[i];
+	SIMD_Compress1(A, texture_id, buffer, g_fft4);
+	uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
+#pragma unroll 32
+	for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i];
+}
 
-		temp4[48 + thr] = vec0;
+__device__ __forceinline__ void SIMD_Compress2(uint32_t *A, const int thr_id, uint4 *g_fft4) {
+	uint32_t IV[4][8];
+	int i;
+	const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente)
+#pragma unroll 8
+	for(i=0; i<8; i++) {
+		IV[0][i] = c_IV_512[i];
+		IV[1][i] = c_IV_512[8+i];
+		IV[2][i] = c_IV_512[16+i];
+		IV[3][i] = c_IV_512[24+i];
+	}
+	Round8_2(A, thr_offset, 29, 9, 15, 5, g_fft4);
+	Round8_3(A, thr_offset, 4, 13, 10, 25, g_fft4);
+	STEP8_IF_32(IV[0],  4, 13, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_35(IV[3], 25,  4, &A[8], &A[16], &A[24], A);
+}
 
-		t = __shfl(expanded[16], (threadIdx.x + 4) & 7, 8); P1 = sel ? expanded[17] : t; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		t = __shfl(expanded[18], (threadIdx.x + 4) & 7, 8); Q2 = sel ? expanded[19] : t; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.x = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[7][thr], 8);
-		t = __shfl(expanded[24], (threadIdx.x + 4) & 7, 8); P1 = sel ? expanded[25] : t; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		t = __shfl(expanded[26], (threadIdx.x + 4) & 7, 8); Q2 = sel ? expanded[27] : t; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.y = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[7][thr], 8);
-		t = __shfl(expanded[20], (threadIdx.x + 4) & 7, 8); P1 = sel ? expanded[21] : t; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		t = __shfl(expanded[22], (threadIdx.x + 4) & 7, 8); Q2 = sel ? expanded[23] : t; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.z = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[7][thr], 8);
-		t = __shfl(expanded[28], (threadIdx.x + 4) & 7, 8); P1 = sel ? expanded[29] : t; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8);
-		t = __shfl(expanded[30], (threadIdx.x + 4) & 7, 8); Q2 = sel ? expanded[31] : t; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8);
-		P = even ? P1 : P2; Q = even ? Q1 : Q2;
-		vec0.w = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[7][thr], 8);
+__device__ __forceinline__ void Compression2(const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
+	uint32_t A[32];
+	int i;
+	uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)];
+#pragma unroll 32
+	for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
+	SIMD_Compress2(A, texture_id, g_fft4);
+#pragma unroll 32
+	for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i];
+}
 
-		temp4[56 + thr] = vec0;
+__device__ __forceinline__ void SIMD_Compress_Final(uint32_t *A, const uint32_t *M) {
+	uint32_t IV[4][8];
+	int i;
+#pragma unroll 8
+	for(i=0; i<8; i++) {
+		IV[0][i] = A[i];
+		IV[1][i] = (&A[8])[i];
+		IV[2][i] = (&A[16])[i];
+		IV[3][i] = (&A[24])[i];
+	}
+#pragma unroll 8
+	for(i=0; i<8; i++) {
+		A[i] ^= M[i];
+		(&A[8])[i] ^= M[8+i];
 	}
+	Round8_0_final(A, 3, 23, 17, 27);
+	Round8_1_final(A, 28, 19, 22, 7);
+	Round8_2_final(A, 29, 9, 15, 5);
+	Round8_3_final(A, 4, 13, 10, 25);
+	STEP8_IF_32(IV[0],  4, 13, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_35(IV[3], 25,  4, &A[8], &A[16], &A[24], A);
+}
+
+__device__ __forceinline__ void Final(uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
+	uint32_t A[32];
+	int i;
+	uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)];
+#pragma unroll 32
+	for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
+	uint32_t buffer[16];
+	buffer[0] = 512;
+#pragma unroll 15
+	for (i=1; i < 16; i++) buffer[i] = 0;
+	SIMD_Compress_Final(A, buffer);
+#pragma unroll 16
+	for (i=0; i < 16; i++)
+		hashval[i] = A[i];
 }
diff --git a/x11/cuda_x11_simd512_sm2.cuh b/x11/cuda_x11_simd512_sm2.cuh
index 7abbac163e..34041e4bc1 100644
--- a/x11/cuda_x11_simd512_sm2.cuh
+++ b/x11/cuda_x11_simd512_sm2.cuh
@@ -532,7 +532,7 @@ void SIMDHash(const uint32_t *data, uint32_t *hashval)
 
 /***************************************************/
 __global__
-void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+void x11_simd512_gpu_hash_64_sm2(int *thr_id, const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -547,7 +547,7 @@ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNou
 }
 
 #else
-__global__ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {}
+__global__ void x11_simd512_gpu_hash_64_sm2(int *thr_id, const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {}
 #endif /* __CUDA_ARCH__ < 300 */
 
 __host__
@@ -561,7 +561,7 @@ static void x11_simd512_cpu_init_sm2(int thr_id)
 }
 
 __host__
-static void x11_simd512_cpu_hash_64_sm2(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+static void x11_simd512_cpu_hash_64_sm2(int *thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
 	const int threadsperblock = 256;
 
@@ -570,6 +570,6 @@ static void x11_simd512_cpu_hash_64_sm2(int thr_id, uint32_t threads, uint32_t s
 
 	size_t shared_size = 0;
 
-	x11_simd512_gpu_hash_64_sm2<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-	MyStreamSynchronize(NULL, order, thr_id);
+	x11_simd512_gpu_hash_64_sm2 << <grid, block, shared_size >> >(thr_id, threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	MyStreamSynchronize(NULL, order, ((uintptr_t)thr_id) & 15);
 }
diff --git a/x13/cuda_x13_fugue512_alexis.cu b/x13/cuda_x13_fugue512_alexis.cu
index e35cd48ad0..e5fae9ec48 100644
--- a/x13/cuda_x13_fugue512_alexis.cu
+++ b/x13/cuda_x13_fugue512_alexis.cu
@@ -245,11 +245,11 @@ static void SMIX_LDG(const uint32_t shared[4][256], uint32_t &x0,uint32_t &x1,ui
 __global__ __launch_bounds__(256,3)
 void x13_fugue512_gpu_hash_64_alexis(int *thr_id, uint32_t threads, uint64_t *g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
 	__shared__ uint32_t shared[4][256];
 
-	//	if(threadIdx.x<256){
+//	if(threadIdx.x<256){
 		const uint32_t tmp = mixtab0[threadIdx.x];
 		shared[0][threadIdx.x] = tmp;
 		shared[1][threadIdx.x] = ROR8(tmp);
@@ -405,18 +405,18 @@ void x13_fugue512_gpu_hash_64_final_alexis(uint32_t threads,const uint32_t* __re
 				resNonce[1] = tmp;		
 		}
 	}
-} 
+}
 
 __host__
-void x13_fugue512_cpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *d_hash){
-
+void x13_fugue512_cpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *d_hash)
+{
 	const uint32_t threadsperblock = 256;
 
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	x13_fugue512_gpu_hash_64_alexis << <grid, block >> >(thr_id, threads, (uint64_t*)d_hash);
+	x13_fugue512_gpu_hash_64_alexis<<<grid, block>>>(thr_id, threads, (uint64_t*)d_hash);
 }
 
 __host__
diff --git a/x13/cuda_x13_hamsi512.cu b/x13/cuda_x13_hamsi512.cu
index ae796d2ce5..30216e40e9 100644
--- a/x13/cuda_x13_hamsi512.cu
+++ b/x13/cuda_x13_hamsi512.cu
@@ -318,7 +318,7 @@ static const uint32_t T512[64][16] = {
 };
 
 __global__
-void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+void x13_hamsi512_gpu_hash_64(int *thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -416,14 +416,14 @@ void x13_hamsi512_cpu_init(int thr_id, uint32_t threads)
 }
 
 __host__
-void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void x13_hamsi512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
 	const uint32_t threadsperblock = 128;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	x13_hamsi512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x13_hamsi512_gpu_hash_64<<<grid, block>>>(thr_id, threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
 
@@ -436,11 +436,8 @@ void x16_hamsi512_setBlock_80(void *pdata)
 }
 
 __global__
-void x16_hamsi512_gpu_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
+void x16_hamsi512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
@@ -551,5 +548,5 @@ void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	x16_hamsi512_gpu_hash_80 << <grid, block >> > (thr_id, threads, startNounce, (uint64_t*)d_hash);
+	x16_hamsi512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash);
 }
diff --git a/x13/cuda_x13_hamsi512_alexis.cu b/x13/cuda_x13_hamsi512_alexis.cu
index f6d0f249df..e0641e22cc 100644
--- a/x13/cuda_x13_hamsi512_alexis.cu
+++ b/x13/cuda_x13_hamsi512_alexis.cu
@@ -177,7 +177,7 @@ static __constant__ const uint32_t d_T512[1024] = {
 __global__ __launch_bounds__(384,2)
 void x13_hamsi512_gpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -288,7 +288,7 @@ void x13_hamsi512_gpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *g_
 	}
 }
 
-__host__ 
+__host__
 void x13_hamsi512_cpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 384;
@@ -296,6 +296,6 @@ void x13_hamsi512_cpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *d_
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	x13_hamsi512_gpu_hash_64_alexis << <grid, block >> >(thr_id, threads, d_hash);
+	x13_hamsi512_gpu_hash_64_alexis<<<grid, block>>>(thr_id, threads, d_hash);
 
 }
diff --git a/x15/cuda_x14_shabal512.cu b/x15/cuda_x14_shabal512.cu
index 43c5ebf1ca..fec59deac7 100644
--- a/x15/cuda_x14_shabal512.cu
+++ b/x15/cuda_x14_shabal512.cu
@@ -361,7 +361,7 @@ static const uint32_t d_C512[] = {
 
 /***************************************************/
 // GPU Hash Function
-__global__ void x14_shabal512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+__global__ void x14_shabal512_gpu_hash_64(int *thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	__syncthreads();
 
@@ -458,7 +458,7 @@ __host__ void x14_shabal512_cpu_init(int thr_id, uint32_t threads)
 }
 
 // #include <stdio.h>
-__host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void x14_shabal512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
 	const uint32_t threadsperblock = 256;
 
@@ -470,6 +470,6 @@ __host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t s
 
 	// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 
-	x14_shabal512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x14_shabal512_gpu_hash_64<<<grid, block, shared_size>>>(thr_id, threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/x15/cuda_x14_shabal512_alexis.cu b/x15/cuda_x14_shabal512_alexis.cu
index 84d36234be..69ef2a5dff 100644
--- a/x15/cuda_x14_shabal512_alexis.cu
+++ b/x15/cuda_x14_shabal512_alexis.cu
@@ -105,7 +105,7 @@ void ROTATE(uint32_t* A){
 __global__ __launch_bounds__(384,3)
 void x14_shabal512_gpu_hash_64_alexis(int *thr_id, uint32_t threads, uint32_t *g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
@@ -177,7 +177,7 @@ __host__ void x14_shabal512_cpu_hash_64_alexis(int *thr_id, uint32_t threads, ui
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	x14_shabal512_gpu_hash_64_alexis << <grid, block >> >(thr_id, threads, d_hash);
+	x14_shabal512_gpu_hash_64_alexis<<<grid, block>>>(thr_id, threads, d_hash);
 }
 
 __global__ __launch_bounds__(512,2)
diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu
index 827f8ecbf7..79d67f0f37 100644
--- a/x15/cuda_x15_whirlpool.cu
+++ b/x15/cuda_x15_whirlpool.cu
@@ -41,8 +41,9 @@ extern "C" {
 #include <miner.h>
 }
 
-#include "cuda_helper_alexis.h"
-#include "cuda_vectors_alexis.h"
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+#include <cuda_vectors.h>
 
 #define xor3x(a,b,c) (a^b^c)
 
@@ -620,7 +621,7 @@ __global__
 __launch_bounds__(TPB64,2)
 void x15_whirlpool_gpu_hash_64(int *thr_id, uint32_t threads, uint64_t *g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
 	__shared__ uint2 sharedMemory[7][256];
 
@@ -735,7 +736,7 @@ static void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_
 
 	x15_whirlpool_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash);
 }
-*/ 
+*/
 __host__
 void x15_whirlpool_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash)
 {
diff --git a/x15/cuda_x15_whirlpool_sm3.cu b/x15/cuda_x15_whirlpool_sm3.cu
index 1251f5cdc8..f99c7afca1 100644
--- a/x15/cuda_x15_whirlpool_sm3.cu
+++ b/x15/cuda_x15_whirlpool_sm3.cu
@@ -2000,11 +2000,8 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int
 
 
 __global__
-void oldwhirlpool_gpu_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, void *outputHash, int swab)
+void oldwhirlpool_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, void *outputHash, int swab)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 	__shared__ uint64_t sharedMemory[2048];
 
 	if (threadIdx.x < 256) {
@@ -2100,7 +2097,7 @@ void oldwhirlpool_gpu_hash_80(int thr_id, const uint32_t threads, const uint32_t
 }
 
 __global__
-void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+void x15_whirlpool_gpu_hash_64(int *thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	__shared__ uint64_t sharedMemory[2048];
 
@@ -2309,12 +2306,12 @@ void whirlpool512_free_sm3(int thr_id)
 }
 
 __host__
-void whirlpool512_hash_64_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void whirlpool512_hash_64_sm3(int *thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
 	dim3 grid((threads + threadsperblock-1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	x15_whirlpool_gpu_hash_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x15_whirlpool_gpu_hash_64 <<<grid, block>>> (thr_id, threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
@@ -2347,7 +2344,7 @@ void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNonce,
 	if (threads < 256)
 		applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
 
-	oldwhirlpool_gpu_hash_80 << <grid, block >> >(thr_id, threads, startNonce, d_outputHash, 1);
+	oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNonce, d_outputHash, 1);
 }
 
 extern void whirl_midstate(void *state, const void *input);
@@ -2420,5 +2417,5 @@ void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t
 	if (threads < 256)
 		applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
 
-	oldwhirlpool_gpu_hash_80 << <grid, block >> > (thr_id, threads, startNonce, d_outputHash, 1);
+	oldwhirlpool_gpu_hash_80 <<<grid, block>>> (threads, startNonce, d_outputHash, 1);
 }
diff --git a/x16/x16s.cu b/x16/x16s.cu
index 5c555ad0c9..36aeacbc21 100644
--- a/x16/x16s.cu
+++ b/x16/x16s.cu
@@ -31,10 +31,7 @@ extern "C" {
 
 #include "miner.h"
 #include "cuda_helper.h"
-//#include "cuda_x16.h"
-
-#include "../x16r/cuda_x16r.h" // todo, re-unify these like core ccminer is.
-
+#include "cuda_x16.h"
 
 static uint32_t *d_hash[MAX_GPUS];
 
@@ -237,15 +234,9 @@ static bool use_compat_kernels[MAX_GPUS] = { 0 };
 //#define _DEBUG
 #define _DEBUG_PREFIX "x16s-"
 #include "cuda_debug.cuh"
-/*
-static int algo80_tests[HASH_FUNC_COUNT] = { 0 };
-static int algo64_tests[HASH_FUNC_COUNT] = { 0 };
-static int algo80_fails[HASH_FUNC_COUNT] = { 0 };
-*/
+
 extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
-	return -1;
-#if 0
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
@@ -270,30 +261,6 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
 		if (use_compat_kernels[thr_id])
 			x11_echo512_cpu_init(thr_id, throughput);
 
-		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
-		if (throughput2intensity(throughput) > 21) gpulog(LOG_INFO, thr_id, "SIMD throws error on malloc call, TBD if there is a fix");
-
-		quark_groestl512_cpu_init(thr_id, throughput);
-		//		quark_blake512_cpu_init(thr_id, throughput);
-		//		quark_bmw512_cpu_init(thr_id, throughput);
-		//		quark_skein512_cpu_init(thr_id, throughput);
-		quark_jh512_cpu_init(thr_id, throughput);
-		quark_keccak512_cpu_init(thr_id, throughput);
-		//		x11_shavite512_cpu_init(thr_id, throughput);
-		if (x11_simd512_cpu_init(thr_id, throughput))
-		{
-			applog(LOG_WARNING, "SIMD was unable to initialize :( exiting...");
-			exit(-1);
-		}// 64
-		x16_echo512_cuda_init(thr_id, throughput);
-		x13_hamsi512_cpu_init(thr_id, throughput);
-		x13_fugue512_cpu_init(thr_id, throughput);
-		x16_fugue512_cpu_init(thr_id, throughput);
-		// x14_shabal512_cpu_init(thr_id, throughput);
-		x15_whirlpool_cpu_init(thr_id, throughput, 0);
-		x16_whirlpool512_init(thr_id, throughput);
-		x17_sha512_cpu_init(thr_id, throughput);
-		/*
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
@@ -312,7 +279,7 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
 		x15_whirlpool_cpu_init(thr_id, throughput, 0);
 		x16_whirlpool512_init(thr_id, throughput);
 		x17_sha512_cpu_init(thr_id, throughput);
-		*/
+
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
 
 		cuda_check_cpu_init(thr_id, throughput);
@@ -346,70 +313,57 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
 	const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
 
 	switch (algo80) {
-	case BLAKE:
-		//! low impact, can do a lot to optimize quark_blake512
-		quark_blake512_cpu_setBlock_80(thr_id, endiandata);
-		break;
-	case BMW:
-		//! low impact, painfully optimize quark_bmw512
-		quark_bmw512_cpu_setBlock_80(endiandata);
-		break;
-	case GROESTL:
-		//! second most used algo historically
-		groestl512_setBlock_80(thr_id, endiandata);
-		break;
-	case JH:
-		//! average use, optimization tbd
-		jh512_setBlock_80(thr_id, endiandata);
-		break;
-	case KECCAK:
-		//! low impact
-		keccak512_setBlock_80(thr_id, endiandata);
-		break;
-	case SKEIN:
-		//! very low impact
-		skein512_cpu_setBlock_80((void*)endiandata);
-		break;
-	case LUFFA:
-		//! moderate impact (more than shavite)
-		qubit_luffa512_cpu_setBlock_80_alexis((void*)endiandata);
-		break;
-	case CUBEHASH:
-		//! moderate impact (more than shavite)
-		cubehash512_setBlock_80(thr_id, endiandata);
-		break;
-	case SHAVITE:
-		//! has been optimized fairly well
-		x11_shavite512_setBlock_80((void*)endiandata);
-		break;
-	case SIMD:
-		//! high impact optimization. -i > 21 causes error.
-		x16_simd512_setBlock_80((void*)endiandata);
-		break;
-	case ECHO:
-		//! high impact needs more optimizations
-		x16_echo512_setBlock_80((void*)endiandata);
-		break;
-	case HAMSI:
-		//! ***highest impact***
-		x16_hamsi512_setBlock_80((void*)endiandata);
-		break;
-	case FUGUE:
-		//! very high impact!
-		x16_fugue512_setBlock_80((void*)pdata);
-		break;
-	case SHABAL:
-		//! very low impact.
-		x16_shabal512_setBlock_80((void*)endiandata);
-		break;
-	case WHIRLPOOL:
-		//! moderate impact (more than shavite by a bit)
-		x16_whirlpool512_setBlock_80((void*)endiandata);
-		break;
-	case SHA512:
-		//! second lowest impact.
-		x16_sha512_setBlock_80(endiandata);
-		break;
+		case BLAKE:
+			quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+			break;
+		case BMW:
+			quark_bmw512_cpu_setBlock_80(endiandata);
+			break;
+		case GROESTL:
+			groestl512_setBlock_80(thr_id, endiandata);
+			break;
+		case JH:
+			jh512_setBlock_80(thr_id, endiandata);
+			break;
+		case KECCAK:
+			keccak512_setBlock_80(thr_id, endiandata);
+			break;
+		case SKEIN:
+			skein512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case LUFFA:
+			qubit_luffa512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case CUBEHASH:
+			cubehash512_setBlock_80(thr_id, endiandata);
+			break;
+		case SHAVITE:
+			x11_shavite512_setBlock_80((void*)endiandata);
+			break;
+		case SIMD:
+			x16_simd512_setBlock_80((void*)endiandata);
+			break;
+		case ECHO:
+			x16_echo512_setBlock_80((void*)endiandata);
+			break;
+		case HAMSI:
+			x16_hamsi512_setBlock_80((void*)endiandata);
+			break;
+		case FUGUE:
+			x16_fugue512_setBlock_80((void*)pdata);
+			break;
+		case SHABAL:
+			x16_shabal512_setBlock_80((void*)endiandata);
+			break;
+		case WHIRLPOOL:
+			x16_whirlpool512_setBlock_80((void*)endiandata);
+			break;
+		case SHA512:
+			x16_sha512_setBlock_80(endiandata);
+			break;
+		default: {
+			return -1;
+		}
 	}
 
 	int warn = 0;
@@ -419,13 +373,13 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
 
 		// Hash with CUDA
 
-			switch (algo80) {
+		switch (algo80) {
 			case BLAKE:
 				quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("blake80:");
 				break;
 			case BMW:
-				quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 				TRACE("bmw80  :");
 				break;
 			case GROESTL:
@@ -441,11 +395,11 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
 				TRACE("kecck80:");
 				break;
 			case SKEIN:
-				skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
 				TRACE("skein80:");
 				break;
 			case LUFFA:
-				qubit_luffa512_cpu_hash_80_alexis(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 				TRACE("luffa80:");
 				break;
 			case CUBEHASH:
@@ -453,7 +407,7 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
 				TRACE("cube 80:");
 				break;
 			case SHAVITE:
-				x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 				TRACE("shavite:");
 				break;
 			case SIMD:
@@ -493,67 +447,70 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
 
 			switch (algo64) {
 			case BLAKE:
-				quark_blake512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("blake  :");
 				break;
 			case BMW:
-				quark_bmw512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("bmw    :");
 				break;
 			case GROESTL:
-				quark_groestl512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("groestl:");
 				break;
 			case JH:
-				quark_jh512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("jh512  :");
 				break;
 			case KECCAK:
-				quark_keccak512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("keccak :");
 				break;
 			case SKEIN:
-				quark_skein512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("skein  :");
 				break;
 			case LUFFA:
-				x11_luffa512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++;
+				x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("luffa  :");
 				break;
 			case CUBEHASH:
-				x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("cube   :");
 				break;
 			case SHAVITE:
-				x11_shavite512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++;
+				x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("shavite:");
 				break;
 			case SIMD:
-				x11_simd512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+				x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("simd   :");
 				break;
 			case ECHO:
-				x11_echo512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++;
+				if (use_compat_kernels[thr_id])
+					x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				else
+					x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
 				TRACE("echo   :");
 				break;
 			case HAMSI:
-				x13_hamsi512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++;
+				x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("hamsi  :");
 				break;
 			case FUGUE:
-				x13_fugue512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++;
+				x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("fugue  :");
 				break;
 			case SHABAL:
-				x14_shabal512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++;
+				x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("shabal :");
 				break;
 			case WHIRLPOOL:
-				x15_whirlpool_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("shabal :");
 				break;
 			case SHA512:
-				x17_sha512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("sha512 :");
 				break;
 			}
@@ -589,26 +546,7 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
 				} else {
 					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-//				gpulog(LOG_INFO, thr_id, "hash found with %s 80 (%s)!", algo_strings[algo80], hashOrder);
-#if 0
-				gpulog(LOG_INFO, thr_id, "hash found with %s 80!", algo_strings[algo80]);
-
-				algo80_tests[algo80] += work->valid_nonces;
-				char oks64[128] = { 0 };
-				char oks80[128] = { 0 };
-				char fails[128] = { 0 };
-				for (int a = 0; a < HASH_FUNC_COUNT; a++) {
-					const char elem = hashOrder[a];
-					const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-					if (a > 0) algo64_tests[algo64] += work->valid_nonces;
-					sprintf(&oks64[strlen(oks64)], "|%X:%2d", a, algo64_tests[a] < 100 ? algo64_tests[a] : 99);
-					sprintf(&oks80[strlen(oks80)], "|%X:%2d", a, algo80_tests[a] < 100 ? algo80_tests[a] : 99);
-					sprintf(&fails[strlen(fails)], "|%X:%2d", a, algo80_fails[a] < 100 ? algo80_fails[a] : 99);
-				}
-				applog(LOG_INFO, "K64: %s", oks64);
-				applog(LOG_INFO, "K80: %s", oks80);
-				applog(LOG_ERR, "F80: %s", fails);
-#endif
+				//gpulog(LOG_INFO, thr_id, "hash found with %s 80 (%s)!", algo_strings[algo80], hashOrder);
 				return work->valid_nonces;
 			}
 			else if (vhash[7] > Htarg) {
@@ -637,7 +575,6 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
 
 	*hashes_done = pdata[19] - first_nonce;
 	return 0;
-#endif
 }
 
 // cleanup
diff --git a/x16r/cuda_x16_echo512.cu b/x16r/cuda_x16_echo512.cu
index 373978a213..bd1139d8df 100644
--- a/x16r/cuda_x16_echo512.cu
+++ b/x16r/cuda_x16_echo512.cu
@@ -297,11 +297,8 @@ void x16_echo512_setBlock_80(void *endiandata)
 }
 
 __global__ __launch_bounds__(128, 7) /* will force 72 registers */
-void x16_echo512_gpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *g_hash)
+void x16_echo512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_hash)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 	__shared__ uint32_t sharedMemory[1024];
 
 //	echo_gpu_init(sharedMemory);
@@ -331,5 +328,5 @@ void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	x16_echo512_gpu_hash_80 << <grid, block >> >(thr_id, threads, startNonce, (uint64_t*)d_hash);
+	x16_echo512_gpu_hash_80<<<grid, block>>>(threads, startNonce, (uint64_t*)d_hash);
 }
diff --git a/x16r/cuda_x16_fugue512.cu b/x16r/cuda_x16_fugue512.cu
index 7c8893f86b..5967087f1e 100644
--- a/x16r/cuda_x16_fugue512.cu
+++ b/x16r/cuda_x16_fugue512.cu
@@ -306,11 +306,8 @@ void x16_fugue512_setBlock_80(void *pdata)
 
 __global__
 __launch_bounds__(TPB)
-void x16_fugue512_gpu_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
+void x16_fugue512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 	__shared__ uint32_t mixtabs[1024];
 
 	// load shared mem (with 256 threads)
@@ -468,5 +465,5 @@ void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	x16_fugue512_gpu_hash_80 << <grid, block >> > (thr_id, threads, startNonce, (uint64_t*)d_hash);
+	x16_fugue512_gpu_hash_80 <<<grid, block>>> (threads, startNonce, (uint64_t*)d_hash);
 }
diff --git a/x16r/cuda_x16_shabal512.cu b/x16r/cuda_x16_shabal512.cu
index c037f205f5..c1d3e66ee0 100644
--- a/x16r/cuda_x16_shabal512.cu
+++ b/x16r/cuda_x16_shabal512.cu
@@ -241,11 +241,8 @@ void x16_shabal512_setBlock_80(void *pdata)
 #define TPB_SHABAL 256
 
 __global__ __launch_bounds__(TPB_SHABAL, 2)
-void x16_shabal512_gpu_hash_80(int thr_id, uint32_t threads, const uint32_t startNonce, uint32_t *g_hash)
+void x16_shabal512_gpu_hash_80(uint32_t threads, const uint32_t startNonce, uint32_t *g_hash)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
 	uint32_t B[] = {
@@ -351,5 +348,5 @@ void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	x16_shabal512_gpu_hash_80 << <grid, block >> >(thr_id, threads, startNonce, d_hash);
+	x16_shabal512_gpu_hash_80 <<<grid, block >>>(threads, startNonce, d_hash);
 }
diff --git a/x16r/cuda_x16_simd512_80.cu b/x16r/cuda_x16_simd512_80.cu
index 76c810c289..c9c1544062 100644
--- a/x16r/cuda_x16_simd512_80.cu
+++ b/x16r/cuda_x16_simd512_80.cu
@@ -1680,11 +1680,8 @@ void x16_simd512_setBlock_80(void *pdata)
 #define TPB_SIMD 128
 __global__
 __launch_bounds__(TPB_SIMD,1)
-static void x16_simd512_gpu_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *g_outputhash)
+static void x16_simd512_gpu_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_outputhash)
 {
-//	if (*(int*)((uint64_t)thr_id & ~15) & (1 << ((uint64_t)thr_id & 15)))
-//		return;
-
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
@@ -1837,5 +1834,5 @@ void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t
 	const uint32_t tpb = 128;
 	const dim3 grid((threads + tpb - 1) / tpb);
 	const dim3 block(tpb);
-	x16_simd512_gpu_80 << <grid, block >> > (thr_id, threads, startNonce, (uint64_t*)d_hash);
+	x16_simd512_gpu_80 <<<grid, block>>> (threads, startNonce, (uint64_t*) d_hash);
 }
diff --git a/x16r/x16r.cu b/x16r/x16r.cu
index 0eded40125..55260dc441 100644
--- a/x16r/x16r.cu
+++ b/x16r/x16r.cu
@@ -1,8 +1,8 @@
 /**
- * X16R algorithm (X16 with Randomized chain order)
- *
- * tpruvot 2018 - GPL code
- */
+* X16R algorithm (X16 with Randomized chain order)
+*
+* tpruvot 2018 - GPL code
+*/
 
 #include <stdio.h>
 #include <memory.h>
@@ -27,7 +27,7 @@ extern "C" {
 #include "sph/sph_shabal.h"
 #include "sph/sph_whirlpool.h"
 #include "sph/sph_sha2.h"
-//extern struct work_restart *work_restart;
+	//extern struct work_restart *work_restart;
 }
 
 #include "miner.h"
@@ -35,7 +35,7 @@ extern "C" {
 #include "cuda_x16r.h"
 
 #define GPU_HASH_CHECK_LOG 0
-static uint32_t *d_hash[MAX_GPUS+1];
+static uint32_t *d_hash[MAX_GPUS + 1];
 
 enum Algo {
 	BLAKE = 0,
@@ -98,7 +98,7 @@ static void(*pAlgo64[16])(int*, uint32_t, uint32_t*) =
 	x13_fugue512_cpu_hash_64_alexis,
 	x14_shabal512_cpu_hash_64_alexis,
 	x15_whirlpool_cpu_hash_64,
-	x17_sha512_cpu_hash_64 
+	x17_sha512_cpu_hash_64
 };
 static void(*pAlgo80[16])(int, uint32_t, uint32_t, uint32_t*) =
 {
@@ -154,32 +154,32 @@ static void run_x16r_rounds(const uint32_t* prevblock, int thr_id, uint32_t thre
 	pAlgo64[(*(uint64_t*)prevblock >> 60 - (7 * 4)) & 0x0f](thr_id, threads, d_hash, 7);
 	pAlgo64[(*(uint64_t*)prevblock >> 60 - (8 * 4)) & 0x0f](thr_id, threads, d_hash, 8);
 	pAlgo64[(*(uint64_t*)prevblock >> 60 - (9 * 4)) & 0x0f](thr_id, threads, d_hash, 9);
-	pAlgo64[(*(uint64_t*)prevblock >> 60 - (10* 4)) & 0x0f](thr_id, threads, d_hash,10);
-	pAlgo64[(*(uint64_t*)prevblock >> 60 - (11* 4)) & 0x0f](thr_id, threads, d_hash,11);
-	pAlgo64[(*(uint64_t*)prevblock >> 60 - (12* 4)) & 0x0f](thr_id, threads, d_hash,12);
-	pAlgo64[(*(uint64_t*)prevblock >> 60 - (13* 4)) & 0x0f](thr_id, threads, d_hash,13);
-	pAlgo64[(*(uint64_t*)prevblock >> 60 - (14* 4)) & 0x0f](thr_id, threads, d_hash,14);
-	pAlgo64[(*(uint64_t*)prevblock >> 60 - (15* 4)) & 0x0f](thr_id, threads, d_hash,15);
+	pAlgo64[(*(uint64_t*)prevblock >> 60 - (10 * 4)) & 0x0f](thr_id, threads, d_hash, 10);
+	pAlgo64[(*(uint64_t*)prevblock >> 60 - (11 * 4)) & 0x0f](thr_id, threads, d_hash, 11);
+	pAlgo64[(*(uint64_t*)prevblock >> 60 - (12 * 4)) & 0x0f](thr_id, threads, d_hash, 12);
+	pAlgo64[(*(uint64_t*)prevblock >> 60 - (13 * 4)) & 0x0f](thr_id, threads, d_hash, 13);
+	pAlgo64[(*(uint64_t*)prevblock >> 60 - (14 * 4)) & 0x0f](thr_id, threads, d_hash, 14);
+	pAlgo64[(*(uint64_t*)prevblock >> 60 - (15 * 4)) & 0x0f](thr_id, threads, d_hash, 15);
 }
 #endif
 static void getAlgoString(const uint32_t* prevblock, char *output)
 {
 	for (int i = 0; i < 16; i++)
 	{
-			*output++ = (*(uint64_t*)prevblock >> 60 - (i * 4)) & 0x0f;
+		*output++ = (*(uint64_t*)prevblock >> 60 - (i * 4)) & 0x0f;
 	}
 	/*
 	char *sptr = output;
 	uint8_t* data = (uint8_t*)prevblock;
 	//if data == 0x123456789abcdef how does it order?
 	for (uint8_t j = 0; j < HASH_FUNC_COUNT; j++) {
-		uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
-		uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
-		if (algoDigit >= 10)
-			sprintf(sptr, "%c", 'A' + (algoDigit - 10));
-		else
-			sprintf(sptr, "%u", (uint32_t) algoDigit);
-		sptr++;
+	uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
+	uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
+	if (algoDigit >= 10)
+	sprintf(sptr, "%c", 'A' + (algoDigit - 10));
+	else
+	sprintf(sptr, "%u", (uint32_t) algoDigit);
+	sptr++;
 	}
 	*sptr = '\0';
 	*/
@@ -207,18 +207,18 @@ extern "C" void x16r_hash(void *output, const void *input)
 	sph_whirlpool_context ctx_whirlpool;
 	sph_sha512_context ctx_sha512;
 
-	void *in = (void*) input;
+	void *in = (void*)input;
 	int size = 80;
 
-	uint32_t *in32 = (uint32_t*) input;
-//	getAlgoString(&in32[1], hashOrder);
+	uint32_t *in32 = (uint32_t*)input;
+	//	getAlgoString(&in32[1], hashOrder);
 	uint64_t prevblock = *(uint64_t*)&in32[1];
 
 	for (int i = 0; i < 16; i++)
 	{
-//		const char elem = hashOrder[i];
-//		const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-//		uint8_t algo = hashOrder[i];
+		//		const char elem = hashOrder[i];
+		//		const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+		//		uint8_t algo = hashOrder[i];
 		switch ((prevblock >> 60 - (i << 2)) & 0x0f) {
 		case BLAKE:
 			sph_blake512_init(&ctx_blake);
@@ -297,14 +297,14 @@ extern "C" void x16r_hash(void *output, const void *input)
 			break;
 		case SHA512:
 			sph_sha512_init(&ctx_sha512);
-			sph_sha512(&ctx_sha512,(const void*) in, size);
+			sph_sha512(&ctx_sha512, (const void*)in, size);
 			sph_sha512_close(&ctx_sha512, (void*)output);
 			break;
 		}
-		in = (void*) output;
+		in = (void*)output;
 		size = 64;
 	}
-//	memcpy(output, hash, 32);
+	//	memcpy(output, hash, 32);
 }
 
 void whirlpool_midstate(void *state, const void *input)
@@ -340,11 +340,11 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	const int dev_id = device_map[thr_id];
-//	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
-//	if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20;
+	//	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
+	//	if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20;
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19);
 
-//	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+	//	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 	if (init[thr_id]){
 		throughput = min(throughput, max_nonce - first_nonce);
 		if (throughput == max_nonce - first_nonce)
@@ -362,11 +362,11 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 		if (throughput2intensity(throughput) > 21) gpulog(LOG_INFO, thr_id, "SIMD throws error on malloc call, TBD if there is a fix");
-
+/*
 		quark_groestl512_cpu_init(thr_id, throughput);
-//		quark_blake512_cpu_init(thr_id, throughput);
-//		quark_bmw512_cpu_init(thr_id, throughput);
-//		quark_skein512_cpu_init(thr_id, throughput);
+		//		quark_blake512_cpu_init(thr_id, throughput);
+		//		quark_bmw512_cpu_init(thr_id, throughput);
+		//		quark_skein512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
 		x11_shavite512_cpu_init(thr_id, throughput);
@@ -384,9 +384,32 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 		x15_whirlpool_cpu_init(thr_id, throughput, 0);
 		x16_whirlpool512_init(thr_id, throughput);
 		x17_sha512_cpu_init(thr_id, throughput);
+*/
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		if (x11_simd512_cpu_init(thr_id, throughput))
+		{
+			applog(LOG_WARNING, "SIMD was unable to initialize :( exiting...");
+			exit(-1);
+		}// 64
+		x16_echo512_cuda_init(thr_id, throughput);
+		x11_echo512_cuda_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		x16_fugue512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+		x15_whirlpool_cpu_init(thr_id, throughput, 0);
+		x16_whirlpool512_init(thr_id, throughput);
+		x17_sha512_cpu_init(thr_id, throughput);
+
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput), 0);
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput + 0x10000000), 0);
-		
 		cuda_check_cpu_init(thr_id, throughput);
 
 		init[thr_id] = true;
@@ -402,13 +425,16 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 
 		//testing 0xb, 0xc
 		//6FB7C831F4ED0A52
-//		((uint32_t*)ptarget)[7] = 0x5ac6acf2;
+		//		((uint32_t*)ptarget)[7] = 0x5ac6acf2;
 		((uint32_t*)ptarget)[7] = 0x003f;
+//		((uint32_t*)ptarget)[7] = 0x123f;
+//		((uint32_t*)pdata)[1] = 0xEFCDAB89;
+//		((uint32_t*)pdata)[2] = 0x67452301;
 		((uint32_t*)pdata)[1] = 0xEFCDAB89;
-		((uint32_t*)pdata)[2] = 0x67452301;
-		//		*((uint64_t*)&pdata[1]) = 0xaaaaaaaaaaaaaaaa;//0x67452301EFCDAB89;//0x31C8B76F520AEDF4;
-//		((uint32_t*)pdata)[1] = 0x99999999; //E4F361B3
-//		((uint32_t*)pdata)[2] = 0x99999999; //427B6D24
+		((uint32_t*)pdata)[2] = 0x67452301; // 8:64,C:64 bad
+		//*((uint64_t*)&pdata[1]) = 0xffffffffffffffff;//0x67452301EFCDAB89;//0x31C8B76F520AEDF4;
+		//		((uint32_t*)pdata)[1] = 0x99999999; //E4F361B3
+		//		((uint32_t*)pdata)[2] = 0x99999999; //427B6D24
 		/*
 		BLAKE = 0,
 		BMW,1
@@ -430,7 +456,7 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 	}
 	uint32_t _ALIGN(64) endiandata[20];
 
-	for (int k=0; k < 19; k++)
+	for (int k = 0; k < 19; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
 	uint32_t ntime = swab32(pdata[17]);
@@ -450,171 +476,172 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 	uint8_t algo80 = (*(uint64_t*)&endiandata[1] >> 60) & 0x0f;
 
 	switch (algo80) {
+	case BLAKE:
+		//! low impact, can do a lot to optimize quark_blake512
+		quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+		break;
+	case BMW:
+		//! low impact, painfully optimize quark_bmw512
+		quark_bmw512_cpu_setBlock_80(endiandata);
+		break;
+	case GROESTL:
+		//! second most used algo historically
+		groestl512_setBlock_80(thr_id, endiandata);
+		break;
+	case JH:
+		//! average use, optimization tbd
+		jh512_setBlock_80(thr_id, endiandata);
+		break;
+	case KECCAK:
+		//! low impact
+		keccak512_setBlock_80(thr_id, endiandata);
+		break;
+	case SKEIN:
+		//! very low impact
+		skein512_cpu_setBlock_80((void*)endiandata);
+		break;
+	case LUFFA:
+		//! moderate impact (more than shavite)
+		qubit_luffa512_cpu_setBlock_80_alexis((void*)endiandata);
+		break;
+	case CUBEHASH:
+		//! moderate impact (more than shavite)
+		cubehash512_setBlock_80(thr_id, endiandata);
+		break;
+	case SHAVITE:
+		//! has been optimized fairly well
+		x11_shavite512_setBlock_80((void*)endiandata);
+		break;
+	case SIMD:
+		//! high impact optimization. -i > 21 causes error.
+		x16_simd512_setBlock_80((void*)endiandata);
+		break;
+	case ECHO:
+		//! high impact needs more optimizations
+		x16_echo512_setBlock_80((void*)endiandata);
+		break;
+	case HAMSI:
+		//! ***highest impact***
+		x16_hamsi512_setBlock_80((void*)endiandata);
+		break;
+	case FUGUE:
+		//! very high impact!
+		x16_fugue512_setBlock_80((void*)pdata);
+		break;
+	case SHABAL:
+		//! very low impact.
+		x16_shabal512_setBlock_80((void*)endiandata);
+		break;
+	case WHIRLPOOL:
+		//! moderate impact (more than shavite by a bit)
+		x16_whirlpool512_setBlock_80((void*)endiandata);
+		break;
+	case SHA512:
+		//! second lowest impact.
+		x16_sha512_setBlock_80(endiandata);
+		break;
+	}
+
+	int warn = 0;
+	//	int rowdy = 16;
+	do {
+		// Hash with CUDA
+		/*
+		switch (algo80) {
 		case BLAKE:
-			//! low impact, can do a lot to optimize quark_blake512
-			quark_blake512_cpu_setBlock_80(thr_id, endiandata);
-			break;
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis
+		TRACE("blake80:");
+		break;
 		case BMW:
-			//! low impact, painfully optimize quark_bmw512
-			quark_bmw512_cpu_setBlock_80(endiandata);
-			break;
+		quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // , 0); // alexis x
+		TRACE("bmw80  :");
+		break;
 		case GROESTL:
-			//! second most used algo historically
-			groestl512_setBlock_80(thr_id, endiandata);
-			break;
+		groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis
+		TRACE("grstl80:");
+		break;
 		case JH:
-			//! average use, optimization tbd
-			jh512_setBlock_80(thr_id, endiandata);
-			break;
+		jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
+		TRACE("jh51280:");
+		break;
 		case KECCAK:
-			//! low impact
-			keccak512_setBlock_80(thr_id, endiandata);
-			break;
+		keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
+		TRACE("kecck80:");
+		break;
 		case SKEIN:
-			//! very low impact
-			skein512_cpu_setBlock_80((void*)endiandata);
-			break;
+		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // , 1);
+		TRACE("skein80:");
+		break;
 		case LUFFA:
-			//! moderate impact (more than shavite)
-			qubit_luffa512_cpu_setBlock_80_alexis((void*)endiandata);
-			break;
+		qubit_luffa512_cpu_hash_80_alexis(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		TRACE("luffa80:");
+		break;
 		case CUBEHASH:
-			//! moderate impact (more than shavite)
-			cubehash512_setBlock_80(thr_id, endiandata);
-			break;
+		cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
+		TRACE("cube 80:");
+		break;
 		case SHAVITE:
-			//! has been optimized fairly well
-			x11_shavite512_setBlock_80((void*)endiandata);
-			break;
+		x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // , 0);
+		TRACE("shavite:");
+		break;
 		case SIMD:
-			//! high impact optimization. -i > 21 causes error.
-			x16_simd512_setBlock_80((void*)endiandata);
-			break;
+		x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
+		TRACE("simd512:");
+		break;
 		case ECHO:
-			//! high impact needs more optimizations
-			x16_echo512_setBlock_80((void*)endiandata);
-			break;
+		x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		TRACE("echo   :");
+		break;
 		case HAMSI:
-			//! ***highest impact***
-			x16_hamsi512_setBlock_80((void*)endiandata);
-			break;
+		x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		TRACE("hamsi  :");
+		break;
 		case FUGUE:
-			//! very high impact!
-			x16_fugue512_setBlock_80((void*)pdata);
-			break;
+		x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
+		TRACE("fugue  :");
+		break;
 		case SHABAL:
-			//! very low impact.
-			x16_shabal512_setBlock_80((void*)endiandata);
-			break;
+		x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
+		TRACE("shabal :");
+		break;
 		case WHIRLPOOL:
-			//! moderate impact (more than shavite by a bit)
-			x16_whirlpool512_setBlock_80((void*)endiandata);
-			break;
+		x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
+		TRACE("whirl  :");
+		break;
 		case SHA512:
-			//! second lowest impact.
-			x16_sha512_setBlock_80(endiandata);
-			break;
-	}
-
-	int warn = 0;
-//	int rowdy = 16;
-	do {
-		// Hash with CUDA
-/*
-		switch (algo80) {
-			case BLAKE:
-				quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis
-				TRACE("blake80:");
-				break;
-			case BMW:
-				quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // , 0); // alexis x
-				TRACE("bmw80  :");
-				break;
-			case GROESTL:
-				groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis
-				TRACE("grstl80:");
-				break;
-			case JH:
-				jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
-				TRACE("jh51280:");
-				break;
-			case KECCAK:
-				keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
-				TRACE("kecck80:");
-				break;
-			case SKEIN:
-				skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // , 1);
-				TRACE("skein80:");
-				break;
-			case LUFFA:
-				qubit_luffa512_cpu_hash_80_alexis(thr_id, throughput, pdata[19], d_hash[thr_id]);
-				TRACE("luffa80:");
-				break;
-			case CUBEHASH:
-				cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
-				TRACE("cube 80:");
-				break;
-			case SHAVITE:
-				x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // , 0);
-				TRACE("shavite:");
-				break;
-			case SIMD:
-				x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
-				TRACE("simd512:");
-				break;
-			case ECHO:
-				x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
-				TRACE("echo   :");
-				break;
-			case HAMSI:
-				x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
-				TRACE("hamsi  :");
-				break;
-			case FUGUE:
-				x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
-				TRACE("fugue  :");
-				break;
-			case SHABAL:
-				x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
-				TRACE("shabal :");
-				break;
-			case WHIRLPOOL:
-				x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
-				TRACE("whirl  :");
-				break;
-			case SHA512:
-				x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
-				TRACE("sha512 :");
-				break;
+		x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // alexis x
+		TRACE("sha512 :");
+		break;
 		}
-*/		
-			if (work_restart[thr_id].restart) return -127;
+		*/
+		
+		if (work_restart[thr_id].restart) return -127;
 		pAlgo80[(*(uint64_t*)&endiandata[1] >> 60 - (0 * 4)) & 0x0f](thr_id, throughput, pdata[19], d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (1 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (2 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (3 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (4 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (5 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (6 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (7 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (8 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (9 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (10 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (11 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (12 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (13 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (14 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
-		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (15 * 4)) & 0x0f]((int*)((uint64_t)d_ark | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (1 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (2 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (3 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (4 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (5 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (6 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (7 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (8 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (9 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (10 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (11 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (12 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (13 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (14 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
+		pAlgo64[(*(uint64_t*)&endiandata[1] >> 60 - (15 * 4)) & 0x0f]((int*)(((uintptr_t)d_ark) | (thr_id & 15)), throughput, d_hash[thr_id]);
 		x13_echo512_cpu_init(thr_id, throughput);
 
-//		if (work_restart[thr_id].restart) return -127;
+		//		if (work_restart[thr_id].restart) return -127;
 
-//		run_x16r_rounds(&endiandata[1], thr_id, throughput, pdata[19], d_hash[thr_id]);
+		//		run_x16r_rounds(&endiandata[1], thr_id, throughput, pdata[19], d_hash[thr_id]);
 
 		*hashes_done = pdata[19] - first_nonce + throughput;
-		 
+
 		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (work_restart[thr_id].restart) return -127;
+//		if (work_restart[thr_id].restart) return -127;
 #ifdef _DEBUG
 		uint32_t _ALIGN(64) dhash[8];
 		be32enc(&endiandata[19], pdata[19]);
@@ -639,7 +666,8 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 					bn_set_target_ratio(work, vhash, 1);
 					work->valid_nonces++;
 					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
-				} else {
+				}
+				else {
 					pdata[19] = work->nonces[0] + 1; // cursor
 				}
 #if GPU_HASH_CHECK_LOG == 1
@@ -659,9 +687,9 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 				}
 				applog(LOG_INFO, "K64: %s", oks64);
 				applog(LOG_INFO, "K80: %s", oks80);
-				applog(LOG_ERR,  "F80: %s", fails);
+				applog(LOG_ERR, "F80: %s", fails);
 #endif
-				if (work_restart[thr_id].restart) return -127;
+//				if (work_restart[thr_id].restart) return -127;
 				return work->valid_nonces;
 			}
 			else if (vhash[7] > Htarg) {
@@ -672,12 +700,13 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 					warn++;
 					pdata[19] = work->nonces[0] + 1;
 					continue;
-				} else {
+				}
+				else {
 					if (!opt_quiet)	gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %X%X",
 						work->nonces[0], algo_strings[algo80], endiandata[2], endiandata[1]);
-//					work->nonces[0], algo_strings[algo80], hashOrder);
+					//					work->nonces[0], algo_strings[algo80], hashOrder);
 					warn = 0;
-//					work->data[19] = max_nonce;
+					//					work->data[19] = max_nonce;
 					if (work_restart[thr_id].restart) return -127;
 					return -128;
 				}
@@ -720,7 +749,7 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 
 	*hashes_done = pdata[19] - first_nonce;
-	if (work_restart[thr_id].restart) return -127;
+//	if (work_restart[thr_id].restart) return -127;
 	return 0;
 }
 
@@ -733,7 +762,7 @@ extern "C" void free_x16r(int thr_id)
 	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
-	cudaFree((void *)d_ark);
+	cudaFree((void *)&d_ark);
 	quark_blake512_cpu_free(thr_id);
 	quark_groestl512_cpu_free(thr_id);
 	x11_simd512_cpu_free(thr_id);
@@ -751,17 +780,12 @@ volatile int h_ark = 0;
 
 extern "C" int *_d_ark = NULL;
 static int q = 0;
-static int* skin = NULL;
+
 __host__
 void x11_echo512_cuda_init(int thr_id, uint32_t threads)
 {
 	if (q++) return;
 	cudaMalloc(&d_ark, (size_t)64);
-	skin = d_ark;
-	if ((uint64_t)d_ark & 15)
-	{
-		d_ark = (int*)((uint64_t)d_ark + ~((uint64_t)d_ark & 15));
-	}
 	cudaMemcpyToSymbol(d_ark, (int*)&h_ark, sizeof(int), 0, cudaMemcpyHostToDevice);
 }
 __host__ extern void x11_echo512_cpu_init(int thr_id, uint32_t threads)
@@ -772,6 +796,7 @@ __host__ extern void x11_echo512_cpu_init(int thr_id, uint32_t threads)
 }
 __host__ extern void x13_echo512_cpu_init(int thr_id, uint32_t threads)
 {
-	h_ark ^= 1 << thr_id;
+//	h_ark ^= (1 << thr_id);
+	h_ark &= ~(1 << thr_id);
 	cudaMemcpyToSymbol(d_ark, (int*)&h_ark, sizeof(int), 0, cudaMemcpyHostToDevice);
 }
diff --git a/x17/cuda_x17_sha512.cu b/x17/cuda_x17_sha512.cu
index 08e3335025..ce13188223 100644
--- a/x17/cuda_x17_sha512.cu
+++ b/x17/cuda_x17_sha512.cu
@@ -92,9 +92,8 @@ __global__
 /*__launch_bounds__(256, 4)*/
 void x17_sha512_gpu_hash_64(int *thr_id, const uint32_t threads, uint64_t *g_hash)
 {
-	if ((*(int*)(((uint64_t)thr_id) & ~15ULL)) & (1 << (((uint64_t)thr_id) & 15)))
+	if ((*(int*)(((uintptr_t)thr_id) & ~15ULL)) & (1 << (((uintptr_t)thr_id) & 15)))
 		return;
-
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
@@ -162,7 +161,7 @@ void x17_sha512_cpu_init(int thr_id, uint32_t threads)
 {
 	cudaMemcpyToSymbol(c_WB, WB, 80 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 }
- 
+
 __host__
 void x17_sha512_cpu_hash_64(int *thr_id, uint32_t threads, uint32_t *d_hash)
 {
@@ -179,9 +178,8 @@ static uint64_t c_PaddedMessage80[10];
 
 __global__
 /*__launch_bounds__(256, 4)*/
-void x16_sha512_gpu_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
+void x16_sha512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
 {
-
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
@@ -243,7 +241,7 @@ void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	x16_sha512_gpu_hash_80 << <grid, block >> > (thr_id, threads, startNounce, (uint64_t*)d_hash);
+	x16_sha512_gpu_hash_80 << <grid, block >> > (threads, startNounce, (uint64_t*)d_hash);
 }
 
 __host__