diff --git a/Algo256/blake2s.cu b/Algo256/blake2s.cu
index 0b4bbe0428..29d28f4916 100644
--- a/Algo256/blake2s.cu
+++ b/Algo256/blake2s.cu
@@ -34,18 +34,6 @@ uint32_t ROL16(const uint32_t a) {
 #define ROL16(u) (u << 16)
 #endif
 
-__device__ __forceinline__
-uint32_t xor3x(uint32_t a, uint32_t b, uint32_t c)
-{
-	uint32_t result;
-#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
-	asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result) : "r"(a), "r"(b),"r"(c)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
-#else
-	result = a^b^c;
-#endif
-	return result;
-}
-
 static const uint32_t blake2s_IV[8] = {
 	0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
 	0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
@@ -562,4 +550,3 @@ extern "C" void free_blake2s(int thr_id)
 
 	cudaDeviceSynchronize();
 }
-
diff --git a/Algo256/cuda_keccak256.cu b/Algo256/cuda_keccak256.cu
index 7e87bb2860..9ae67c6046 100644
--- a/Algo256/cuda_keccak256.cu
+++ b/Algo256/cuda_keccak256.cu
@@ -32,18 +32,6 @@ __constant__ uint2 keccak_round_constants[24] = {
 	{ 0x80008081, 0x80000000 }, { 0x00008080, 0x80000000 },	{ 0x80000001, 0x00000000 }, { 0x80008008, 0x80000000 }
 };
 
-__device__ __forceinline__
-uint2 xor3x(const uint2 a,const uint2 b,const uint2 c) {
-	uint2 result;
-#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
-	asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
-	asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
-#else
-	result = a^b^c;
-#endif
-	return result;
-}
-
 __device__ __forceinline__
 uint2 chi(const uint2 a,const uint2 b,const uint2 c) { // keccak chi
 	uint2 result;
diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu
index cbeb660e85..5c514e26c5 100644
--- a/Algo256/cuda_skein256.cu
+++ b/Algo256/cuda_skein256.cu
@@ -1,3 +1,4 @@
+#if 0
 #include <memory.h>
 
 #include "cuda_helper.h"
@@ -311,3 +312,4 @@ void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, ui
 	MyStreamSynchronize(NULL, order, thr_id);
 }
 
+#endif
\ No newline at end of file
diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu
index 1388a54ade..88800cf7ce 100644
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@@ -1,3 +1,4 @@
+#if 0
 /*
  * Keccak 256
  *
@@ -173,3 +174,4 @@ extern "C" void free_keccak256(int thr_id)
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
 }
+#endif
\ No newline at end of file
diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu
index 9192a8a5cd..a47fec181d 100644
--- a/JHA/cuda_jha_keccak512.cu
+++ b/JHA/cuda_jha_keccak512.cu
@@ -530,7 +530,7 @@ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNoun
 	size_t shared_size = 0;
 
 	jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
-	MyStreamSynchronize(NULL, order, thr_id);
+	//MyStreamSynchronize(NULL, order, thr_id);
 }
 
 
@@ -591,7 +591,7 @@ void zr5_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce,
 	dim3 block(threadsperblock);
 
 	zr5_keccak512_gpu_hash<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
-	MyStreamSynchronize(NULL, 0, thr_id);
+	//MyStreamSynchronize(NULL, 0, thr_id);
 }
 
 /* required for the second hash part of zr5 */
@@ -662,5 +662,5 @@ void zr5_keccak512_cpu_hash_pok(int thr_id, uint32_t threads, uint32_t startNoun
 
 	cudaMemcpyToSymbol(d_OriginalData, pdata, sizeof(d_OriginalData), 0, cudaMemcpyHostToDevice);
 	zr5_keccak512_gpu_hash_pok<<<grid, block>>>(threads, startNounce, d_hash, d_poks, version);
-	MyStreamSynchronize(NULL, 10, thr_id);
+	//MyStreamSynchronize(NULL, 10, thr_id);
 }
diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
index 78e5bd02e7..e87e0ef6b4 100644
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@@ -1,3 +1,4 @@
+#if 0
 extern "C"
 {
 #include "sph/sph_keccak.h"
@@ -293,3 +294,4 @@ extern "C" void free_jackpot(int thr_id)
 
 	init[thr_id] = false;
 }
+#endif
\ No newline at end of file
diff --git a/JHA/jha.cu b/JHA/jha.cu
index ec7895c10d..a50b24d0df 100644
--- a/JHA/jha.cu
+++ b/JHA/jha.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * JHA v8 algorithm - compatible implementation
  * @author tpruvot@github 05-2017
@@ -263,3 +264,4 @@ extern "C" void free_jha(int thr_id)
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
 }
+#endif
\ No newline at end of file
diff --git a/Makefile.am b/Makefile.am
index d7d2a0bb58..e82b72d2a5 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -67,18 +67,17 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  sph/ripemd.c sph/sph_sha2.c \
 			  polytimos.cu \
 			  lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \
-			  qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
+			  qubit/qubit.cu qubit/qubit_luffa512.cu qubit/qubit_luffa512_alexis.cu qubit/deep.cu qubit/luffa.cu \
 			  tribus/tribus.cu tribus/cuda_echo512_final.cu \
-			  x11/x11.cu x12/x12.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
-			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
+			  x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
+			  x11/cuda_x11_shavite512.cu x11/cuda_x11_shavite512_alexis.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu x11/cuda_x11_echo_alexis.cu \
 			  x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \
-			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
+			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu x13/cuda_x13_fugue512_alexis.cu \
 			  x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \
-			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
+			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x14_shabal512_alexis.cu x15/cuda_x15_whirlpool.cu \
 			  x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \
-			  x16/x16r.cu x16/x16s.cu x16/cuda_x16_echo512.cu x16/cuda_x16_fugue512.cu \
-			  x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
-			  x16/cuda_x16_echo512_64.cu \
+			  x16r/x16r.cu x16r/cuda_x16_echo512.cu x16r/cuda_x16_fugue512.cu \
+			  x16r/cuda_x16_shabal512.cu x16r/cuda_x16_simd512_80.cu \
 			  x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
 			  x11/phi.cu x11/cuda_streebog_maxwell.cu \
 			  x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
@@ -115,7 +114,7 @@ endif
 ccminer_LDADD += -lcuda
 
 nvcc_ARCH :=
-#nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
+nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
 nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
 nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\"
 #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
diff --git a/README.md b/README.md
index 5bcb50572c..2608670795 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,16 @@
 # ccminer
 
+Download here: https://github.com/a1i3nj03/a1_min3r/releases
+
+use like this: ccminer -o stratum+tcp://pool.ominousnetwork.com:3636 - mywalletisbetterthanurs.TR_FTW -p password123
+
+Younger better faster
+
+I R broke'd, send me (a1i3nj03) RVN coinz @ RP6cmcZNE9g5oEakzCb88DgVJSLVBYJwnu PLZ
+
+(Always) Forget to add dev fee :sadparrot:
+
+
 Based on Christian Buchner's &amp; Christian H.'s CUDA project, no more active on github since 2014.
 
 Check the [README.txt](README.txt) for the additions
@@ -8,12 +19,12 @@ BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)
 
 A part of the recent algos were originally written by [djm34](https://github.com/djm34) and [alexis78](https://github.com/alexis78)
 
-This variant was tested and built on Linux (ubuntu server 14.04, 16.04, Fedora 22 to 25)
-It is also built for Windows 7 to 10 with VStudio 2013, to stay compatible with Windows 7 and Vista.
+This variant may be built on Linux someday (ubuntu server 14.04, 16.04, Fedora 22 to 25)
+It is built for Windows 7 to 10 with VStudio 2013, to stay compatible with Windows 7 and Vista.
 
-Note that the x86 releases are generally faster than x64 ones on Windows, but that tend to change with the recent drivers.
+Note that the x86 releases are generally NOT faster than x64 ones on Windows, that tends to change with the recent drivers.
 
-The recommended CUDA Toolkit version was the [6.5.19](http://developer.download.nvidia.com/compute/cuda/6_5/rel/installers/cuda_6.5.19_windows_general_64.exe), but some light algos could be faster with the version 7.5 and 8.0 (like lbry, decred and skein).
+The recommended CUDA Toolkit version was the [6.5.19](http://developer.download.nvidia.com/compute/cuda/6_5/rel/installers/cuda_6.5.19_windows_general_64.exe), but some light algos could be faster with the version 7.5 and 8.0 (like lbry, decred and skein). CUDA 9.0 was used, but 9.1 should be even better.
 
 About source code dependencies
 ------------------------------
diff --git a/README.txt b/README.txt
index 59a2cec18a..74d38d14b0 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
 
-ccminer 2.2.5 (Apr 2018)             "x12, x16r and x16s algos"
+ccminer 2.2.5 (Feb 2018)                            "x16r algo"
 ---------------------------------------------------------------
 
 ***************************************************************
@@ -120,12 +120,9 @@ its command line interface and options.
                           tribus      use to mine Denarius
                           x11evo      use to mine Revolver
                           x11         use to mine DarkCoin
-                          x12         use to mine GalaxyCash
-                          x13         use to mine X13
-                          x14         use to mine X14
+                          x14         use to mine X14Coin
                           x15         use to mine Halcyon
                           x16r        use to mine Raven
-                          x16s        use to mine Pigeon and Eden
                           x17         use to mine X17
                           vanilla     use to mine Vanilla (Blake256)
                           veltor      use to mine VeltorCoin
@@ -281,13 +278,10 @@ so we can more efficiently implement new algorithms using the latest hardware
 features.
 
 >>> RELEASE HISTORY <<<
-  Apr. 02nd 2018  v2.2.5
-                  New x16r algo for Raven
-                  New x16s algo for Pigeon and Eden
-                  New x12 algo for Galaxycash
-                  Equihash (SIMT) sync issues for the Volta generation
+  Feb. 2017       v2.2.5
+                  New x16r algo
 
-  Jan. 04th 2018  v2.2.4
+  Jan. 04th 2017  v2.2.4
                   Improve lyra2v2
                   Higher keccak default intensity
                   Drop SM 2.x support by default, for CUDA 9 and more recent
diff --git a/algos.h b/algos.h
index ed0ff83021..014b4cbf1b 100644
--- a/algos.h
+++ b/algos.h
@@ -56,12 +56,10 @@ enum sha_algos {
 	ALGO_BITCORE,
 	ALGO_X11EVO,
 	ALGO_X11,
-	ALGO_X12,
 	ALGO_X13,
 	ALGO_X14,
 	ALGO_X15,
 	ALGO_X16R,
-	ALGO_X16S,
 	ALGO_X17,
 	ALGO_VANILLA,
 	ALGO_VELTOR,
@@ -128,12 +126,10 @@ static const char *algo_names[] = {
 	"bitcore",
 	"x11evo",
 	"x11",
-	"x12",
 	"x13",
 	"x14",
 	"x15",
 	"x16r",
-	"x16s",
 	"x17",
 	"vanilla",
 	"veltor",
diff --git a/api.cpp b/api.cpp
index 9014f3a78d..cd11fe93a2 100644
--- a/api.cpp
+++ b/api.cpp
@@ -1252,7 +1252,7 @@ static void api()
 			char *wskey = NULL;
 			n = recv(c, &buf[0], SOCK_REC_BUFSZ, 0);
 
-			fail = SOCKETFAIL(n) || n < 0;
+			fail = SOCKETFAIL(n);
 			if (fail)
 				buf[0] = '\0';
 			else if (n > 0 && buf[n-1] == '\n') {
@@ -1261,7 +1261,7 @@ static void api()
 				if (n > 0 && buf[n-1] == '\r')
 					buf[n-1] = '\0';
 			}
-			else buf[n] = '\0';
+			buf[n] = '\0';
 
 			//if (opt_debug && opt_protocol && n > 0)
 			//	applog(LOG_DEBUG, "API: recv command: (%d) '%s'+char(%x)", n, buf, buf[n-1]);
@@ -1348,6 +1348,6 @@ void api_set_throughput(int thr_id, uint32_t throughput)
 		if (cgpu->throughput != throughput) cgpu->throughput = throughput;
 	}
 	// to display in bench results
-	if (opt_benchmark)
-		bench_set_throughput(thr_id, throughput);
+//	if (opt_benchmark)
+//		bench_set_throughput(thr_id, throughput);
 }
diff --git a/bench.cpp b/bench.cpp
index eeeee60a99..569c85310d 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -1,3 +1,4 @@
+#if 0
 /**
  * Made to benchmark and test algo switch
  *
@@ -99,12 +100,10 @@ void algo_free_all(int thr_id)
 	free_wildkeccak(thr_id);
 	free_x11evo(thr_id);
 	free_x11(thr_id);
-	free_x12(thr_id);
 	free_x13(thr_id);
 	free_x14(thr_id);
 	free_x15(thr_id);
 	free_x16r(thr_id);
-	free_x16s(thr_id);
 	free_x17(thr_id);
 	free_zr5(thr_id);
 	free_scrypt(thr_id);
@@ -240,3 +239,4 @@ void bench_display_results()
 		}
 	}
 }
+#endif
\ No newline at end of file
diff --git a/ccminer.cpp b/ccminer.cpp
index 87cd26ce40..6d1a8ba150 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -2,6 +2,8 @@
  * Copyright 2010 Jeff Garzik
  * Copyright 2012-2014 pooler
  * Copyright 2014-2017 tpruvot
+ * Copyright 2018      brianmct
+ * Copyright 2018      a1i3nj03
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -45,6 +47,7 @@
 #include "sia/sia-rpc.h"
 #include "crypto/xmr-rpc.h"
 #include "equi/equihash.h"
+#include "donate.h"
 
 #include <cuda_runtime.h>
 
@@ -55,7 +58,7 @@
 BOOL WINAPI ConsoleHandler(DWORD);
 #endif
 
-#define PROGRAM_NAME		"ccminer"
+#define PROGRAM_NAME		"a1_min3r"
 #define LP_SCANTIME		60
 #define HEAVYCOIN_BLKHDR_SZ		84
 #define MNR_BLKHDR_SZ 80
@@ -105,17 +108,18 @@ bool use_colors = true;
 int use_pok = 0;
 static bool opt_background = false;
 bool opt_quiet = false;
-int opt_maxlograte = 3;
+int opt_maxlograte = 5;//3;
 static int opt_retries = -1;
 static int opt_fail_pause = 30;
 int opt_time_limit = -1;
 int opt_shares_limit = -1;
 time_t firstwork_time = 0;
+time_t dev_timestamp;
 int opt_timeout = 300; // curl
-int opt_scantime = 10;
+int opt_scantime = 10;//10;
 static json_t *opt_config;
 static const bool opt_time = true;
-volatile enum sha_algos opt_algo = ALGO_AUTO;
+volatile enum sha_algos opt_algo = ALGO_X16R; // ALGO_AUTO;
 int opt_n_threads = 0;
 int gpu_threads = 1;
 int64_t opt_affinity = -1L;
@@ -163,6 +167,7 @@ char *jane_params = NULL;
 struct pool_infos pools[MAX_POOLS] = { 0 };
 int num_pools = 1;
 volatile int cur_pooln = 0;
+volatile int prev_pooln = 0;
 bool opt_pool_failover = true;
 volatile bool pool_on_hold = false;
 volatile bool pool_is_switching = false;
@@ -232,6 +237,8 @@ int opt_api_mcast_port = 4068;
 
 bool opt_stratum_stats = false;
 
+//double dev_donate_percent = MIN_DEV_DONATE_PERCENT;
+
 static char const usage[] = "\
 Usage: " PROGRAM_NAME " [OPTIONS]\n\
 Options:\n\
@@ -290,12 +297,10 @@ Options:\n\
 			whirlpool   Whirlpool algo\n\
 			x11evo      Permuted x11 (Revolver)\n\
 			x11         X11 (DarkCoin)\n\
-			x12         X12 (GalaxyCash)\n\
 			x13         X13 (MaruCoin)\n\
 			x14         X14\n\
 			x15         X15\n\
 			x16r        X16R (Raven)\n\
-			x16s        X16S\n\
 			x17         X17\n\
 			wildkeccak  Boolberry\n\
 			zr5         ZR5 (ZiftrCoin)\n\
@@ -369,6 +374,7 @@ Options:\n\
   -B, --background      run the miner in the background\n\
       --benchmark       run in offline benchmark mode\n\
       --cputest         debug hashes from cpu algorithms\n\
+      --donate          percentage of time to donate to dev\n\
   -c, --config=FILE     load a JSON-format configuration file\n\
   -V, --version         display version information and exit\n\
   -h, --help            display this help text and exit\n\
@@ -400,6 +406,7 @@ struct option options[] = {
 	{ "cpu-priority", 1, NULL, 1021 },
 	{ "cuda-schedule", 1, NULL, 1025 },
 	{ "debug", 0, NULL, 'D' },
+	{ "donate", 1, NULL, 1081 },
 	{ "help", 0, NULL, 'h' },
 	{ "intensity", 1, NULL, 'i' },
 	{ "ndevs", 0, NULL, 'n' },
@@ -575,9 +582,11 @@ void get_currentalgo(char* buf, int sz)
 
 void format_hashrate(double hashrate, char *output)
 {
+#if 0
 	if (opt_algo == ALGO_EQUIHASH)
 		format_hashrate_unit(hashrate, output, "Sol/s");
 	else
+#endif
 		format_hashrate_unit(hashrate, output, "H/s");
 }
 
@@ -661,6 +670,7 @@ static void calc_network_diff(struct work *work)
 	// sample for diff 43.281 : 1c05ea29
 	// todo: endian reversed on longpoll could be zr5 specific...
 	uint32_t nbits = have_longpoll ? work->data[18] : swab32(work->data[18]);
+#if 0
 	if (opt_algo == ALGO_LBRY) nbits = swab32(work->data[26]);
 	if (opt_algo == ALGO_DECRED) nbits = work->data[29];
 	if (opt_algo == ALGO_SIA) nbits = work->data[11]; // unsure if correct
@@ -668,20 +678,29 @@ static void calc_network_diff(struct work *work)
 		net_diff = equi_network_diff(work);
 		return;
 	}
-
+#endif
 	uint32_t bits = (nbits & 0xffffff);
 	int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
 
 	uint64_t diffone = 0x0000FFFF00000000ull;
+	/*
 	double d = (double)0x0000ffff / (double)bits;
 
 	for (int m=shift; m < 29; m++) d *= 256.0;
 	for (int m=29; m < shift; m++) d /= 256.0;
-	if (opt_algo == ALGO_DECRED && shift == 28) d *= 256.0;
+	*/
+
+	uint32_t d = 0x0000ffff / bits;
+
+	for (int m = shift; m < 29; m++) d <<= 8;
+	for (int m = 29; m < shift; m++) d >>= 8;
+
+	//	if (opt_algo == ALGO_DECRED && shift == 28) d *= 256.0;
 	if (opt_debug_diff)
-		applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
+		applog(LOG_DEBUG, "net diff: %u -> shift %u, bits %08x", d, shift, bits);
+//		applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
 
-	net_diff = d;
+	net_diff = (double)d;
 }
 
 /* decode data from getwork (wallets and longpoll pools) */
@@ -690,8 +709,9 @@ static bool work_decode(const json_t *val, struct work *work)
 	int data_size, target_size = sizeof(work->target);
 	int adata_sz, atarget_sz = ARRAY_SIZE(work->target);
 	int i;
-
+#if 0
 	switch (opt_algo) {
+
 	case ALGO_DECRED:
 		data_size = 192;
 		adata_sz = 180/4;
@@ -706,9 +726,10 @@ static bool work_decode(const json_t *val, struct work *work)
 	case ALGO_WILDKECCAK:
 		return rpc2_job_decode(val, work);
 	default:
+#endif
 		data_size = 128;
 		adata_sz = data_size / 4;
-	}
+//	}
 
 	if (!jobj_binary(val, "data", work->data, data_size)) {
 		json_t *obj = json_object_get(val, "data");
@@ -729,12 +750,14 @@ static bool work_decode(const json_t *val, struct work *work)
 		applog(LOG_ERR, "JSON invalid target");
 		return false;
 	}
-
+#if 0
 	if (opt_algo == ALGO_HEAVY) {
 		if (unlikely(!jobj_binary(val, "maxvote", &work->maxvote, sizeof(work->maxvote)))) {
 			work->maxvote = 2048;
 		}
 	} else work->maxvote = 0;
+#endif
+		work->maxvote = 0;
 
 	for (i = 0; i < adata_sz; i++)
 		work->data[i] = le32dec(work->data + i);
@@ -750,6 +773,7 @@ static bool work_decode(const json_t *val, struct work *work)
 	stratum_diff = work->targetdiff;
 
 	work->tx_count = use_pok = 0;
+#if 0
 	if (opt_algo == ALGO_ZR5 && work->data[0] & POK_BOOL_MASK) {
 		use_pok = 1;
 		json_t *txs = json_object_get(val, "txs");
@@ -778,10 +802,10 @@ static bool work_decode(const json_t *val, struct work *work)
 				applog(LOG_DEBUG, "block txs: %u, total len: %u", work->tx_count, totlen);
 		}
 	}
-
+#endif
 	/* use work ntime as job id (solo-mining) */
 	cbin2hex(work->job_id, (const char*)&work->data[17], 4);
-
+#if 0
 	if (opt_algo == ALGO_DECRED) {
 		uint16_t vote;
 		// always keep last bit of votebits
@@ -807,7 +831,7 @@ static bool work_decode(const json_t *val, struct work *work)
 		}
 		cbin2hex(work->job_id, (const char*)&work->data[34], 4);
 	}
-
+#endif
 	return true;
 }
 
@@ -862,6 +886,13 @@ int share_result(int result, int pooln, double sharediff, const char *reason)
 			suppl, s, flag, solved);
 	if (reason) {
 		applog(LOG_WARNING, "reject reason: %s", reason);
+
+//		if (strncasecmp(reason, "job not found", 13) == 0) {
+//			applog(LOG_WARNING, "trying thread restart");
+//			restart_threads();
+//			g_work_time = 0;
+//		} else
+
 		if (!check_dups && strncasecmp(reason, "duplicate", 9) == 0) {
 			applog(LOG_WARNING, "enabling duplicates check feature");
 			check_dups = true;
@@ -902,9 +933,10 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 	}
 
 	/* discard if a newer block was received */
+	pthread_mutex_lock(&g_work_lock);
 	stale_work = work->height && work->height < g_work.height;
 	if (have_stratum && !stale_work && !opt_submit_stale && opt_algo != ALGO_ZR5 && opt_algo != ALGO_SCRYPT_JANE) {
-		pthread_mutex_lock(&g_work_lock);
+//		pthread_mutex_lock(&g_work_lock);
 		if (strlen(work->job_id + 8))
 			stale_work = strncmp(work->job_id + 8, g_work.job_id + 8, sizeof(g_work.job_id) - 8);
 		if (stale_work) {
@@ -916,8 +948,10 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 				check_stratum_jobs = true;
 			}
 		}
-		pthread_mutex_unlock(&g_work_lock);
+//		restart_threads();
+//		pthread_mutex_unlock(&g_work_lock);
 	}
+	pthread_mutex_unlock(&g_work_lock);
 
 	if (!have_stratum && !stale_work && allow_gbt) {
 		struct work wheight = { 0 };
@@ -930,6 +964,8 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		}
 	}
 
+#if 0
+
 	if (!stale_work && opt_algo == ALGO_ZR5 && !have_stratum) {
 		stale_work = (memcmp(&work->data[1], &g_work.data[1], 68));
 	}
@@ -939,13 +975,13 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 			applog(LOG_WARNING, "stale work detected, discarding");
 		return true;
 	}
-
+#endif
 	if (pool->type & POOL_STRATUM) {
 		uint32_t sent = 0;
 		uint32_t ntime, nonce = work->nonces[idnonce];
 		char *ntimestr, *noncestr, *xnonce2str, *nvotestr;
 		uint16_t nvote = 0;
-
+#if 0
 		switch (opt_algo) {
 		case ALGO_BLAKE:
 		case ALGO_BLAKECOIN:
@@ -984,11 +1020,12 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 			be32enc(&nonce, work->data[19]);
 			break;
 		default:
+#endif
 			le32enc(&ntime, work->data[17]);
 			le32enc(&nonce, work->data[19]);
-		}
+//		}
 		noncestr = bin2hex((const uchar*)(&nonce), 4);
-
+#if 1
 		if (check_dups)
 			sent = hashlog_already_submittted(work->job_id, nonce);
 		if (sent > 0) {
@@ -1003,17 +1040,18 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 			restart_threads();
 			return true;
 		}
-
+#endif
 		ntimestr = bin2hex((const uchar*)(&ntime), 4);
-
+#if 0
 		if (opt_algo == ALGO_DECRED) {
 			xnonce2str = bin2hex((const uchar*)&work->data[36], stratum.xnonce1_size);
 		} else if (opt_algo == ALGO_SIA) {
 			uint16_t high_nonce = swab32(work->data[9]) >> 16;
 			xnonce2str = bin2hex((unsigned char*)(&high_nonce), 2);
 		} else {
+#endif
 			xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len);
-		}
+//		}
 
 		// store to keep/display the solved ratio/diff
 		stratum.sharediff = work->sharediff[idnonce];
@@ -1057,7 +1095,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 
 		/* build hex string */
 		char *str = NULL;
-
+		/*
 		if (opt_algo == ALGO_ZR5) {
 			data_size = 80; adata_sz = 20;
 		}
@@ -1067,7 +1105,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		else if (opt_algo == ALGO_SIA) {
 			return sia_submit(curl, pool, work);
 		}
-
+		*/
 		if (opt_algo != ALGO_HEAVY && opt_algo != ALGO_MJOLLNIR) {
 			for (int i = 0; i < adata_sz; i++)
 				le32enc(work->data + i, work->data[i]);
@@ -1241,7 +1279,7 @@ static bool get_upstream_work(CURL *curl, struct work *work)
 	json_t *val;
 
 	gettimeofday(&tv_start, NULL);
-
+#if 0
 	if (opt_algo == ALGO_SIA) {
 		char *sia_header = sia_getheader(curl, pool);
 		if (sia_header) {
@@ -1254,7 +1292,7 @@ static bool get_upstream_work(CURL *curl, struct work *work)
 		}
 		return rc;
 	}
-
+#endif
 	if (opt_debug_threads)
 		applog(LOG_DEBUG, "%s: want_longpoll=%d have_longpoll=%d",
 			__func__, want_longpoll, have_longpoll);
@@ -1457,14 +1495,16 @@ bool get_work(struct thr_info *thr, struct work *work)
 		memset(work->data, 0x55, 76);
 		//work->data[17] = swab32((uint32_t)time(NULL));
 		memset(work->data + 19, 0x00, 52);
+#if 0
 		if (opt_algo == ALGO_DECRED) {
 			memset(&work->data[35], 0x00, 52);
 		} else if (opt_algo == ALGO_LBRY) {
 			work->data[28] = 0x80000000;
 		} else {
+#endif
 			work->data[20] = 0x80000000;
 			work->data[31] = 0x00000280;
-		}
+//		}
 		memset(work->target, 0x00, sizeof(work->target));
 		return true;
 	}
@@ -1551,8 +1591,9 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	work->pooln = sctx->pooln;
 
 	/* Generate merkle root */
+#if 0
 	switch (opt_algo) {
-		case ALGO_DECRED:
+	case ALGO_DECRED:
 		case ALGO_EQUIHASH:
 		case ALGO_SIA:
 			// getwork over stratum, no merkle to generate
@@ -1572,8 +1613,9 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 			break;
 		case ALGO_WHIRLPOOL:
 		default:
+#endif
 			sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
-	}
+//	}
 
 	for (i = 0; i < sctx->job.merkle_count; i++) {
 		memcpy(merkle_root + 32, sctx->job.merkle[i], 32);
@@ -1584,7 +1626,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 #endif
 			sha256d(merkle_root, merkle_root, 64);
 	}
-	
+
 	/* Increment extranonce2 */
 	for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++);
 
@@ -1593,7 +1635,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	work->data[0] = le32dec(sctx->job.version);
 	for (i = 0; i < 8; i++)
 		work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i);
-
+#if 0
 	if (opt_algo == ALGO_DECRED) {
 		uint16_t vote;
 		for (i = 0; i < 8; i++) // reversed prevhash
@@ -1643,7 +1685,9 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		memcpy(&work->data[12], sctx->job.coinbase, 32); // merkle_root
 		work->data[20] = 0x80000000;
 		if (opt_debug) applog_hex(work->data, 80);
-	} else {
+	} else 
+#endif
+	{
 		for (i = 0; i < 8; i++)
 			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
 		work->data[17] = le32dec(sctx->job.ntime);
@@ -1654,7 +1698,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 
 	if (opt_showdiff || opt_max_diff > 0.)
 		calc_network_diff(work);
-
+#if 0
 	switch (opt_algo) {
 	case ALGO_MJOLLNIR:
 	case ALGO_HEAVY:
@@ -1672,7 +1716,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		ext[1] = be16dec(sctx->job.nreward);
 		// applog(LOG_DEBUG, "DEBUG: vote=%hx reward=%hx", ext[0], ext[1]);
 	}
-
+#endif
 	pthread_mutex_unlock(&stratum_work_lock);
 
 	if (opt_debug && opt_algo != ALGO_DECRED && opt_algo != ALGO_EQUIHASH && opt_algo != ALGO_SIA) {
@@ -1690,6 +1734,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		opt_difficulty = 1.;
 
 	switch (opt_algo) {
+#if 0
 		case ALGO_HMQ1725:
 		case ALGO_JACKPOT:
 		case ALGO_JHA:
@@ -1706,12 +1751,13 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_LBRY:
 		case ALGO_LYRA2v2:
 		case ALGO_LYRA2Z:
-		case ALGO_TIMETRAVEL:
-		case ALGO_BITCORE:
+#endif
+//		case ALGO_TIMETRAVEL:
+//		case ALGO_BITCORE:
 		case ALGO_X16R:
-		case ALGO_X16S:
-			work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
+			work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));//(256.0 * opt_difficulty));
 			break;
+#if 0
 		case ALGO_KECCAK:
 		case ALGO_LYRA2:
 			work_set_target(work, sctx->job.diff / (128.0 * opt_difficulty));
@@ -1721,6 +1767,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 			break;
 		default:
 			work_set_target(work, sctx->job.diff / opt_difficulty);
+#endif
 	}
 
 	if (stratum_diff != sctx->job.diff) {
@@ -1731,7 +1778,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 			snprintf(sdiff, 32, " (%.5f)", work->targetdiff);
 		applog(LOG_WARNING, "Stratum difficulty set to %g%s", stratum_diff, sdiff);
 	}
-
+	sctx->job.clean = 1; //!!!
 	return true;
 }
 
@@ -1739,7 +1786,7 @@ void restart_threads(void)
 {
 	if (opt_debug && !opt_quiet)
 		applog(LOG_DEBUG,"%s", __FUNCTION__);
-
+	// restart mining thread IRL
 	for (int i = 0; i < opt_n_threads && work_restart; i++)
 		work_restart[i].restart = 1;
 }
@@ -1766,7 +1813,7 @@ static bool wanna_mine(int thr_id)
 	}
 	// Network Difficulty
 	if (opt_max_diff > 0.0 && net_diff > opt_max_diff) {
-		int next = pool_get_first_valid(cur_pooln+1);
+		int next = pool_get_first_valid(cur_pooln+1, false);
 		if (num_pools > 1 && pools[next].max_diff != pools[cur_pooln].max_diff && opt_resume_diff <= 0.)
 			conditional_pool_rotate = allow_pool_rotate;
 		if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
@@ -1779,7 +1826,7 @@ static bool wanna_mine(int thr_id)
 	}
 	// Network hashrate
 	if (opt_max_rate > 0.0 && net_hashrate > opt_max_rate) {
-		int next = pool_get_first_valid(cur_pooln+1);
+		int next = pool_get_first_valid(cur_pooln+1, false);
 		if (pools[next].max_rate != pools[cur_pooln].max_rate && opt_resume_rate <= 0.)
 			conditional_pool_rotate = allow_pool_rotate;
 		if (!thr_id && !conditional_state[thr_id] && !opt_quiet) {
@@ -1796,7 +1843,24 @@ static bool wanna_mine(int thr_id)
 	conditional_state[thr_id] = (uint8_t) !state; // only one wait message in logs
 	return state;
 }
-
+/*
+static bool is_dev_time() {
+	// Add 2 seconds to compensate for connection time
+	time_t dev_portion = double(DONATE_CYCLE_TIME)
+											* dev_donate_percent * 0.01 + 2;
+	if(dev_portion < 12) // No point in bothering with less than 10s
+		return false;
+	return (time(NULL) - dev_timestamp) % DONATE_CYCLE_TIME
+					>= (DONATE_CYCLE_TIME - dev_portion);
+}
+*/
+/*
+void sig_fn(int sig)
+{
+	applog(LOG_DEBUG,"TID: %i", sig);
+	return;
+}
+*/
 static void *miner_thread(void *userdata)
 {
 	struct thr_info *mythr = (struct thr_info *)userdata;
@@ -1813,6 +1877,8 @@ static void *miner_thread(void *userdata)
 	bool extrajob = false;
 	char s[16];
 	int rc = 0;
+//#define NEW_WORK 0xdead
+//	signal(SIGINT, sig_fn);
 
 	memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized
 
@@ -1877,34 +1943,21 @@ static void *miner_thread(void *userdata)
 		// &work.data[19]
 		int wcmplen = (opt_algo == ALGO_DECRED) ? 140 : 76;
 		int wcmpoft = 0;
-
+		/*
 		if (opt_algo == ALGO_LBRY) wcmplen = 108;
 		else if (opt_algo == ALGO_SIA) {
 			wcmpoft = (32+16)/4;
 			wcmplen = 32;
 		}
-
+		*/
 		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
 
-		if (opt_algo == ALGO_WILDKECCAK) {
-			nonceptr = (uint32_t*) (((char*)work.data) + 1);
-			wcmpoft = 2;
-			wcmplen = 32;
-		} else if (opt_algo == ALGO_CRYPTOLIGHT || opt_algo == ALGO_CRYPTONIGHT) {
-			nonceptr = (uint32_t*) (((char*)work.data) + 39);
-			wcmplen = 39;
-		} else if (opt_algo == ALGO_EQUIHASH) {
-			nonceptr = &work.data[EQNONCE_OFFSET]; // 27 is pool extranonce (256bits nonce space)
-			wcmplen = 4+32+32;
-		}
-
 		if (have_stratum) {
-			uint32_t sleeptime = 0;
 
-			if (opt_algo == ALGO_DECRED || opt_algo == ALGO_WILDKECCAK /* getjob */)
-				work_done = true; // force "regen" hash
+			uint32_t sleeptime = 0;
 			while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) {
-				usleep(100*1000);
+//				usleep(100*1000);
+				usleep(100);
 				if (sleeptime > 4) {
 					extrajob = true;
 					break;
@@ -1918,9 +1971,11 @@ static void *miner_thread(void *userdata)
 			extrajob |= work_done;
 
 			regen = (nonceptr[0] >= end_nonce);
+#if 0
 			if (opt_algo == ALGO_SIA) {
 				regen = ((nonceptr[1] & 0xFF00) >= 0xF000);
 			}
+#endif
 			regen = regen || extrajob;
 
 			if (regen) {
@@ -1928,8 +1983,6 @@ static void *miner_thread(void *userdata)
 				extrajob = false;
 				if (stratum_gen_work(&stratum, &g_work))
 					g_work_time = time(NULL);
-				if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT)
-					nonceptr[0] += 0x100000;
 			}
 		} else {
 			uint32_t secs = 0;
@@ -1968,36 +2021,14 @@ static void *miner_thread(void *userdata)
 			work.height = g_work.height;
 			//nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
 		}
-
+		/*
 		if (opt_algo == ALGO_ZR5) {
 			// ignore pok/version header
 			wcmpoft = 1;
 			wcmplen -= 4;
 		}
-
-		if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT) {
-			uint32_t oldpos = nonceptr[0];
-			bool nicehash = strstr(pools[cur_pooln].url, "nicehash") != NULL;
-			if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) {
-				memcpy(&work, &g_work, sizeof(struct work));
-				if (!nicehash) nonceptr[0] = (rand()*4) << 24;
-				nonceptr[0] &=  0xFF000000u; // nicehash prefix hack
-				nonceptr[0] |= (0x00FFFFFFu / opt_n_threads) * thr_id;
-			}
-			// also check the end, nonce in the middle
-			else if (memcmp(&work.data[44/4], &g_work.data[0], 76-44)) {
-				memcpy(&work, &g_work, sizeof(struct work));
-			}
-			if (oldpos & 0xFFFF) {
-				if (!nicehash) nonceptr[0] = oldpos + 0x1000000u;
-				else {
-					uint32_t pfx = nonceptr[0] & 0xFF000000u;
-					nonceptr[0] = pfx | ((oldpos + 0x8000u) & 0xFFFFFFu);
-				}
-			}
-		}
-
-		else if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) {
+		*/
+		if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) {
 			#if 0
 			if (opt_debug) {
 				for (int n=0; n <= (wcmplen-8); n+=8) {
@@ -2009,44 +2040,13 @@ static void *miner_thread(void *userdata)
 				}
 			}
 			#endif
+			//*** SIGNAL JOB UPDATE *********************************************************************
 			memcpy(&work, &g_work, sizeof(struct work));
 			nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
 		} else
 			nonceptr[0]++; //??
 
-		if (opt_algo == ALGO_DECRED) {
-			// suprnova job_id check without data/target/height change...
-			if (check_stratum_jobs && strcmp(work.job_id, g_work.job_id)) {
-				pthread_mutex_unlock(&g_work_lock);
-				continue;
-			}
-
-			// use the full range per loop
-			nonceptr[0] = 0;
-			end_nonce = UINT32_MAX;
-			// and make an unique work (extradata)
-			nonceptr[1] += 1;
-			nonceptr[2] |= thr_id;
-
-		} else if (opt_algo == ALGO_EQUIHASH) {
-			nonceptr[1]++;
-			nonceptr[1] |= thr_id << 24;
-			//applog_hex(&work.data[27], 32);
-		} else if (opt_algo == ALGO_WILDKECCAK) {
-			//nonceptr[1] += 1;
-		} else if (opt_algo == ALGO_SIA) {
-			// suprnova job_id check without data/target/height change...
-			if (have_stratum && strcmp(work.job_id, g_work.job_id)) {
-				pthread_mutex_unlock(&g_work_lock);
-				work_done = true;
-				continue;
-			}
-			nonceptr[1] += opt_n_threads;
-			nonceptr[1] |= thr_id;
-			// range max
-			nonceptr[0] = 0;
-			end_nonce = UINT32_MAX;
-		} else if (opt_benchmark) {
+		if (opt_benchmark) {
 			// randomize work
 			nonceptr[-1] += 1;
 		}
@@ -2054,6 +2054,7 @@ static void *miner_thread(void *userdata)
 		pthread_mutex_unlock(&g_work_lock);
 
 		// --benchmark [-a all]
+		/*
 		if (opt_benchmark && bench_algo >= 0) {
 			//gpulog(LOG_DEBUG, thr_id, "loop %d", loopcnt);
 			if (loopcnt >= 3) {
@@ -2066,23 +2067,17 @@ static void *miner_thread(void *userdata)
 				loopcnt = 0;
 			}
 		}
+		*/
 		loopcnt++;
 
 		// prevent gpu scans before a job is received
-		if (opt_algo == ALGO_SIA) nodata_check_oft = 7; // no stratum version
-		else if (opt_algo == ALGO_DECRED) nodata_check_oft = 4; // testnet ver is 0
-		else nodata_check_oft = 0;
+		nodata_check_oft = 0;
 		if (have_stratum && work.data[nodata_check_oft] == 0 && !opt_benchmark) {
 			sleep(1);
 			if (!thr_id) pools[cur_pooln].wait_time += 1;
 			gpulog(LOG_DEBUG, thr_id, "no data");
 			continue;
 		}
-		if (opt_algo == ALGO_WILDKECCAK && !scratchpad_size) {
-			sleep(1);
-			if (!thr_id) pools[cur_pooln].wait_time += 1;
-			continue;
-		}
 
 		/* conditional mining */
 		if (!wanna_mine(thr_id))
@@ -2094,7 +2089,7 @@ static void *miner_thread(void *userdata)
 			if (need_nvsettings) nvs_reset_clocks(dev_id);
 #endif
 			// free gpu resources
-			algo_free_all(thr_id);
+//			algo_free_all(thr_id);
 			// clear any free error (algo switch)
 			cuda_clear_lasterror();
 
@@ -2117,7 +2112,38 @@ static void *miner_thread(void *userdata)
 			sleep(5);
 			if (!thr_id) pools[cur_pooln].wait_time += 5;
 			continue;
-		} else {
+		}/* else if (is_dev_time() != (bool)(pools[cur_pooln].type & POOL_DONATE)) {
+
+			// reset default mem offset before idle..
+#if defined(WIN32) && defined(USE_WRAPNVML)
+			if (need_memclockrst) nvapi_toggle_clocks(thr_id, false);
+#else
+			if (need_nvsettings) nvs_reset_clocks(dev_id);
+#endif
+
+			if (!pool_is_switching) {
+				// Switch back to previous pool
+				if (pools[cur_pooln].type & POOL_DONATE) {
+					pool_switch(thr_id, prev_pooln);
+					if (!thr_id) prev_pooln = cur_pooln;
+				}
+				// Switch to dev pool
+				else {
+					if (!thr_id) prev_pooln = cur_pooln;
+					int dev_pool = pool_get_first_valid(cur_pooln, true);
+					pool_switch(thr_id, dev_pool);
+				}
+				pool_is_switching = true;
+			}
+			else if (time(NULL) - firstwork_time > 35) {
+				if (!opt_quiet)
+					applog(LOG_WARNING, "Pool switching timed out...");
+				if (!thr_id) pools[cur_pooln].wait_time += 1;
+				pool_is_switching = false;
+			}
+			sleep(1);
+			continue;
+		} */else {
 			// reapply mem offset if needed
 #if defined(WIN32) && defined(USE_WRAPNVML)
 			if (need_memclockrst) nvapi_toggle_clocks(thr_id, true);
@@ -2210,37 +2236,14 @@ static void *miner_thread(void *userdata)
 		/* on start, max64 should not be 0,
 		 *    before hashrate is computed */
 		if (max64 < minmax) {
+#if 0
 			switch (opt_algo) {
-			case ALGO_BLAKECOIN:
-			case ALGO_BLAKE2S:
-			case ALGO_VANILLA:
-				minmax = 0x80000000U;
-				break;
-			case ALGO_BLAKE:
-			case ALGO_BMW:
-			case ALGO_DECRED:
-			case ALGO_SHA256D:
-			case ALGO_SHA256T:
-			//case ALGO_WHIRLPOOLX:
-				minmax = 0x40000000U;
-				break;
-			case ALGO_KECCAK:
-			case ALGO_KECCAKC:
-			case ALGO_LBRY:
-			case ALGO_LUFFA:
-			case ALGO_SIA:
-			case ALGO_SKEIN:
-			case ALGO_SKEIN2:
-			case ALGO_TRIBUS:
-				minmax = 0x1000000;
-				break;
 			case ALGO_C11:
 			case ALGO_DEEP:
 			case ALGO_HEAVY:
 			case ALGO_JACKPOT:
 			case ALGO_JHA:
 			case ALGO_HSR:
-			case ALGO_LYRA2v2:
 			case ALGO_PHI:
 			case ALGO_POLYTIMOS:
 			case ALGO_S3:
@@ -2249,16 +2252,12 @@ static void *miner_thread(void *userdata)
 			case ALGO_BITCORE:
 			case ALGO_X11EVO:
 			case ALGO_X11:
-			case ALGO_X12:
 			case ALGO_X13:
 			case ALGO_WHIRLCOIN:
 			case ALGO_WHIRLPOOL:
+			case ALGO_LYRA2v2:
 				minmax = 0x400000;
 				break;
-			case ALGO_X14:
-			case ALGO_X15:
-				minmax = 0x300000;
-				break;
 			case ALGO_LYRA2:
 			case ALGO_LYRA2Z:
 			case ALGO_NEOSCRYPT:
@@ -2267,12 +2266,8 @@ static void *miner_thread(void *userdata)
 			case ALGO_VELTOR:
 				minmax = 0x80000;
 				break;
-			case ALGO_CRYPTOLIGHT:
-			case ALGO_CRYPTONIGHT:
-			case ALGO_SCRYPT_JANE:
-				minmax = 0x1000;
-				break;
 			}
+#endif
 			max64 = max(minmax-1, max64);
 		}
 
@@ -2322,7 +2317,7 @@ static void *miner_thread(void *userdata)
 
 		/* scan nonces for a proof-of-work hash */
 		switch (opt_algo) {
-
+#if 0
 		case ALGO_BASTION:
 			rc = scanhash_bastion(thr_id, &work, max_nonce, &hashes_done);
 			break;
@@ -2412,12 +2407,14 @@ static void *miner_thread(void *userdata)
 		case ALGO_LYRA2:
 			rc = scanhash_lyra2(thr_id, &work, max_nonce, &hashes_done);
 			break;
+
 		case ALGO_LYRA2v2:
 			rc = scanhash_lyra2v2(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_LYRA2Z:
 			rc = scanhash_lyra2Z(thr_id, &work, max_nonce, &hashes_done);
 			break;
+
 		case ALGO_NEOSCRYPT:
 			rc = scanhash_neoscrypt(thr_id, &work, max_nonce, &hashes_done);
 			break;
@@ -2496,9 +2493,6 @@ static void *miner_thread(void *userdata)
 		case ALGO_X11:
 			rc = scanhash_x11(thr_id, &work, max_nonce, &hashes_done);
 			break;
-		case ALGO_X12:
-			rc = scanhash_x12(thr_id, &work, max_nonce, &hashes_done);
-			break;
 		case ALGO_X13:
 			rc = scanhash_x13(thr_id, &work, max_nonce, &hashes_done);
 			break;
@@ -2508,19 +2502,31 @@ static void *miner_thread(void *userdata)
 		case ALGO_X15:
 			rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done);
 			break;
+#endif
 		case ALGO_X16R:
-			rc = scanhash_x16r(thr_id, &work, max_nonce, &hashes_done);
-			break;
-		case ALGO_X16S:
-			rc = scanhash_x16s(thr_id, &work, max_nonce, &hashes_done);
+//			try{
+				rc = scanhash_x16r(thr_id, &work, max_nonce, &hashes_done);
+				if (rc == -127)
+				{
+//					work.data[19] = max_nonce;
+					continue;
+				}
+				/*
+			}
+			catch (int)
+			{
+				applog(LOG_DEBUG, "Work restart.");
+				continue;
+			}	*/
 			break;
+#if 0
 		case ALGO_X17:
 			rc = scanhash_x17(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_ZR5:
 			rc = scanhash_zr5(thr_id, &work, max_nonce, &hashes_done);
 			break;
-
+#endif
 		default:
 			/* should never happen */
 			goto out;
@@ -2538,16 +2544,6 @@ static void *miner_thread(void *userdata)
 		/* record scanhash elapsed time */
 		gettimeofday(&tv_end, NULL);
 
-		switch (opt_algo) {
-			// algos to migrate to replace pdata[21] by work.nonces[]
-			case ALGO_HEAVY:
-			case ALGO_SCRYPT:
-			case ALGO_SCRYPT_JANE:
-			//case ALGO_WHIRLPOOLX:
-				work.nonces[0] = nonceptr[0];
-				work.nonces[1] = nonceptr[2];
-		}
-
 		if (stratum.rpc2 && (rc == -EBUSY || work_restart[thr_id].restart)) {
 			// bbr scratchpad download or stale result
 			sleep(1);
@@ -2570,20 +2566,13 @@ static void *miner_thread(void *userdata)
 			double dtime = (double) diff.tv_sec + 1e-6 * diff.tv_usec;
 
 			/* hashrate factors for some algos */
-			double rate_factor = 1.0;
-			switch (opt_algo) {
-				case ALGO_JACKPOT:
-				case ALGO_QUARK:
-					// to stay comparable to other ccminer forks or pools
-					rate_factor = 0.5;
-					break;
-			}
+//			double rate_factor = 1.0;
 
 			/* store thread hashrate */
 			if (dtime > 0.0) {
 				pthread_mutex_lock(&stats_lock);
 				thr_hashrates[thr_id] = hashes_done / dtime;
-				thr_hashrates[thr_id] *= rate_factor;
+//				thr_hashrates[thr_id] *= rate_factor;
 				if (loopcnt > 2) // ignore first (init time)
 					stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t) rc, work.height);
 				pthread_mutex_unlock(&stats_lock);
@@ -2594,6 +2583,12 @@ static void *miner_thread(void *userdata)
 			work.scanned_to = work.nonces[0];
 		if (rc > 1)
 			work.scanned_to = max(work.nonces[0], work.nonces[1]);
+		if (rc == -128)
+		{
+			work.data[19] = max_nonce;
+			usleep(100);
+
+		}
 		else {
 			work.scanned_to = max_nonce;
 			if (opt_debug && opt_benchmark) {
@@ -2623,11 +2618,12 @@ static void *miner_thread(void *userdata)
 			for (int i = 0; i < opt_n_threads && thr_hashrates[i]; i++)
 				hashrate += stats_get_speed(i, thr_hashrates[i]);
 			pthread_mutex_unlock(&stats_lock);
+			/*
 			if (opt_benchmark && bench_algo == -1 && loopcnt > 2) {
 				format_hashrate(hashrate, s);
 				applog(LOG_NOTICE, "Total: %s", s);
 			}
-
+			*/
 			// since pool start
 			pools[cur_pooln].work_time = (uint32_t) (time(NULL) - firstwork_time);
 
@@ -2649,6 +2645,8 @@ static void *miner_thread(void *userdata)
 
 			work.submit_nonce_id = 0;
 			nonceptr[0] = work.nonces[0];
+			if (work_restart[thr_id].restart)
+				continue;
 			if (!submit_work(mythr, &work))
 				break;
 			nonceptr[0] = curnonce;
@@ -2667,10 +2665,14 @@ static void *miner_thread(void *userdata)
 			if (rc > 1 && work.nonces[1]) {
 				work.submit_nonce_id = 1;
 				nonceptr[0] = work.nonces[1];
+#if 0
 				if (opt_algo == ALGO_ZR5) {
 					work.data[0] = work.data[22]; // pok
 					work.data[22] = 0;
 				}
+#endif
+				if (work_restart[thr_id].restart)
+					continue;
 				if (!submit_work(mythr, &work))
 					break;
 				nonceptr[0] = curnonce;
@@ -2678,7 +2680,7 @@ static void *miner_thread(void *userdata)
 			}
 		}
 	}
-
+	
 out:
 	if (opt_led_mode)
 		gpu_led_off(dev_id);
@@ -2719,13 +2721,14 @@ static void *longpoll_thread(void *userdata)
 
 	// to detect pool switch during loop
 	switchn = pool_switch_count;
-
+	/*
 	if (opt_algo == ALGO_SIA) {
 		goto out;
 	}
-
+	else
+	*/
 	/* full URL */
-	else if (strstr(hdr_path, "://")) {
+	if (strstr(hdr_path, "://")) {
 		lp_url = hdr_path;
 		hdr_path = NULL;
 	}
@@ -2762,7 +2765,7 @@ static void *longpoll_thread(void *userdata)
 		// exit on pool switch
 		if (switchn != pool_switch_count)
 			goto need_reinit;
-
+		/*
 		if (opt_algo == ALGO_SIA) {
 			char *sia_header = sia_getheader(curl, pool);
 			if (sia_header) {
@@ -2775,8 +2778,9 @@ static void *longpoll_thread(void *userdata)
 			}
 			continue;
 		}
-
+		*/
 		val = json_rpc_longpoll(curl, lp_url, pool, rpc_req, &err);
+//		restart_threads();
 		if (have_stratum || switchn != pool_switch_count) {
 			if (val)
 				json_decref(val);
@@ -2996,13 +3000,23 @@ static void *stratum_thread(void *userdata)
 					applog(LOG_BLUE, "%s asks job %d for block %d", pool->short_url,
 						strtoul(stratum.job.job_id, NULL, 16), stratum.job.height);
 			}
+#define realign_work() 
+			realign_work();
 			pthread_mutex_unlock(&g_work_lock);
 		}
-		
+
 		// check we are on the right pool
 		if (switchn != pool_switch_count) goto pool_switched;
 
-		if (!stratum_socket_full(&stratum, opt_timeout)) {
+		bool stratum_full = false;
+		for (int i = 0; i < opt_timeout; i++) {
+			if (stratum_socket_full(&stratum, 1)) {
+				stratum_full = true;
+				break;
+			}
+			if (switchn != pool_switch_count) goto pool_switched;
+		}
+		if (!stratum_full) {
 			if (opt_debug)
 				applog(LOG_WARNING, "Stratum connection timed out");
 			s = NULL;
@@ -3016,6 +3030,7 @@ static void *stratum_thread(void *userdata)
 			stratum_disconnect(&stratum);
 			if (!opt_quiet && !pool_on_hold)
 				applog(LOG_WARNING, "Stratum connection interrupted");
+//			restart_threads(); // ?
 			continue;
 		}
 		if (!stratum_handle_method(&stratum, s))
@@ -3161,7 +3176,6 @@ void parse_arg(int key, char *arg)
 		if (v < 1 || v > 65535) // sanity check
 			show_usage_and_exit(1);
 		opt_api_mcast_port = v;
-		break;
 	case 'B':
 		opt_background = true;
 		break;
@@ -3534,7 +3548,7 @@ void parse_arg(int key, char *arg)
 		have_stratum = false;
 		break;
 	case 1006:
-		print_hash_tests();
+//		print_hash_tests();
 		proper_exit(EXIT_CODE_OK);
 		break;
 	case 1003:
@@ -3679,6 +3693,21 @@ void parse_arg(int key, char *arg)
 		opt_difficulty = 1.0/d;
 		break;
 
+	case 1081: /* dev donate percent */
+		/*
+		d = atof(arg);
+		if (d < 0.)
+			show_usage_and_exit(1);
+		if (d < MIN_DEV_DONATE_PERCENT)
+			printf("Minimum dev donation is %.1f%%.\n",
+				(double)MIN_DEV_DONATE_PERCENT);
+		else if (d >= 100)
+			dev_donate_percent = 100;
+		else
+			dev_donate_percent = d;
+		*/
+		break;
+
 	/* PER POOL CONFIG OPTIONS */
 
 	case 1100: /* pool name */
@@ -3792,7 +3821,7 @@ static void parse_cmdline(int argc, char *argv[])
 	}
 
 	parse_config(opt_config);
-
+	/*
 	if (opt_algo == ALGO_HEAVY && opt_vote == 9999 && !opt_benchmark) {
 		fprintf(stderr, "%s: Heavycoin hash requires block reward vote parameter (see --vote)\n",
 			argv[0]);
@@ -3802,6 +3831,7 @@ static void parse_cmdline(int argc, char *argv[])
 	if (opt_vote == 9999) {
 		opt_vote = 0; // default, don't vote
 	}
+	*/
 }
 
 static void parse_single_opt(int opt, int argc, char *argv[])
@@ -3877,7 +3907,7 @@ int main(int argc, char *argv[])
 	// get opt_quiet early
 	parse_single_opt('q', argc, argv);
 
-	printf("*** ccminer " PACKAGE_VERSION " for nVidia GPUs by tpruvot@github ***\n");
+	printf("*** a1_min3r " PACKAGE_VERSION " for nVidia GPUs by a1i3nj03@users.noreply.github.com ***\n");
 	if (!opt_quiet) {
 		const char* arch = is_x64() ? "64-bits" : "32-bits";
 #ifdef _MSC_VER
@@ -3888,7 +3918,6 @@ int main(int argc, char *argv[])
 			CUDART_VERSION/1000, (CUDART_VERSION % 1000)/10, arch);
 		printf("  Originally based on Christian Buchner and Christian H. project\n");
 		printf("  Include some kernels from alexis78, djm34, djEzo, tsiv and krnlx.\n\n");
-		printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)\n\n");
 	}
 
 	rpc_user = strdup("");
@@ -3941,7 +3970,30 @@ int main(int argc, char *argv[])
 
 	/* parse command line */
 	parse_cmdline(argc, argv);
-
+	/*
+	if (dev_donate_percent == 0.0) {
+		printf("No dev donation set. Please consider making a one-time donation to the following addresses:\n");
+		printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)\n");
+		printf("BTC donation address: 1FHLroBZaB74QvQW5mBmAxCNVJNXa14mH5 (brianmct)\n");
+		printf("RVN donation address: RWoSZX6j6WU6SVTVq5hKmdgPmmrYE9be5R (brianmct)\n\n");
+	}
+	else {
+		// Set dev pool credentials.
+		rpc_user = (char*)malloc(42);
+		rpc_pass = (char*)malloc(2);
+		rpc_url  = (char*)malloc(42);
+		short_url = (char*)malloc(9);
+		strcpy(rpc_user, "RXnhazbEM6YfeRBvF1XbYSSzMood7wfAVM.donate");
+		strcpy(rpc_pass, "x");
+		strcpy(rpc_url,  "stratum+tcp://stratum.threeeyed.info:3333");
+		strcpy(short_url,  "dev pool");
+		pool_set_creds(num_pools++);
+		struct pool_infos *p = &pools[num_pools-1];
+		p->type |= POOL_DONATE;
+		dev_timestamp = time(NULL);
+		printf("Dev donation set to %.1f%%. Thanks for supporting this project!\n\n", dev_donate_percent);
+	}
+	*/
 	if (!opt_benchmark && !strlen(rpc_url)) {
 		// try default config file (user then binary folder)
 		char defconfig[MAX_PATH] = { 0 };
@@ -3971,9 +4023,9 @@ int main(int argc, char *argv[])
 
 	if (opt_debug)
 		pool_dump_infos();
-	cur_pooln = pool_get_first_valid(0);
+	cur_pooln = pool_get_first_valid(0, false);
 	pool_switch(-1, cur_pooln);
-
+	/*
 	if (opt_algo == ALGO_DECRED || opt_algo == ALGO_SIA) {
 		allow_gbt = false;
 		allow_mininginfo = false;
@@ -3993,7 +4045,7 @@ int main(int argc, char *argv[])
 		if (!opt_quiet) applog(LOG_INFO, "Using JSON-RPC 2.0");
 		GetScratchpad();
 	}
-
+	*/
 	flags = !opt_benchmark && strncmp(rpc_url, "https:", 6)
 	      ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
 	      : CURL_GLOBAL_ALL;
@@ -4074,7 +4126,7 @@ int main(int argc, char *argv[])
 
 	// generally doesn't work well...
 	gpu_threads = max(gpu_threads, opt_n_threads / active_gpus);
-
+	/*
 	if (opt_benchmark && opt_algo == ALGO_AUTO) {
 		bench_init(opt_n_threads);
 		for (int n=0; n < MAX_GPUS; n++) {
@@ -4082,7 +4134,7 @@ int main(int argc, char *argv[])
 		}
 		opt_autotune = false;
 	}
-
+	*/
 #ifdef HAVE_SYSLOG_H
 	if (use_syslog)
 		openlog(opt_syslog_pfx, LOG_PID, LOG_USER);
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index f995f4afcf..942edcaa5e 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -155,7 +155,7 @@
       <MaxRegCount>80</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
-      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_30,sm_30</CodeGeneration>
+      <CodeGeneration>compute_61,sm_61;compute_50,sm_50;compute_52,sm_52</CodeGeneration>
       <AdditionalOptions>--ptxas-options="-O2" --Wno-deprecated-gpu-targets %(AdditionalOptions)</AdditionalOptions>
       <Optimization>O2</Optimization>
     </CudaCompile>
@@ -167,11 +167,10 @@
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
+      <Optimization>Full</Optimization>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
-      <EnableEnhancedInstructionSet>
-      </EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
       <OpenMPSupport>true</OpenMPSupport>
       <CompileAsManaged>false</CompileAsManaged>
       <FunctionLevelLinking>true</FunctionLevelLinking>
@@ -180,6 +179,12 @@
       <AdditionalIncludeDirectories>.;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
       <MinimalRebuild>true</MinimalRebuild>
       <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <SDLCheck>false</SDLCheck>
+      <OmitFramePointers>true</OmitFramePointers>
+      <StringPooling>true</StringPooling>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>false</GenerateDebugInformation>
@@ -192,17 +197,19 @@
       <AdditionalOptions>/NODEFAULTLIB:LIBCMT %(AdditionalOptions)</AdditionalOptions>
       <LinkStatus>false</LinkStatus>
       <LargeAddressAware>true</LargeAddressAware>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
     </Link>
     <CudaCompile>
       <CInterleavedPTX>false</CInterleavedPTX>
-      <MaxRegCount>80</MaxRegCount>
+      <MaxRegCount>128</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
-      <CodeGeneration>compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30</CodeGeneration>
+      <CodeGeneration>compute_61,sm_61;compute_52,sm_52;compute_50,sm_50</CodeGeneration>
       <Include>$(NVTOOLSEXT_PATH)\include</Include>
       <Optimization>O3</Optimization>
       <TargetMachinePlatform>64</TargetMachinePlatform>
       <AdditionalOptions>--Wno-deprecated-gpu-targets %(AdditionalOptions)</AdditionalOptions>
+      <FastMath>true</FastMath>
     </CudaCompile>
     <CudaLink>
       <Optimization>O3</Optimization>
@@ -268,8 +275,13 @@
     <ClCompile Include="neoscrypt\neoscrypt.cpp" />
     <ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
     <ClInclude Include="neoscrypt\cuda_vectors.h" />
+    <ClInclude Include="quark\groestl_functions_quad_a1_min3r.cuh" />
+    <ClInclude Include="quark\groestl_transf_quad_a1_min3r.cuh" />
+    <ClInclude Include="quark\groestl_simple.cuh" />
+    <ClInclude Include="x11\cuda_x11_aes_alexis.cuh" />
+    <ClInclude Include="x11\cuda_x11_echo_aes.cuh" />
     <ClInclude Include="x11\cuda_x11_simd512_sm2.cuh" />
-    <ClInclude Include="x16\cuda_x16.h" />
+    <ClInclude Include="x16r\cuda_x16r.h" />
     <CudaCompile Include="Algo256\bmw.cu" />
     <CudaCompile Include="Algo256\cuda_bmw.cu">
       <MaxRegCount>76</MaxRegCount>
@@ -407,6 +419,7 @@
     <CudaCompile Include="cuda.cpp" />
     <CudaCompile Include="cuda_groestlcoin.cu" />
     <CudaCompile Include="cuda_myriadgroestl.cu" />
+    <CudaCompile Include="qubit\qubit_luffa512_alexis.cu" />
     <CudaCompile Include="scrypt\blake.cu" />
     <CudaCompile Include="scrypt\keccak.cu" />
     <CudaCompile Include="scrypt\sha256.cu" />
@@ -435,6 +448,21 @@
     <CudaCompile Include="sha256\sha256d.cu" />
     <CudaCompile Include="sha256\cuda_sha256t.cu" />
     <CudaCompile Include="sha256\sha256t.cu" />
+    <CudaCompile Include="x11\cuda_x11_echo_alexis.cu" />
+    <CudaCompile Include="x11\cuda_x11_shavite512_alexis.cu" />
+    <CudaCompile Include="x13\cuda_x13_fugue512_alexis.cu" />
+    <CudaCompile Include="x13\cuda_x13_hamsi512_alexis.cu">
+      <DeploymentContent>true</DeploymentContent>
+      <DeploymentContent Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </DeploymentContent>
+    </CudaCompile>
+    <CudaCompile Include="x15\cuda_x14_shabal512_alexis.cu" />
+    <CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu" />
+    <CudaCompile Include="x16r\x16r.cu" />
+    <CudaCompile Include="x16r\cuda_x16_echo512.cu" />
+    <CudaCompile Include="x16r\cuda_x16_fugue512.cu" />
+    <CudaCompile Include="x16r\cuda_x16_shabal512.cu" />
+    <CudaCompile Include="x16r\cuda_x16_simd512_80.cu" />
     <CudaCompile Include="zr5.cu" />
     <CudaCompile Include="heavy\cuda_blake512.cu">
     </CudaCompile>
@@ -574,7 +602,6 @@
     <CudaCompile Include="x11\veltor.cu" />
     <CudaCompile Include="x11\x11.cu" />
     <CudaCompile Include="x11\x11evo.cu" />
-    <CudaCompile Include="x12\x12.cu" />
     <CudaCompile Include="x13\cuda_x13_hamsi512.cu">
       <MaxRegCount>72</MaxRegCount>
     </CudaCompile>
@@ -589,16 +616,6 @@
     <CudaCompile Include="x17\hmq17.cu" />
     <CudaCompile Include="x15\x15.cu" />
     <CudaCompile Include="x15\whirlpool.cu" />
-    <CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu" />
-    <CudaCompile Include="x16\x16r.cu" />
-    <CudaCompile Include="x16\x16s.cu" />
-    <CudaCompile Include="x16\cuda_x16_echo512.cu" />
-    <CudaCompile Include="x16\cuda_x16_fugue512.cu" />
-    <CudaCompile Include="x16\cuda_x16_shabal512.cu" />
-    <CudaCompile Include="x16\cuda_x16_simd512_80.cu" />
-    <CudaCompile Include="x16\cuda_x16_echo512_64.cu">
-      <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
-    </CudaCompile>
     <CudaCompile Include="x17\x17.cu" />
     <CudaCompile Include="x17\cuda_x17_haval256.cu">
     </CudaCompile>
@@ -608,6 +625,7 @@
   </ItemGroup>
   <ItemGroup>
     <Image Include="res\ccminer.ico" />
+    <Image Include="res\icon2.ico" />
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="res\ccminer.rc" />
@@ -626,4 +644,4 @@
   <Target Name="AfterClean">
     <Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
   </Target>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 4c1b8d6b1d..5dbb0b0a06 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -58,7 +58,7 @@
     <Filter Include="Source Files\CUDA\x15">
       <UniqueIdentifier>{a2403c22-6777-46ab-a55a-3fcc7386c974}</UniqueIdentifier>
     </Filter>
-    <Filter Include="Source Files\CUDA\x16">
+    <Filter Include="Source Files\CUDA\x16r">
       <UniqueIdentifier>{55dfae6a-66ba-43e2-8ceb-98ee70cbdf16}</UniqueIdentifier>
     </Filter>
     <Filter Include="Source Files\CUDA\x17">
@@ -114,9 +114,6 @@
     </Filter>
     <Filter Include="Source Files\CUDA\tribus">
       <UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
-    </Filter>
-        <Filter Include="Source Files\CUDA\x12">
-      <UniqueIdentifier>{xde48d89-fx12-1323-129a-b592fe2b2de3}</UniqueIdentifier>
     </Filter>
   </ItemGroup>
   <ItemGroup>
@@ -515,12 +512,6 @@
     <ClInclude Include="lyra2\Sponge.h">
       <Filter>Header Files\lyra2</Filter>
     </ClInclude>
-    <ClInclude Include="quark\groestl_transf_quad.h">
-      <Filter>Source Files\CUDA\quark</Filter>
-    </ClInclude>
-    <ClInclude Include="quark\groestl_functions_quad.h">
-      <Filter>Source Files\CUDA\quark</Filter>
-    </ClInclude>
     <ClInclude Include="res\resource.h">
       <Filter>Ressources</Filter>
     </ClInclude>
@@ -602,9 +593,24 @@
     <ClInclude Include="equi\equihash.h">
       <Filter>Source Files\equi</Filter>
     </ClInclude>
-    <ClInclude Include="x16\cuda_x16.h">
+    <ClInclude Include="x16r\cuda_x16r.h">
       <Filter>Header Files\CUDA</Filter>
     </ClInclude>
+    <ClInclude Include="x11\cuda_x11_aes_alexis.cuh">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </ClInclude>
+    <ClInclude Include="x11\cuda_x11_echo_aes.cuh">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </ClInclude>
+    <ClInclude Include="quark\groestl_simple.cuh" />
+    <ClInclude Include="quark\groestl_transf_quad.h">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </ClInclude>
+    <ClInclude Include="quark\groestl_functions_quad.h">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </ClInclude>
+    <ClInclude Include="quark\groestl_functions_quad_a1_min3r.cuh" />
+    <ClInclude Include="quark\groestl_transf_quad_a1_min3r.cuh" />
   </ItemGroup>
   <ItemGroup>
     <CudaCompile Include="cuda.cpp">
@@ -814,9 +820,6 @@
     <CudaCompile Include="x11\s3.cu">
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
-    <CudaCompile Include="x12\x12.cu">
-      <Filter>Source Files\CUDA\x12</Filter>
-    </CudaCompile>
     <CudaCompile Include="x11\timetravel.cu">
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
@@ -982,32 +985,43 @@
     <CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu">
       <Filter>Source Files\CUDA\x15</Filter>
     </CudaCompile>
-    <CudaCompile Include="x16\cuda_x16_echo512.cu">
-      <Filter>Source Files\CUDA\x16</Filter>
+    <CudaCompile Include="x16r\cuda_x16_echo512.cu">
+      <Filter>Source Files\CUDA\x16r</Filter>
     </CudaCompile>
-    <CudaCompile Include="x16\cuda_x16_echo512_64.cu">
-      <Filter>Source Files\CUDA\x16</Filter>
+    <CudaCompile Include="x16r\cuda_x16_fugue512.cu">
+      <Filter>Source Files\CUDA\x16r</Filter>
     </CudaCompile>
-    <CudaCompile Include="x16\cuda_x16_fugue512.cu">
-      <Filter>Source Files\CUDA\x16</Filter>
+    <CudaCompile Include="x16r\cuda_x16_shabal512.cu">
+      <Filter>Source Files\CUDA\x16r</Filter>
     </CudaCompile>
-    <CudaCompile Include="x16\cuda_x16_shabal512.cu">
-      <Filter>Source Files\CUDA\x16</Filter>
+    <CudaCompile Include="x16r\cuda_x16_simd512_80.cu">
+      <Filter>Source Files\CUDA\x16r</Filter>
     </CudaCompile>
-    <CudaCompile Include="x16\cuda_x16_simd512_80.cu">
-      <Filter>Source Files\CUDA\x16</Filter>
+    <CudaCompile Include="x16r\x16r.cu">
+      <Filter>Source Files\CUDA\x16r</Filter>
     </CudaCompile>
-    <CudaCompile Include="x16\x16r.cu">
-      <Filter>Source Files\CUDA\x16</Filter>
+    <CudaCompile Include="qubit\qubit_luffa512_alexis.cu">
+      <Filter>Source Files\CUDA\qubit</Filter>
     </CudaCompile>
-    <CudaCompile Include="x16\x16s.cu">
-      <Filter>Source Files\CUDA\x16</Filter>
+    <CudaCompile Include="x11\cuda_x11_shavite512_alexis.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\cuda_x11_echo_alexis.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x15\cuda_x14_shabal512_alexis.cu">
+      <Filter>Source Files\CUDA\x15</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x13\cuda_x13_fugue512_alexis.cu">
+      <Filter>Source Files\CUDA\x13</Filter>
     </CudaCompile>
+    <CudaCompile Include="x13\cuda_x13_hamsi512_alexis.cu" />
   </ItemGroup>
   <ItemGroup>
     <Image Include="res\ccminer.ico">
       <Filter>Ressources</Filter>
     </Image>
+    <Image Include="res\icon2.ico" />
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="res\ccminer.rc">
@@ -1019,4 +1033,4 @@
       <Filter>Ressources</Filter>
     </Text>
   </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h
index 17efd4cfb4..fb7c9c6929 100644
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@@ -152,19 +152,21 @@
 /* #undef NO_MINUS_C_MINUS_O */
 
 /* Name of package */
-#define PACKAGE "ccminer"
+//#define PACKAGE "ccminer"
+#define PACKAGE "a1_min3r"
 
 /* Define to the address where bug reports for this package should be sent. */
 #define PACKAGE_BUGREPORT ""
 
 /* Define to the full name of this package. */
-#define PACKAGE_NAME "ccminer"
+//#define PACKAGE_NAME "ccminer"
+#define PACKAGE_NAME "a1_min3r"
 
 /* Define to the home page for this package. */
 #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.2.5"
+#define PACKAGE_VERSION "2.2.5.1"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
diff --git a/configure.ac b/configure.ac
index 08a340f49b..822df66641 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.2.5], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([a1_min3r], [0.1.1], [], [a1_min3r], [a1i3nj03@users.noreply.github.com])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
@@ -157,6 +157,6 @@ AC_SUBST(CUDA_LIBS)
 AC_SUBST(CUDA_LDFLAGS)
 AC_SUBST(NVCC)
 
-AC_SUBST(OPENMP_CFLAGS) 
+AC_SUBST(OPENMP_CFLAGS)
 
 AC_OUTPUT
diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu
index 76a94f6623..2eb621b41a 100644
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@@ -8,7 +8,8 @@
 
 #include "cuda_helper.h"
 
-__constant__ uint32_t pTarget[8]; // 32 bytes
+//__constant__ uint32_t pTarget[8]; // 32 bytes
+__constant__ uint32_t pTarget[64]; // 32 bytes
 
 // store MAX_GPUS device arrays of 8 nonces
 static uint32_t* h_resNonces[MAX_GPUS] = { NULL };
@@ -39,13 +40,88 @@ __host__
 void cuda_check_cpu_setTarget(const void *ptarget)
 {
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice));
+//	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 256, 0, cudaMemcpyHostToDevice));
+}
+__host__
+int cuda_check_cpu_setTarget_retry(const void *ptarget)
+{
+	cudaError_t r = cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice);
+	if (r != CUDA_SUCCESS)
+	{
+		fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\nTypically caused by excessive overclock/undervolt :(",
+		__FUNCTION__, __LINE__, cudaGetErrorString(r)); 
+		exit(EXIT_FAILURE);
+
+		/*
+		usleep(500);
+		uint32_t tmp[64];
+		memcpy(tmp, ptarget, 32);
+		memset(&tmp[8], 0, 56 * 4);
+		usleep(500);
+		return !(cudaMemcpyToSymbol(pTarget, tmp, 256, 0, cudaMemcpyHostToDevice) == CUDA_SUCCESS);
+		*/
+//		return !(cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice) == CUDA_SUCCESS);
+	}
+	return 0;
 }
 
 /* --------------------------------------------------------------------------------------------- */
+__device__ int  __ffs(int x);
 
 __device__ __forceinline__
 static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target)
 {
+#if 1
+	int le =
+		((hash[0] < target[0]) << 7) | ((hash[1] < target[1]) << 6) | ((hash[2] < target[2]) << 5) | ((hash[3] < target[3]) << 4) |
+		((hash[4] < target[4]) << 3) | ((hash[5] < target[5]) << 2) | ((hash[6] < target[6]) << 1) | (hash[7] < target[7]) | 0x100;
+
+	int gt =
+		((hash[0] > target[0]) << 7) | ((hash[1] > target[1]) << 6) | ((hash[2] > target[2]) << 5) | ((hash[3] > target[3]) << 4) |
+		((hash[4] > target[4]) << 3) | ((hash[5] > target[5]) << 2) | ((hash[6] > target[6]) << 1) | (hash[7] > target[7]) | 0x200;
+	return __ffs(gt) > __ffs(le);
+#elif 1
+	/*
+		if(h7 == t7
+		h7
+		>|<|==
+		0|1|next
+		
+		h6
+		>|<|==
+		0|1|next
+
+		h5
+		>|<|==
+		0|1|next
+
+		h4
+		>|<|==
+		0|1|next
+
+		h3
+		>|<|==
+		0|1|next
+
+		h2
+		>|<|==
+		0|1|next
+
+		h1
+		>|<|==
+		0|1|next
+
+		h0
+		>|<|==
+		0|1|next
+
+		next
+		return 1
+
+		if(h[all] == t[all]) return 1;
+		if(h[any] > t[any]) return 0;
+		if(h[any] < t[any]) return 1;
+	*/
 	if (hash[7] > target[7])
 		return false;
 	if (hash[7] < target[7])
@@ -81,6 +157,7 @@ static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint3
 		return false;
 
 	return true;
+#endif
 }
 
 __global__ __launch_bounds__(512, 4)
@@ -125,8 +202,8 @@ uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uin
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	if (bench_algo >= 0) // dont interrupt the global benchmark
-		return UINT32_MAX;
+//	if (bench_algo >= 0) // dont interrupt the global benchmark
+//		return UINT32_MAX;
 
 	if (!init_done) {
 		applog(LOG_ERR, "missing call to cuda_check_cpu_init");
@@ -134,7 +211,7 @@ uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uin
 	}
 
 	cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
-	cudaThreadSynchronize();
+//	cudaThreadSynchronize();
 
 	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	return h_resNonces[thr_id][0];
@@ -150,8 +227,8 @@ uint32_t cuda_check_hash_32(int thr_id, uint32_t threads, uint32_t startNounce,
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	if (bench_algo >= 0) // dont interrupt the global benchmark
-		return UINT32_MAX;
+//	if (bench_algo >= 0) // dont interrupt the global benchmark
+//		return UINT32_MAX;
 
 	if (!init_done) {
 		applog(LOG_ERR, "missing call to cuda_check_cpu_init");
@@ -247,8 +324,8 @@ uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNoun
 
 	uint32_t result = UINT32_MAX;
 
-	if (bench_algo >= 0) // dont interrupt the global benchmark
-		return result;
+//	if (bench_algo >= 0) // dont interrupt the global benchmark
+//		return result;
 
 	if (!init_done) {
 		applog(LOG_ERR, "missing call to cuda_check_cpu_init");
diff --git a/cuda_helper.h b/cuda_helper.h
index c51a325332..a3885d6f09 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -33,6 +33,7 @@ extern int cuda_get_arch(int thr_id);
 extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
 extern void cuda_check_cpu_free(int thr_id);
 extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern int cuda_check_cpu_setTarget_retry(const void *ptarget);
 extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
 extern uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce);
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@@ -102,7 +103,7 @@ __device__ __forceinline__ uint64_t REPLACE_LODWORD(const uint64_t &x, const uin
 	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
 }
 
-// Endian Drehung f�r 32 Bit Typen
+// Endian Drehung f�r 32 Bit Typen
 #ifdef __CUDA_ARCH__
 __device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
 {
@@ -178,7 +179,7 @@ do {                                                                  \
 	cudaError_t err = call;                                           \
 	if (cudaSuccess != err) {                                         \
 		fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
-		         __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
+				 __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
 		exit(EXIT_FAILURE);                                           \
 	}                                                                 \
 } while (0)
@@ -226,7 +227,7 @@ uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
 {
 	uint64_t result;
 	asm("xor.b64 %0, %2, %3; // xor3\n\t"
-	    "xor.b64 %0, %0, %1;\n\t"
+		"xor.b64 %0, %0, %1;\n\t"
 		/* output : input registers */
 		: "=l"(result) : "l"(a), "l"(b), "l"(c));
 	return result;
@@ -669,6 +670,29 @@ static uint2 SHR2(uint2 a, int offset)
 #endif
 }
 
+__device__ __forceinline__
+uint32_t xor3x(uint32_t a,uint32_t b,uint32_t c){
+	uint32_t result;
+	#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+			asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result) : "r"(a), "r"(b),"r"(c));
+	#else
+			result = a^b^c;
+	#endif
+	return result;
+}
+
+__device__ __forceinline__
+uint2 xor3x(const uint2 a,const uint2 b,const uint2 c) {
+	uint2 result;
+#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+	asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
+	asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
+#else
+	result = a^b^c;
+#endif
+	return result;
+}
+
 // CUDA 9+ deprecated functions warnings (new mask param)
 #if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
 #undef __shfl
diff --git a/fuguecoin.cpp b/fuguecoin.cpp
index 1e36e672f7..71b7b97f94 100644
--- a/fuguecoin.cpp
+++ b/fuguecoin.cpp
@@ -1,3 +1,4 @@
+#if 0
 #include <string.h>
 #include <stdint.h>
 #include <cuda_runtime.h>
@@ -112,3 +113,4 @@ void free_fugue256(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/groestlcoin.cpp b/groestlcoin.cpp
index 453bd4c1e3..8609013f1c 100644
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@@ -1,3 +1,4 @@
+#if 0
 #include <string.h>
 #include <stdint.h>
 #include <cuda_runtime.h>
@@ -111,3 +112,4 @@ void free_groestlcoin(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/heavy/bastion.cu b/heavy/bastion.cu
index ec9ba8d755..abfa365678 100644
--- a/heavy/bastion.cu
+++ b/heavy/bastion.cu
@@ -1,7 +1,7 @@
 /**
  * bastion cuda implemention tpruvot@github 2017
  */
-
+#if 0
 #include <stdio.h>
 #include <string.h>
 //#include <openssl/sha.h>
@@ -329,3 +329,4 @@ void bastionhash(void* output, const uchar* input)
 
 	memcpy(output, hash, 32);
 }
+#endif
\ No newline at end of file
diff --git a/lyra2/Lyra2.c b/lyra2/Lyra2.c
index 1f0a953e4a..1e1a3e3042 100644
--- a/lyra2/Lyra2.c
+++ b/lyra2/Lyra2.c
@@ -1,3 +1,4 @@
+#if 0
 /**
  * Implementation of the Lyra2 Password Hashing Scheme (PHS).
  *
@@ -212,3 +213,4 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa
 
 	return 0;
 }
+#endif
\ No newline at end of file
diff --git a/lyra2/Lyra2Z.c b/lyra2/Lyra2Z.c
index edf463b1d1..9e8b490390 100644
--- a/lyra2/Lyra2Z.c
+++ b/lyra2/Lyra2Z.c
@@ -1,3 +1,4 @@
+#if 0
 /**
  * Implementation of the Lyra2 Password Hashing Scheme (PHS).
  *
@@ -213,3 +214,4 @@ int LYRA2Z(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *s
 	return 0;
 }
 
+#endif
\ No newline at end of file
diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu
index 7905d238ae..ca5c973c8e 100644
--- a/lyra2/cuda_lyra2.cu
+++ b/lyra2/cuda_lyra2.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * Lyra2 (v1) cuda implementation based on djm34 work
  * tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
@@ -570,3 +571,4 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6
 	else
 		lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, startNounce, d_hash);
 }
+#endif
\ No newline at end of file
diff --git a/lyra2/cuda_lyra2Z.cu b/lyra2/cuda_lyra2Z.cu
index 3a7e565fd7..f81dba1843 100644
--- a/lyra2/cuda_lyra2Z.cu
+++ b/lyra2/cuda_lyra2Z.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * Lyra2 (v1) cuda implementation based on djm34 work
  * tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
@@ -971,3 +972,4 @@ uint32_t lyra2Z_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce,
 
 	return result;
 }
+#endif
\ No newline at end of file
diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu
index df3291c1fc..d6ac7d3243 100644
--- a/lyra2/cuda_lyra2v2.cu
+++ b/lyra2/cuda_lyra2v2.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * Lyra2 (v2) CUDA Implementation
  *
@@ -467,3 +468,4 @@ void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uin
 
 	}
 }
+#endif
\ No newline at end of file
diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu
index b3ad49f1f5..c142bd8c53 100644
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@@ -1,3 +1,4 @@
+#if 0
 extern "C" {
 #include "sph/sph_blake.h"
 #include "sph/sph_groestl.h"
@@ -198,3 +199,4 @@ extern "C" void free_lyra2(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu
index 715f311e74..a296910b52 100644
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@@ -1,3 +1,4 @@
+#if 0
 extern "C" {
 #include "sph/sph_blake.h"
 #include "sph/sph_bmw.h"
@@ -207,3 +208,4 @@ extern "C" void free_lyra2v2(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/lyra2/lyra2Z.cu b/lyra2/lyra2Z.cu
index 7d84b3c6d9..13a563b678 100644
--- a/lyra2/lyra2Z.cu
+++ b/lyra2/lyra2Z.cu
@@ -1,3 +1,4 @@
+#if 0
 extern "C" {
 #include <sph/sph_blake.h>
 #include "Lyra2Z.h"
@@ -166,3 +167,4 @@ extern "C" void free_lyra2Z(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/miner.h b/miner.h
index 6d90518c77..a883a3380c 100644
--- a/miner.h
+++ b/miner.h
@@ -195,11 +195,14 @@ static inline uint32_t le32dec(const void *pp)
 #if !HAVE_DECL_BE32ENC
 static inline void be32enc(void *pp, uint32_t x)
 {
+	/*
 	uint8_t *p = (uint8_t *)pp;
 	p[3] = x & 0xff;
 	p[2] = (x >> 8) & 0xff;
 	p[1] = (x >> 16) & 0xff;
 	p[0] = (x >> 24) & 0xff;
+	*/
+	*(uint32_t*)pp = ((x >> 24) & 0x00000000ff) | ((x << 8) & 0x00ff0000) | ((x >> 8) & 0x0000ff00) | ((x << 24) & 0xff000000);
 }
 #endif
 
@@ -322,12 +325,12 @@ extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, uns
 extern int scanhash_wildkeccak(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
-extern int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+
+
 extern int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
-extern int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 
@@ -389,12 +392,10 @@ extern void free_whirl(int thr_id);
 extern void free_wildkeccak(int thr_id);
 extern void free_x11evo(int thr_id);
 extern void free_x11(int thr_id);
-extern void free_x12(int thr_id);
 extern void free_x13(int thr_id);
 extern void free_x14(int thr_id);
 extern void free_x15(int thr_id);
 extern void free_x16r(int thr_id);
-extern void free_x16s(int thr_id);
 extern void free_x17(int thr_id);
 extern void free_zr5(int thr_id);
 //extern void free_sha256d(int thr_id);
@@ -756,6 +757,7 @@ struct pool_infos {
 #define POOL_GETWORK  1
 #define POOL_STRATUM  2
 #define POOL_LONGPOLL 4
+#define POOL_DONATE		8
 	uint8_t type;
 #define POOL_ST_DEFINED 1
 #define POOL_ST_VALID 2
@@ -805,7 +807,7 @@ void pool_set_attr(int pooln, const char* key, char* arg);
 bool pool_switch_url(char *params);
 bool pool_switch(int thr_id, int pooln);
 bool pool_switch_next(int thr_id);
-int pool_get_first_valid(int startfrom);
+int pool_get_first_valid(int startfrom, bool donate);
 bool parse_pool_array(json_t *obj);
 void pool_dump_infos(void);
 
@@ -936,12 +938,10 @@ void wcoinhash(void *state, const void *input);
 void whirlxHash(void *state, const void *input);
 void x11evo_hash(void *output, const void *input);
 void x11hash(void *output, const void *input);
-void x12hash(void *output, const void *input);
 void x13hash(void *output, const void *input);
 void x14hash(void *output, const void *input);
 void x15hash(void *output, const void *input);
 void x16r_hash(void *output, const void *input);
-void x16s_hash(void *output, const void *input);
 void x17hash(void *output, const void *input);
 void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize);
 void zr5hash(void *output, const void *input);
diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp
index 77ac593f26..2633918210 100644
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@@ -1,3 +1,4 @@
+#if 0
 #include <string.h>
 #include <stdint.h>
 #include <cuda_runtime.h>
@@ -128,3 +129,4 @@ void free_myriad(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/pentablake.cu b/pentablake.cu
index 5c63d4a419..b61d6c3628 100644
--- a/pentablake.cu
+++ b/pentablake.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * Penta Blake
  */
@@ -160,3 +161,4 @@ void free_pentablake(int thr_id)
 
 	init[thr_id] = false;
 }
+#endif
\ No newline at end of file
diff --git a/polytimos.cu b/polytimos.cu
index fe7776ca56..89bc327e37 100644
--- a/polytimos.cu
+++ b/polytimos.cu
@@ -1,3 +1,4 @@
+#if 0
 /*
  * Polytimos algorithm
  */
@@ -214,3 +215,4 @@ extern "C" void free_polytimos(int thr_id)
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
 }
+#endif
\ No newline at end of file
diff --git a/pools.cpp b/pools.cpp
index c8fb1ba8e8..1c60f9f8bd 100644
--- a/pools.cpp
+++ b/pools.cpp
@@ -285,7 +285,7 @@ bool pool_switch(int thr_id, int pooln)
 }
 
 // search available pool
-int pool_get_first_valid(int startfrom)
+int pool_get_first_valid(int startfrom, bool donate)
 {
 	int next = 0;
 	struct pool_infos *p;
@@ -296,6 +296,8 @@ int pool_get_first_valid(int startfrom)
 			continue;
 		if (p->status & (POOL_ST_DISABLED | POOL_ST_REMOVED))
 			continue;
+		if ((bool)(p->type & POOL_DONATE) != donate)
+			continue;
 		next = pooln;
 		break;
 	}
@@ -306,7 +308,7 @@ int pool_get_first_valid(int startfrom)
 bool pool_switch_next(int thr_id)
 {
 	if (num_pools > 1) {
-		int pooln = pool_get_first_valid(cur_pooln+1);
+		int pooln = pool_get_first_valid(cur_pooln+1, false);
 		return pool_switch(thr_id, pooln);
 	} else {
 		// no switch possible
diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu
index 6011beb513..6131f5d6a6 100644
--- a/quark/cuda_bmw512.cu
+++ b/quark/cuda_bmw512.cu
@@ -3,11 +3,12 @@
 
 #define WANT_BMW512_80
 
-#include "cuda_helper.h"
+//#include "cuda_helper.h"
+#include "cuda_helper_alexis.h"
 
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 
-#include "cuda_bmw512_sm3.cuh"
+//#include "cuda_bmw512_sm3.cuh"
 
 #ifdef __INTELLISENSE__
 /* just for vstudio code colors */
@@ -321,14 +322,14 @@ __launch_bounds__(32, 16)
 #else
 __launch_bounds__(64, 8)
 #endif
-void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+void quark_bmw512_gpu_hash_64(uint32_t threads, uint64_t *g_hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		//uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		uint32_t hashPosition = nounce - startNounce;
+		uint32_t hashPosition = thread;//= nounce - startNounce;
 		uint64_t *inpHash = &g_hash[hashPosition * 8];
 
 		// Init
@@ -451,23 +452,25 @@ void quark_bmw512_cpu_setBlock_80(void *pdata)
 	memcpy(PaddedMessage, pdata, 80);
 	memset(PaddedMessage+80, 0, 48);
 	uint64_t *message = (uint64_t*)PaddedMessage;
+//	PaddedMessage[80] = 0x80;
+//	PaddedMessage[120] - 640;
 	message[10] = SPH_C64(0x80);
 	message[15] = SPH_C64(640);
 	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 }
 
 __host__
-void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
+void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) // , int order)
 {
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	int dev_id = device_map[thr_id];
+//	int dev_id = device_map[thr_id];
 
-	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
+//	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
 		quark_bmw512_gpu_hash_80<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
-	else
-		quark_bmw512_gpu_hash_80_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
+//	else
+//		quark_bmw512_gpu_hash_80_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
 }
 
 __host__
@@ -477,15 +480,15 @@ void quark_bmw512_cpu_init(int thr_id, uint32_t threads)
 }
 
 __host__
-void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 32;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	int dev_id = device_map[thr_id];
-	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
-		quark_bmw512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-	else
-		quark_bmw512_gpu_hash_64_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+//	int dev_id = device_map[thr_id];
+//	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
+		quark_bmw512_gpu_hash_64<<<grid, block>>>(threads, (uint64_t*)d_hash);
+//	else
+//		quark_bmw512_gpu_hash_64_30<<<grid, block>>>(threads, (uint64_t*)d_hash);
 }
diff --git a/quark/cuda_bmw512_sm3.cuh b/quark/cuda_bmw512_sm3.cuh
index 1298b13c85..faa314e4f7 100644
--- a/quark/cuda_bmw512_sm3.cuh
+++ b/quark/cuda_bmw512_sm3.cuh
@@ -157,14 +157,14 @@ void Compression512_30(uint64_t *msg, uint64_t *hash)
 }
 
 __global__
-void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint64_t *g_hash)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		//uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		uint32_t hashPosition = nounce - startNounce;
+		uint32_t hashPosition = thread;//= nounce - startNounce;
 		uint64_t *inpHash = &g_hash[hashPosition * 8];
 
 		// Init
@@ -265,7 +265,7 @@ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_
 }
 
 #else /* stripped stubs for other archs */
-__global__ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {}
+__global__ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint64_t *g_hash) {}
 __global__ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
 #endif
 
diff --git a/quark/cuda_jh512.cu b/quark/cuda_jh512.cu
index 3a1bbabe82..c81c6bc96a 100644
--- a/quark/cuda_jh512.cu
+++ b/quark/cuda_jh512.cu
@@ -3,7 +3,8 @@
  *
  * JH80 by tpruvot - 2017 - under GPLv3
  **/
-#include <cuda_helper.h>
+//#include <cuda_helper.h>
+#include "cuda_helper_alexis.h"
 
 // #include <stdio.h>  // printf
 // #include <unistd.h> // sleep
@@ -276,13 +277,13 @@ static void E8(uint32_t x[8][4])
 
 __global__
 //__launch_bounds__(256,2)
-void quark_jh512_gpu_hash_64(const uint32_t threads, const uint32_t startNounce, uint32_t* g_hash, uint32_t * g_nonceVector)
+void quark_jh512_gpu_hash_64(const uint32_t threads, uint32_t* g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-		const uint32_t hashPosition = nounce - startNounce;
+		//const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t hashPosition = thread;//= nounce - startNounce;
 		uint32_t *Hash = &g_hash[(size_t)16 * hashPosition];
 
 		uint32_t h[16];
@@ -328,13 +329,13 @@ void quark_jh512_gpu_hash_64(const uint32_t threads, const uint32_t startNounce,
 }
 
 __host__
-void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 256;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	quark_jh512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
+	quark_jh512_gpu_hash_64<<<grid, block>>>(threads, d_hash);
 }
 
 // Setup function
diff --git a/quark/cuda_quark.h b/quark/cuda_quark.h
index fbb0c1da43..4f69ccac8c 100644
--- a/quark/cuda_quark.h
+++ b/quark/cuda_quark.h
@@ -9,21 +9,21 @@ extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t st
 extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
 extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 extern void quark_groestl512_cpu_free(int thr_id);
 
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_skein512_cpu_hash_64(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_skein512_cpu_hash_64(int thr_id, const uint32_t threads, uint32_t *d_hash);
 
 extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
 
 extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
 
 extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
 extern void quark_compactTest_cpu_free(int thr_id);
diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu
index 68a411a711..32dfedad86 100644
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@@ -3,7 +3,8 @@
 #include <sys/types.h> // off_t
 
 #include "miner.h"
-#include "cuda_helper.h"
+//#include "cuda_helper.h"
+#include "cuda_helper_alexis.h"
 
 #define ROTR(x,n) ROTR64(x,n)
 
@@ -115,7 +116,7 @@ void quark_blake512_compress(uint64_t *h, const uint64_t *block, const uint8_t (
 }
 
 __global__ __launch_bounds__(256, 4)
-void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
+void quark_blake512_gpu_hash_64(uint32_t threads, uint64_t *g_hash)
 {
 #if !defined(SP_KERNEL) || __CUDA_ARCH__ < 500
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
@@ -128,9 +129,9 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t
 	if (thread < threads)
 #endif
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		//uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		off_t hashPosition = nounce - startNounce;
+		off_t hashPosition = thread;//= nounce - startNounce;
 		uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8
 
 		// 128 Bytes
@@ -234,21 +235,21 @@ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou
 #endif
 
 __host__
-void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order)
+void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_outputHash)
 {
 #ifdef SP_KERNEL
 	int dev_id = device_map[thr_id];
 	if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
-		quark_blake512_cpu_hash_64_sp(threads, startNounce, d_nonceVector, d_outputHash);
+		quark_blake512_cpu_hash_64_sp(threads, d_outputHash);
 	else
 #endif
 	{
 		const uint32_t threadsperblock = 256;
 		dim3 grid((threads + threadsperblock-1)/threadsperblock);
 		dim3 block(threadsperblock);
-		quark_blake512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
+		quark_blake512_gpu_hash_64<<<grid, block>>>(threads, (uint64_t*)d_outputHash);
 	}
-	MyStreamSynchronize(NULL, order, thr_id);
+	//MyStreamSynchronize(NULL, order, thr_id);
 }
 
 __host__
diff --git a/quark/cuda_quark_blake512_sp.cuh b/quark/cuda_quark_blake512_sp.cuh
index 069620a680..efc3174353 100644
--- a/quark/cuda_quark_blake512_sp.cuh
+++ b/quark/cuda_quark_blake512_sp.cuh
@@ -90,14 +90,14 @@ __global__
 #if __CUDA_ARCH__ > 500
 __launch_bounds__(256, 1)
 #endif
-void quark_blake512_gpu_hash_64_sp(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2* g_hash)
+void quark_blake512_gpu_hash_64_sp(uint32_t threads, uint2* g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
 	if (thread < threads)
 	{
-		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-		const uint32_t hashPosition = nounce - startNounce;
+//		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t hashPosition = thread;//= nounce - startNounce;
 
 		uint2 msg[16];
 
@@ -652,12 +652,12 @@ __global__ void quark_blake512_gpu_hash_80_sp(uint32_t, uint32_t startNounce, ui
 #endif
 
 __host__
-void quark_blake512_cpu_hash_64_sp(uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash)
+void quark_blake512_cpu_hash_64_sp(uint32_t threads, uint32_t *d_outputHash)
 {
 	const uint32_t threadsperblock = 32;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	quark_blake512_gpu_hash_64_sp <<<grid, block>>>(threads, startNounce, d_nonceVector, (uint2*)d_outputHash);
+	quark_blake512_gpu_hash_64_sp <<<grid, block>>>(threads, (uint2*)d_outputHash);
 }
 
 __host__
diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu
index 91903acec7..4f5f796b73 100644
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@@ -4,7 +4,8 @@
 #include <memory.h>
 #include <sys/types.h> // off_t
 
-#include <cuda_helper.h>
+//#include <cuda_helper.h>
+#include "cuda_helper_alexis.h"
 
 #ifdef __INTELLISENSE__
 #define __CUDA_ARCH__ 500
@@ -14,8 +15,14 @@
 #define THF 4U
 
 #if __CUDA_ARCH__ >= 300
+#define INTENSIVE_GMF
+#include "miner.h"
+#include "cuda_vectors_alexis.h"
+//#include "../x11/cuda_x11_echo_aes.cuh"
 #include "groestl_functions_quad.h"
+#include "groestl_functions_quad_a1_min3r.cuh"
 #include "groestl_transf_quad.h"
+#include "groestl_transf_quad_a1_min3r.cuh"
 #endif
 
 #define WANT_GROESTL80
@@ -23,12 +30,167 @@
 __constant__ static uint32_t c_Message80[20];
 #endif
 
-#include "cuda_quark_groestl512_sm2.cuh"
+//#include "cuda_quark_groestl512_sm2.cuh"
+
+
 
 __global__ __launch_bounds__(TPB, THF)
-void quark_groestl512_gpu_hash_64_quad(const uint32_t threads, const uint32_t startNounce, uint32_t * g_hash, uint32_t * __restrict g_nonceVector)
+//const uint32_t startNounce, 
+void quark_groestl512_gpu_hash_64_quad_a1_min3r(const uint32_t threads, uint4* g_hash)
 {
+
 #if __CUDA_ARCH__ >= 300
+	// BEWARE : 4-WAY CODE (one hash need 4 threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); // >> 2; // done on cpu
+
+	if (thread < threads)
+	{
+		//uint32_t message[8];
+		//uint32_t state[8];
+		uint4 state[2], message[2];
+
+		//uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); // assuming compiler doesn't already do this for the gpu...
+		//uint32_t nounce = startNounce + (thread >> 2);
+		//off_t hashPosition = nounce - startNounce;
+		//uint32_t *pHash = &g_hash[hashPosition << 4];
+		const uint32_t thr = threadIdx.x & 0x3; // % THF
+		//		uint32_t *pHash = &g_hash[thread << 1]; // thread << 4
+		uint4 *pHash = (uint4*)&g_hash[thread ^ thr]; // thread << 4
+
+
+		/*| M0 M1 M2 M3 | M4 M5 M6 M7 | (input)
+		--|-------------|-------------|
+		T0|  0  4  8 12 | 80          |
+		T1|  1  5    13 |             |
+		T2|  2  6    14 |             |
+		T3|  3  7    15 |          01 |
+		--|-------------|-------------| */
+
+		message[0].x = ((uint32_t*)&pHash[0])[thr];
+		message[0].y = ((uint32_t*)&pHash[1])[thr];
+		message[0].z = ((uint32_t*)&pHash[2])[thr];
+		message[0].w = ((uint32_t*)&pHash[3])[thr];
+		__syncthreads();
+
+//#pragma unroll
+//		for (int k = 0; k<4; k++) message[k] = pHash[thr + (k * THF)];
+#if 0
+#pragma unroll
+		for (int k = 4; k<8; k++) message[k] = 0;
+
+
+		if (thr == 0)
+		{
+			message[4] = 0x80U; // end of data tag
+			uint32_t msgBitsliced[8];
+			to_bitslice_quad(message, msgBitsliced);
+
+			groestl512_progressMessage_quad(state, msgBitsliced);
+
+			uint32_t hash[16];
+			from_bitslice_quad(state, hash);
+			uint4 *phash = (uint4*)hash;
+			uint4 *outpt = (uint4*)pHash;
+			outpt[0] = phash[0];
+			outpt[1] = phash[1];
+			outpt[2] = phash[2];
+			outpt[3] = phash[3];
+		}
+		else
+		{
+			if (thr == 3) message[7] = 0x01000000U;
+
+			uint32_t msgBitsliced[8];
+			to_bitslice_quad(message, msgBitsliced);
+
+			groestl512_progressMessage_quad(state, msgBitsliced);
+
+			uint32_t hash[16];
+			from_bitslice_quad(state, hash);
+		}
+
+#else
+		message[1].x = (0x80 & (thr-1));
+		message[1].y = 0;
+		message[1].z = 0;
+		message[1].w = 0x01000000 & -(thr == 3);
+
+//		message[1].x = 0;
+//		message[1].y = 0;
+//		message[1].z = 0;
+//		message[1].w = 0;
+//		if (thr == 0)
+//			message[1].x = 0x80; // if (thr == 0)
+//		if (thr == 3)
+//			message[1].w = 0x01000000; // if (thr == 3)
+//		message[1].x = 0x80 & (thr - 1); // if (thr == 0)
+//		message[1].y = 0;
+//		message[1].z = 0;
+
+//		message[1].w = 0x01000000 & -((thr + 1) >> 2); // if (thr == 3)
+
+//#pragma unroll
+		//		for (int k = 4; k<8; k++) message[k] = 0;
+		//		if (thr == 0) message[4] = 0x80U; // end of data tag
+//		if (thr == 3) message[7] = 0x01000000U;
+
+//		uint32_t msgBitsliced[8];
+		uint4 msgBitsliced[2];
+		to_bitslice_quad_a1_min3r(message, msgBitsliced); //! error?!
+//		to_bitslice_quad((uint32_t*)message, (uint32_t*)msgBitsliced);
+		
+//		msgBitsliced[0] |= thr;
+		groestl512_progressMessage_quad_a1_min3r(state, msgBitsliced); // works
+//		groestl512_progressMessage_quad((uint32_t*)state, (uint32_t*)msgBitsliced);
+		//! state is used cross thread?!
+//		state[0] |= thr;
+		//uint32_t hash[16];
+		uint4 hash[4];
+		//! optimize vvv
+//		*(uint2x4*)&hash[0] = *(uint2x4*)&state[0];
+//		*(uint2x4*)&hash[2] = *(uint2x4*)&state[2];
+
+		from_bitslice_quad_a1_min3r(state, hash);//! error :(
+//		from_bitslice_quad((uint32_t*)state, (uint32_t*)hash);
+		//		if (thr != 0) state[0] = 0;
+		/*
+		if (0)//(thr == 0)
+		{
+			uint4 flag0 = { (thr != 0) - 1, (thr != 0) - 1, (thr != 0) - 1, (thr != 0) - 1 };
+			uint4 flagn0 = { (thr == 0) - 1, (thr == 0) - 1, (thr == 0) - 1, (thr == 0) - 1 };
+			pHash[0] = (hash[0] & flag0) | (pHash[0] & flagn0);
+			pHash[1] = (hash[1] & flag0) | (pHash[1] & flagn0);
+			pHash[2] = (hash[2] & flag0) | (pHash[2] & flagn0);
+			pHash[3] = (hash[3] & flag0) | (pHash[3] & flagn0);
+		}
+		*/
+		/*
+		pHash[0] = (hash[0] & flag0) | (pHash[0] & flagn0);
+		pHash[1] = (hash[1] & flag0) | (pHash[1] & flagn0);
+		pHash[2] = (hash[2] & flag0) | (pHash[2] & flagn0);
+		pHash[3] = (hash[3] & flag0) | (pHash[3] & flagn0);
+		*/
+
+		if (thr == 0)
+		{
+			//! hash is unused unless thr == 0 ...
+//			uint4* ohash = pHash;
+//			uint4* thash = hash;
+			*(uint2x4*)&pHash[0] = *(uint2x4*)&hash[0];
+			*(uint2x4*)&pHash[2] = *(uint2x4*)&hash[2];
+		}
+
+#endif // 0
+	}
+#endif
+}
+
+__global__ __launch_bounds__(TPB, THF)
+void quark_groestl512_gpu_hash_64_quad(const uint32_t threads, const uint32_t startNounce, uint32_t * g_hash, uint32_t * __restrict g_nonceVector)
+{
+	//! fixme please
+#if 0 // __CUDA_ARCH__ >= 300
+
 
 	// BEWARE : 4-WAY CODE (one hash need 4 threads)
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
@@ -52,14 +214,14 @@ void quark_groestl512_gpu_hash_64_quad(const uint32_t threads, const uint32_t st
 		T3|  3  7    15 |          01 |
 		--|-------------|-------------| */
 
-		#pragma unroll
-		for(int k=0;k<4;k++) message[k] = pHash[thr + (k * THF)];
+#pragma unroll
+		for (int k = 0; k<4; k++) message[k] = pHash[thr + (k * THF)];
 
-		#pragma unroll
-		for(int k=4;k<8;k++) message[k] = 0;
+#pragma unroll
+		for (int k = 4; k<8; k++) message[k] = 0;
 
 		if (thr == 0) message[4] = 0x80U; // end of data tag
-		if (thr == 3) message[7] = 0x01000000U;
+		if (thr == 3) message[7] = 0x01000000U; 
 
 		uint32_t msgBitsliced[8];
 		to_bitslice_quad(message, msgBitsliced);
@@ -71,8 +233,8 @@ void quark_groestl512_gpu_hash_64_quad(const uint32_t threads, const uint32_t st
 
 		// uint4 = 4x4 uint32_t = 16 bytes
 		if (thr == 0) {
-			uint4 *phash = (uint4*) hash;
-			uint4 *outpt = (uint4*) pHash;
+			uint4 *phash = (uint4*)hash;
+			uint4 *outpt = (uint4*)pHash;
 			outpt[0] = phash[0];
 			outpt[1] = phash[1];
 			outpt[2] = phash[2];
@@ -85,38 +247,45 @@ void quark_groestl512_gpu_hash_64_quad(const uint32_t threads, const uint32_t st
 __host__
 void quark_groestl512_cpu_init(int thr_id, uint32_t threads)
 {
-	int dev_id = device_map[thr_id];
+//	int dev_id = device_map[thr_id];
 	cuda_get_arch(thr_id);
-	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
-		quark_groestl512_sm20_init(thr_id, threads);
+//	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
+//		quark_groestl512_sm20_init(thr_id, threads);
 }
 
 __host__
 void quark_groestl512_cpu_free(int thr_id)
 {
-	int dev_id = device_map[thr_id];
-	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
-		quark_groestl512_sm20_free(thr_id);
+//	int dev_id = device_map[thr_id];
+//	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
+//		quark_groestl512_sm20_free(thr_id);
 }
 
 __host__
-void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
 {
 	uint32_t threadsperblock = TPB;
-
 	// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
 	// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
 	const uint32_t factor = THF;
 
-	dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
+	dim3 grid(factor*((threads + threadsperblock - 1) / threadsperblock));
 	dim3 block(threadsperblock);
 
-	int dev_id = device_map[thr_id];
+//	int dev_id = device_map[thr_id];
+	//! GTX 1070+ (?) run quark_groestl512_gpu_hash_64_quad
 
-	if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300)
+//	if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300)// && order == -1) //! for x16r, TBD if it will work on other algos.
+//	{
+		quark_groestl512_gpu_hash_64_quad_a1_min3r <<<grid, block >>>(threads << 2, (uint4*)d_hash);
+//	}
+	/*
+	else 
+		if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300)
 		quark_groestl512_gpu_hash_64_quad<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
-	else
-		quark_groestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order);
+	*/
+//	else
+//		quark_groestl512_sm20_hash_64(thr_id, threads, d_hash, order);
 }
 
 // --------------------------------------------------------------------------------------------------------------------------------------------
@@ -129,6 +298,80 @@ void groestl512_setBlock_80(int thr_id, uint32_t *endiandata)
 	cudaMemcpyToSymbol(c_Message80, endiandata, sizeof(c_Message80), 0, cudaMemcpyHostToDevice);
 }
 
+__global__ __launch_bounds__(TPB, THF)
+void groestl512_gpu_hash_80_quad_a1_min3r(const uint32_t threads, const uint32_t startNounce, uint4* g_hash)
+{
+#if __CUDA_ARCH__ >= 300
+	// BEWARE : 4-WAY CODE (one hash need 4 threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); // >> 2; // done on cpu
+
+	if (thread < threads)
+	{
+		const uint32_t thr = threadIdx.x & 0x3; // % THF
+		uint4 message[2];
+		//		uint32_t *pHash = &g_hash[thread << 1]; // thread << 4
+		uint4 *pHash = (uint4*)&g_hash[thread ^ thr]; // thread << 4
+		/*| M0 M1 M2 M3 M4 | M5 M6 M7 | (input)
+		--|----------------|----------|
+		T0|  0  4  8 12 16 | 80       |
+		T1|  1  5       17 |          |
+		T2|  2  6       18 |          |
+		T3|  3  7       Nc |       01 |
+		--|----------------|----------| TPR */
+		message[0].x = c_Message80[thr + (0 * THF)];
+		message[0].y = c_Message80[thr + (1 * THF)];
+		message[0].z = c_Message80[thr + (2 * THF)];
+		message[0].w = c_Message80[thr + (3 * THF)];
+		message[1].x = c_Message80[thr + (4 * THF)];
+
+		__syncthreads();
+
+
+//		message[1].y = 0;
+
+//		message[1].w = 0;
+
+//		message[1].x = cuda_swab32(startNounce + (thread>>2)) & -(thr == 3);
+//		message[1].y = 0x80 & -(thr == 0);
+		message[1].z = 0;
+//		message[1].w = 0x01000000U & -(thr == 3);
+//		if (thr == 3) {
+//			message[1].x = cuda_swab32(startNounce + (thread >> 2));
+//			message[1].w = 0x01000000U;
+//		}
+//		if (thr == 0)
+//			message[1].y = 0x80;
+		message[1].y = 0x80 & (thr -1);
+		message[1].w = 0x01000000U & -(thr == 3);
+		message[1].x = (cuda_swab32(startNounce + (thread >> 2)) & -(thr == 3)) | (message[1].x & -(thr != 3));
+
+		uint4 msgBitsliced[2];
+//		to_bitslice_quad((uint32_t*)message, (uint32_t*)msgBitsliced);
+		to_bitslice_quad_a1_min3r(message, msgBitsliced);
+
+		uint4 state[2];
+//		groestl512_progressMessage_quad((uint32_t*)state, (uint32_t*)msgBitsliced);
+		groestl512_progressMessage_quad_a1_min3r(state, msgBitsliced); // works
+
+		uint4 hash[4];
+//		from_bitslice_quad((uint32_t*)state, (uint32_t*)hash);
+		from_bitslice_quad_a1_min3r(state, hash);
+//		from_bitslice_quad_a1_min3r((uint32_t*)state, (uint32_t*)hash);
+		 
+		if (thr == 0) { /* 4 threads were done */
+			*(uint2x4*)&pHash[0] = *(uint2x4*)&hash[0];
+			*(uint2x4*)&pHash[2] = *(uint2x4*)&hash[2];
+			/*
+			pHash[0] = hash[0];
+			pHash[1] = hash[1];
+			pHash[2] = hash[2];
+			pHash[3] = hash[3];
+			*/
+		}
+	}
+#endif
+}
+
 __global__ __launch_bounds__(TPB, THF)
 void groestl512_gpu_hash_80_quad(const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash)
 {
@@ -187,18 +430,21 @@ void groestl512_gpu_hash_80_quad(const uint32_t threads, const uint32_t startNou
 __host__
 void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
 {
-	int dev_id = device_map[thr_id];
+//	int dev_id = device_map[thr_id];
 
-	if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) {
+//	if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) {
 		const uint32_t threadsperblock = TPB;
 		const uint32_t factor = THF;
-
+		
 		dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
 		dim3 block(threadsperblock);
+		//! setup only for x16r(s?)
+		groestl512_gpu_hash_80_quad_a1_min3r <<<grid, block>>> (threads << 2, startNounce, (uint4*)d_hash);
+//		groestl512_gpu_hash_80_quad<< <grid, block >> > (threads, startNounce, d_hash);
+		/*
 
-		groestl512_gpu_hash_80_quad <<<grid, block>>> (threads, startNounce, d_hash);
-
-	} else {
+	} 
+	else {
 
 		const uint32_t threadsperblock = 256;
 		dim3 grid((threads + threadsperblock-1)/threadsperblock);
@@ -206,6 +452,7 @@ void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uin
 
 		groestl512_gpu_hash_80_sm2 <<<grid, block>>> (threads, startNounce, d_hash);
 	}
+	*/
 }
 
 #endif
diff --git a/quark/cuda_quark_groestl512_sm2.cuh b/quark/cuda_quark_groestl512_sm2.cuh
index c07340d897..fe3bb5db73 100644
--- a/quark/cuda_quark_groestl512_sm2.cuh
+++ b/quark/cuda_quark_groestl512_sm2.cuh
@@ -129,7 +129,7 @@ void quark_groestl512_perm_Q(uint32_t *a, char *mixtabs)
 #endif
 
 __global__
-void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
+void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
 {
 #if __CUDA_ARCH__ < 300 || defined(_DEBUG)
 
@@ -155,9 +155,9 @@ void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32
 		uint32_t message[32];
 		uint32_t state[32];
 
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		//uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		off_t hashPosition = nounce - startNounce;
+		off_t hashPosition = thread;//= nounce - startNounce;
 		uint32_t *pHash = &g_hash[hashPosition * 16];
 
 		#pragma unroll 4
@@ -244,16 +244,16 @@ void quark_groestl512_sm20_free(int thr_id)
 }
 
 __host__
-void quark_groestl512_sm20_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void quark_groestl512_sm20_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order)
 {
 	int threadsperblock = 512;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	quark_groestl512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
+	quark_groestl512_gpu_hash_64<<<grid, block>>>(threads, d_hash);
 }
-
+/*
 __host__
 void quark_doublegroestl512_sm20_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
@@ -265,7 +265,7 @@ void quark_doublegroestl512_sm20_hash_64(int thr_id, uint32_t threads, uint32_t
 	quark_groestl512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
 	quark_groestl512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
 }
-
+*/
 // --------------------------------------------------------------------------------------------------------------------------------------------
 
 #ifdef WANT_GROESTL80
diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu
index 1a6136ff7b..d6a87f355a 100644
--- a/quark/cuda_quark_keccak512.cu
+++ b/quark/cuda_quark_keccak512.cu
@@ -2,7 +2,8 @@
 #include <memory.h>
 #include <sys/types.h> // off_t
 
-#include "cuda_helper.h"
+//#include "cuda_helper.h"
+#include "cuda_helper_alexis.h"
 
 #define U32TO64_LE(p) \
 	(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
@@ -95,14 +96,14 @@ static void keccak_block(uint2 *s)
 }
 
 __global__
-void quark_keccak512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+void quark_keccak512_gpu_hash_64(uint32_t threads, uint64_t *g_hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		//uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		off_t hashPosition = nounce - startNounce;
+		off_t hashPosition = thread;//nounce - startNounce;
 		uint64_t *inpHash = &g_hash[hashPosition * 8];
 		uint2 keccak_gpu_state[25];
 
@@ -194,14 +195,14 @@ static void keccak_block_v30(uint64_t *s, const uint32_t *in)
 }
 
 __global__
-void quark_keccak512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+void quark_keccak512_gpu_hash_64_v30(uint32_t threads, uint64_t *g_hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		//uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		off_t hashPosition = nounce - startNounce;
+		off_t hashPosition = thread;//nounce - startNounce;
 		uint32_t *inpHash = (uint32_t*)&g_hash[hashPosition * 8];
 
 		uint32_t message[18];
@@ -233,7 +234,7 @@ void quark_keccak512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uin
 }
 
 __host__
-void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 256;
 
@@ -243,11 +244,11 @@ void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNou
 	int dev_id = device_map[thr_id];
 
 	if (device_sm[dev_id] >= 320)
-		quark_keccak512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+		quark_keccak512_gpu_hash_64<<<grid, block>>>(threads, (uint64_t*)d_hash);
 	else
-		quark_keccak512_gpu_hash_64_v30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+		quark_keccak512_gpu_hash_64_v30<<<grid, block>>>(threads, (uint64_t*)d_hash);
 
-	MyStreamSynchronize(NULL, order, thr_id);
+	//MyStreamSynchronize(NULL, order, thr_id);
 }
 
 void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu
index 7c4d99ab3a..3608e206ce 100644
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@@ -1,11 +1,14 @@
+
 /* SKEIN 64 and 80 based on Alexis Provos version */
 
 #define TPB52 512
 #define TPB50 256
 
 #include <stdio.h>
-#include <cuda_vectors.h>
-#include <cuda_vector_uint2x4.h>
+#include "cuda_vectors_alexis.h"
+//#include "cuda_helper_alexis.h"
+//#include <cuda_vectors.h>
+//#include <cuda_vector_uint2x4.h>
 
 /* ************************ */
 
@@ -463,7 +466,7 @@ __launch_bounds__(TPB52, 3)
 #else
 __launch_bounds__(TPB50, 5)
 #endif
-void quark_skein512_gpu_hash_64(const uint32_t threads, const uint32_t startNonce, uint64_t* __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector)
+void quark_skein512_gpu_hash_64(const uint32_t threads, uint64_t* __restrict__ g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
@@ -472,7 +475,7 @@ void quark_skein512_gpu_hash_64(const uint32_t threads, const uint32_t startNonc
 		// Skein
 		uint2 p[8], h[9];
 
-		const uint32_t hashPosition = (g_nonceVector == NULL) ? thread : g_nonceVector[thread] - startNonce;
+		const uint32_t hashPosition = thread;//= (g_nonceVector == NULL) ? thread : g_nonceVector[thread] - startNonce;
 
 		uint64_t *Hash = &g_hash[hashPosition<<3];
 
@@ -567,7 +570,7 @@ void quark_skein512_gpu_hash_64(const uint32_t threads, const uint32_t startNonc
 
 		uint2 hash64[8];
 
-		hash64[5] = h5 + 8;
+		hash64[5] = h5 + (uint32_t)8;
 
 		hash64[0] = h0 + h1;
 		hash64[1] = ROL2(h1, 46) ^ hash64[0];
@@ -606,47 +609,47 @@ void quark_skein512_gpu_hash_64(const uint32_t threads, const uint32_t startNonc
 		hash64[4]+= h5;
 		hash64[5]+= h6 + make_uint2(0,0xff000000);
 		hash64[6]+= h7 + vectorize(0xff00000000000008);
-		hash64[7]+= skein_h8 + 1;
+		hash64[7] += skein_h8 + (uint32_t)1;
 		macro3();
 		hash64[0]+= h2; hash64[1]+= h3; hash64[2]+= h4; hash64[3]+= h5;
 		hash64[4]+= h6;
 		hash64[5]+= h7 + vectorize(0xff00000000000008);
-		hash64[6]+= skein_h8 + 8;
-		hash64[7]+= h0 + 2;
+		hash64[6] += skein_h8 + (uint32_t)8;
+		hash64[7] += h0 + (uint32_t)2;
 		macro4();
 		hash64[0] = (hash64[0] + h3);			hash64[1] = (hash64[1] + h4);
 		hash64[2] = (hash64[2] + h5);			hash64[3] = (hash64[3] + h6);
-		hash64[4] = (hash64[4] + h7);			hash64[5] = (hash64[5] + skein_h8 + 8);
+		hash64[4] = (hash64[4] + h7);			hash64[5] = (hash64[5] + skein_h8 + (uint32_t)8);
 		hash64[6] = (hash64[6] + h0 + make_uint2(0,0xff000000));
-		hash64[7] = (hash64[7] + h1 + 3);
+		hash64[7] = (hash64[7] + h1 + (uint32_t)3);
 		macro3();
 		hash64[0] = (hash64[0] + h4);			hash64[1] = (hash64[1] + h5);
 		hash64[2] = (hash64[2] + h6);			hash64[3] = (hash64[3] + h7);
 		hash64[4] = (hash64[4] + skein_h8);		hash64[5] = (hash64[5] + h0 + make_uint2(0,0xff000000));
 		hash64[6] = (hash64[6] + h1 + vectorize(0xff00000000000008));
-		hash64[7] = (hash64[7] + h2 + 4);
+		hash64[7] = (hash64[7] + h2 + (uint32_t)4);
 		macro4();
 		hash64[0] = (hash64[0] + h5);			hash64[1] = (hash64[1] + h6);
 		hash64[2] = (hash64[2] + h7);			hash64[3] = (hash64[3] + skein_h8);
 		hash64[4] = (hash64[4] + h0);			hash64[5] = (hash64[5] + h1 + vectorize(0xff00000000000008));
-		hash64[6] = (hash64[6] + h2 + 8);		hash64[7] = (hash64[7] + h3 + 5);
+		hash64[6] = (hash64[6] + h2 + (uint32_t)8);		hash64[7] = (hash64[7] + h3 + (uint32_t)5);
 		macro3();
 		hash64[0] = (hash64[0] + h6);			hash64[1] = (hash64[1] + h7);
 		hash64[2] = (hash64[2] + skein_h8);		hash64[3] = (hash64[3] + h0);
-		hash64[4] = (hash64[4] + h1);			hash64[5] = (hash64[5] + h2 + 8);
+		hash64[4] = (hash64[4] + h1);			hash64[5] = (hash64[5] + h2 + (uint32_t)8);
 		hash64[6] = (hash64[6] + h3 + make_uint2(0,0xff000000));
-		hash64[7] = (hash64[7] + h4 + 6);
+		hash64[7] = (hash64[7] + h4 + (uint32_t)6);
 		macro4();
 		hash64[0] = (hash64[0] + h7);			hash64[1] = (hash64[1] + skein_h8);
 		hash64[2] = (hash64[2] + h0);			hash64[3] = (hash64[3] + h1);
 		hash64[4] = (hash64[4] + h2);			hash64[5] = (hash64[5] + h3 + make_uint2(0,0xff000000));
 		hash64[6] = (hash64[6] + h4 + vectorize(0xff00000000000008));
-		hash64[7] = (hash64[7] + h5 + 7);
+		hash64[7] = (hash64[7] + h5 + (uint32_t)7);
 		macro3();
 		hash64[0] = (hash64[0] + skein_h8);		hash64[1] = (hash64[1] + h0);
 		hash64[2] = (hash64[2] + h1);			hash64[3] = (hash64[3] + h2);
 		hash64[4] = (hash64[4] + h3);			hash64[5] = (hash64[5] + h4 + vectorize(0xff00000000000008));
-		hash64[6] = (hash64[6] + h5 + 8);		hash64[7] = (hash64[7] + h6 + 8);
+		hash64[6] = (hash64[6] + h5 + (uint32_t)8);		hash64[7] = (hash64[7] + h6 + (uint32_t)8);
 		macro4();
 		hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h0));
 		hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h1));
@@ -756,7 +759,7 @@ void quark_skein512_gpu_hash_64(const uint32_t threads, const uint32_t startNonc
 
 __host__
 //void quark_skein512_cpu_hash_64(int thr_id,uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_hash)
-void quark_skein512_cpu_hash_64(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void quark_skein512_cpu_hash_64(int thr_id, const uint32_t threads, uint32_t *d_hash)
 {
 	uint32_t tpb = TPB52;
 	int dev_id = device_map[thr_id];
@@ -764,7 +767,7 @@ void quark_skein512_cpu_hash_64(int thr_id, const uint32_t threads, const uint32
 	if (device_sm[dev_id] <= 500) tpb = TPB50;
 	const dim3 grid((threads + tpb-1)/tpb);
 	const dim3 block(tpb);
-	quark_skein512_gpu_hash_64 <<<grid, block >>>(threads, startNonce, (uint64_t*)d_hash, d_nonceVector);
+	quark_skein512_gpu_hash_64 <<<grid, block >>>(threads, (uint64_t*)d_hash);
 
 }
 
@@ -939,7 +942,7 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp
 }
 
 __host__
-void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap)
+void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) //, int swap)
 {
 	uint32_t tpb = TPB52;
 	int dev_id = device_map[thr_id];
diff --git a/quark/groestl_transf_quad.h b/quark/groestl_transf_quad.h
index 545a5e680c..657f5b585e 100644
--- a/quark/groestl_transf_quad.h
+++ b/quark/groestl_transf_quad.h
@@ -1,37 +1,37 @@
 /* File included in quark/groestl (quark/jha,nist5/X11+) and groest/myriad coins for SM 3+ */
 
-#define merge8(z,x,y)\
-	z=__byte_perm(x, y, 0x5140); \
+#define merge8(z,x,y) {\
+	z=__byte_perm(x, y, 0x5140); };
 
-#define SWAP8(x,y)\
+#define SWAP8(x,y) {\
 	x=__byte_perm(x, y, 0x5410); \
-	y=__byte_perm(x, y, 0x7632);
+	y=__byte_perm(x, y, 0x7632);};
 
-#define SWAP4(x,y)\
+#define SWAP4(x,y) {\
 	t = (y<<4); \
 	t = (x ^ t); \
 	t = 0xf0f0f0f0UL & t; \
 	x = (x ^ t); \
 	t=  t>>4;\
-	y=  y ^ t;
+	y=  y ^ t;};
 
-#define SWAP2(x,y)\
+#define SWAP2(x,y) {\
 	t = (y<<2); \
 	t = (x ^ t); \
 	t = 0xccccccccUL & t; \
 	x = (x ^ t); \
 	t=  t>>2;\
-	y=  y ^ t;
+	y=  y ^ t;};
 
-#define SWAP1(x,y)\
+#define SWAP1(x,y) {\
 	t = (y+y); \
 	t = (x ^ t); \
 	t = 0xaaaaaaaaUL & t; \
 	x = (x ^ t); \
 	t=  t>>1;\
-	y=  y ^ t;
-
+	y=  y ^ t;};
 
+ 
 __device__ __forceinline__
 void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
 {
diff --git a/quark/nist5.cu b/quark/nist5.cu
index 25aff74311..8e989e06cb 100644
--- a/quark/nist5.cu
+++ b/quark/nist5.cu
@@ -1,3 +1,4 @@
+#if 0
 extern "C"
 {
 #include "sph/sph_blake.h"
@@ -182,3 +183,4 @@ extern "C" void free_nist5(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
index 445b1cfebf..4e37835bf7 100644
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@@ -1,3 +1,4 @@
+#if 0
 extern "C"
 {
 #include "sph/sph_blake.h"
@@ -337,3 +338,4 @@ extern "C" void free_quark(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/qubit/deep.cu b/qubit/deep.cu
index 0de2a9ce32..1a73bd9960 100644
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@@ -1,3 +1,4 @@
+#if 0
 /*
  * deepcoin algorithm
  *
@@ -155,3 +156,4 @@ extern "C" void free_deep(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/qubit/qubit.cu b/qubit/qubit.cu
index 9520ea367f..16c0a257c7 100644
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@@ -1,3 +1,4 @@
+#if 0
 /*
  * qubit algorithm
  *
@@ -173,3 +174,4 @@ extern "C" void free_qubit(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/res/ccminer.rc b/res/ccminer.rc
index e031f82678..fa9a8a9b01 100644
--- a/res/ccminer.rc
+++ b/res/ccminer.rc
@@ -13,7 +13,7 @@
 #undef APSTUDIO_READONLY_SYMBOLS
 
 /////////////////////////////////////////////////////////////////////////////
-// English (United States) resources
+// Anglais (�tats-Unis) resources
 
 #if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
 LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
@@ -88,7 +88,7 @@ BEGIN
     END
 END
 
-#endif    // English (United States) resources
+#endif    // Anglais (�tats-Unis) resources
 /////////////////////////////////////////////////////////////////////////////
 
 
diff --git a/scrypt/salsa_kernel.cu b/scrypt/salsa_kernel.cu
index 9422988faa..9fb3c44435 100644
--- a/scrypt/salsa_kernel.cu
+++ b/scrypt/salsa_kernel.cu
@@ -240,9 +240,8 @@ inline int _ConvertSMVer2Cores(int major, int minor)
 		{ 0x21, 48  }, // Fermi Generation (SM 2.1) GF10x class
 		{ 0x30, 192 }, // Kepler Generation (SM 3.0) GK10x class - GK104 = 1536 cores / 8 SMs
 		{ 0x35, 192 }, // Kepler Generation (SM 3.5) GK11x class
-		{ 0x50, 128 }, // Maxwell First Generation (SM 5.0) GTX750/750Ti
+		{ 0x50, 128 }, // Maxwell Generation (SM 5.0) GTX750/750Ti
 		{ 0x52, 128 }, // Maxwell Second Generation (SM 5.2) GTX980 = 2048 cores / 16 SMs - GTX970 1664 cores / 13 SMs
-		{ 0x61, 128 }, // Pascal GeForce (SM 6.1)
 		{ -1, -1 },
 	};
 
diff --git a/skein.cu b/skein.cu
index 568839f840..fd0919a3c4 100644
--- a/skein.cu
+++ b/skein.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * SKEIN512 80 + SHA256 64
  * by tpruvot@github - 2015
@@ -482,3 +483,4 @@ extern "C" void free_skeincoin(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/skein2.cpp b/skein2.cpp
index 2d57f2d4a8..fd30b9e5e1 100644
--- a/skein2.cpp
+++ b/skein2.cpp
@@ -1,3 +1,4 @@
+#if 0
 /**
  * SKEIN512 80 + SKEIN512 64 (Woodcoin)
  * by tpruvot@github - 2015
@@ -147,3 +148,4 @@ void free_skein2(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/skunk/cuda_skunk.cu b/skunk/cuda_skunk.cu
index cd4c269f7a..d4bb0ff283 100644
--- a/skunk/cuda_skunk.cu
+++ b/skunk/cuda_skunk.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * skein + cube + fugue merged kernel, based on krnlx work
  *
@@ -699,3 +700,4 @@ void skunk_cpu_init(int thr_id, uint32_t threads)
 	cuda_get_arch(thr_id);
 }
 
+#endif
\ No newline at end of file
diff --git a/skunk/cuda_skunk_streebog.cu b/skunk/cuda_skunk_streebog.cu
index 36ec7923c0..4438f1a86a 100644
--- a/skunk/cuda_skunk_streebog.cu
+++ b/skunk/cuda_skunk_streebog.cu
@@ -1,3 +1,4 @@
+#if 0
 /*
  * Streebog GOST R 34.10-2012 stripped CUDA implementation for final hash
  *
@@ -369,3 +370,4 @@ void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_
 
 	skunk_streebog_gpu_final_64 <<< grid, block >>> ((uint64_t*)d_hash, d_resNonce);
 }
+#endif
\ No newline at end of file
diff --git a/skunk/skunk.cu b/skunk/skunk.cu
index c1add50303..4670161a51 100644
--- a/skunk/skunk.cu
+++ b/skunk/skunk.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * Skunk Algo for Signatum
  * (skein, cube, fugue, gost streebog)
@@ -209,3 +210,4 @@ extern "C" void free_skunk(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/stats.cpp b/stats.cpp
index b58f997ce0..6eb00bebb1 100644
--- a/stats.cpp
+++ b/stats.cpp
@@ -69,7 +69,7 @@ double stats_get_speed(int thr_id, double def_speed)
 	while (i != tlastscans.rend() && records < opt_statsavg) {
 		if (!i->second.ignored)
 		if (thr_id == -1 || i->second.thr_id == thr_id) {
-			if (i->second.hashcount > 1000) {
+			if (i->second.hashcount > 100000) {//1000) {
 				speed += i->second.hashrate;
 				records++;
 				// applog(LOG_BLUE, "%d %x %.1f", thr_id, i->second.thr_id, i->second.hashrate);
diff --git a/tribus/tribus.cu b/tribus/tribus.cu
index 4516e7d69c..15fdeb1797 100644
--- a/tribus/tribus.cu
+++ b/tribus/tribus.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * Tribus Algo for Denarius
  *
@@ -182,3 +183,4 @@ extern "C" void free_tribus(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/util.cpp b/util.cpp
index dc20c2adca..fca0c70a8e 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2148,7 +2148,7 @@ void do_gpu_tests(void)
 	opt_tracegpu = false;
 #endif
 }
-
+#if 0
 void print_hash_tests(void)
 {
 	uchar *scratchbuf = NULL;
@@ -2314,28 +2314,22 @@ void print_hash_tests(void)
 	printpfx("x11evo", hash);
 
 	x11hash(&hash[0], &buf[0]);
-	printpfx("x11", hash);
-
-	x12hash(&hash[0], &buf[0]);
-	printpfx("x12", hash);
+	printpfx("X11", hash);
 
 	x13hash(&hash[0], &buf[0]);
-	printpfx("x13", hash);
+	printpfx("X13", hash);
 
 	x14hash(&hash[0], &buf[0]);
-	printpfx("x14", hash);
+	printpfx("X14", hash);
 
 	x15hash(&hash[0], &buf[0]);
-	printpfx("x15", hash);
+	printpfx("X15", hash);
 
 	x16r_hash(&hash[0], &buf[0]);
-	printpfx("x16r", hash);
-
-	x16s_hash(&hash[0], &buf[0]);
-	printpfx("x16s", hash);
+	printpfx("X16r", hash);
 
 	x17hash(&hash[0], &buf[0]);
-	printpfx("x17", hash);
+	printpfx("X17", hash);
 
 	//memcpy(buf, zrtest, 80);
 	zr5hash(&hash[0], &buf[0]);
@@ -2348,3 +2342,4 @@ void print_hash_tests(void)
 
 	free(scratchbuf);
 }
+#endif
\ No newline at end of file
diff --git a/x11/bitcore.cu b/x11/bitcore.cu
index 78739679c3..5920c7bc52 100644
--- a/x11/bitcore.cu
+++ b/x11/bitcore.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * Timetravel-10 (bitcore) CUDA implementation
  *  by tpruvot@github - May 2017
@@ -437,3 +438,4 @@ extern "C" void free_bitcore(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/x11/c11.cu b/x11/c11.cu
index 8f8f6663b2..8e9d9a0775 100644
--- a/x11/c11.cu
+++ b/x11/c11.cu
@@ -1,3 +1,4 @@
+#if 0
 extern "C"
 {
 #include "sph/sph_blake.h"
@@ -267,3 +268,4 @@ extern "C" void free_c11(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/x11/cuda_x11.h b/x11/cuda_x11.h
index 8552157941..b1698ddad6 100644
--- a/x11/cuda_x11.h
+++ b/x11/cuda_x11.h
@@ -1,21 +1,21 @@
 #include "quark/cuda_quark.h"
 
 extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order);
+extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
 
 extern void x11_luffa512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern void x11_cubehash512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
 
 extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
 extern void x11_simd512_cpu_free(int thr_id);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
-extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 
diff --git a/x11/cuda_x11_cubehash512.cu b/x11/cuda_x11_cubehash512.cu
index f7ce97c4b8..9aa4c625b1 100644
--- a/x11/cuda_x11_cubehash512.cu
+++ b/x11/cuda_x11_cubehash512.cu
@@ -1,5 +1,7 @@
-#include <cuda_helper.h>
-#include <cuda_vectors.h>
+//#include <cuda_helper.h>
+//#include <cuda_vectors.h>
+#include "cuda_helper_alexis.h"
+#include "cuda_vectors_alexis.h"
 
 #define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
 #define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
@@ -213,14 +215,14 @@ static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
 /***************************************************/
 
 __global__
-void x11_cubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+void x11_cubehash512_gpu_hash_64(uint32_t threads, uint64_t *g_hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		//uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		int hashPosition = nounce - startNounce;
+		int hashPosition = thread;//nounce - startNounce;
 		uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
 
 		uint32_t x[2][2][2][2][2];
@@ -241,7 +243,7 @@ void x11_cubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_
 }
 
 __host__
-void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 256;
 
@@ -250,7 +252,7 @@ void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNou
 
 	size_t shared_size = 0;
 
-	x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, (uint64_t*)d_hash);
 }
 
 __host__
diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu
index fa5c4f7885..4d0a991d16 100644
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <memory.h>
 
-#include "cuda_helper.h"
+#include "cuda_helper_alexis.h"
 
 extern __device__ __device_builtin__ void __threadfence_block(void);
 
@@ -315,5 +315,5 @@ void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,
 	dim3 block(threadsperblock);
 
 	x11_echo512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-	MyStreamSynchronize(NULL, order, thr_id);
+	//MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/x11/cuda_x11_luffa512.cu b/x11/cuda_x11_luffa512.cu
index b2272ecba8..2e71d77586 100644
--- a/x11/cuda_x11_luffa512.cu
+++ b/x11/cuda_x11_luffa512.cu
@@ -376,6 +376,6 @@ __host__ void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st
     size_t shared_size = 0;
 
     x11_luffa512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-    MyStreamSynchronize(NULL, order, thr_id);
+    //MyStreamSynchronize(NULL, order, thr_id);
 }
 
diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu
index cfebf0d8ee..4da808c218 100644
--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@@ -1,1348 +1,476 @@
 #include <memory.h> // memcpy()
 
-#include "cuda_helper.h"
+#include "cuda_helper_alexis.h"
 
 extern __device__ __device_builtin__ void __threadfence_block(void);
+ 
+//#define TPB 128
+#define TPB 384
 
-#define TPB 128
+__constant__ uint32_t c_PaddedMessage80[20]; // padded message (80 bytes + padding)
 
-__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
-
-#include "cuda_x11_aes.cuh"
+//#include "cuda_x11_aes.cuh"
+#define INTENSIVE_GMF
+#include "../x11/cuda_x11_echo_aes.cuh"
 
 __device__ __forceinline__
-static void AES_ROUND_NOKEY(
-	const uint32_t* __restrict__ sharedMemory,
-	uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3)
-{
-	uint32_t y0, y1, y2, y3;
-	aes_round(sharedMemory,
-		x0, x1, x2, x3,
-		y0, y1, y2, y3);
-
-	x0 = y0;
-	x1 = y1;
-	x2 = y2;
-	x3 = y3;
+#if HACK_1
+static void round_3_7_11(const uint32_t sharedMemory[4][256], uint32_t* r, uint4 *p, uint4 &x) {
+#else
+static void round_3_7_11(const uint32_t* __restrict__ sharedMemory, uint32_t* r, uint4 *p, uint4 &x) {
+#endif
+	KEY_EXPAND_ELT(sharedMemory, &r[0]);
+	*(uint4*)&r[0] ^= *(uint4*)&r[28];
+	x = p[2] ^ *(uint4*)&r[0];
+	KEY_EXPAND_ELT(sharedMemory, &r[4]);
+	*(uint4*)&r[4] ^= *(uint4*)&r[0];
+	/*
+	r[4] ^= r[0];
+	r[5] ^= r[1];
+	r[6] ^= r[2];
+	r[7] ^= r[3];
+	*/
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	x ^= *(uint4*)&r[4];
+	/*
+	x.x ^= r[4];
+	x.y ^= r[5];
+	x.z ^= r[6];
+	x.w ^= r[7];
+	*/
+	KEY_EXPAND_ELT(sharedMemory, &r[8]);
+	*(uint4*)&r[8] ^= *(uint4*)&r[4];
+	/*
+	r[8] ^= r[4];
+	r[9] ^= r[5];
+	r[10] ^= r[6];
+	r[11] ^= r[7];
+	*/
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	x ^= *(uint4*)&r[8];
+	/*
+	x.x ^= r[8];
+	x.y ^= r[9];
+	x.z ^= r[10];
+	x.w ^= r[11];
+	*/
+	KEY_EXPAND_ELT(sharedMemory, &r[12]);
+	*(uint4*)&r[12] ^= *(uint4*)&r[8];
+	/*
+	r[12] ^= r[8];
+	r[13] ^= r[9];
+	r[14] ^= r[10];
+	r[15] ^= r[11];
+	*/
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	x ^= *(uint4*)&r[12];
+	/*
+	x.x ^= r[12];
+	x.y ^= r[13];
+	x.z ^= r[14];
+	x.w ^= r[15];
+	*/
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[1] ^= x;
+	/*
+	p[1].x ^= x.x;
+	p[1].y ^= x.y;
+	p[1].z ^= x.z;
+	p[1].w ^= x.w;
+	*/
+	KEY_EXPAND_ELT(sharedMemory, &r[16]);
+	*(uint4*)&r[16] ^= *(uint4*)&r[12];
+	x = p[0] ^ *(uint4*)&r[16];
+	KEY_EXPAND_ELT(sharedMemory, &r[20]);
+	*(uint4*)&r[20] ^= *(uint4*)&r[16];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	x ^= *(uint4*)&r[20];
+	KEY_EXPAND_ELT(sharedMemory, &r[24]);
+	*(uint4*)&r[24] ^= *(uint4*)&r[20];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	x ^= *(uint4*)&r[24];
+	KEY_EXPAND_ELT(sharedMemory, &r[28]);
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[28] ^= *(uint4*)&r[24];
+	x ^= *(uint4*)&r[28];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[3] ^= x;
 }
 
 __device__ __forceinline__
-static void KEY_EXPAND_ELT(
-	const uint32_t* __restrict__ sharedMemory,
-	uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3)
-{
-	uint32_t y0, y1, y2, y3;
-	aes_round(sharedMemory,
-		k0, k1, k2, k3,
-		y0, y1, y2, y3);
-
-	k0 = y1;
-	k1 = y2;
-	k2 = y3;
-	k3 = y0;
+#if HACK_1
+static void round_4_8_12(const uint32_t sharedMemory[4][256], uint32_t* r, uint4 *p, uint4 &x){
+#else
+static void round_4_8_12(const uint32_t* __restrict__ sharedMemory, uint32_t* r, uint4 *p, uint4 &x){
+#endif
+	*(uint4*)&r[0] ^= *(uint4*)&r[25];
+	x = p[1] ^ *(uint4*)&r[0];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+
+	r[4] ^= r[29];	r[5] ^= r[30];
+	r[6] ^= r[31];	r[7] ^= r[0];
+
+	x ^= *(uint4*)&r[4];
+	*(uint4*)&r[8] ^= *(uint4*)&r[1];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	x ^= *(uint4*)&r[8];
+	*(uint4*)&r[12] ^= *(uint4*)&r[5];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	x ^= *(uint4*)&r[12];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[0] ^= x;
+	*(uint4*)&r[16] ^= *(uint4*)&r[9];
+	x = p[3] ^ *(uint4*)&r[16];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[20] ^= *(uint4*)&r[13];
+	x ^= *(uint4*)&r[20];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[24] ^= *(uint4*)&r[17];
+	x ^= *(uint4*)&r[24];
+	*(uint4*)&r[28] ^= *(uint4*)&r[21];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	x ^= *(uint4*)&r[28];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[2] ^= x;
 }
 
 __device__ __forceinline__
-static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, const uint32_t count)
+static void c512(const uint32_t* sharedMemory, const uint32_t *state, uint32_t *msg, uint2x4 *Hash, const uint32_t counter)
 {
-	uint32_t p0, p1, p2, p3, p4, p5, p6, p7;
-	uint32_t p8, p9, pA, pB, pC, pD, pE, pF;
-	uint32_t x0, x1, x2, x3;
-	uint32_t rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07;
-	uint32_t rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F;
-	uint32_t rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
-	uint32_t rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
-	const uint32_t counter = count;
-
-	p0 = state[0x0];
-	p1 = state[0x1];
-	p2 = state[0x2];
-	p3 = state[0x3];
-	p4 = state[0x4];
-	p5 = state[0x5];
-	p6 = state[0x6];
-	p7 = state[0x7];
-	p8 = state[0x8];
-	p9 = state[0x9];
-	pA = state[0xA];
-	pB = state[0xB];
-	pC = state[0xC];
-	pD = state[0xD];
-	pE = state[0xE];
-	pF = state[0xF];
+	uint4 p[4];
+	uint4 x;
+	uint32_t r[32];
+
+	*(uint2x4*)&r[0] = *(uint2x4*)&msg[0];
+	*(uint2x4*)&r[8] = *(uint2x4*)&msg[8];
+	*(uint4*)&r[16] = *(uint4*)&msg[16];
+
+	*(uint2x4*)&p[0] = *(uint2x4*)&state[0];
+	*(uint2x4*)&p[2] = *(uint2x4*)&state[8];
 
 	/* round 0 */
-	rk00 = msg[0];
-	x0 = p4 ^ msg[0];
-	rk01 = msg[1];
-	x1 = p5 ^ msg[1];
-	rk02 = msg[2];
-	x2 = p6 ^ msg[2];
-	rk03 = msg[3];
-	x3 = p7 ^ msg[3];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk04 = msg[4];
-	x0 ^= msg[4];
-	rk05 = msg[5];
-	x1 ^= msg[5];
-	rk06 = msg[6];
-	x2 ^= msg[6];
-	rk07 = msg[7];
-	x3 ^= msg[7];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk08 = msg[8];
-	x0 ^= msg[8];
-	rk09 = msg[9];
-	x1 ^= msg[9];
-	rk0A = msg[10];
-	x2 ^= msg[10];
-	rk0B = msg[11];
-	x3 ^= msg[11];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk0C = msg[12];
-	x0 ^= msg[12];
-	rk0D = msg[13];
-	x1 ^= msg[13];
-	rk0E = msg[14];
-	x2 ^= msg[14];
-	rk0F = msg[15];
-	x3 ^= msg[15];
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p0 ^= x0;
-	p1 ^= x1;
-	p2 ^= x2;
-	p3 ^= x3;
-	if (count == 512)
+	x = p[1] ^ *(uint4*)&r[0];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	x ^= *(uint4*)&r[4];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	x ^= *(uint4*)&r[8];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	x ^= *(uint4*)&r[12];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[0] ^= x;
+	//
+	if (counter == 512)
 	{
-		rk10 = 0x80U;
-		x0 = pC ^ 0x80U;
-		rk11 = 0;
-		x1 = pD;
-		rk12 = 0;
-		x2 = pE;
-		rk13 = 0;
-		x3 = pF;
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk14 = 0;
-		rk15 = 0;
-		rk16 = 0;
-		rk17 = 0;
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk18 = 0;
-		rk19 = 0;
-		rk1A = 0;
-		rk1B = 0x02000000U;
-		x3 ^= 0x02000000U;
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk1C = 0;
-		rk1D = 0;
-		rk1E = 0;
-		rk1F = 0x02000000;
-		x3 ^= 0x02000000;
+		*(uint4*)&r[20] = *(uint4*)&msg[20];
+		*(uint2x4*)&r[24] = *(uint2x4*)&msg[24];
+		x = p[3];
+		x.x ^= 0x80;
+		AES_ROUND_NOKEY(sharedMemory, &x);
+		AES_ROUND_NOKEY(sharedMemory, &x);
+		x.w ^= 0x2000000;
+		AES_ROUND_NOKEY(sharedMemory, &x);
+		x.w ^= 0x2000000;
+
 	}
 	else
 	{
-		rk10 = msg[16];
-		x0 = pC ^ msg[16];
-		rk11 = msg[17];
-		x1 = pD ^ msg[17];
-		rk12 = msg[18];
-		x2 = pE ^ msg[18];
-		rk13 = msg[19];
-		x3 = pF ^ msg[19];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk14 = msg[20];
-		x0 ^= msg[20];
-		rk15 = msg[21];
-		x1 ^= msg[21];
-		rk16 = msg[22];
-		x2 ^= msg[22];
-		rk17 = msg[23];
-		x3 ^= msg[23];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk18 = msg[24];
-		x0 ^= msg[24];
-		rk19 = msg[25];
-		x1 ^= msg[25];
-		rk1A = msg[26];
-		x2 ^= msg[26];
-		rk1B = msg[27];
-		x3 ^= msg[27];
-		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-		rk1C = msg[28];
-		x0 ^= msg[28];
-		rk1D = msg[29];
-		x1 ^= msg[29];
-		rk1E = msg[30];
-		x2 ^= msg[30];
-		rk1F = msg[31];
-		x3 ^= msg[31];
+		x = p[3] ^ *(uint4*)&r[16];
+
+		r[0x14] = 0x80;
+		r[0x15] = 0; r[0x16] = 0; r[0x17] = 0; r[0x18] = 0; r[0x19] = 0; r[0x1a] = 0;
+		r[0x1b] = 0x2800000;
+		r[0x1c] = 0; r[0x1d] = 0; r[0x1e] = 0;
+		r[0x1f] = 0x2000000;
+
+		AES_ROUND_NOKEY(sharedMemory, &x);
+
+		x.x ^= 0x80;
+		AES_ROUND_NOKEY(sharedMemory, &x);
+
+		x.w ^= 0x02800000;
+		AES_ROUND_NOKEY(sharedMemory, &x);
+
+		x.w ^= 0x02000000;
+		//
 	}
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p8 ^= x0;
-	p9 ^= x1;
-	pA ^= x2;
-	pB ^= x3;
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[2] ^= x;
 
 	// 1
-	KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03);
-	rk00 ^= rk1C;
-	rk01 ^= rk1D;
-	rk02 ^= rk1E;
-	rk03 ^= rk1F;
-	rk00 ^= counter;
-	rk03 ^= 0xFFFFFFFF;
-	x0 = p0 ^ rk00;
-	x1 = p1 ^ rk01;
-	x2 = p2 ^ rk02;
-	x3 = p3 ^ rk03;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07);
-	rk04 ^= rk00;
-	rk05 ^= rk01;
-	rk06 ^= rk02;
-	rk07 ^= rk03;
-	x0 ^= rk04;
-	x1 ^= rk05;
-	x2 ^= rk06;
-	x3 ^= rk07;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B);
-	rk08 ^= rk04;
-	rk09 ^= rk05;
-	rk0A ^= rk06;
-	rk0B ^= rk07;
-	x0 ^= rk08;
-	x1 ^= rk09;
-	x2 ^= rk0A;
-	x3 ^= rk0B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F);
-	rk0C ^= rk08;
-	rk0D ^= rk09;
-	rk0E ^= rk0A;
-	rk0F ^= rk0B;
-	x0 ^= rk0C;
-	x1 ^= rk0D;
-	x2 ^= rk0E;
-	x3 ^= rk0F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	pC ^= x0;
-	pD ^= x1;
-	pE ^= x2;
-	pF ^= x3;
-	KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13);
-	rk10 ^= rk0C;
-	rk11 ^= rk0D;
-	rk12 ^= rk0E;
-	rk13 ^= rk0F;
-	x0 = p8 ^ rk10;
-	x1 = p9 ^ rk11;
-	x2 = pA ^ rk12;
-	x3 = pB ^ rk13;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17);
-	rk14 ^= rk10;
-	rk15 ^= rk11;
-	rk16 ^= rk12;
-	rk17 ^= rk13;
-	x0 ^= rk14;
-	x1 ^= rk15;
-	x2 ^= rk16;
-	x3 ^= rk17;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B);
-	rk18 ^= rk14;
-	rk19 ^= rk15;
-	rk1A ^= rk16;
-	rk1B ^= rk17;
-	x0 ^= rk18;
-	x1 ^= rk19;
-	x2 ^= rk1A;
-	x3 ^= rk1B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F);
-	rk1C ^= rk18;
-	rk1D ^= rk19;
-	rk1E ^= rk1A;
-	rk1F ^= rk1B;
-	x0 ^= rk1C;
-	x1 ^= rk1D;
-	x2 ^= rk1E;
-	x3 ^= rk1F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p4 ^= x0;
-	p5 ^= x1;
-	p6 ^= x2;
-	p7 ^= x3;
-
-	rk00 ^= rk19;
-	x0 = pC ^ rk00;
-	rk01 ^= rk1A;
-	x1 = pD ^ rk01;
-	rk02 ^= rk1B;
-	x2 = pE ^ rk02;
-	rk03 ^= rk1C;
-	x3 = pF ^ rk03;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk04 ^= rk1D;
-	x0 ^= rk04;
-	rk05 ^= rk1E;
-	x1 ^= rk05;
-	rk06 ^= rk1F;
-	x2 ^= rk06;
-	rk07 ^= rk00;
-	x3 ^= rk07;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk08 ^= rk01;
-	x0 ^= rk08;
-	rk09 ^= rk02;
-	x1 ^= rk09;
-	rk0A ^= rk03;
-	x2 ^= rk0A;
-	rk0B ^= rk04;
-	x3 ^= rk0B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk0C ^= rk05;
-	x0 ^= rk0C;
-	rk0D ^= rk06;
-	x1 ^= rk0D;
-	rk0E ^= rk07;
-	x2 ^= rk0E;
-	rk0F ^= rk08;
-	x3 ^= rk0F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p8 ^= x0;
-	p9 ^= x1;
-	pA ^= x2;
-	pB ^= x3;
-	rk10 ^= rk09;
-	x0 = p4 ^ rk10;
-	rk11 ^= rk0A;
-	x1 = p5 ^ rk11;
-	rk12 ^= rk0B;
-	x2 = p6 ^ rk12;
-	rk13 ^= rk0C;
-	x3 = p7 ^ rk13;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk14 ^= rk0D;
-	x0 ^= rk14;
-	rk15 ^= rk0E;
-	x1 ^= rk15;
-	rk16 ^= rk0F;
-	x2 ^= rk16;
-	rk17 ^= rk10;
-	x3 ^= rk17;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk18 ^= rk11;
-	x0 ^= rk18;
-	rk19 ^= rk12;
-	x1 ^= rk19;
-	rk1A ^= rk13;
-	x2 ^= rk1A;
-	rk1B ^= rk14;
-	x3 ^= rk1B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk1C ^= rk15;
-	x0 ^= rk1C;
-	rk1D ^= rk16;
-	x1 ^= rk1D;
-	rk1E ^= rk17;
-	x2 ^= rk1E;
-	rk1F ^= rk18;
-	x3 ^= rk1F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p0 ^= x0;
-	p1 ^= x1;
-	p2 ^= x2;
-	p3 ^= x3;
+	KEY_EXPAND_ELT(sharedMemory, &r[0]);
+	*(uint4*)&r[0] ^= *(uint4*)&r[28];
+	r[0] ^= counter; // 0x200/0x280
+	r[3] ^= 0xFFFFFFFF;
+	x = p[0] ^ *(uint4*)&r[0];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[4]);
+	*(uint4*)&r[4] ^= *(uint4*)&r[0];
+	x ^= *(uint4*)&r[4];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[8]);
+	*(uint4*)&r[8] ^= *(uint4*)&r[4];
+	x ^= *(uint4*)&r[8];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[12]);
+	*(uint4*)&r[12] ^= *(uint4*)&r[8];
+	x ^= *(uint4*)&r[12];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[3] ^= x;
+	KEY_EXPAND_ELT(sharedMemory, &r[16]);
+	*(uint4*)&r[16] ^= *(uint4*)&r[12];
+	x = p[2] ^ *(uint4*)&r[16];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[20]);
+	*(uint4*)&r[20] ^= *(uint4*)&r[16];
+	x ^= *(uint4*)&r[20];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[24]);
+	*(uint4*)&r[24] ^= *(uint4*)&r[20];
+	x ^= *(uint4*)&r[24];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[28]);
+	*(uint4*)&r[28] ^= *(uint4*)&r[24];
+	x ^= *(uint4*)&r[28];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[1] ^= x;
+	*(uint4*)&r[0] ^= *(uint4*)&r[25];
+	x = p[3] ^ *(uint4*)&r[0];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+
+	r[4] ^= r[29]; r[5] ^= r[30];
+	r[6] ^= r[31]; r[7] ^= r[0];
+
+	x ^= *(uint4*)&r[4];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[8] ^= *(uint4*)&r[1];
+	x ^= *(uint4*)&r[8];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[12] ^= *(uint4*)&r[5];
+	x ^= *(uint4*)&r[12];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[2] ^= x;
+	*(uint4*)&r[16] ^= *(uint4*)&r[9];
+	x = p[1] ^ *(uint4*)&r[16];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[20] ^= *(uint4*)&r[13];
+	x ^= *(uint4*)&r[20];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[24] ^= *(uint4*)&r[17];
+	x ^= *(uint4*)&r[24];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[28] ^= *(uint4*)&r[21];
+	x ^= *(uint4*)&r[28];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+
+	p[0] ^= x;
 
 	/* round 3, 7, 11 */
-	KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03);
-	rk00 ^= rk1C;
-	rk01 ^= rk1D;
-	rk02 ^= rk1E;
-	rk03 ^= rk1F;
-	x0 = p8 ^ rk00;
-	x1 = p9 ^ rk01;
-	x2 = pA ^ rk02;
-	x3 = pB ^ rk03;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07);
-	rk04 ^= rk00;
-	rk05 ^= rk01;
-	rk06 ^= rk02;
-	rk07 ^= rk03;
-	x0 ^= rk04;
-	x1 ^= rk05;
-	x2 ^= rk06;
-	x3 ^= rk07;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B);
-	rk08 ^= rk04;
-	rk09 ^= rk05;
-	rk0A ^= rk06;
-	rk0B ^= rk07;
-	x0 ^= rk08;
-	x1 ^= rk09;
-	x2 ^= rk0A;
-	x3 ^= rk0B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F);
-	rk0C ^= rk08;
-	rk0D ^= rk09;
-	rk0E ^= rk0A;
-	rk0F ^= rk0B;
-	x0 ^= rk0C;
-	x1 ^= rk0D;
-	x2 ^= rk0E;
-	x3 ^= rk0F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p4 ^= x0;
-	p5 ^= x1;
-	p6 ^= x2;
-	p7 ^= x3;
-	KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13);
-	rk10 ^= rk0C;
-	rk11 ^= rk0D;
-	rk12 ^= rk0E;
-	rk13 ^= rk0F;
-	x0 = p0 ^ rk10;
-	x1 = p1 ^ rk11;
-	x2 = p2 ^ rk12;
-	x3 = p3 ^ rk13;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17);
-	rk14 ^= rk10;
-	rk15 ^= rk11;
-	rk16 ^= rk12;
-	rk17 ^= rk13;
-	x0 ^= rk14;
-	x1 ^= rk15;
-	x2 ^= rk16;
-	x3 ^= rk17;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B);
-	rk18 ^= rk14;
-	rk19 ^= rk15;
-	rk1A ^= rk16;
-	rk1B ^= rk17;
-	x0 ^= rk18;
-	x1 ^= rk19;
-	x2 ^= rk1A;
-	x3 ^= rk1B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F);
-	rk1C ^= rk18;
-	rk1D ^= rk19;
-	rk1E ^= rk1A;
-	rk1F ^= rk1B;
-	x0 ^= rk1C;
-	x1 ^= rk1D;
-	x2 ^= rk1E;
-	x3 ^= rk1F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	pC ^= x0;
-	pD ^= x1;
-	pE ^= x2;
-	pF ^= x3;
+	round_3_7_11(sharedMemory, r, p, x);
+
 
 	/* round 4, 8, 12 */
-	rk00 ^= rk19;
-	x0 = p4 ^ rk00;
-	rk01 ^= rk1A;
-	x1 = p5 ^ rk01;
-	rk02 ^= rk1B;
-	x2 = p6 ^ rk02;
-	rk03 ^= rk1C;
-	x3 = p7 ^ rk03;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk04 ^= rk1D;
-	x0 ^= rk04;
-	rk05 ^= rk1E;
-	x1 ^= rk05;
-	rk06 ^= rk1F;
-	x2 ^= rk06;
-	rk07 ^= rk00;
-	x3 ^= rk07;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk08 ^= rk01;
-	x0 ^= rk08;
-	rk09 ^= rk02;
-	x1 ^= rk09;
-	rk0A ^= rk03;
-	x2 ^= rk0A;
-	rk0B ^= rk04;
-	x3 ^= rk0B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk0C ^= rk05;
-	x0 ^= rk0C;
-	rk0D ^= rk06;
-	x1 ^= rk0D;
-	rk0E ^= rk07;
-	x2 ^= rk0E;
-	rk0F ^= rk08;
-	x3 ^= rk0F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p0 ^= x0;
-	p1 ^= x1;
-	p2 ^= x2;
-	p3 ^= x3;
-	rk10 ^= rk09;
-	x0 = pC ^ rk10;
-	rk11 ^= rk0A;
-	x1 = pD ^ rk11;
-	rk12 ^= rk0B;
-	x2 = pE ^ rk12;
-	rk13 ^= rk0C;
-	x3 = pF ^ rk13;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk14 ^= rk0D;
-	x0 ^= rk14;
-	rk15 ^= rk0E;
-	x1 ^= rk15;
-	rk16 ^= rk0F;
-	x2 ^= rk16;
-	rk17 ^= rk10;
-	x3 ^= rk17;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk18 ^= rk11;
-	x0 ^= rk18;
-	rk19 ^= rk12;
-	x1 ^= rk19;
-	rk1A ^= rk13;
-	x2 ^= rk1A;
-	rk1B ^= rk14;
-	x3 ^= rk1B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk1C ^= rk15;
-	x0 ^= rk1C;
-	rk1D ^= rk16;
-	x1 ^= rk1D;
-	rk1E ^= rk17;
-	x2 ^= rk1E;
-	rk1F ^= rk18;
-	x3 ^= rk1F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p8 ^= x0;
-	p9 ^= x1;
-	pA ^= x2;
-	pB ^= x3;
+	round_4_8_12(sharedMemory, r, p, x);
 
 	// 2
-	KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03);
-	rk00 ^= rk1C;
-	rk01 ^= rk1D;
-	rk02 ^= rk1E;
-	rk03 ^= rk1F;
-	x0 = p0 ^ rk00;
-	x1 = p1 ^ rk01;
-	x2 = p2 ^ rk02;
-	x3 = p3 ^ rk03;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07);
-	rk04 ^= rk00;
-	rk05 ^= rk01;
-	rk06 ^= rk02;
-	rk07 ^= rk03;
-	rk07 ^= SPH_T32(~counter);
-	x0 ^= rk04;
-	x1 ^= rk05;
-	x2 ^= rk06;
-	x3 ^= rk07;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B);
-	rk08 ^= rk04;
-	rk09 ^= rk05;
-	rk0A ^= rk06;
-	rk0B ^= rk07;
-	x0 ^= rk08;
-	x1 ^= rk09;
-	x2 ^= rk0A;
-	x3 ^= rk0B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F);
-	rk0C ^= rk08;
-	rk0D ^= rk09;
-	rk0E ^= rk0A;
-	rk0F ^= rk0B;
-	x0 ^= rk0C;
-	x1 ^= rk0D;
-	x2 ^= rk0E;
-	x3 ^= rk0F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	pC ^= x0;
-	pD ^= x1;
-	pE ^= x2;
-	pF ^= x3;
-	KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13);
-	rk10 ^= rk0C;
-	rk11 ^= rk0D;
-	rk12 ^= rk0E;
-	rk13 ^= rk0F;
-	x0 = p8 ^ rk10;
-	x1 = p9 ^ rk11;
-	x2 = pA ^ rk12;
-	x3 = pB ^ rk13;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17);
-	rk14 ^= rk10;
-	rk15 ^= rk11;
-	rk16 ^= rk12;
-	rk17 ^= rk13;
-	x0 ^= rk14;
-	x1 ^= rk15;
-	x2 ^= rk16;
-	x3 ^= rk17;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B);
-	rk18 ^= rk14;
-	rk19 ^= rk15;
-	rk1A ^= rk16;
-	rk1B ^= rk17;
-	x0 ^= rk18;
-	x1 ^= rk19;
-	x2 ^= rk1A;
-	x3 ^= rk1B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F);
-	rk1C ^= rk18;
-	rk1D ^= rk19;
-	rk1E ^= rk1A;
-	rk1F ^= rk1B;
-	x0 ^= rk1C;
-	x1 ^= rk1D;
-	x2 ^= rk1E;
-	x3 ^= rk1F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p4 ^= x0;
-	p5 ^= x1;
-	p6 ^= x2;
-	p7 ^= x3;
-
-	rk00 ^= rk19;
-	x0 = pC ^ rk00;
-	rk01 ^= rk1A;
-	x1 = pD ^ rk01;
-	rk02 ^= rk1B;
-	x2 = pE ^ rk02;
-	rk03 ^= rk1C;
-	x3 = pF ^ rk03;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk04 ^= rk1D;
-	x0 ^= rk04;
-	rk05 ^= rk1E;
-	x1 ^= rk05;
-	rk06 ^= rk1F;
-	x2 ^= rk06;
-	rk07 ^= rk00;
-	x3 ^= rk07;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk08 ^= rk01;
-	x0 ^= rk08;
-	rk09 ^= rk02;
-	x1 ^= rk09;
-	rk0A ^= rk03;
-	x2 ^= rk0A;
-	rk0B ^= rk04;
-	x3 ^= rk0B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk0C ^= rk05;
-	x0 ^= rk0C;
-	rk0D ^= rk06;
-	x1 ^= rk0D;
-	rk0E ^= rk07;
-	x2 ^= rk0E;
-	rk0F ^= rk08;
-	x3 ^= rk0F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p8 ^= x0;
-	p9 ^= x1;
-	pA ^= x2;
-	pB ^= x3;
-	rk10 ^= rk09;
-	x0 = p4 ^ rk10;
-	rk11 ^= rk0A;
-	x1 = p5 ^ rk11;
-	rk12 ^= rk0B;
-	x2 = p6 ^ rk12;
-	rk13 ^= rk0C;
-	x3 = p7 ^ rk13;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk14 ^= rk0D;
-	x0 ^= rk14;
-	rk15 ^= rk0E;
-	x1 ^= rk15;
-	rk16 ^= rk0F;
-	x2 ^= rk16;
-	rk17 ^= rk10;
-	x3 ^= rk17;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk18 ^= rk11;
-	x0 ^= rk18;
-	rk19 ^= rk12;
-	x1 ^= rk19;
-	rk1A ^= rk13;
-	x2 ^= rk1A;
-	rk1B ^= rk14;
-	x3 ^= rk1B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk1C ^= rk15;
-	x0 ^= rk1C;
-	rk1D ^= rk16;
-	x1 ^= rk1D;
-	rk1E ^= rk17;
-	x2 ^= rk1E;
-	rk1F ^= rk18;
-	x3 ^= rk1F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p0 ^= x0;
-	p1 ^= x1;
-	p2 ^= x2;
-	p3 ^= x3;
+	KEY_EXPAND_ELT(sharedMemory, &r[0]);
+	*(uint4*)&r[0] ^= *(uint4*)&r[28];
+	x = p[0] ^ *(uint4*)&r[0];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[4]);
+	*(uint4*)&r[4] ^= *(uint4*)&r[0];
+	r[7] ^= (~counter);// 512/640
+	x ^= *(uint4*)&r[4];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[8]);
+	*(uint4*)&r[8] ^= *(uint4*)&r[4];
+	x ^= *(uint4*)&r[8];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[12]);
+	*(uint4*)&r[12] ^= *(uint4*)&r[8];
+	x ^= *(uint4*)&r[12];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[3] ^= x;
+	KEY_EXPAND_ELT(sharedMemory, &r[16]);
+	*(uint4*)&r[16] ^= *(uint4*)&r[12];
+	x = p[2] ^ *(uint4*)&r[16];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[20]);
+	*(uint4*)&r[20] ^= *(uint4*)&r[16];
+	x ^= *(uint4*)&r[20];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[24]);
+	*(uint4*)&r[24] ^= *(uint4*)&r[20];
+	x ^= *(uint4*)&r[24];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[28]);
+	*(uint4*)&r[28] ^= *(uint4*)&r[24];
+	x ^= *(uint4*)&r[28];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[1] ^= x;
+
+	*(uint4*)&r[0] ^= *(uint4*)&r[25];
+	x = p[3] ^ *(uint4*)&r[0];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	r[4] ^= r[29];
+	r[5] ^= r[30];
+	r[6] ^= r[31];
+	r[7] ^= r[0];
+	x ^= *(uint4*)&r[4];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[8] ^= *(uint4*)&r[1];
+	x ^= *(uint4*)&r[8];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[12] ^= *(uint4*)&r[5];
+	x ^= *(uint4*)&r[12];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[2] ^= x;
+	*(uint4*)&r[16] ^= *(uint4*)&r[9];
+	x = p[1] ^ *(uint4*)&r[16];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[20] ^= *(uint4*)&r[13];
+	x ^= *(uint4*)&r[20];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[24] ^= *(uint4*)&r[17];
+	x ^= *(uint4*)&r[24];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[28] ^= *(uint4*)&r[21];
+	x ^= *(uint4*)&r[28];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[0] ^= x;
 
 	/* round 3, 7, 11 */
-	KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03);
-	rk00 ^= rk1C;
-	rk01 ^= rk1D;
-	rk02 ^= rk1E;
-	rk03 ^= rk1F;
-	x0 = p8 ^ rk00;
-	x1 = p9 ^ rk01;
-	x2 = pA ^ rk02;
-	x3 = pB ^ rk03;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07);
-	rk04 ^= rk00;
-	rk05 ^= rk01;
-	rk06 ^= rk02;
-	rk07 ^= rk03;
-	x0 ^= rk04;
-	x1 ^= rk05;
-	x2 ^= rk06;
-	x3 ^= rk07;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B);
-	rk08 ^= rk04;
-	rk09 ^= rk05;
-	rk0A ^= rk06;
-	rk0B ^= rk07;
-	x0 ^= rk08;
-	x1 ^= rk09;
-	x2 ^= rk0A;
-	x3 ^= rk0B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F);
-	rk0C ^= rk08;
-	rk0D ^= rk09;
-	rk0E ^= rk0A;
-	rk0F ^= rk0B;
-	x0 ^= rk0C;
-	x1 ^= rk0D;
-	x2 ^= rk0E;
-	x3 ^= rk0F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p4 ^= x0;
-	p5 ^= x1;
-	p6 ^= x2;
-	p7 ^= x3;
-	KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13);
-	rk10 ^= rk0C;
-	rk11 ^= rk0D;
-	rk12 ^= rk0E;
-	rk13 ^= rk0F;
-	x0 = p0 ^ rk10;
-	x1 = p1 ^ rk11;
-	x2 = p2 ^ rk12;
-	x3 = p3 ^ rk13;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17);
-	rk14 ^= rk10;
-	rk15 ^= rk11;
-	rk16 ^= rk12;
-	rk17 ^= rk13;
-	x0 ^= rk14;
-	x1 ^= rk15;
-	x2 ^= rk16;
-	x3 ^= rk17;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B);
-	rk18 ^= rk14;
-	rk19 ^= rk15;
-	rk1A ^= rk16;
-	rk1B ^= rk17;
-	x0 ^= rk18;
-	x1 ^= rk19;
-	x2 ^= rk1A;
-	x3 ^= rk1B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F);
-	rk1C ^= rk18;
-	rk1D ^= rk19;
-	rk1E ^= rk1A;
-	rk1F ^= rk1B;
-	x0 ^= rk1C;
-	x1 ^= rk1D;
-	x2 ^= rk1E;
-	x3 ^= rk1F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	pC ^= x0;
-	pD ^= x1;
-	pE ^= x2;
-	pF ^= x3;
+	round_3_7_11(sharedMemory, r, p, x);
 
 	/* round 4, 8, 12 */
-	rk00 ^= rk19;
-	x0 = p4 ^ rk00;
-	rk01 ^= rk1A;
-	x1 = p5 ^ rk01;
-	rk02 ^= rk1B;
-	x2 = p6 ^ rk02;
-	rk03 ^= rk1C;
-	x3 = p7 ^ rk03;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk04 ^= rk1D;
-	x0 ^= rk04;
-	rk05 ^= rk1E;
-	x1 ^= rk05;
-	rk06 ^= rk1F;
-	x2 ^= rk06;
-	rk07 ^= rk00;
-	x3 ^= rk07;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk08 ^= rk01;
-	x0 ^= rk08;
-	rk09 ^= rk02;
-	x1 ^= rk09;
-	rk0A ^= rk03;
-	x2 ^= rk0A;
-	rk0B ^= rk04;
-	x3 ^= rk0B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk0C ^= rk05;
-	x0 ^= rk0C;
-	rk0D ^= rk06;
-	x1 ^= rk0D;
-	rk0E ^= rk07;
-	x2 ^= rk0E;
-	rk0F ^= rk08;
-	x3 ^= rk0F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p0 ^= x0;
-	p1 ^= x1;
-	p2 ^= x2;
-	p3 ^= x3;
-	rk10 ^= rk09;
-	x0 = pC ^ rk10;
-	rk11 ^= rk0A;
-	x1 = pD ^ rk11;
-	rk12 ^= rk0B;
-	x2 = pE ^ rk12;
-	rk13 ^= rk0C;
-	x3 = pF ^ rk13;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk14 ^= rk0D;
-	x0 ^= rk14;
-	rk15 ^= rk0E;
-	x1 ^= rk15;
-	rk16 ^= rk0F;
-	x2 ^= rk16;
-	rk17 ^= rk10;
-	x3 ^= rk17;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk18 ^= rk11;
-	x0 ^= rk18;
-	rk19 ^= rk12;
-	x1 ^= rk19;
-	rk1A ^= rk13;
-	x2 ^= rk1A;
-	rk1B ^= rk14;
-	x3 ^= rk1B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk1C ^= rk15;
-	x0 ^= rk1C;
-	rk1D ^= rk16;
-	x1 ^= rk1D;
-	rk1E ^= rk17;
-	x2 ^= rk1E;
-	rk1F ^= rk18;
-	x3 ^= rk1F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p8 ^= x0;
-	p9 ^= x1;
-	pA ^= x2;
-	pB ^= x3;
+	round_4_8_12(sharedMemory, r, p, x);
 
 	// 3
-	KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03);
-	rk00 ^= rk1C;
-	rk01 ^= rk1D;
-	rk02 ^= rk1E;
-	rk03 ^= rk1F;
-	x0 = p0 ^ rk00;
-	x1 = p1 ^ rk01;
-	x2 = p2 ^ rk02;
-	x3 = p3 ^ rk03;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07);
-	rk04 ^= rk00;
-	rk05 ^= rk01;
-	rk06 ^= rk02;
-	rk07 ^= rk03;
-	x0 ^= rk04;
-	x1 ^= rk05;
-	x2 ^= rk06;
-	x3 ^= rk07;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B);
-	rk08 ^= rk04;
-	rk09 ^= rk05;
-	rk0A ^= rk06;
-	rk0B ^= rk07;
-	x0 ^= rk08;
-	x1 ^= rk09;
-	x2 ^= rk0A;
-	x3 ^= rk0B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F);
-	rk0C ^= rk08;
-	rk0D ^= rk09;
-	rk0E ^= rk0A;
-	rk0F ^= rk0B;
-	x0 ^= rk0C;
-	x1 ^= rk0D;
-	x2 ^= rk0E;
-	x3 ^= rk0F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	pC ^= x0;
-	pD ^= x1;
-	pE ^= x2;
-	pF ^= x3;
-	KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13);
-	rk10 ^= rk0C;
-	rk11 ^= rk0D;
-	rk12 ^= rk0E;
-	rk13 ^= rk0F;
-	x0 = p8 ^ rk10;
-	x1 = p9 ^ rk11;
-	x2 = pA ^ rk12;
-	x3 = pB ^ rk13;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17);
-	rk14 ^= rk10;
-	rk15 ^= rk11;
-	rk16 ^= rk12;
-	rk17 ^= rk13;
-	x0 ^= rk14;
-	x1 ^= rk15;
-	x2 ^= rk16;
-	x3 ^= rk17;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B);
-	rk18 ^= rk14;
-	rk19 ^= rk15;
-	rk1A ^= rk16;
-	rk1B ^= rk17;
-	x0 ^= rk18;
-	x1 ^= rk19;
-	x2 ^= rk1A;
-	x3 ^= rk1B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F);
-	rk1C ^= rk18;
-	rk1D ^= rk19;
-	rk1E ^= rk1A;
-	rk1F ^= rk1B;
-	rk1E ^= counter;
-	rk1F ^= 0xFFFFFFFF;
-	x0 ^= rk1C;
-	x1 ^= rk1D;
-	x2 ^= rk1E;
-	x3 ^= rk1F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p4 ^= x0;
-	p5 ^= x1;
-	p6 ^= x2;
-	p7 ^= x3;
-
-	rk00 ^= rk19;
-	x0 = pC ^ rk00;
-	rk01 ^= rk1A;
-	x1 = pD ^ rk01;
-	rk02 ^= rk1B;
-	x2 = pE ^ rk02;
-	rk03 ^= rk1C;
-	x3 = pF ^ rk03;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk04 ^= rk1D;
-	x0 ^= rk04;
-	rk05 ^= rk1E;
-	x1 ^= rk05;
-	rk06 ^= rk1F;
-	x2 ^= rk06;
-	rk07 ^= rk00;
-	x3 ^= rk07;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk08 ^= rk01;
-	x0 ^= rk08;
-	rk09 ^= rk02;
-	x1 ^= rk09;
-	rk0A ^= rk03;
-	x2 ^= rk0A;
-	rk0B ^= rk04;
-	x3 ^= rk0B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk0C ^= rk05;
-	x0 ^= rk0C;
-	rk0D ^= rk06;
-	x1 ^= rk0D;
-	rk0E ^= rk07;
-	x2 ^= rk0E;
-	rk0F ^= rk08;
-	x3 ^= rk0F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p8 ^= x0;
-	p9 ^= x1;
-	pA ^= x2;
-	pB ^= x3;
-	rk10 ^= rk09;
-	x0 = p4 ^ rk10;
-	rk11 ^= rk0A;
-	x1 = p5 ^ rk11;
-	rk12 ^= rk0B;
-	x2 = p6 ^ rk12;
-	rk13 ^= rk0C;
-	x3 = p7 ^ rk13;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk14 ^= rk0D;
-	x0 ^= rk14;
-	rk15 ^= rk0E;
-	x1 ^= rk15;
-	rk16 ^= rk0F;
-	x2 ^= rk16;
-	rk17 ^= rk10;
-	x3 ^= rk17;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk18 ^= rk11;
-	x0 ^= rk18;
-	rk19 ^= rk12;
-	x1 ^= rk19;
-	rk1A ^= rk13;
-	x2 ^= rk1A;
-	rk1B ^= rk14;
-	x3 ^= rk1B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk1C ^= rk15;
-	x0 ^= rk1C;
-	rk1D ^= rk16;
-	x1 ^= rk1D;
-	rk1E ^= rk17;
-	x2 ^= rk1E;
-	rk1F ^= rk18;
-	x3 ^= rk1F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p0 ^= x0;
-	p1 ^= x1;
-	p2 ^= x2;
-	p3 ^= x3;
+	KEY_EXPAND_ELT(sharedMemory, &r[0]);
+	*(uint4*)&r[0] ^= *(uint4*)&r[28];
+	x = p[0] ^ *(uint4*)&r[0];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[4]);
+	*(uint4*)&r[4] ^= *(uint4*)&r[0];
+	x ^= *(uint4*)&r[4];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[8]);
+	*(uint4*)&r[8] ^= *(uint4*)&r[4];
+	x ^= *(uint4*)&r[8];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[12]);
+	*(uint4*)&r[12] ^= *(uint4*)&r[8];
+	x ^= *(uint4*)&r[12];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[3] ^= x;
+	KEY_EXPAND_ELT(sharedMemory, &r[16]);
+	*(uint4*)&r[16] ^= *(uint4*)&r[12];
+	x = p[2] ^ *(uint4*)&r[16];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[20]);
+	*(uint4*)&r[20] ^= *(uint4*)&r[16];
+	x ^= *(uint4*)&r[20];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[24]);
+	*(uint4*)&r[24] ^= *(uint4*)&r[20];
+	x ^= *(uint4*)&r[24];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[28]);
+	*(uint4*)&r[28] ^= *(uint4*)&r[24];
+	r[30] ^= counter; // 512/640
+	r[31] ^= 0xFFFFFFFF;
+	x ^= *(uint4*)&r[28];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[1] ^= x;
+
+	*(uint4*)&r[0] ^= *(uint4*)&r[25];
+	x = p[3] ^ *(uint4*)&r[0];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	r[4] ^= r[29];
+	r[5] ^= r[30];
+	r[6] ^= r[31];
+	r[7] ^= r[0];
+	x ^= *(uint4*)&r[4];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[8] ^= *(uint4*)&r[1];
+	x ^= *(uint4*)&r[8];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[12] ^= *(uint4*)&r[5];
+	x ^= *(uint4*)&r[12];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[2] ^= x;
+	*(uint4*)&r[16] ^= *(uint4*)&r[9];
+	x = p[1] ^ *(uint4*)&r[16];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[20] ^= *(uint4*)&r[13];
+	x ^= *(uint4*)&r[20];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[24] ^= *(uint4*)&r[17];
+	x ^= *(uint4*)&r[24];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	*(uint4*)&r[28] ^= *(uint4*)&r[21];
+	x ^= *(uint4*)&r[28];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[0] ^= x;
 
 	/* round 3, 7, 11 */
-	KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03);
-	rk00 ^= rk1C;
-	rk01 ^= rk1D;
-	rk02 ^= rk1E;
-	rk03 ^= rk1F;
-	x0 = p8 ^ rk00;
-	x1 = p9 ^ rk01;
-	x2 = pA ^ rk02;
-	x3 = pB ^ rk03;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07);
-	rk04 ^= rk00;
-	rk05 ^= rk01;
-	rk06 ^= rk02;
-	rk07 ^= rk03;
-	x0 ^= rk04;
-	x1 ^= rk05;
-	x2 ^= rk06;
-	x3 ^= rk07;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B);
-	rk08 ^= rk04;
-	rk09 ^= rk05;
-	rk0A ^= rk06;
-	rk0B ^= rk07;
-	x0 ^= rk08;
-	x1 ^= rk09;
-	x2 ^= rk0A;
-	x3 ^= rk0B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F);
-	rk0C ^= rk08;
-	rk0D ^= rk09;
-	rk0E ^= rk0A;
-	rk0F ^= rk0B;
-	x0 ^= rk0C;
-	x1 ^= rk0D;
-	x2 ^= rk0E;
-	x3 ^= rk0F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p4 ^= x0;
-	p5 ^= x1;
-	p6 ^= x2;
-	p7 ^= x3;
-	KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13);
-	rk10 ^= rk0C;
-	rk11 ^= rk0D;
-	rk12 ^= rk0E;
-	rk13 ^= rk0F;
-	x0 = p0 ^ rk10;
-	x1 = p1 ^ rk11;
-	x2 = p2 ^ rk12;
-	x3 = p3 ^ rk13;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17);
-	rk14 ^= rk10;
-	rk15 ^= rk11;
-	rk16 ^= rk12;
-	rk17 ^= rk13;
-	x0 ^= rk14;
-	x1 ^= rk15;
-	x2 ^= rk16;
-	x3 ^= rk17;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B);
-	rk18 ^= rk14;
-	rk19 ^= rk15;
-	rk1A ^= rk16;
-	rk1B ^= rk17;
-	x0 ^= rk18;
-	x1 ^= rk19;
-	x2 ^= rk1A;
-	x3 ^= rk1B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F);
-	rk1C ^= rk18;
-	rk1D ^= rk19;
-	rk1E ^= rk1A;
-	rk1F ^= rk1B;
-	x0 ^= rk1C;
-	x1 ^= rk1D;
-	x2 ^= rk1E;
-	x3 ^= rk1F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	pC ^= x0;
-	pD ^= x1;
-	pE ^= x2;
-	pF ^= x3;
+	round_3_7_11(sharedMemory, r, p, x);
+
 	/* round 4, 8, 12 */
-	rk00 ^= rk19;
-	x0 = p4 ^ rk00;
-	rk01 ^= rk1A;
-	x1 = p5 ^ rk01;
-	rk02 ^= rk1B;
-	x2 = p6 ^ rk02;
-	rk03 ^= rk1C;
-	x3 = p7 ^ rk03;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk04 ^= rk1D;
-	x0 ^= rk04;
-	rk05 ^= rk1E;
-	x1 ^= rk05;
-	rk06 ^= rk1F;
-	x2 ^= rk06;
-	rk07 ^= rk00;
-	x3 ^= rk07;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk08 ^= rk01;
-	x0 ^= rk08;
-	rk09 ^= rk02;
-	x1 ^= rk09;
-	rk0A ^= rk03;
-	x2 ^= rk0A;
-	rk0B ^= rk04;
-	x3 ^= rk0B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk0C ^= rk05;
-	x0 ^= rk0C;
-	rk0D ^= rk06;
-	x1 ^= rk0D;
-	rk0E ^= rk07;
-	x2 ^= rk0E;
-	rk0F ^= rk08;
-	x3 ^= rk0F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p0 ^= x0;
-	p1 ^= x1;
-	p2 ^= x2;
-	p3 ^= x3;
-	rk10 ^= rk09;
-	x0 = pC ^ rk10;
-	rk11 ^= rk0A;
-	x1 = pD ^ rk11;
-	rk12 ^= rk0B;
-	x2 = pE ^ rk12;
-	rk13 ^= rk0C;
-	x3 = pF ^ rk13;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk14 ^= rk0D;
-	x0 ^= rk14;
-	rk15 ^= rk0E;
-	x1 ^= rk15;
-	rk16 ^= rk0F;
-	x2 ^= rk16;
-	rk17 ^= rk10;
-	x3 ^= rk17;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk18 ^= rk11;
-	x0 ^= rk18;
-	rk19 ^= rk12;
-	x1 ^= rk19;
-	rk1A ^= rk13;
-	x2 ^= rk1A;
-	rk1B ^= rk14;
-	x3 ^= rk1B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk1C ^= rk15;
-	x0 ^= rk1C;
-	rk1D ^= rk16;
-	x1 ^= rk1D;
-	rk1E ^= rk17;
-	x2 ^= rk1E;
-	rk1F ^= rk18;
-	x3 ^= rk1F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p8 ^= x0;
-	p9 ^= x1;
-	pA ^= x2;
-	pB ^= x3;
+	round_4_8_12(sharedMemory, r, p, x);
 
 	/* round 13 */
-	KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03);
-	rk00 ^= rk1C;
-	rk01 ^= rk1D;
-	rk02 ^= rk1E;
-	rk03 ^= rk1F;
-	x0 = p0 ^ rk00;
-	x1 = p1 ^ rk01;
-	x2 = p2 ^ rk02;
-	x3 = p3 ^ rk03;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07);
-	rk04 ^= rk00;
-	rk05 ^= rk01;
-	rk06 ^= rk02;
-	rk07 ^= rk03;
-	x0 ^= rk04;
-	x1 ^= rk05;
-	x2 ^= rk06;
-	x3 ^= rk07;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B);
-	rk08 ^= rk04;
-	rk09 ^= rk05;
-	rk0A ^= rk06;
-	rk0B ^= rk07;
-	x0 ^= rk08;
-	x1 ^= rk09;
-	x2 ^= rk0A;
-	x3 ^= rk0B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F);
-	rk0C ^= rk08;
-	rk0D ^= rk09;
-	rk0E ^= rk0A;
-	rk0F ^= rk0B;
-	x0 ^= rk0C;
-	x1 ^= rk0D;
-	x2 ^= rk0E;
-	x3 ^= rk0F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	pC ^= x0;
-	pD ^= x1;
-	pE ^= x2;
-	pF ^= x3;
-	KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13);
-	rk10 ^= rk0C;
-	rk11 ^= rk0D;
-	rk12 ^= rk0E;
-	rk13 ^= rk0F;
-	x0 = p8 ^ rk10;
-	x1 = p9 ^ rk11;
-	x2 = pA ^ rk12;
-	x3 = pB ^ rk13;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17);
-	rk14 ^= rk10;
-	rk15 ^= rk11;
-	rk16 ^= rk12;
-	rk17 ^= rk13;
-	x0 ^= rk14;
-	x1 ^= rk15;
-	x2 ^= rk16;
-	x3 ^= rk17;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B);
-	rk18 ^= rk14;
-	rk19 ^= rk15 ^ counter;
-	rk1A ^= rk16;
-	rk1B ^= rk17 ^ 0xFFFFFFFF;
-	x0 ^= rk18;
-	x1 ^= rk19;
-	x2 ^= rk1A;
-	x3 ^= rk1B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F);
-	rk1C ^= rk18;
-	rk1D ^= rk19;
-	rk1E ^= rk1A;
-	rk1F ^= rk1B;
-	x0 ^= rk1C;
-	x1 ^= rk1D;
-	x2 ^= rk1E;
-	x3 ^= rk1F;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	p4 ^= x0;
-	p5 ^= x1;
-	p6 ^= x2;
-	p7 ^= x3;
-	state[0x0] ^= p8;
-	state[0x1] ^= p9;
-	state[0x2] ^= pA;
-	state[0x3] ^= pB;
-	state[0x4] ^= pC;
-	state[0x5] ^= pD;
-	state[0x6] ^= pE;
-	state[0x7] ^= pF;
-	state[0x8] ^= p0;
-	state[0x9] ^= p1;
-	state[0xA] ^= p2;
-	state[0xB] ^= p3;
-	state[0xC] ^= p4;
-	state[0xD] ^= p5;
-	state[0xE] ^= p6;
-	state[0xF] ^= p7;
+	KEY_EXPAND_ELT(sharedMemory, &r[0]);
+	*(uint4*)&r[0] ^= *(uint4*)&r[28];
+	x = p[0] ^ *(uint4*)&r[0];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[4]);
+	*(uint4*)&r[4] ^= *(uint4*)&r[0];
+	x ^= *(uint4*)&r[4];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[8]);
+	*(uint4*)&r[8] ^= *(uint4*)&r[4];
+	x ^= *(uint4*)&r[8];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[12]);
+	*(uint4*)&r[12] ^= *(uint4*)&r[8];
+	x ^= *(uint4*)&r[12];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[3] ^= x;
+	KEY_EXPAND_ELT(sharedMemory, &r[16]);
+	*(uint4*)&r[16] ^= *(uint4*)&r[12];
+	x = p[2] ^ *(uint4*)&r[16];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[20]);
+	*(uint4*)&r[20] ^= *(uint4*)&r[16];
+	x ^= *(uint4*)&r[20];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[24]);
+	*(uint4*)&r[24] ^= *(uint4*)&r[20];
+	r[25] ^= counter; // 512/640
+	r[27] ^= 0xFFFFFFFF;
+	x ^= *(uint4*)&r[24];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	KEY_EXPAND_ELT(sharedMemory, &r[28]);
+	*(uint4*)&r[28] ^= *(uint4*)&r[24];
+	x ^= *(uint4*)&r[28];
+	AES_ROUND_NOKEY(sharedMemory, &x);
+	p[1] ^= x;
+
+	Hash[0] = *(uint2x4*)&state[0] ^ *(uint2x4*)&p[2];
+	Hash[1] = *(uint2x4*)&state[8] ^ *(uint2x4*)&p[0];
 }
 
+
 __device__ __forceinline__
 void shavite_gpu_init(uint32_t *sharedMemory)
 {
 	/* each thread startup will fill a uint32 */
-	if (threadIdx.x < 128) {
-		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
-		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
-		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
-		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
-
-		sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
-		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
-		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
-		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
-	}
+	aes_gpu_init(sharedMemory);
 }
 
 // GPU Hash
-__global__ __launch_bounds__(TPB, 7) /* 64 registers with 128,8 - 72 regs with 128,7 */
+//__global__ __launch_bounds__(TPB, 7) /* 64 registers with 128,8 - 72 regs with 128,7 */
+__global__ __launch_bounds__(TPB, 2) /* 64 registers with 128,8 - 72 regs with 128,7 */
 void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	__shared__ uint32_t sharedMemory[1024];
@@ -1386,53 +514,70 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		msg[30] = 0;
 		msg[31] = 0x02000000;
 
-		c512(sharedMemory, state, msg, 512);
-
+		c512(sharedMemory, state, msg, (uint2x4*)Hash, 512);
+		/*
 		#pragma unroll 16
 		for(int i=0;i<16;i++)
 			Hash[i] = state[i];
+		*/
 	}
 }
 
+//__global__ __launch_bounds__(TPB, 7)
+__global__ __launch_bounds__(TPB, 2)
+
+
+#if TPB == 128
 __global__ __launch_bounds__(TPB, 7)
-void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
+#elif TPB == 384
+__global__ __launch_bounds__(TPB, 2)
+#else
+#error "Not set up for this"
+#endif
+void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
 {
+	#if TPB == 128
+	aes_gpu_init_128(sharedMemory);
+	#elif TPB == 384
+	//! todo, fix naming and sharedMemory
 	__shared__ uint32_t sharedMemory[1024];
+	aes_gpu_init(sharedMemory); //! hack 1 must be using 256 threads
+	#else
+	#error "Not set up for this"
+	#endif
+//	__threadfence_block();
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	// initial state
+	uint32_t state[16] = { // should be constant instead of being used to store data and being overwritten
+		SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
+		SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
+		SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
+		SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A)
+	};
 
-	shavite_gpu_init(sharedMemory);
-	__threadfence_block();
-
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		const uint32_t nounce = startNounce + thread;
+		uint64_t *Hash = &g_hash[thread << 3];
 
-		// initial state
-		uint32_t state[16] = {
-			SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
-			SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
-			SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
-			SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A)
-		};
-
-		uint32_t msg[32];
-
-		#pragma unroll 32
-		for(int i=0;i<32;i++) {
-			msg[i] = c_PaddedMessage80[i];
-		}
-		msg[19] = cuda_swab32(nounce);
-		msg[20] = 0x80;
-		msg[27] = 0x2800000;
-		msg[31] = 0x2000000;
+		uint32_t r[20];//32
+		*(uint2x4*)&r[0] = *(uint2x4*)&c_PaddedMessage80[0];
+		*(uint2x4*)&r[8] = *(uint2x4*)&c_PaddedMessage80[8];
+		*(uint4*)&r[16] = *(uint4*)&c_PaddedMessage80[16];
 
-		c512(sharedMemory, state, msg, 640);
+//		__syncthreads();
 
-		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
 
-		#pragma unroll 16
-		for(int i=0;i<16;i++)
-			outHash[i] = state[i];
+		r[19] = cuda_swab32(nounce);
+		/*
+		r[20] = 0x80;
+		r[21] = r[22] = r[23] = r[24] = r[25] = r[26] = r[28] = r[29] = r[30] = 0;
+		r[27] = 0x2800000;
+		r[31] = 0x2000000;
+		*/
+		c512(sharedMemory, state, r, (uint2x4*)Hash, 640);
 
 	} //thread < threads
 }
@@ -1446,19 +591,20 @@ void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun
 	dim3 block(threadsperblock);
 
 	// note: 128 threads minimum are required to init the shared memory array
+	//gpulog(LOG_WARNING, thr_id, "x11 shavite512 is not set up for this algo");
 	x11_shavite512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
 
 __host__
-void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
+void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
 {
 	const uint32_t threadsperblock = TPB;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	x11_shavite512_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash);
+	x11_shavite512_gpu_hash_80<<<grid, block>>>(threads, startNounce, (uint64_t*)d_outputHash);
 }
 
 __host__
@@ -1472,9 +618,12 @@ void x11_shavite512_setBlock_80(void *pdata)
 {
 	// Message with Padding
 	// The nonce is at Byte 76.
+/*
 	unsigned char PaddedMessage[128];
 	memcpy(PaddedMessage, pdata, 80);
 	memset(PaddedMessage+80, 0, 48);
 
 	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+*/
+	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, 20 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 }
diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu
index 5495edadf4..df8d0bce7f 100644
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@@ -662,14 +662,22 @@ __host__
 int x11_simd512_cpu_init(int thr_id, uint32_t threads)
 {
 	int dev_id = device_map[thr_id];
-	cuda_get_arch(thr_id);
+	// cuda_get_arch(thr_id); // should be already done!
 	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
 		x11_simd512_cpu_init_sm2(thr_id);
 		return 0;
 	}
-
-	CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */
-	CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err);
+	//2097152
+#if 0
+	if (threads > 2097152)
+	{
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 32 * sizeof(uint4)*(threads >> 1)), (int)err); /* todo: prevent -i 21 */
+		CUDA_CALL_OR_RET_X(cudaMalloc((&d_temp4[thr_id]) + 32 * (threads >> 1), 32 * sizeof(uint4)*(threads >> 1)), (int)err); /* todo: prevent -i 21 */
+	}
+	else
+#endif
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64 * sizeof(uint4)*threads), (int)err); /* todo: prevent -i 21 */
+	CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32 * sizeof(int)*threads), (int)err);
 
 #ifndef DEVICE_DIRECT_CONSTANTS
 	cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice);
@@ -705,20 +713,15 @@ void x11_simd512_cpu_free(int thr_id)
 }
 
 __host__
-void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = TPB;
 	int dev_id = device_map[thr_id];
-
+	//2097152
 	dim3 block(threadsperblock);
 	dim3 grid((threads + threadsperblock-1) / threadsperblock);
 	dim3 gridX8(grid.x * 8);
 
-	if (d_nonceVector != NULL || device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
-		x11_simd512_cpu_hash_64_sm2(thr_id, threads, startNounce, d_nonceVector, d_hash, order);
-		return;
-	}
-
 	x11_simd512_gpu_expand_64 <<<gridX8, block>>> (threads, d_hash, d_temp4[thr_id]);
 
 	if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) {
@@ -730,5 +733,5 @@ void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,
 
 	x11_simd512_gpu_final_64 <<<grid, block>>> (threads, d_hash, d_temp4[thr_id], d_state[thr_id]);
 
-	//MyStreamSynchronize(NULL, order, thr_id);
+//	MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/x11/cuda_x11_simd512_sm2.cuh b/x11/cuda_x11_simd512_sm2.cuh
index 1c5b314364..7abbac163e 100644
--- a/x11/cuda_x11_simd512_sm2.cuh
+++ b/x11/cuda_x11_simd512_sm2.cuh
@@ -537,9 +537,9 @@ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNou
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		//uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		const int hashPosition = nounce - startNounce;
+		const int hashPosition = thread;//nounce - startNounce;
 		uint32_t *Hash = (uint32_t*) &g_hash[8 * hashPosition];
 
 		SIMDHash(Hash, Hash);
diff --git a/x11/fresh.cu b/x11/fresh.cu
index f67a54e7c7..8db268b3b4 100644
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * Fresh algorithm
  */
@@ -183,3 +184,4 @@ extern "C" void free_fresh(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/x11/phi.cu b/x11/phi.cu
index ab1f30833c..6e32d37fcd 100644
--- a/x11/phi.cu
+++ b/x11/phi.cu
@@ -1,3 +1,4 @@
+#if 0
 //
 //
 //  PHI1612 algo
@@ -221,3 +222,4 @@ extern "C" void free_phi(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/x11/s3.cu b/x11/s3.cu
index 85eaaa6a1c..004e80903f 100644
--- a/x11/s3.cu
+++ b/x11/s3.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * S3 Hash (Also called Triple S - Used by 1Coin)
  */
@@ -179,3 +180,4 @@ extern "C" void free_s3(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/x11/sib.cu b/x11/sib.cu
index c437523d03..6afa73bb47 100644
--- a/x11/sib.cu
+++ b/x11/sib.cu
@@ -1,3 +1,4 @@
+#if 0
 extern "C" {
 #include "sph/sph_blake.h"
 #include "sph/sph_bmw.h"
@@ -249,3 +250,4 @@ extern "C" void free_sib(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/x11/timetravel.cu b/x11/timetravel.cu
index 93c3fd19a8..4e26f531c6 100644
--- a/x11/timetravel.cu
+++ b/x11/timetravel.cu
@@ -2,7 +2,7 @@
  * Timetravel CUDA implementation
  *  by tpruvot@github - March 2017
  */
-
+#if 0
 #include <stdio.h>
 #include <memory.h>
 #include <unistd.h>
@@ -552,3 +552,4 @@ extern "C" void free_timetravel(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/x11/veltor.cu b/x11/veltor.cu
index 7bc1e18dab..c4121468d1 100644
--- a/x11/veltor.cu
+++ b/x11/veltor.cu
@@ -1,3 +1,4 @@
+#if 0
 extern "C" {
 #include "sph/sph_skein.h"
 #include "sph/sph_shavite.h"
@@ -195,3 +196,4 @@ extern "C" void free_veltor(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/x11/x11.cu b/x11/x11.cu
index a7f1b601fa..984c0f6096 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -1,3 +1,4 @@
+#if 0
 extern "C" {
 #include "sph/sph_blake.h"
 #include "sph/sph_bmw.h"
@@ -231,3 +232,4 @@ extern "C" void free_x11(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/x11/x11evo.cu b/x11/x11evo.cu
index 53799f9ba6..41bd50cde8 100644
--- a/x11/x11evo.cu
+++ b/x11/x11evo.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * X11EVO algo implementation
  * Cuda implementation by tpruvot@github - May 2016
@@ -412,3 +413,4 @@ extern "C" void free_x11evo(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file
diff --git a/x13/cuda_x13_hamsi512.cu b/x13/cuda_x13_hamsi512.cu
index 3c21f7f1e5..ddea4165c6 100644
--- a/x13/cuda_x13_hamsi512.cu
+++ b/x13/cuda_x13_hamsi512.cu
@@ -7,7 +7,8 @@
 #include <stdint.h>
 #include <memory.h>
 
-#include "cuda_helper.h"
+//#include "cuda_helper.h"
+#include "cuda_helper_alexis.h"
 
 typedef unsigned char BitSequence;
 
diff --git a/x13/hsr.cu b/x13/hsr.cu
index e86444628d..7a56de1db0 100644
--- a/x13/hsr.cu
+++ b/x13/hsr.cu
@@ -1,3 +1,4 @@
+#if 0
 /*
  * X13 algorithm
  */
@@ -263,3 +264,4 @@ extern "C" void free_hsr(int thr_id)
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
 }
+#endif
\ No newline at end of file
diff --git a/x13/x13.cu b/x13/x13.cu
index 0f5d88c394..713f685cbf 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -1,3 +1,4 @@
+#if 0
 /*
  * X13 algorithm
  */
@@ -252,3 +253,4 @@ extern "C" void free_x13(int thr_id)
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
 }
+#endif
\ No newline at end of file
diff --git a/x15/cuda_x14_shabal512.cu b/x15/cuda_x14_shabal512.cu
index a1d5a8da01..43c5ebf1ca 100644
--- a/x15/cuda_x14_shabal512.cu
+++ b/x15/cuda_x14_shabal512.cu
@@ -471,5 +471,5 @@ __host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t s
 	// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 
 	x14_shabal512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-	MyStreamSynchronize(NULL, order, thr_id);
+	//MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu
index a541919711..fbed3759fd 100644
--- a/x15/cuda_x15_whirlpool.cu
+++ b/x15/cuda_x15_whirlpool.cu
@@ -200,7 +200,7 @@ void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode)
 
 	CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
 
-	cuda_get_arch(thr_id);
+//	cuda_get_arch(thr_id); // must be called
 }
 
 __host__
@@ -725,7 +725,7 @@ void x15_whirlpool_gpu_hash_64(uint32_t threads, uint64_t *g_hash)
 		*(uint2x4*)&g_hash[(thread<<3)+ 4] = *(uint2x4*)&hash[ 4];
 	}
 }
-
+/*
 __host__
 static void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
 {
@@ -734,10 +734,13 @@ static void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_
 
 	x15_whirlpool_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash);
 }
-
+*/
 __host__
-void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
 {
-	x15_whirlpool_cpu_hash_64(thr_id, threads, d_hash);
-}
+	dim3 grid((threads + TPB64 - 1) / TPB64);
+	dim3 block(TPB64);
 
+	x15_whirlpool_gpu_hash_64 << <grid, block >> > (threads, (uint64_t*)d_hash);
+	//	x15_whirlpool_cpu_hash_64(thr_id, threads, d_hash);
+}
diff --git a/x15/cuda_x15_whirlpool_sm3.cu b/x15/cuda_x15_whirlpool_sm3.cu
index 3110a694ed..5e3477e3c7 100644
--- a/x15/cuda_x15_whirlpool_sm3.cu
+++ b/x15/cuda_x15_whirlpool_sm3.cu
@@ -1,3 +1,4 @@
+
 /**
  * Whirlpool-512 CUDA implementation. (better for SM 3.0)
  *
@@ -35,7 +36,8 @@
 // don't change, used by shared mem fetch!
 #define threadsperblock 256
 
-#include <cuda_helper.h>
+#include "cuda_helper_alexis.h"
+//#include <cuda_helper.h>
 #include <miner.h>
 #include "cuda_whirlpool_tables.cuh"
 
diff --git a/x15/whirlpool.cu b/x15/whirlpool.cu
index ae5bc996e4..6ca5bc6058 100644
--- a/x15/whirlpool.cu
+++ b/x15/whirlpool.cu
@@ -1,3 +1,4 @@
+
 /*
  * whirlpool routine
  */
@@ -69,7 +70,7 @@ void whirl_midstate(void *state, const void *input)
 }
 
 static bool init[MAX_GPUS] = { 0 };
-
+#if 0
 extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t _ALIGN(128) endiandata[20];
@@ -179,3 +180,4 @@ extern "C" void free_whirl(int thr_id)
 	cudaDeviceSynchronize();
 }
 
+#endif
\ No newline at end of file
diff --git a/x15/x14.cu b/x15/x14.cu
index 4232c6906f..a9356d6dcc 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -1,3 +1,4 @@
+#if 0
 /*
  * X14 algorithm
  * Added in ccminer by Tanguy Pruvot - 2014
@@ -268,3 +269,4 @@ extern "C" void free_x14(int thr_id)
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
 }
+#endif
\ No newline at end of file
diff --git a/x15/x15.cu b/x15/x15.cu
index cdfbd81f00..237f9482e3 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -1,3 +1,4 @@
+#if 0
 /*
  * X15 algorithm (CHC, BBC, X15C)
  * Added in ccminer by Tanguy Pruvot - 2014
@@ -273,3 +274,4 @@ extern "C" void free_x15(int thr_id)
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
 }
+#endif
\ No newline at end of file
diff --git a/x17/cuda_x17_sha512.cu b/x17/cuda_x17_sha512.cu
index a0757d0e5b..e0495a4cce 100644
--- a/x17/cuda_x17_sha512.cu
+++ b/x17/cuda_x17_sha512.cu
@@ -1,37 +1,38 @@
 /*
- * sha-512 cuda kernel implementation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2014 djm34
- *               2016 tpruvot
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- */
+* sha-512 cuda kernel implementation.
+*
+* ==========================(LICENSE BEGIN)============================
+*
+* Copyright (c) 2014 djm34
+*               2016 tpruvot
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to deal in the Software without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Software, and to
+* permit persons to whom the Software is furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*
+* ===========================(LICENSE END)=============================
+*/
 #include <stdio.h>
 
 #define NEED_HASH_512
 
-#include "cuda_helper.h"
+//#include "cuda_helper.h"
+#include "cuda_helper_alexis.h"
 
 #define SWAP64(u64) cuda_swab64(u64)
 
@@ -61,8 +62,8 @@ static const uint64_t WB[80] = {
 };
 
 #define BSG5_0(x) xor3(ROTR64(x,28), ROTR64(x,34), ROTR64(x,39))
-#define SSG5_0(x) xor3(ROTR64(x, 1), ROTR64(x ,8), shr_t64(x,7))
-#define SSG5_1(x) xor3(ROTR64(x,19), ROTR64(x,61), shr_t64(x,6))
+#define SSG5_0(x) xor3(ROTR64(x, 1), ROTR64(x ,8), shr_u64(x,7))
+#define SSG5_1(x) xor3(ROTR64(x,19), ROTR64(x,61), shr_u64(x,6))
 
 //#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
 #define MAJ(x, y, z)   andor(x,y,z)
@@ -71,12 +72,12 @@ __device__ __forceinline__
 uint64_t Tone(uint64_t* K, uint64_t* r, uint64_t* W, const int a, const int i)
 {
 	//asm("// TONE \n");
-	const uint64_t e = r[(a+4) & 7];
+	const uint64_t e = r[(a + 4) & 7];
 	uint64_t BSG51 = xor3(ROTR64(e, 14), ROTR64(e, 18), ROTR64(e, 41));
-	const uint64_t f = r[(a+5) & 7];
-	const uint64_t g = r[(a+6) & 7];
+	const uint64_t f = r[(a + 5) & 7];
+	const uint64_t g = r[(a + 6) & 7];
 	uint64_t CHl = ((f ^ g) & e) ^ g; // xandx(e, f, g);
-	return (r[(a+7) & 7] + BSG51 + CHl + K[i] + W[i]);
+	return (r[(a + 7) & 7] + BSG51 + CHl + K[i] + W[i]);
 }
 
 #define SHA3_STEP(K, r, W, ord, i) { \
@@ -95,25 +96,25 @@ void x17_sha512_gpu_hash_64(const uint32_t threads, uint64_t *g_hash)
 	if (thread < threads)
 	{
 		const uint64_t hashPosition = thread;
-		uint64_t *pHash = &g_hash[hashPosition*8U];
+		uint64_t *pHash = &g_hash[hashPosition * 8U];
 
 		uint64_t W[80];
-		#pragma unroll
-		for (int i = 0; i < 8; i ++) {
+#pragma unroll
+		for (int i = 0; i < 8; i++) {
 			W[i] = SWAP64(pHash[i]);
 		}
 		W[8] = 0x8000000000000000;
 
-		#pragma unroll 69
+#pragma unroll 69
 		for (int i = 9; i<78; i++) {
 			W[i] = 0U;
 		}
 		W[15] = 0x0000000000000200;
 
-		#pragma unroll 64
-		for (int i = 16; i < 80; i ++) {
-			W[i] = SSG5_1(W[i-2]) + W[i-7];
-			W[i] += SSG5_0(W[i-15]) + W[i-16];
+#pragma unroll 64
+		for (int i = 16; i < 80; i++) {
+			W[i] = SSG5_1(W[i - 2]) + W[i - 7];
+			W[i] += SSG5_0(W[i - 15]) + W[i - 16];
 		}
 
 		const uint64_t IV512[8] = {
@@ -124,29 +125,29 @@ void x17_sha512_gpu_hash_64(const uint32_t threads, uint64_t *g_hash)
 		};
 
 		uint64_t r[8];
-		#pragma unroll
-		for (int i = 0; i < 8; i ++) {
+#pragma unroll
+		for (int i = 0; i < 8; i++) {
 			r[i] = IV512[i];
 		}
 
 #if CUDART_VERSION >= 7050
-		#pragma unroll 10
+#pragma unroll 10
 #endif
 		for (int i = 0; i < 80; i += 8) {
-			#pragma unroll
+#pragma unroll
 			for (int ord = 0; ord < 8; ord++) {
-				SHA3_STEP(c_WB, r, W, ord, i+ord);
+				SHA3_STEP(c_WB, r, W, ord, i + ord);
 			}
 		}
 
-		#pragma unroll
-		for (int u = 0; u < 4; u ++) {
+#pragma unroll
+		for (int u = 0; u < 4; u++) {
 			pHash[u] = SWAP64(r[u] + IV512[u]);
 		}
 
 #ifdef NEED_HASH_512
-		#pragma unroll
-		for (int u = 4; u < 8; u ++) {
+#pragma unroll
+		for (int u = 4; u < 8; u++) {
 			pHash[u] = SWAP64(r[u] + IV512[u]);
 		}
 #endif
@@ -156,18 +157,18 @@ void x17_sha512_gpu_hash_64(const uint32_t threads, uint64_t *g_hash)
 __host__
 void x17_sha512_cpu_init(int thr_id, uint32_t threads)
 {
-	cudaMemcpyToSymbol(c_WB, WB, 80*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_WB, WB, 80 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 }
 
 __host__
-void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
+void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 256;
 
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	x17_sha512_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash);
+	x17_sha512_gpu_hash_64 << <grid, block >> > (threads, (uint64_t*)d_hash);
 }
 
 __constant__
@@ -181,8 +182,8 @@ void x16_sha512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, u
 	if (thread < threads)
 	{
 		uint64_t W[80];
-		#pragma unroll
-		for (int i = 0; i < 9; i ++) {
+#pragma unroll
+		for (int i = 0; i < 9; i++) {
 			W[i] = SWAP64(c_PaddedMessage80[i]);
 		}
 		const uint32_t nonce = startNonce + thread;
@@ -191,16 +192,16 @@ void x16_sha512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, u
 		W[9] = cuda_swab64(W[9]);
 		W[10] = 0x8000000000000000;
 
-		#pragma unroll
+#pragma unroll
 		for (int i = 11; i<15; i++) {
 			W[i] = 0U;
 		}
 		W[15] = 0x0000000000000280;
 
-		#pragma unroll 64
-		for (int i = 16; i < 80; i ++) {
-			W[i] = SSG5_1(W[i-2]) + W[i-7];
-			W[i] += SSG5_0(W[i-15]) + W[i-16];
+#pragma unroll 64
+		for (int i = 16; i < 80; i++) {
+			W[i] = SSG5_1(W[i - 2]) + W[i - 7];
+			W[i] += SSG5_0(W[i - 15]) + W[i - 16];
 		}
 
 		const uint64_t IV512[8] = {
@@ -211,20 +212,20 @@ void x16_sha512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, u
 		};
 
 		uint64_t r[8];
-		#pragma unroll
+#pragma unroll
 		for (int i = 0; i < 8; i++) {
 			r[i] = IV512[i];
 		}
 
-		#pragma unroll
+#pragma unroll
 		for (int i = 0; i < 80; i++) {
-			SHA3_STEP(c_WB, r, W, i&7, i);
+			SHA3_STEP(c_WB, r, W, i & 7, i);
 		}
 
 		const uint64_t hashPosition = thread;
 		uint64_t *pHash = &g_hash[hashPosition << 3];
-		#pragma unroll
-		for (int u = 0; u < 8; u ++) {
+#pragma unroll
+		for (int u = 0; u < 8; u++) {
 			pHash[u] = SWAP64(r[u] + IV512[u]);
 		}
 	}
@@ -235,10 +236,10 @@ void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t
 {
 	const uint32_t threadsperblock = 256;
 
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	x16_sha512_gpu_hash_80 <<<grid, block >>> (threads, startNounce, (uint64_t*)d_hash);
+	x16_sha512_gpu_hash_80 << <grid, block >> > (threads, startNounce, (uint64_t*)d_hash);
 }
 
 __host__
diff --git a/x17/hmq17.cu b/x17/hmq17.cu
index 8fdbcdf4ff..2b99467d63 100644
--- a/x17/hmq17.cu
+++ b/x17/hmq17.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * HMQ1725 algorithm
  * @author tpruvot@github 02-2017
@@ -540,3 +541,4 @@ extern "C" void free_hmq17(int thr_id)
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
 }
+#endif
\ No newline at end of file
diff --git a/x17/x17.cu b/x17/x17.cu
index 816e5e0634..b3ddbab423 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -1,3 +1,4 @@
+#if 0
 /**
  * X17 algorithm (X15 + sha512 + haval256)
  */
@@ -302,3 +303,4 @@ extern "C" void free_x17(int thr_id)
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
 }
+#endif
\ No newline at end of file
diff --git a/zr5.cu b/zr5.cu
index 11140b0fff..afdd11de98 100644
--- a/zr5.cu
+++ b/zr5.cu
@@ -1,5 +1,5 @@
 /* Ziftrcoin ZR5 CUDA Implementation, (c) tpruvot 2015 */
-
+#if 0
 extern "C" {
 #include "sph/sph_blake.h"
 #include "sph/sph_groestl.h"
@@ -323,7 +323,7 @@ extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t
 extern void quark_groestl512_cpu_free(int thr_id);
 
 extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
-extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order);
 
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -512,3 +512,4 @@ extern "C" void free_zr5(int thr_id)
 
 	cudaDeviceSynchronize();
 }
+#endif
\ No newline at end of file