256 improvements

a1i3nj03 · Apr 28, 2018 · 934fe60 · 934fe60
1 parent aaaa0d4
commit 934fe60
Show file tree

Hide file tree

Showing 13 changed files with 166 additions and 69 deletions.
diff --git a/ccminer.cpp b/ccminer.cpp
@@ -1965,7 +1965,7 @@ void sig_fn(int sig)
 	return;
 }
 */
-
+#define X16R_BLOCKTIME_GUESS 20
 static void *miner_thread(void *userdata)
 {
 	struct thr_info *mythr = (struct thr_info *)userdata;
@@ -2035,7 +2035,10 @@ static void *miner_thread(void *userdata)
 	}
 
 	gpu_led_off(dev_id);
-
+	int wcmplen = (opt_algo == ALGO_DECRED) ? 140 : 76;
+	int wcmpoft = 0;
+	uint32_t *nonceptr = (uint32_t*)(((char*)work.data) + wcmplen);
+	nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
 	while (!abort_flag) {
 		struct timeval tv_start, tv_end, diff;
 		unsigned long hashes_done;
@@ -2046,16 +2049,16 @@ static void *miner_thread(void *userdata)
 		bool regen = false;
 
 		// &work.data[19]
-		int wcmplen = (opt_algo == ALGO_DECRED) ? 140 : 76;
-		int wcmpoft = 0;
+//		int wcmplen = (opt_algo == ALGO_DECRED) ? 140 : 76;
+//		int wcmpoft = 0;
 		/*
 		if (opt_algo == ALGO_LBRY) wcmplen = 108;
 		else if (opt_algo == ALGO_SIA) {
 			wcmpoft = (32+16)/4;
 			wcmplen = 32;
 		}
 		*/
-		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
+//		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
 
 		if (have_stratum) {
 
@@ -2074,7 +2077,7 @@ static void *miner_thread(void *userdata)
 				applog(LOG_DEBUG, "sleeptime: %u ms", sleeptime*100);
 				*/
 			//nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
-			pthread_mutex_lock(&g_work_lock);
+//			pthread_mutex_lock(&g_work_lock);
 			extrajob |= work_done;
 
 			regen = (nonceptr[0] >= end_nonce);
@@ -2086,13 +2089,17 @@ static void *miner_thread(void *userdata)
 			regen = regen || extrajob;
 
 			if (regen) {
-				gpulog(LOG_BLUE, thr_id, "REGEN");
+//				gpulog(LOG_BLUE, thr_id, "REGEN");
 				work_done = false;
 				extrajob = false;
+				pthread_mutex_lock(&g_work_lock);
 				if (stratum_gen_work(&stratum, &g_work))
 					g_work_time = time(NULL);
 			}
-		} else {
+			else
+				pthread_mutex_lock(&g_work_lock);
+		}
+		else {
 			uint32_t secs = 0;
 			pthread_mutex_lock(&g_work_lock);
 			secs = (uint32_t) (time(NULL) - g_work_time);
@@ -2114,28 +2121,65 @@ static void *miner_thread(void *userdata)
 			}
 		}
 
-		// reset shares id counter on new job
-		if (strcmp(work.job_id, g_work.job_id))
-			stratum.job.shares_count = 0;
-
 		if (!opt_benchmark && (g_work.height != work.height || memcmp(work.target, g_work.target, sizeof(work.target))))
 		{
+			// reset shares id counter on new job
+//			if (strncmp(work.job_id, g_work.job_id, 128))
+			{// compare up to work/g_work.job_id array bounds.
+//				gpulog(LOG_NOTICE, thr_id, "update");
+				stratum.job.shares_count = 0;
+			}
 			if (opt_debug) {
 				uint64_t target64 = g_work.target[7] * 0x100000000ULL + g_work.target[6];
 				applog(LOG_DEBUG, "job %s target change: %llx (%.1f)", g_work.job_id, target64, g_work.targetdiff);
 			}
-			memcpy(work.target, g_work.target, sizeof(work.target));
-			work.targetdiff = g_work.targetdiff;
-			work.height = g_work.height;
+//			memcpy(work.target, g_work.target, sizeof(work.target));
+//			work.targetdiff = g_work.targetdiff;
+//			work.height = g_work.height;
+
+//			uint32_t t = nonceptr[0] + 1;
+			memcpy(&work, &g_work, sizeof(struct work));
+//			memcpy(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen);
+			pthread_mutex_unlock(&g_work_lock);
+			thr_hashrates[thr_id] = (((uint64_t)thr_hashrates[thr_id] + (0x400000 / X16R_BLOCKTIME_GUESS)) >> 1);
+//			nonceptr[0] = t;
+			nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
+//			gpulog(LOG_NOTICE, thr_id, "job update");
 			//nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
-		}
+		} 
+		else if (!regen && have_stratum && rc == -127)
+		{
+//			gpulog(LOG_BLUE, thr_id, "REGEN");
+//			if (stratum_gen_work(&stratum, &g_work))
+			{
+//				g_work_time = time(NULL);
+				stratum.job.shares_count = 0;
+
+//				memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen);
+//				strncpy(work.job_id, g_work.job_id, 128);
+//				memcpy(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen);
+//				uint32_t t = nonceptr[0]+1;
+				memcpy(&work, &g_work, sizeof(struct work));
+				pthread_mutex_unlock(&g_work_lock);
+				nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
+//				nonceptr[0] = t;
+//				gpulog(LOG_NOTICE, thr_id, "job update");
+			}
+		} 
 		/*
-		if (opt_algo == ALGO_ZR5) {
-			// ignore pok/version header
-			wcmpoft = 1;
-			wcmplen -= 4;
+		else if (strncmp(work.job_id, g_work.job_id, 128))
+		{// compare up to work/g_work.job_id array bounds.
+			gpulog(LOG_NOTICE, thr_id, "update");
+			stratum.job.shares_count = 0;
 		}
 		*/
+		else
+		{
+			pthread_mutex_unlock(&g_work_lock);
+			nonceptr[0]++; //??
+		}
+//		gpulog(LOG_BLUE, thr_id, "Nonce: %8.X", nonceptr[0]);
+#if 0
 		if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) {
 			#if 0
 //			if (opt_debug) {
@@ -2151,16 +2195,22 @@ static void *miner_thread(void *userdata)
 			#endif
 			//*** SIGNAL JOB UPDATE *********************************************************************
 			memcpy(&work, &g_work, sizeof(struct work));
+			pthread_mutex_unlock(&g_work_lock);
 			nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
-		} else
+			gpulog(LOG_NOTICE, thr_id, "job update");
+		}
+		else
+		{
+			pthread_mutex_unlock(&g_work_lock);
+			gpulog(LOG_NOTICE, thr_id, "nonce inc");
 			nonceptr[0]++; //??
-
-//		if (opt_benchmark) {
+		}
+#endif
+		//		if (opt_benchmark) {
 			// randomize work
 //			nonceptr[-1] += 1;
 //		}
-
-		pthread_mutex_unlock(&g_work_lock);
+//		pthread_mutex_unlock(&g_work_lock);
 
 		// --benchmark [-a all]
 		/*
@@ -2181,7 +2231,7 @@ static void *miner_thread(void *userdata)
 
 		// prevent gpu scans before a job is received
 		nodata_check_oft = 0;
-		if (ALGO_X16R && max_nonce == 0 && init_items[thr_id] == 0)
+		if (opt_algo == ALGO_X16R && max_nonce == 0 && init_items[thr_id] == 0)
 		{
 			if (x16r_init(thr_id, -1) != -128)
 				exit(-1);
@@ -2274,7 +2324,12 @@ static void *miner_thread(void *userdata)
 
 		/* adjust max_nonce to meet target scan time */
 		if (have_stratum)
-			max64 = LP_SCANTIME;
+		{
+			if (opt_algo == ALGO_X16R)
+				max64 = X16R_BLOCKTIME_GUESS;
+			else
+				max64 = LP_SCANTIME;
+		}
 		else
 			max64 = max(1, (int64_t) scan_time + g_work_time - time(NULL));
 
@@ -2402,12 +2457,12 @@ static void *miner_thread(void *userdata)
 			max_nonce = (uint32_t) (max64 + start_nonce);
 
 		// todo: keep it rounded to a multiple of 256 ?
-		max_nonce &= ~0xff;
 
 		if (unlikely(start_nonce > max_nonce)) {
 			// should not happen but seen in skein2 benchmark with 2 gpus
 			max_nonce = end_nonce = UINT32_MAX;
 		}
+		max_nonce &= ~0xff;
 
 		work.scanned_from = start_nonce;
 
@@ -2477,11 +2532,12 @@ static void *miner_thread(void *userdata)
 			continue;
 		}
 
+		timeval_subtract(&diff, &tv_end, &tv_start);
 		if (rc > 0 && opt_debug)
 			applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", work.nonces[0], swab32(work.nonces[0]));
 		if (rc > 1 && opt_debug)
 			applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", work.nonces[1], swab32(work.nonces[1]));
-		timeval_subtract(&diff, &tv_end, &tv_start);
+//		timeval_subtract(&diff, &tv_end, &tv_start);
 
 		if (cgpu && diff.tv_sec) { // stop monitoring
 			cgpu->monitor.sampling_flag = false;
@@ -2494,7 +2550,9 @@ static void *miner_thread(void *userdata)
 //			double rate_factor = 1.0;
 
 			/* store thread hashrate */
-			if (dtime > 0.0) {
+			if (dtime < 0.025)
+				thr_hashrates[thr_id] = hashes_done << 6;
+			else if (dtime > 0.0) {
 				pthread_mutex_lock(&stats_lock);
 				thr_hashrates[thr_id] = hashes_done / dtime;
 //				thr_hashrates[thr_id] *= rate_factor;
@@ -2505,14 +2563,15 @@ static void *miner_thread(void *userdata)
 		}
 
 		if (rc > 0)
-			work.scanned_to = work.nonces[0];
-		else if (rc > 1)
+//			work.scanned_to = work.nonces[0];
+//		else if (rc > 1)
 			work.scanned_to = max(work.nonces[0], work.nonces[1]);
 		else if (rc == -127)
 		{
 //			work.data[19] = max_nonce;
 //			if (work_restart[thr_id].restart)
 //				work_done = 1;
+//			gpulog(LOG_NOTICE, thr_id, "Restart thread");
 			continue;
 		}
 		else if (rc == -128)
@@ -2530,8 +2589,8 @@ static void *miner_thread(void *userdata)
 			// prevent low scan ranges on next loop on fast algos (blake)
 			if (nonceptr[0] > UINT32_MAX - (64)) // 64
 				nonceptr[0] = UINT32_MAX;
-			if (work_restart[thr_id].restart)
-				work_done = 1;
+//			if (work_restart[thr_id].restart)
+//				work_done = 1;
 			continue;
 		}
 
@@ -2580,12 +2639,14 @@ static void *miner_thread(void *userdata)
 
 			work.submit_nonce_id = 0;
 			nonceptr[0] = work.nonces[0];
+			/*
 			if (work_restart[thr_id].restart)
 			{
 				work_done = 1;
 				continue;
 			}
-			if (max_nonce - work.scanned_to < (2 << 21))
+			*/
+			if (max_nonce - work.scanned_to < (3 << 21))
 				work_done = 1;
 			if (!submit_work(mythr, &work))
 				break;

diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu
@@ -73,7 +73,7 @@ void quark_groestl512_gpu_hash_64_quad_a1_min3r(const uint32_t threads, uint4* g
 		message[0].y = ((uint32_t*)&pHash[1])[thr];
 		message[0].z = ((uint32_t*)&pHash[2])[thr];
 		message[0].w = ((uint32_t*)&pHash[3])[thr];
-//		__syncthreads();
+		__syncthreads();
 
 //#pragma unroll
 //		for (int k = 0; k<4; k++) message[k] = pHash[thr + (k * THF)];
@@ -333,7 +333,7 @@ void groestl512_gpu_hash_80_quad_a1_min3r(const uint32_t threads, const uint32_t
 		message[0].w = c_Message80[thr + (3 * THF)];
 		message[1].x = c_Message80[thr + (4 * THF)];
 
-//		__syncthreads();
+		__syncthreads();
 
 
 //		message[1].y = 0;

diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu
@@ -486,7 +486,7 @@ void quark_skein512_gpu_hash_64(const uint32_t threads, uint64_t* __restrict__ g
 		uint2x4 *phash = (uint2x4*)Hash;
 		*(uint2x4*)&p[0] = __ldg4(&phash[0]);
 		*(uint2x4*)&p[4] = __ldg4(&phash[1]);
-//		__syncthreads();
+		__syncthreads();
 		h[0] = p[0];	h[1] = p[1];	h[2] = p[2];	h[3] = p[3];
 		h[4] = p[4];	h[5] = p[5];	h[6] = p[6];	h[7] = p[7];
 

diff --git a/qubit/qubit_luffa512_alexis.cu b/qubit/qubit_luffa512_alexis.cu
@@ -710,7 +710,7 @@ void x11_luffa512_gpu_hash_64_alexis(uint32_t threads, uint32_t *g_hash, int *or
 
 		*(uint2x4*)&hash[0] = __ldg4(&Hash[0]);
 		*(uint2x4*)&hash[8] = __ldg4(&Hash[1]);
-//		__syncthreads();
+		__syncthreads();
 		#pragma unroll 8
 		for(int i=0;i<8;i++){
 			statebuffer[i] = cuda_swab32(hash[i]);

diff --git a/x11/cuda_x11_echo_alexis.cu b/x11/cuda_x11_echo_alexis.cu
@@ -303,7 +303,7 @@ static void x11_echo512_gpu_hash_64_alexis(uint32_t threads, uint32_t *g_hash, i
 		*(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0];
 		*(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8];
 
-//		__syncthreads();
+		__syncthreads();
 
 		const uint32_t P[48] = {
 			0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,

diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu
@@ -567,7 +567,7 @@ void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t
 		*(uint2x4*)&r[8] = *(uint2x4*)&c_PaddedMessage80[8];
 		*(uint4*)&r[16] = *(uint4*)&c_PaddedMessage80[16];
 
-//		__syncthreads();
+		__syncthreads();
 
 
 		r[19] = cuda_swab32(nounce);

diff --git a/x11/cuda_x11_shavite512_alexis.cu b/x11/cuda_x11_shavite512_alexis.cu
@@ -213,7 +213,7 @@ void x11_shavite512_gpu_hash_64_alexis(const uint32_t threads, uint64_t *g_hash,
 		// fülle die Nachricht mit 64-byte (vorheriger Hash)
 		*(uint2x4*)&r[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
 		*(uint2x4*)&r[ 8] = __ldg4((uint2x4*)&Hash[ 4]);
-//		__syncthreads();
+		__syncthreads();
 
 		*(uint2x4*)&p[ 0] = *(uint2x4*)&state[ 0];
 		*(uint2x4*)&p[ 2] = *(uint2x4*)&state[ 8];

diff --git a/x13/cuda_x13_fugue512_alexis.cu b/x13/cuda_x13_fugue512_alexis.cu
@@ -280,7 +280,7 @@ void x13_fugue512_gpu_hash_64_alexis(uint32_t threads, uint64_t *g_hash, int *or
 		*(uint2x4*)&Hash[0] = swapvec(__ldg4((uint2x4*)&hash[0]));
 		*(uint2x4*)&Hash[8] = swapvec(__ldg4((uint2x4*)&hash[8]));
 */
-//		__syncthreads();
+		__syncthreads();
 
 		S[ 0] = S[ 1] = S[ 2] = S[ 3] = S[ 4] = S[ 5] = S[ 6] = S[ 7] = S[ 8] = S[ 9] = S[10] = S[11] = S[12] = S[13] = S[14] = S[15] = S[16] = S[17] = S[18] = S[19] = 0;
 		*(uint2x4*)&S[20] = *(uint2x4*)&c_S[ 0];

diff --git a/x13/cuda_x13_hamsi512_alexis.cu b/x13/cuda_x13_hamsi512_alexis.cu
@@ -192,7 +192,7 @@ void x13_hamsi512_gpu_hash_64_alexis(uint32_t threads, uint32_t *g_hash, int *or
 		uint8_t h1[64];
 		*(uint2x4*)&h1[ 0] = *(uint2x4*)&Hash[0];
 		*(uint2x4*)&h1[32] = *(uint2x4*)&Hash[8];
-//		__syncthreads();
+		__syncthreads();
 		uint32_t c[16], h[16], m[16];
 		*(uint16*)&c[ 0] = *(uint16*)&c_c[ 0];
 		*(uint16*)&h[ 0] = *(uint16*)&c_c[ 0];

diff --git a/x15/cuda_x15_whirlpool_sm3.cu b/x15/cuda_x15_whirlpool_sm3.cu
@@ -2017,7 +2017,7 @@ void oldwhirlpool_gpu_hash_80(const uint32_t threads, const uint32_t startNounce
 		#endif
 	}
 	//__threadfence_block(); // ensure shared mem is ready
-//	__syncthreads();
+	__syncthreads();
 
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -2126,7 +2126,7 @@ void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		#pragma unroll 8
 		for (i=0; i<8; i++)
 			n[i] = hash[i] = g_hash[hashPosition + i];
-//		__syncthreads();
+		__syncthreads();
 //		#pragma unroll 10
 		for (i=0; i < 10; i++) {
 			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

diff --git a/x16r/cuda_x16_echo512.cu b/x16r/cuda_x16_echo512.cu
@@ -305,7 +305,7 @@ void x16_echo512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_
 
 //	echo_gpu_init(sharedMemory);
 	aes_gpu_init_128(sharedMemory);
-//	__threadfence_block();
+	__threadfence_block();
 #if 0
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)

diff --git a/x16r/cuda_x16_fugue512.cu b/x16r/cuda_x16_fugue512.cu
@@ -330,7 +330,7 @@ void x16_fugue512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce,
 	}
 #endif
 
-//	__syncthreads();
+	__syncthreads();
 
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)