Skip to content

Commit

Permalink
Alexis Cubehash64
Browse files Browse the repository at this point in the history
  • Loading branch information
a1i3nj03 committed Apr 9, 2018
1 parent f224367 commit 1229f45
Showing 1 changed file with 239 additions and 40 deletions.
279 changes: 239 additions & 40 deletions x11/cuda_x11_cubehash512.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,245 @@

#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }

//

#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }

//#define SHUFFLE

#if defined(SHUFFLE)

#define TPB 1024

__device__ __forceinline__
void rrounds(uint32_t *x){
#pragma unroll 16
for (int r = 0; r < 16; ++r) {
/* "add x_0jklm into x_1jklm modulo 2^32 rotate x_0jklm upwards by 7 bits" */
x[8] += x[0]; x[9] += x[1];
x[10] += x[2]; x[11] += x[3];
x[12] += x[4]; x[13] += x[5];
x[14] += x[6]; x[15] += x[7];
x[0] = ROTL32(x[0], 7);
x[1] = ROTL32(x[1], 7);
x[2] = ROTL32(x[2], 7);
x[3] = ROTL32(x[3], 7);
x[4] = ROTL32(x[4], 7);
x[5] = ROTL32(x[5], 7);
x[6] = ROTL32(x[6], 7);
x[7] = ROTL32(x[7], 7);
/* "swap x_00klm with x_01klm" */
SWAP(x[0], x[4]); SWAP(x[1], x[5]);
SWAP(x[2], x[6]); SWAP(x[3], x[7]);

x[0] ^= x[8]; x[4] ^= x[12];
x[1] ^= x[9]; x[5] ^= x[13];
x[2] ^= x[10]; x[6] ^= x[14];
x[3] ^= x[11]; x[7] ^= x[15];
/* "swap x_1jk0m with x_1jk1m" */
SWAP(x[8], x[10]); SWAP(x[9], x[11]);
SWAP(x[12], x[14]); SWAP(x[13], x[15]);
/* "add x_0jklm into x_1jklm modulo 2^32 rotate x_0jklm upwards by 11 bits" */
x[8] += x[0]; x[9] += x[1];
x[10] += x[2]; x[11] += x[3];
x[12] += x[4]; x[13] += x[5];
x[14] += x[6]; x[15] += x[7];
x[0] = ROTL32(x[0], 11);
x[1] = ROTL32(x[1], 11);
x[2] = ROTL32(x[2], 11);
x[3] = ROTL32(x[3], 11);
x[4] = ROTL32(x[4], 11);
x[5] = ROTL32(x[5], 11);
x[6] = ROTL32(x[6], 11);
x[7] = ROTL32(x[7], 11);
/* "swap x_0j0lm with x_0j1lm" */
x[0] = __shfl(x[0], threadIdx.x ^ 1);
x[1] = __shfl(x[1], threadIdx.x ^ 1);
x[2] = __shfl(x[2], threadIdx.x ^ 1);
x[3] = __shfl(x[3], threadIdx.x ^ 1);
x[4] = __shfl(x[4], threadIdx.x ^ 1);
x[5] = __shfl(x[5], threadIdx.x ^ 1);
x[6] = __shfl(x[6], threadIdx.x ^ 1);
x[7] = __shfl(x[7], threadIdx.x ^ 1);

x[0] ^= x[8]; x[1] ^= x[9];
x[2] ^= x[10]; x[3] ^= x[11];
x[4] ^= x[12]; x[5] ^= x[13];
x[6] ^= x[14]; x[7] ^= x[15];
/* "swap x_1jkl0 with x_1jkl1" */
SWAP(x[8], x[9]); SWAP(x[10], x[11]);
SWAP(x[12], x[13]); SWAP(x[14], x[15]);
}
}
__global__ __launch_bounds__(TPB, 1)
void x11_cubehash512_gpu_hash_64(uint32_t threads, uint64_t *g_hash){
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 1;

const uint32_t even = (threadIdx.x & 1);

if (thread < threads){
uint32_t *Hash = (uint32_t*)&g_hash[8 * thread + 2 * even];

uint32_t x[16];

if (even == 0){
x[0] = 0x2AEA2A61; x[1] = 0x50F494D4; x[2] = 0x2D538B8B; x[3] = 0x4167D83E;
x[4] = 0x4D42C787; x[5] = 0xA647A8B3; x[6] = 0x97CF0BEF; x[7] = 0x825B4537;
x[8] = 0xFCD398D9; x[9] = 0x148FE485; x[10] = 0x1B017BEF; x[11] = 0xB6444532;
x[12] = 0xD65C8A2B; x[13] = 0xA5A70E75; x[14] = 0xB1C62456; x[15] = 0xBC796576;
}
else{
x[0] = 0x3FEE2313; x[1] = 0xC701CF8C; x[2] = 0xCC39968E; x[3] = 0x50AC5695;
x[4] = 0xEEF864D2; x[5] = 0xF22090C4; x[6] = 0xD0E5CD33; x[7] = 0xA23911AE;
x[8] = 0x6A536159; x[9] = 0x2FF5781C; x[10] = 0x91FA7934; x[11] = 0x0DBADEA9;
x[12] = 0x1921C8F7; x[13] = 0xE7989AF1; x[14] = 0x7795D246; x[15] = 0xD43E3B44;
}
*(uint4*)&x[0] ^= __ldg((uint4*)&Hash[0]);
rrounds(x);

*(uint4*)&x[0] ^= __ldg((uint4*)&Hash[8]);

rrounds(x);

if (!even)
x[0] ^= 0x80;

rrounds(x);
/* "the integer 1 is xored into the last state word x_11111" */
if (even)
x[15] ^= 1;

#pragma unroll 10
for (int i = 0; i < 10; ++i)
rrounds(x);

*(uint4*)&Hash[0] = *(uint4*)&x[0];
*(uint4*)&Hash[8] = *(uint4*)&x[4];
// g_hash[thread + (2*even+0) * threads] = *(uint2*)&x[ 0];
// g_hash[thread + (2*even+1) * threads] = *(uint2*)&x[ 2];
}
}
__host__
void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){

// berechne wie viele Thread Blocks wir brauchen
dim3 grid((2 * threads + TPB - 1) / TPB);
dim3 block(TPB);

x11_cubehash512_gpu_hash_64 << <grid, block >> >(threads, (uint64_t*)d_hash);

}

#else

#define TPB 768

__device__ __forceinline__
static void rrounds(uint32_t *x){
#pragma unroll 2
for (int r = 0; r < 16; r++) {
/* "add x_0jklm into x_1jklmn modulo 2^32 rotate x_0jklm upwards by 7 bits" */
x[16] = x[16] + x[0]; x[0] = ROTL32(x[0], 7); x[17] = x[17] + x[1]; x[1] = ROTL32(x[1], 7);
x[18] = x[18] + x[2]; x[2] = ROTL32(x[2], 7); x[19] = x[19] + x[3]; x[3] = ROTL32(x[3], 7);
x[20] = x[20] + x[4]; x[4] = ROTL32(x[4], 7); x[21] = x[21] + x[5]; x[5] = ROTL32(x[5], 7);
x[22] = x[22] + x[6]; x[6] = ROTL32(x[6], 7); x[23] = x[23] + x[7]; x[7] = ROTL32(x[7], 7);
x[24] = x[24] + x[8]; x[8] = ROTL32(x[8], 7); x[25] = x[25] + x[9]; x[9] = ROTL32(x[9], 7);
x[26] = x[26] + x[10]; x[10] = ROTL32(x[10], 7); x[27] = x[27] + x[11]; x[11] = ROTL32(x[11], 7);
x[28] = x[28] + x[12]; x[12] = ROTL32(x[12], 7); x[29] = x[29] + x[13]; x[13] = ROTL32(x[13], 7);
x[30] = x[30] + x[14]; x[14] = ROTL32(x[14], 7); x[31] = x[31] + x[15]; x[15] = ROTL32(x[15], 7);
/* "swap x_00klm with x_01klm" */
SWAP(x[0], x[8]); x[0] ^= x[16]; x[8] ^= x[24]; SWAP(x[1], x[9]); x[1] ^= x[17]; x[9] ^= x[25];
SWAP(x[2], x[10]); x[2] ^= x[18]; x[10] ^= x[26]; SWAP(x[3], x[11]); x[3] ^= x[19]; x[11] ^= x[27];
SWAP(x[4], x[12]); x[4] ^= x[20]; x[12] ^= x[28]; SWAP(x[5], x[13]); x[5] ^= x[21]; x[13] ^= x[29];
SWAP(x[6], x[14]); x[6] ^= x[22]; x[14] ^= x[30]; SWAP(x[7], x[15]); x[7] ^= x[23]; x[15] ^= x[31];
/* "swap x_1jk0m with x_1jk1m" */
SWAP(x[16], x[18]); SWAP(x[17], x[19]); SWAP(x[20], x[22]); SWAP(x[21], x[23]); SWAP(x[24], x[26]); SWAP(x[25], x[27]); SWAP(x[28], x[30]); SWAP(x[29], x[31]);
/* "add x_0jklm into x_1jklm modulo 2^32 rotate x_0jklm upwards by 11 bits" */
x[16] = x[16] + x[0]; x[0] = ROTL32(x[0], 11); x[17] = x[17] + x[1]; x[1] = ROTL32(x[1], 11);
x[18] = x[18] + x[2]; x[2] = ROTL32(x[2], 11); x[19] = x[19] + x[3]; x[3] = ROTL32(x[3], 11);
x[20] = x[20] + x[4]; x[4] = ROTL32(x[4], 11); x[21] = x[21] + x[5]; x[5] = ROTL32(x[5], 11);
x[22] = x[22] + x[6]; x[6] = ROTL32(x[6], 11); x[23] = x[23] + x[7]; x[7] = ROTL32(x[7], 11);
x[24] = x[24] + x[8]; x[8] = ROTL32(x[8], 11); x[25] = x[25] + x[9]; x[9] = ROTL32(x[9], 11);
x[26] = x[26] + x[10]; x[10] = ROTL32(x[10], 11); x[27] = x[27] + x[11]; x[11] = ROTL32(x[11], 11);
x[28] = x[28] + x[12]; x[12] = ROTL32(x[12], 11); x[29] = x[29] + x[13]; x[13] = ROTL32(x[13], 11);
x[30] = x[30] + x[14]; x[14] = ROTL32(x[14], 11); x[31] = x[31] + x[15]; x[15] = ROTL32(x[15], 11);
/* "swap x_0j0lm with x_0j1lm" */
SWAP(x[0], x[4]); x[0] ^= x[16]; x[4] ^= x[20]; SWAP(x[1], x[5]); x[1] ^= x[17]; x[5] ^= x[21];
SWAP(x[2], x[6]); x[2] ^= x[18]; x[6] ^= x[22]; SWAP(x[3], x[7]); x[3] ^= x[19]; x[7] ^= x[23];
SWAP(x[8], x[12]); x[8] ^= x[24]; x[12] ^= x[28]; SWAP(x[9], x[13]); x[9] ^= x[25]; x[13] ^= x[29];
SWAP(x[10], x[14]); x[10] ^= x[26]; x[14] ^= x[30]; SWAP(x[11], x[15]); x[11] ^= x[27]; x[15] ^= x[31];
/* "swap x_1jkl0 with x_1jkl1" */
SWAP(x[16], x[17]); SWAP(x[18], x[19]); SWAP(x[20], x[21]); SWAP(x[22], x[23]); SWAP(x[24], x[25]); SWAP(x[26], x[27]); SWAP(x[28], x[29]); SWAP(x[30], x[31]);
}
}

/***************************************************/
// GPU Hash Function
__global__ __launch_bounds__(TPB)
void x11_cubehash512_gpu_hash_64(uint32_t threads, uint64_t *g_hash){

uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);

if (thread < threads){

uint32_t *Hash = (uint32_t*)&g_hash[8 * thread];

uint32_t x[32] = {
0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E,
0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695,
0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537,
0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE,
0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532,
0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576,
0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44
};

// erste Hälfte des Hashes (32 bytes)
//Update32(x, (const BitSequence*)Hash);
*(uint2x4*)&x[0] ^= __ldg4((uint2x4*)&Hash[0]);

rrounds(x);

// zweite Hälfte des Hashes (32 bytes)
// Update32(x, (const BitSequence*)(Hash+8));
*(uint2x4*)&x[0] ^= __ldg4((uint2x4*)&Hash[8]);

rrounds(x);

// Padding Block
x[0] ^= 0x80;
rrounds(x);

// Final(x, (BitSequence*)Hash);
x[31] ^= 1;

/* "the state is then transformed invertibly through 10r identical rounds" */
#pragma unroll 10
for (int i = 0; i < 10; ++i)
rrounds(x);

/* "output the first h/8 bytes of the state" */
*(uint2x4*)&Hash[0] = *(uint2x4*)&x[0];
*(uint2x4*)&Hash[8] = *(uint2x4*)&x[8];
}
}


__host__
void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){

// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + TPB - 1) / TPB);
dim3 block(TPB);

x11_cubehash512_gpu_hash_64 << <grid, block >> >(threads, (uint64_t*)d_hash);

}
#endif

//

__device__ __constant__
static const uint32_t c_IV_512[32] = {
0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E,
Expand Down Expand Up @@ -214,46 +453,6 @@ static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)

/***************************************************/

__global__
void x11_cubehash512_gpu_hash_64(uint32_t threads, uint64_t *g_hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
//uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);

int hashPosition = thread;//nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];

uint32_t x[2][2][2][2][2];
Init(x);

Update32(x, &Hash[0]);
Update32(x, &Hash[8]);

// Padding Block
uint32_t last[8];
last[0] = 0x80;
#pragma unroll 7
for (int i=1; i < 8; i++) last[i] = 0;
Update32(x, last);

Final(x, Hash);
}
}

__host__
void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
{
const uint32_t threadsperblock = 256;

dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);

size_t shared_size = 0;

x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, (uint64_t*)d_hash);
}

__host__
void x11_cubehash512_cpu_init(int thr_id, uint32_t threads) { }
Expand Down

0 comments on commit 1229f45

Please sign in to comment.