-
Notifications
You must be signed in to change notification settings - Fork 172
/
VectorAddDrv.java
167 lines (151 loc) · 5.38 KB
/
VectorAddDrv.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/* goes with the lincense of org.bytedeco.javacpp
* (c) kaba
*/
import org.bytedeco.javacpp.BytePointer;
import org.bytedeco.javacpp.IntPointer;
import org.bytedeco.javacpp.LongPointer;
import org.bytedeco.javacpp.PointerPointer;
import org.bytedeco.javacpp.cuda;
/**
*
* @author kaba
*/
public class VectorAddDrv
{
private static int N = 1000000;
public static void main(String[] args) {
int[] a=new int[N], b=new int[N], c=new int[N]; // array in host memory
long[] dev_a={0}, dev_b={0}, dev_c={0}; // "pointers" to device memory
// init the device
cuda.cuInit(0);
int[] cudaDevice = {0};
cuda.cuDeviceGet(cudaDevice,0);
// obtain a context
cuda.CUctx_st context = new cuda.CUctx_st();
cuda.cuCtxCreate( context, 0, cudaDevice[0] );
// allocate device memory
cuda.cuMemAlloc( dev_a, N * Integer.BYTES );
cuda.cuMemAlloc( dev_b, N * Integer.BYTES );
cuda.cuMemAlloc( dev_c, N * Integer.BYTES );
// prepare host arrays
for( int i=0; i<N; i++) {
a[i] = -i;
b[i] = i*i;
}
a[N-3]=1; // introduce error to be found; to show that check works
// copy input host (heap) arrays to native memory to device memory
IntPointer nat_a = new IntPointer( a ); // host->nat
cuda.cuMemcpyHtoD( dev_a[0], nat_a, a.length*Integer.BYTES ); // nat->dev
IntPointer nat_b = new IntPointer( b ); // host->nat
cuda.cuMemcpyHtoD( dev_b[0], nat_b, b.length*Integer.BYTES ); // nat->dev
// prepare the kernel - module and function
cuda.CUmod_st module = new cuda.CUmod_st();
cuda.cuModuleLoadData( module, new BytePointer(VECTOR_ADD_PTX) );
cuda.CUfunc_st vector_add = new cuda.CUfunc_st();
cuda.cuModuleGetFunction( vector_add, module, "vector_add" );
// prepare kernel parameters
PointerPointer kernelParameters = new PointerPointer(
new IntPointer( new int[]{N} ),
new LongPointer( dev_a ),
new LongPointer( dev_b ),
new LongPointer( dev_c ) );
// run the kernel
cuda.cuLaunchKernel( vector_add,
(N+255)/256, 1, 1, // Grid dimension
256, 1, 1, // Block dimension - all GPUs should manage > 256 threads per block
0, null, // Shared memory size and stream
kernelParameters, null ); // Kernel- and extra parameters
cuda.cuCtxSynchronize();
// copy output device array to native memory to host (heap) memory
IntPointer nat_c = new IntPointer( c.length );
cuda.cuMemcpyDtoH( nat_c, dev_c[0], c.length*Integer.BYTES ); // dev->nat
nat_c.get( c ); // nat->host
// check results
boolean ok = true;
for( int i=0; i<N; i++) {
if( c[i] != (i*i)-i ) {
ok = false;
System.out.println( "result incorrect at position "+i+": expected "+((i*i)-i)+", but got "+c[i] );
}
}
if(ok)
System.out.println( "calculation succeded" );
}
/** PTX module with CUDA function vector add, taking a size and three arrays.
* c.f. https://devblogs.nvidia.com/even-easier-introduction-cuda/
*
* workflow (there are others):
* the following CUDA function must be compiled manually to ptx, which must then be copied
* into the String constant below
* compile command: nvcc -ptx vector_add.cu -o vector_add.ptx
* NB: remember to fix comment closing marks before compiling
* /
extern "C"
__global__ void vector_add(int n, int *a, int *b, int *c)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for( int i = index; i < n; i+= stride )
c[i] = a[i] + b[i];
}
*/
private final static String VECTOR_ADD_PTX=
"//\n" +
"// Generated by NVIDIA NVVM Compiler\n" +
"//\n" +
"// Compiler Build ID: CL-23083092\n" +
"// Cuda compilation tools, release 9.1, V9.1.85\n" +
"// Based on LLVM 3.4svn\n" +
"//\n" +
"\n" +
".version 6.1\n" +
".target sm_30\n" +
".address_size 64\n" +
"\n" +
" // .globl vector_add\n" +
"\n" +
".visible .entry vector_add(\n" +
" .param .u32 vector_add_param_0,\n" +
" .param .u64 vector_add_param_1,\n" +
" .param .u64 vector_add_param_2,\n" +
" .param .u64 vector_add_param_3\n" +
")\n" +
"{\n" +
" .reg .pred %p<3>;\n" +
" .reg .b32 %r<14>;\n" +
" .reg .b64 %rd<11>;\n" +
"\n" +
"\n" +
" ld.param.u32 %r5, [vector_add_param_0];\n" +
" ld.param.u64 %rd4, [vector_add_param_1];\n" +
" ld.param.u64 %rd5, [vector_add_param_2];\n" +
" ld.param.u64 %rd6, [vector_add_param_3];\n" +
" cvta.to.global.u64 %rd1, %rd6;\n" +
" cvta.to.global.u64 %rd2, %rd5;\n" +
" cvta.to.global.u64 %rd3, %rd4;\n" +
" mov.u32 %r6, %ntid.x;\n" +
" mov.u32 %r7, %ctaid.x;\n" +
" mov.u32 %r8, %tid.x;\n" +
" mad.lo.s32 %r13, %r6, %r7, %r8;\n" +
" mov.u32 %r9, %nctaid.x;\n" +
" mul.lo.s32 %r2, %r9, %r6;\n" +
" setp.ge.s32 %p1, %r13, %r5;\n" +
" @%p1 bra BB0_2;\n" +
"\n" +
"BB0_1:\n" +
" mul.wide.s32 %rd7, %r13, 4;\n" +
" add.s64 %rd8, %rd3, %rd7;\n" +
" add.s64 %rd9, %rd2, %rd7;\n" +
" ld.global.u32 %r10, [%rd9];\n" +
" ld.global.u32 %r11, [%rd8];\n" +
" add.s32 %r12, %r10, %r11;\n" +
" add.s64 %rd10, %rd1, %rd7;\n" +
" st.global.u32 [%rd10], %r12;\n" +
" add.s32 %r13, %r13, %r2;\n" +
" setp.lt.s32 %p2, %r13, %r5;\n" +
" @%p2 bra BB0_1;\n" +
"\n" +
"BB0_2:\n" +
" ret;\n" +
"}";
}