In [14]:
%%writefile hello.cu
#include<stdio.h>
__global__ void mykernel(void){
printf("Hello from GPU thread %d\n", threadIdx.x);
};
int main(void)
{
mykernel<<<2,1>>>();
cudaDeviceSynchronize();
printf("Hello World with cpu");
return 0;
}

Overwriting hello.cu


In [15]:
!nvcc -arch=sm_75 hello.cu -o hello


In [16]:
!./hello


Hello from GPU thread 0
Hello from GPU thread 0
Hello World with cpu

In [20]:
%%writefile hello.cu
#include<stdio.h>
__global__ void mykernel(void){
printf("Hello from GPU block %d, and thread block %d\n", blockIdx.x,threadIdx.x);
};
int main(void)
{
mykernel<<<2,2>>>();
cudaDeviceSynchronize();
printf("Hello World with cpu");
return 0;
}

Overwriting hello.cu


In [21]:
!nvcc -arch=sm_75 hello.cu -o hello

In [22]:
!./hello

Hello from GPU block 0, and thread block 0
Hello from GPU block 0, and thread block 1
Hello from GPU block 1, and thread block 0
Hello from GPU block 1, and thread block 1
Hello World with cpu

Thread indexing with globalid

In [26]:
%%writefile gidthread.cu
#include<stdio.h>
__global__ void mykernel(void){
int id = blockIdx.x * blockDim.x + threadIdx.x;
printf("Thread %d\n", id);
};
int main(void)
{
mykernel<<<3,4>>>();
cudaDeviceSynchronize();
return 0;
}

Overwriting gidthread.cu


In [27]:
!nvcc -arch=sm_75 gidthread.cu -o gidthread

In [28]:
!./gidthread

Thread 8
Thread 9
Thread 10
Thread 11
Thread 0
Thread 1
Thread 2
Thread 3
Thread 4
Thread 5
Thread 6
Thread 7


write an array filled (a[i]=id), global indexes on gpu

In [29]:
%%writefile vectorinit.cu
#include<stdio.h>
#include <cuda_runtime.h>

#define N 16

__global__ void fill(int *A){
int i = blockIdx.x * blockDim.x + threadIdx.x;
A[i] = i;
};
int main(void)
{
int *h_a;
int *d_a;
int size = N * sizeof(int);
h_a = (int *)malloc(size);
cudaMalloc((void **)&d_a, size);
int threads = 8;
int blocks = (N + threads - 1) / threads;
fill<<<blocks,threads>>>(d_a);

// Copy result back to host
cudaMemcpy(h_a, d_a, size, cudaMemcpyDeviceToHost);
printf("Array values:\n");
for(int i=0; i<N; i++)
printf("%d ", h_a[i]);

printf("\n");

cudaFree(d_a);
free(h_a);

return 0;
}

Writing vectorinit.cu


In [30]:
!nvcc -arch=sm_75 vectorinit.cu -o vectorinit

In [31]:
!./vectorinit

Array values:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 


In [38]:
%%writefile addvector.cu
#include<stdio.h>
#include <cuda_runtime.h>
#define N 16

__global__ void add(int *a, int *b, int *c){
int index= blockIdx.x * blockDim.x + threadIdx.x;
c[index] = a[index] + b[index];
};
int main(void)
{
int *a, *b, *c;
int *d_a, *d_b, *d_c;
int size = N * sizeof(int);
a = (int *)malloc(size);
b = (int *)malloc(size);
c = (int *)malloc(size);
for (int i = 0; i < N; i++) {
a[i] = i * 1;
printf("%d\n",a[i]);
}
for (int i = 0; i < N; i++) {
b[i] = i * 2;
printf("%d\n",b[i]);
}

cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);

cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

int threads = 8;
int blocks = (N + threads - 1) / threads;

add<<<blocks,threads>>>(d_a, d_b, d_c);

// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
printf("Array values of C:\n");
for(int i=0; i<N; i++)
printf("%d ", c[i]);

// Cleanup
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}

Overwriting addvector.cu


In [39]:
!nvcc -arch=sm_75 addvector.cu -o addvector

In [40]:
!./addvector

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
0
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
Array values of C:
0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45 